10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 51623Stw21770 * Common Development and Distribution License (the "License"). 61623Stw21770 * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate /* 221623Stw21770 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 230Sstevel@tonic-gate * Use is subject to license terms. 240Sstevel@tonic-gate */ 250Sstevel@tonic-gate 260Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 270Sstevel@tonic-gate 280Sstevel@tonic-gate /* 290Sstevel@tonic-gate * Just in case we're not in a build environment, make sure that 300Sstevel@tonic-gate * TEXT_DOMAIN gets set to something. 310Sstevel@tonic-gate */ 320Sstevel@tonic-gate #if !defined(TEXT_DOMAIN) 330Sstevel@tonic-gate #define TEXT_DOMAIN "SYS_TEST" 340Sstevel@tonic-gate #endif 350Sstevel@tonic-gate 360Sstevel@tonic-gate /* 370Sstevel@tonic-gate * Metadevice diskset interfaces 380Sstevel@tonic-gate */ 390Sstevel@tonic-gate 400Sstevel@tonic-gate #include "meta_set_prv.h" 410Sstevel@tonic-gate #include <meta.h> 420Sstevel@tonic-gate #include <metad.h> 430Sstevel@tonic-gate #include <mdmn_changelog.h> 440Sstevel@tonic-gate #include <sys/lvm/md_crc.h> 450Sstevel@tonic-gate #include <sys/utsname.h> 460Sstevel@tonic-gate #include <sdssc.h> 470Sstevel@tonic-gate 480Sstevel@tonic-gate #include <sys/sysevent/eventdefs.h> 490Sstevel@tonic-gate #include <sys/sysevent/svm.h> 500Sstevel@tonic-gate extern char *blkname(char *); 510Sstevel@tonic-gate 520Sstevel@tonic-gate static md_drive_desc * 530Sstevel@tonic-gate dr2drivedesc( 540Sstevel@tonic-gate mdsetname_t *sp, 550Sstevel@tonic-gate side_t sideno, 560Sstevel@tonic-gate int flags, 570Sstevel@tonic-gate md_error_t *ep 580Sstevel@tonic-gate ) 590Sstevel@tonic-gate { 600Sstevel@tonic-gate md_set_record *sr; 610Sstevel@tonic-gate md_drive_record *dr; 620Sstevel@tonic-gate mddrivename_t *dnp; 630Sstevel@tonic-gate md_drive_desc *dd_head = NULL; 640Sstevel@tonic-gate md_set_desc *sd; 650Sstevel@tonic-gate 660Sstevel@tonic-gate if (flags & MD_BYPASS_DAEMON) { 670Sstevel@tonic-gate if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL) 680Sstevel@tonic-gate return (NULL); 690Sstevel@tonic-gate sd = metaget_setdesc(sp, ep); 700Sstevel@tonic-gate sideno = getnodeside(mynode(), sd); 710Sstevel@tonic-gate sp = metafakesetname(sp->setno, sr->sr_setname); 720Sstevel@tonic-gate } else { 730Sstevel@tonic-gate if ((sr = getsetbyname(sp->setname, ep)) == NULL) 740Sstevel@tonic-gate return (NULL); 750Sstevel@tonic-gate } 760Sstevel@tonic-gate 770Sstevel@tonic-gate assert(sideno != MD_SIDEWILD); 780Sstevel@tonic-gate 790Sstevel@tonic-gate /* 800Sstevel@tonic-gate * WARNING: 810Sstevel@tonic-gate * The act of getting the dnp from the namespace means that we 820Sstevel@tonic-gate * will get the devid of the disk as recorded in the namespace. 830Sstevel@tonic-gate * This devid has the potential to be stale if the disk is being 840Sstevel@tonic-gate * replaced via a rebind, this means that any code that relies 850Sstevel@tonic-gate * on any of the dnp information should take the appropriate action 860Sstevel@tonic-gate * to preserve that information. For example in the rebind code the 870Sstevel@tonic-gate * devid of the new disk is saved off and then copied back in once 880Sstevel@tonic-gate * the code that has called this function has completed. 890Sstevel@tonic-gate */ 900Sstevel@tonic-gate for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) { 910Sstevel@tonic-gate if ((dnp = metadrivename_withdrkey(sp, sideno, dr->dr_key, 920Sstevel@tonic-gate flags, ep)) == NULL) { 930Sstevel@tonic-gate if (!(flags & MD_BYPASS_DAEMON)) 940Sstevel@tonic-gate free_sr(sr); 950Sstevel@tonic-gate metafreedrivedesc(&dd_head); 960Sstevel@tonic-gate return (NULL); 970Sstevel@tonic-gate } 980Sstevel@tonic-gate 990Sstevel@tonic-gate (void) metadrivedesc_append(&dd_head, dnp, dr->dr_dbcnt, 1000Sstevel@tonic-gate dr->dr_dbsize, dr->dr_ctime, dr->dr_genid, dr->dr_flags); 1010Sstevel@tonic-gate } 1020Sstevel@tonic-gate 1030Sstevel@tonic-gate if (!(flags & MD_BYPASS_DAEMON)) { 1040Sstevel@tonic-gate free_sr(sr); 1050Sstevel@tonic-gate } 1060Sstevel@tonic-gate return (dd_head); 1070Sstevel@tonic-gate } 1080Sstevel@tonic-gate 1090Sstevel@tonic-gate static int 1100Sstevel@tonic-gate get_sidenmlist( 1110Sstevel@tonic-gate mdsetname_t *sp, 1120Sstevel@tonic-gate mddrivename_t *dnp, 1130Sstevel@tonic-gate md_error_t *ep 1140Sstevel@tonic-gate ) 1150Sstevel@tonic-gate { 1160Sstevel@tonic-gate md_set_desc *sd; 1170Sstevel@tonic-gate mdsidenames_t *sn, **sn_next; 1180Sstevel@tonic-gate int i; 1190Sstevel@tonic-gate 1200Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) 1210Sstevel@tonic-gate return (-1); 1220Sstevel@tonic-gate 1230Sstevel@tonic-gate metaflushsidenames(dnp); 1240Sstevel@tonic-gate sn_next = &dnp->side_names; 1250Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) { 1260Sstevel@tonic-gate /* 1270Sstevel@tonic-gate * Only get sidenames for this node since 1280Sstevel@tonic-gate * that is the only side information stored in 1290Sstevel@tonic-gate * the local mddb for a multi-node diskset. 1300Sstevel@tonic-gate */ 1310Sstevel@tonic-gate if (sd->sd_mn_mynode) { 1320Sstevel@tonic-gate sn = Zalloc(sizeof (*sn)); 1330Sstevel@tonic-gate sn->sideno = sd->sd_mn_mynode->nd_nodeid; 1340Sstevel@tonic-gate if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET, 1350Sstevel@tonic-gate sn->sideno, dnp->side_names_key, &sn->dname, 1360Sstevel@tonic-gate &sn->mnum, NULL, ep)) == NULL) { 1370Sstevel@tonic-gate if (sn->dname != NULL) 1380Sstevel@tonic-gate Free(sn->dname); 1390Sstevel@tonic-gate Free(sn); 1400Sstevel@tonic-gate return (-1); 1410Sstevel@tonic-gate } 1420Sstevel@tonic-gate 1430Sstevel@tonic-gate /* Add to the end of the linked list */ 1440Sstevel@tonic-gate assert(*sn_next == NULL); 1450Sstevel@tonic-gate *sn_next = sn; 1460Sstevel@tonic-gate sn_next = &sn->next; 1470Sstevel@tonic-gate } 1480Sstevel@tonic-gate } else { 1490Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) { 1500Sstevel@tonic-gate /* Skip empty slots */ 1510Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0') 1520Sstevel@tonic-gate continue; 1530Sstevel@tonic-gate 1540Sstevel@tonic-gate sn = Zalloc(sizeof (*sn)); 1550Sstevel@tonic-gate sn->sideno = i; 1560Sstevel@tonic-gate if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET, 1570Sstevel@tonic-gate i+SKEW, dnp->side_names_key, &sn->dname, 1580Sstevel@tonic-gate &sn->mnum, NULL, ep)) == NULL) { 1590Sstevel@tonic-gate /* 1600Sstevel@tonic-gate * It is possible that during the add of a 1610Sstevel@tonic-gate * host to have a 'missing' side as the side 1620Sstevel@tonic-gate * for this disk will be added later. So ignore 1630Sstevel@tonic-gate * the error. The 'missing' side will be added 1640Sstevel@tonic-gate * once the addhosts process has completed. 1650Sstevel@tonic-gate */ 1660Sstevel@tonic-gate if (mdissyserror(ep, ENOENT)) { 1670Sstevel@tonic-gate mdclrerror(ep); 1680Sstevel@tonic-gate Free(sn); 1690Sstevel@tonic-gate continue; 1700Sstevel@tonic-gate } 1710Sstevel@tonic-gate 1720Sstevel@tonic-gate if (sn->dname != NULL) 1730Sstevel@tonic-gate Free(sn->dname); 1740Sstevel@tonic-gate Free(sn); 1750Sstevel@tonic-gate return (-1); 1760Sstevel@tonic-gate } 1770Sstevel@tonic-gate 1780Sstevel@tonic-gate /* Add to the end of the linked list */ 1790Sstevel@tonic-gate assert(*sn_next == NULL); 1800Sstevel@tonic-gate *sn_next = sn; 1810Sstevel@tonic-gate sn_next = &sn->next; 1820Sstevel@tonic-gate } 1830Sstevel@tonic-gate } 1840Sstevel@tonic-gate 1850Sstevel@tonic-gate return (0); 1860Sstevel@tonic-gate } 1870Sstevel@tonic-gate 1880Sstevel@tonic-gate static md_drive_desc * 1890Sstevel@tonic-gate rl_to_dd( 1900Sstevel@tonic-gate mdsetname_t *sp, 1910Sstevel@tonic-gate md_replicalist_t *rlp, 1920Sstevel@tonic-gate md_error_t *ep 1930Sstevel@tonic-gate ) 1940Sstevel@tonic-gate { 1950Sstevel@tonic-gate md_replicalist_t *rl; 1960Sstevel@tonic-gate md_replica_t *r; 1970Sstevel@tonic-gate md_drive_desc *dd = NULL; 1980Sstevel@tonic-gate md_drive_desc *d; 1990Sstevel@tonic-gate int found; 2000Sstevel@tonic-gate md_set_desc *sd; 2010Sstevel@tonic-gate daddr_t nblks = 0; 2020Sstevel@tonic-gate 2030Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) 2040Sstevel@tonic-gate return (NULL); 2050Sstevel@tonic-gate 2060Sstevel@tonic-gate /* find the smallest existing replica */ 2070Sstevel@tonic-gate for (rl = rlp; rl != NULL; rl = rl->rl_next) { 2080Sstevel@tonic-gate r = rl->rl_repp; 2090Sstevel@tonic-gate nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks)); 2100Sstevel@tonic-gate } 2110Sstevel@tonic-gate 2120Sstevel@tonic-gate if (nblks <= 0) 2130Sstevel@tonic-gate nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE; 2140Sstevel@tonic-gate 2150Sstevel@tonic-gate for (rl = rlp; rl != NULL; rl = rl->rl_next) { 2160Sstevel@tonic-gate r = rl->rl_repp; 2170Sstevel@tonic-gate 2180Sstevel@tonic-gate found = 0; 2190Sstevel@tonic-gate for (d = dd; d != NULL; d = d->dd_next) { 2200Sstevel@tonic-gate if (strcmp(r->r_namep->drivenamep->cname, 2210Sstevel@tonic-gate d->dd_dnp->cname) == 0) { 2220Sstevel@tonic-gate found = 1; 2230Sstevel@tonic-gate dd->dd_dbcnt++; 2240Sstevel@tonic-gate break; 2250Sstevel@tonic-gate } 2260Sstevel@tonic-gate } 2270Sstevel@tonic-gate 2280Sstevel@tonic-gate if (! found) 2290Sstevel@tonic-gate (void) metadrivedesc_append(&dd, r->r_namep->drivenamep, 2300Sstevel@tonic-gate 1, nblks, sd->sd_ctime, sd->sd_genid, MD_DR_OK); 2310Sstevel@tonic-gate } 2320Sstevel@tonic-gate 2330Sstevel@tonic-gate return (dd); 2340Sstevel@tonic-gate } 2350Sstevel@tonic-gate 2360Sstevel@tonic-gate /* 2370Sstevel@tonic-gate * Exported Entry Points 2380Sstevel@tonic-gate */ 2390Sstevel@tonic-gate 2400Sstevel@tonic-gate set_t 2410Sstevel@tonic-gate get_max_sets(md_error_t *ep) 2420Sstevel@tonic-gate { 2430Sstevel@tonic-gate 2440Sstevel@tonic-gate static set_t max_sets = 0; 2450Sstevel@tonic-gate 2460Sstevel@tonic-gate if (max_sets == 0) 2470Sstevel@tonic-gate if (metaioctl(MD_IOCGETNSET, &max_sets, ep, NULL) != 0) 2480Sstevel@tonic-gate return (0); 2490Sstevel@tonic-gate 2500Sstevel@tonic-gate return (max_sets); 2510Sstevel@tonic-gate } 2520Sstevel@tonic-gate 2530Sstevel@tonic-gate int 2540Sstevel@tonic-gate get_max_meds(md_error_t *ep) 2550Sstevel@tonic-gate { 2560Sstevel@tonic-gate static int max_meds = 0; 2570Sstevel@tonic-gate 2580Sstevel@tonic-gate if (max_meds == 0) 2590Sstevel@tonic-gate if (metaioctl(MD_MED_GET_NMED, &max_meds, ep, NULL) != 0) 2600Sstevel@tonic-gate return (0); 2610Sstevel@tonic-gate 2620Sstevel@tonic-gate return (max_meds); 2630Sstevel@tonic-gate } 2640Sstevel@tonic-gate 2650Sstevel@tonic-gate side_t 2660Sstevel@tonic-gate getmyside(mdsetname_t *sp, md_error_t *ep) 2670Sstevel@tonic-gate { 2680Sstevel@tonic-gate md_set_desc *sd; 2690Sstevel@tonic-gate char *node = NULL; 2700Sstevel@tonic-gate side_t sideno; 2710Sstevel@tonic-gate 2720Sstevel@tonic-gate if (sp->setno == 0) 2730Sstevel@tonic-gate return (0); 2740Sstevel@tonic-gate 2750Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) 2760Sstevel@tonic-gate return (MD_SIDEWILD); 2770Sstevel@tonic-gate 2780Sstevel@tonic-gate node = mynode(); 2790Sstevel@tonic-gate 2800Sstevel@tonic-gate assert(node != NULL); 2810Sstevel@tonic-gate 2820Sstevel@tonic-gate sideno = getnodeside(node, sd); 2830Sstevel@tonic-gate 2840Sstevel@tonic-gate if (sideno != MD_SIDEWILD) 2850Sstevel@tonic-gate return (sideno); 2860Sstevel@tonic-gate 2870Sstevel@tonic-gate return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, node, NULL, node)); 2880Sstevel@tonic-gate } 2890Sstevel@tonic-gate 2900Sstevel@tonic-gate /* 2910Sstevel@tonic-gate * get set info from name 2920Sstevel@tonic-gate */ 2930Sstevel@tonic-gate md_set_record * 2940Sstevel@tonic-gate getsetbyname(char *setname, md_error_t *ep) 2950Sstevel@tonic-gate { 2960Sstevel@tonic-gate md_set_record *sr = NULL; 2970Sstevel@tonic-gate md_mnset_record *mnsr = NULL; 2980Sstevel@tonic-gate char *p; 2990Sstevel@tonic-gate size_t len; 3000Sstevel@tonic-gate 3010Sstevel@tonic-gate /* get set info from daemon */ 3020Sstevel@tonic-gate if (clnt_getset(mynode(), setname, MD_SET_BAD, &sr, ep) == -1) 3030Sstevel@tonic-gate return (NULL); 3040Sstevel@tonic-gate if (sr != NULL) { 3050Sstevel@tonic-gate /* 3060Sstevel@tonic-gate * Returned record could be for a multi-node set or a 3070Sstevel@tonic-gate * non-multi-node set. 3080Sstevel@tonic-gate */ 3090Sstevel@tonic-gate if (MD_MNSET_REC(sr)) { 3100Sstevel@tonic-gate /* 3110Sstevel@tonic-gate * Record is for a multi-node set. Reissue call 3120Sstevel@tonic-gate * to get mnset information. Need to free 3130Sstevel@tonic-gate * record as if a non-multi-node set record since 3140Sstevel@tonic-gate * that is what clnt_getset gave us. If in 3150Sstevel@tonic-gate * the daemon, don't free since this is a pointer 3160Sstevel@tonic-gate * into the setrecords array. 3170Sstevel@tonic-gate */ 3180Sstevel@tonic-gate if (! md_in_daemon) { 3190Sstevel@tonic-gate sr->sr_flags &= ~MD_SR_MN; 3200Sstevel@tonic-gate free_sr(sr); 3210Sstevel@tonic-gate } 3220Sstevel@tonic-gate if (clnt_mngetset(mynode(), setname, MD_SET_BAD, &mnsr, 3230Sstevel@tonic-gate ep) == -1) 3240Sstevel@tonic-gate return (NULL); 3250Sstevel@tonic-gate if (mnsr != NULL) 3260Sstevel@tonic-gate return ((struct md_set_record *)mnsr); 3270Sstevel@tonic-gate } else { 3280Sstevel@tonic-gate return (sr); 3290Sstevel@tonic-gate } 3300Sstevel@tonic-gate } 3310Sstevel@tonic-gate 3320Sstevel@tonic-gate /* no such set */ 3330Sstevel@tonic-gate len = strlen(setname) + 30; 3340Sstevel@tonic-gate p = Malloc(len); 3350Sstevel@tonic-gate (void) snprintf(p, len, "setname \"%s\"", setname); 3360Sstevel@tonic-gate (void) mderror(ep, MDE_NO_SET, p); 3370Sstevel@tonic-gate Free(p); 3380Sstevel@tonic-gate return (NULL); 3390Sstevel@tonic-gate } 3400Sstevel@tonic-gate 3410Sstevel@tonic-gate /* 3420Sstevel@tonic-gate * get set info from number 3430Sstevel@tonic-gate */ 3440Sstevel@tonic-gate md_set_record * 3450Sstevel@tonic-gate getsetbynum(set_t setno, md_error_t *ep) 3460Sstevel@tonic-gate { 3470Sstevel@tonic-gate md_set_record *sr; 3480Sstevel@tonic-gate md_mnset_record *mnsr = NULL; 3490Sstevel@tonic-gate char buf[100]; 3500Sstevel@tonic-gate 3510Sstevel@tonic-gate if (clnt_getset(mynode(), NULL, setno, &sr, ep) == -1) 3520Sstevel@tonic-gate return (NULL); 3530Sstevel@tonic-gate 3540Sstevel@tonic-gate if (sr != NULL) { 3550Sstevel@tonic-gate /* 3560Sstevel@tonic-gate * Record is for a multi-node set. Reissue call 3570Sstevel@tonic-gate * to get mnset information. Need to free 3580Sstevel@tonic-gate * record as if a non-multi-node set record since 3590Sstevel@tonic-gate * that is what clnt_getset gave us. If in 3600Sstevel@tonic-gate * the daemon, don't free since this is a pointer 3610Sstevel@tonic-gate * into the setrecords array. 3620Sstevel@tonic-gate */ 3630Sstevel@tonic-gate if (MD_MNSET_REC(sr)) { 3640Sstevel@tonic-gate /* 3650Sstevel@tonic-gate * Record is for a multi-node set. Reissue call 3660Sstevel@tonic-gate * to get mnset information. 3670Sstevel@tonic-gate */ 3680Sstevel@tonic-gate if (! md_in_daemon) { 3690Sstevel@tonic-gate sr->sr_flags &= ~MD_SR_MN; 3700Sstevel@tonic-gate free_sr(sr); 3710Sstevel@tonic-gate } 3720Sstevel@tonic-gate if (clnt_mngetset(mynode(), NULL, setno, &mnsr, 3730Sstevel@tonic-gate ep) == -1) 3740Sstevel@tonic-gate return (NULL); 3750Sstevel@tonic-gate if (mnsr != NULL) 3760Sstevel@tonic-gate return ((struct md_set_record *)mnsr); 3770Sstevel@tonic-gate } else { 3780Sstevel@tonic-gate return (sr); 3790Sstevel@tonic-gate } 3800Sstevel@tonic-gate } 3810Sstevel@tonic-gate 3820Sstevel@tonic-gate (void) sprintf(buf, "setno %u", setno); 3830Sstevel@tonic-gate (void) mderror(ep, MDE_NO_SET, buf); 3840Sstevel@tonic-gate return (NULL); 3850Sstevel@tonic-gate } 3860Sstevel@tonic-gate 3870Sstevel@tonic-gate int 3880Sstevel@tonic-gate meta_check_drive_inuse( 3890Sstevel@tonic-gate mdsetname_t *sp, 3900Sstevel@tonic-gate mddrivename_t *dnp, 3910Sstevel@tonic-gate int check_db, 3920Sstevel@tonic-gate md_error_t *ep 3930Sstevel@tonic-gate ) 3940Sstevel@tonic-gate { 3950Sstevel@tonic-gate mdnamelist_t *nlp = NULL; 3960Sstevel@tonic-gate mdnamelist_t *p; 3970Sstevel@tonic-gate int rval = 0; 3980Sstevel@tonic-gate 3990Sstevel@tonic-gate /* get all underlying partitions */ 4000Sstevel@tonic-gate if (meta_getalldevs(sp, &nlp, check_db, ep) != 0) 4010Sstevel@tonic-gate return (-1); 4020Sstevel@tonic-gate 4030Sstevel@tonic-gate /* search for drive */ 4040Sstevel@tonic-gate for (p = nlp; (p != NULL); p = p->next) { 4050Sstevel@tonic-gate mdname_t *np = p->namep; 4060Sstevel@tonic-gate 4070Sstevel@tonic-gate if (strcmp(dnp->cname, np->drivenamep->cname) == 0) { 4080Sstevel@tonic-gate rval = (mddserror(ep, MDE_DS_DRIVEINUSE, sp->setno, 4090Sstevel@tonic-gate NULL, dnp->cname, sp->setname)); 4100Sstevel@tonic-gate break; 4110Sstevel@tonic-gate } 4120Sstevel@tonic-gate } 4130Sstevel@tonic-gate 4140Sstevel@tonic-gate /* cleanup, return success */ 4150Sstevel@tonic-gate metafreenamelist(nlp); 4160Sstevel@tonic-gate return (rval); 4170Sstevel@tonic-gate } 4180Sstevel@tonic-gate 4190Sstevel@tonic-gate /* 4200Sstevel@tonic-gate * simple check for ownership 4210Sstevel@tonic-gate */ 4220Sstevel@tonic-gate int 4230Sstevel@tonic-gate meta_check_ownership(mdsetname_t *sp, md_error_t *ep) 4240Sstevel@tonic-gate { 4250Sstevel@tonic-gate int ownset; 4260Sstevel@tonic-gate md_set_desc *sd; 4270Sstevel@tonic-gate md_drive_desc *dd; 4280Sstevel@tonic-gate md_replicalist_t *rlp = NULL; 4290Sstevel@tonic-gate md_error_t xep = mdnullerror; 4300Sstevel@tonic-gate 4310Sstevel@tonic-gate if (metaislocalset(sp)) 4320Sstevel@tonic-gate return (0); 4330Sstevel@tonic-gate 4340Sstevel@tonic-gate ownset = own_set(sp, NULL, TRUE, ep); 4350Sstevel@tonic-gate if (! mdisok(ep)) 4360Sstevel@tonic-gate return (-1); 4370Sstevel@tonic-gate 4380Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) 4390Sstevel@tonic-gate return (-1); 4400Sstevel@tonic-gate 4410Sstevel@tonic-gate dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep); 4420Sstevel@tonic-gate if (! mdisok(ep)) 4430Sstevel@tonic-gate return (-1); 4440Sstevel@tonic-gate 4450Sstevel@tonic-gate /* If we have no drive descriptors, check for no ownership */ 4460Sstevel@tonic-gate if (dd == NULL) { 4470Sstevel@tonic-gate if (ownset == MD_SETOWNER_NONE) 4480Sstevel@tonic-gate return (0); 4490Sstevel@tonic-gate 4500Sstevel@tonic-gate /* If ownership somehow has come to exist, we must clean up */ 4510Sstevel@tonic-gate 4520Sstevel@tonic-gate if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, 4530Sstevel@tonic-gate &xep) < 0) 4540Sstevel@tonic-gate mdclrerror(&xep); 4550Sstevel@tonic-gate 4560Sstevel@tonic-gate if ((dd = rl_to_dd(sp, rlp, &xep)) == NULL) 4570Sstevel@tonic-gate if (! mdisok(&xep)) 4580Sstevel@tonic-gate mdclrerror(&xep); 4590Sstevel@tonic-gate 4600Sstevel@tonic-gate if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) { 4610Sstevel@tonic-gate if (rel_own_bydd(sp, dd, TRUE, &xep)) 4620Sstevel@tonic-gate mdclrerror(&xep); 4630Sstevel@tonic-gate } 4640Sstevel@tonic-gate 4650Sstevel@tonic-gate if (halt_set(sp, &xep)) 4660Sstevel@tonic-gate mdclrerror(&xep); 4670Sstevel@tonic-gate 4680Sstevel@tonic-gate metafreereplicalist(rlp); 4690Sstevel@tonic-gate 4700Sstevel@tonic-gate metafreedrivedesc(&dd); 4710Sstevel@tonic-gate 4720Sstevel@tonic-gate return (0); 4730Sstevel@tonic-gate } 4740Sstevel@tonic-gate 4750Sstevel@tonic-gate metafreedrivedesc(&sd->sd_drvs); 4760Sstevel@tonic-gate 4770Sstevel@tonic-gate if (ownset == MD_SETOWNER_YES) 4780Sstevel@tonic-gate return (0); 4790Sstevel@tonic-gate 4800Sstevel@tonic-gate return (mddserror(ep, MDE_DS_NOOWNER, sp->setno, NULL, NULL, 4810Sstevel@tonic-gate sp->setname)); 4820Sstevel@tonic-gate } 4830Sstevel@tonic-gate 4840Sstevel@tonic-gate /* 4850Sstevel@tonic-gate * simple check for ownership 4860Sstevel@tonic-gate */ 4870Sstevel@tonic-gate int 4880Sstevel@tonic-gate meta_check_ownership_on_host(mdsetname_t *sp, char *hostname, md_error_t *ep) 4890Sstevel@tonic-gate { 4900Sstevel@tonic-gate md_set_desc *sd; 4910Sstevel@tonic-gate md_drive_desc *dd; 4920Sstevel@tonic-gate int bool; 4930Sstevel@tonic-gate 4940Sstevel@tonic-gate if (metaislocalset(sp)) 4950Sstevel@tonic-gate return (0); 4960Sstevel@tonic-gate 4970Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) 4980Sstevel@tonic-gate return (-1); 4990Sstevel@tonic-gate 5000Sstevel@tonic-gate if (getnodeside(hostname, sd) == MD_SIDEWILD) 5010Sstevel@tonic-gate return (mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 5020Sstevel@tonic-gate hostname, NULL, sp->setname)); 5030Sstevel@tonic-gate 5040Sstevel@tonic-gate dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep); 5050Sstevel@tonic-gate if (! mdisok(ep)) 5060Sstevel@tonic-gate return (-1); 5070Sstevel@tonic-gate 5080Sstevel@tonic-gate if (clnt_ownset(hostname, sp, &bool, ep) == -1) 5090Sstevel@tonic-gate return (-1); 5100Sstevel@tonic-gate 5110Sstevel@tonic-gate if (dd == NULL) 5120Sstevel@tonic-gate return (0); 5130Sstevel@tonic-gate 5140Sstevel@tonic-gate metafreedrivedesc(&sd->sd_drvs); 5150Sstevel@tonic-gate 5160Sstevel@tonic-gate if (bool == TRUE) 5170Sstevel@tonic-gate return (0); 5180Sstevel@tonic-gate 5190Sstevel@tonic-gate return (mddserror(ep, MDE_DS_NODEISNOTOWNER, sp->setno, hostname, NULL, 5200Sstevel@tonic-gate sp->setname)); 5210Sstevel@tonic-gate } 5220Sstevel@tonic-gate 5230Sstevel@tonic-gate /* 5240Sstevel@tonic-gate * Function that determines if a node is in the multinode diskset 5250Sstevel@tonic-gate * membership list. Calling node passes in node to be checked and 5260Sstevel@tonic-gate * the nodelist as returned from meta_read_nodelist. This routine 5270Sstevel@tonic-gate * anticipates being called many times using the same diskset membership 5280Sstevel@tonic-gate * list which is why the alloc and free of the diskset membership list 5290Sstevel@tonic-gate * is left to the calling routine. 5300Sstevel@tonic-gate * Returns: 5310Sstevel@tonic-gate * 1 - if a member 5320Sstevel@tonic-gate * 0 - not a member 5330Sstevel@tonic-gate */ 5340Sstevel@tonic-gate int 5350Sstevel@tonic-gate meta_is_member( 5360Sstevel@tonic-gate char *node_name, 5370Sstevel@tonic-gate md_mn_nodeid_t node_id, 5380Sstevel@tonic-gate mndiskset_membershiplist_t *nl 5390Sstevel@tonic-gate ) 5400Sstevel@tonic-gate { 5410Sstevel@tonic-gate mndiskset_membershiplist_t *nl2; 5420Sstevel@tonic-gate int flag_check_name; 5430Sstevel@tonic-gate 5440Sstevel@tonic-gate if (node_id != 0) 5450Sstevel@tonic-gate flag_check_name = 0; 5460Sstevel@tonic-gate else if (node_name != NULL) 5470Sstevel@tonic-gate flag_check_name = 1; 5480Sstevel@tonic-gate else 5490Sstevel@tonic-gate return (0); 5500Sstevel@tonic-gate 5510Sstevel@tonic-gate nl2 = nl; 5520Sstevel@tonic-gate while (nl2) { 5530Sstevel@tonic-gate if (flag_check_name) { 5540Sstevel@tonic-gate /* Compare given name against name in member list */ 5550Sstevel@tonic-gate if (strcmp(nl2->msl_node_name, node_name) == 0) 5560Sstevel@tonic-gate break; 5570Sstevel@tonic-gate } else { 5580Sstevel@tonic-gate /* Compare given nodeid against nodeid in member list */ 5590Sstevel@tonic-gate if (nl2->msl_node_id == node_id) 5600Sstevel@tonic-gate break; 5610Sstevel@tonic-gate } 5620Sstevel@tonic-gate nl2 = nl2->next; 5630Sstevel@tonic-gate } 5640Sstevel@tonic-gate /* No match found in member list */ 5650Sstevel@tonic-gate if (nl2 == NULL) { 5660Sstevel@tonic-gate return (0); 5670Sstevel@tonic-gate } 5680Sstevel@tonic-gate /* Return 1 if node is in member list */ 5690Sstevel@tonic-gate return (1); 5700Sstevel@tonic-gate } 5710Sstevel@tonic-gate 5720Sstevel@tonic-gate /* 5730Sstevel@tonic-gate * meta_getnext_devinfo should go to the host that 5740Sstevel@tonic-gate * has the device, to return the device name, driver name, minor num. 5750Sstevel@tonic-gate * We can take the big cheat for now, since it is a requirement 5760Sstevel@tonic-gate * that the device names and device numbers are the same, and 5770Sstevel@tonic-gate * just get the info locally. 5780Sstevel@tonic-gate * 5790Sstevel@tonic-gate * This routine is very similar to meta_getnextside_devinfo except 5800Sstevel@tonic-gate * that the specific side to be used is being passed in. 5810Sstevel@tonic-gate * 5820Sstevel@tonic-gate * Exit status: 5830Sstevel@tonic-gate * 0 - No more side info to return 5840Sstevel@tonic-gate * 1 - More side info's to return 5850Sstevel@tonic-gate * -1 - An error has been detected 5860Sstevel@tonic-gate */ 5870Sstevel@tonic-gate /*ARGSUSED*/ 5880Sstevel@tonic-gate int 5890Sstevel@tonic-gate meta_getside_devinfo( 5900Sstevel@tonic-gate mdsetname_t *sp, /* for this set */ 5910Sstevel@tonic-gate char *bname, /* local block name (myside) */ 5920Sstevel@tonic-gate side_t sideno, /* sideno */ 5930Sstevel@tonic-gate char **ret_bname, /* block device name of returned side */ 5940Sstevel@tonic-gate char **ret_dname, /* driver name of returned side */ 5950Sstevel@tonic-gate minor_t *ret_mnum, /* minor number of returned side */ 5960Sstevel@tonic-gate md_error_t *ep 5970Sstevel@tonic-gate ) 5980Sstevel@tonic-gate { 5990Sstevel@tonic-gate mdname_t *np; 6000Sstevel@tonic-gate 6010Sstevel@tonic-gate if (ret_bname != NULL) 6020Sstevel@tonic-gate *ret_bname = NULL; 6030Sstevel@tonic-gate if (ret_dname != NULL) 6040Sstevel@tonic-gate *ret_dname = NULL; 6050Sstevel@tonic-gate if (ret_mnum != NULL) 6060Sstevel@tonic-gate *ret_mnum = NODEV32; 6070Sstevel@tonic-gate 6080Sstevel@tonic-gate 6091623Stw21770 if ((np = metaname(&sp, bname, LOGICAL_DEVICE, ep)) == NULL) 6100Sstevel@tonic-gate return (-1); 6110Sstevel@tonic-gate 6120Sstevel@tonic-gate /* 6130Sstevel@tonic-gate * NOTE (future) - There will be more work here once devids are integrated 6140Sstevel@tonic-gate * into disksets. Then the side should be used to find the correct 6150Sstevel@tonic-gate * host and the b/d names should be gotten from that host. 6160Sstevel@tonic-gate */ 6170Sstevel@tonic-gate 6180Sstevel@tonic-gate /* 6190Sstevel@tonic-gate * Return the side info. 6200Sstevel@tonic-gate */ 6210Sstevel@tonic-gate if (ret_bname != NULL) 6220Sstevel@tonic-gate *ret_bname = Strdup(np->bname); 6230Sstevel@tonic-gate 6240Sstevel@tonic-gate if (ret_dname != NULL) { 6250Sstevel@tonic-gate mdcinfo_t *cinfo; 6260Sstevel@tonic-gate 6270Sstevel@tonic-gate if ((cinfo = metagetcinfo(np, ep)) == NULL) 6280Sstevel@tonic-gate return (-1); 6290Sstevel@tonic-gate 6300Sstevel@tonic-gate *ret_dname = Strdup(cinfo->dname); 6310Sstevel@tonic-gate } 6320Sstevel@tonic-gate 6330Sstevel@tonic-gate if (ret_mnum != NULL) 6340Sstevel@tonic-gate *ret_mnum = meta_getminor(np->dev); 6350Sstevel@tonic-gate 6360Sstevel@tonic-gate return (1); 6370Sstevel@tonic-gate } 6380Sstevel@tonic-gate 6390Sstevel@tonic-gate /* 6400Sstevel@tonic-gate * Get the information on the device from the remote node using the devid 6410Sstevel@tonic-gate * of the disk. 6420Sstevel@tonic-gate * 6430Sstevel@tonic-gate * Exit status: 6440Sstevel@tonic-gate * 0 - No more side info to return 6450Sstevel@tonic-gate * 1 - More side info's to return 6460Sstevel@tonic-gate * -1 - An error has been detected 6470Sstevel@tonic-gate */ 6480Sstevel@tonic-gate int 6490Sstevel@tonic-gate meta_getnextside_devinfo( 6500Sstevel@tonic-gate mdsetname_t *sp, /* for this set */ 6510Sstevel@tonic-gate char *bname, /* local block name (myside) */ 6520Sstevel@tonic-gate side_t *sideno, /* previous sideno & returned sideno */ 6530Sstevel@tonic-gate char **ret_bname, /* block device name of returned side */ 6540Sstevel@tonic-gate char **ret_dname, /* driver name of returned side */ 6550Sstevel@tonic-gate minor_t *ret_mnum, /* minor number of returned side */ 6560Sstevel@tonic-gate md_error_t *ep 6570Sstevel@tonic-gate ) 6580Sstevel@tonic-gate { 6590Sstevel@tonic-gate md_set_desc *sd; 6600Sstevel@tonic-gate int i; 6610Sstevel@tonic-gate mdname_t *np; 6620Sstevel@tonic-gate mddrivename_t *dnp; 6630Sstevel@tonic-gate char *devidstr = NULL; 6640Sstevel@tonic-gate int devidstrlen; 6650Sstevel@tonic-gate md_dev64_t retdev = NODEV64; 6660Sstevel@tonic-gate char *ret_devname = NULL; 6670Sstevel@tonic-gate char *ret_blkdevname = NULL; 6680Sstevel@tonic-gate char *ret_driver = NULL; 6690Sstevel@tonic-gate char *nodename; 6700Sstevel@tonic-gate int fd; 6710Sstevel@tonic-gate int ret = -1; 6720Sstevel@tonic-gate char *minor_name = NULL; 6730Sstevel@tonic-gate md_mnnode_desc *nd; 6740Sstevel@tonic-gate 6750Sstevel@tonic-gate 6760Sstevel@tonic-gate if (ret_bname != NULL) 6770Sstevel@tonic-gate *ret_bname = NULL; 6780Sstevel@tonic-gate if (ret_dname != NULL) 6790Sstevel@tonic-gate *ret_dname = NULL; 6800Sstevel@tonic-gate if (ret_mnum != NULL) 6810Sstevel@tonic-gate *ret_mnum = NODEV32; 6820Sstevel@tonic-gate 6830Sstevel@tonic-gate if (metaislocalset(sp)) { 6840Sstevel@tonic-gate /* no more sides - we are done */ 6850Sstevel@tonic-gate if (*sideno != MD_SIDEWILD) 6860Sstevel@tonic-gate return (0); 6870Sstevel@tonic-gate 6880Sstevel@tonic-gate /* First time through - set up return sideno */ 6890Sstevel@tonic-gate *sideno = 0; 6900Sstevel@tonic-gate } else { 6910Sstevel@tonic-gate 6920Sstevel@tonic-gate /* 6930Sstevel@tonic-gate * Find the next sideno, starting after the one given. 6940Sstevel@tonic-gate */ 6950Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) 6960Sstevel@tonic-gate return (-1); 6970Sstevel@tonic-gate 6980Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) { 6990Sstevel@tonic-gate nd = sd->sd_nodelist; 7000Sstevel@tonic-gate if ((*sideno == MD_SIDEWILD) && 7010Sstevel@tonic-gate (nd != (struct md_mnnode_desc *)NULL)) { 7020Sstevel@tonic-gate *sideno = nd->nd_nodeid; 7030Sstevel@tonic-gate } else { 7040Sstevel@tonic-gate while (nd) { 7050Sstevel@tonic-gate /* 7060Sstevel@tonic-gate * Found given sideno, now find 7070Sstevel@tonic-gate * next sideno, if there is one. 7080Sstevel@tonic-gate */ 7090Sstevel@tonic-gate if ((*sideno == nd->nd_nodeid) && 7100Sstevel@tonic-gate (nd->nd_next != 7110Sstevel@tonic-gate (struct md_mnnode_desc *)NULL)) { 7120Sstevel@tonic-gate *sideno = 7130Sstevel@tonic-gate nd->nd_next->nd_nodeid; 7140Sstevel@tonic-gate break; 7150Sstevel@tonic-gate } 7160Sstevel@tonic-gate nd = nd->nd_next; 7170Sstevel@tonic-gate } 7180Sstevel@tonic-gate if (nd == NULL) { 7190Sstevel@tonic-gate return (0); 7200Sstevel@tonic-gate } 7210Sstevel@tonic-gate } 7220Sstevel@tonic-gate if (*sideno == MD_SIDEWILD) 7230Sstevel@tonic-gate return (0); 7240Sstevel@tonic-gate } else { 7250Sstevel@tonic-gate for (i = (*sideno)+1; i < MD_MAXSIDES; i++) 7260Sstevel@tonic-gate /* Find next full slot */ 7270Sstevel@tonic-gate if (sd->sd_nodes[i][0] != '\0') 7280Sstevel@tonic-gate break; 7290Sstevel@tonic-gate 7300Sstevel@tonic-gate /* No more sides - we are done */ 7310Sstevel@tonic-gate if (i == MD_MAXSIDES) 7320Sstevel@tonic-gate return (0); 7330Sstevel@tonic-gate 7340Sstevel@tonic-gate /* Set up the return sideno */ 7350Sstevel@tonic-gate *sideno = i; 7360Sstevel@tonic-gate nodename = (char *)sd->sd_nodes[i]; 7370Sstevel@tonic-gate } 7380Sstevel@tonic-gate } 7390Sstevel@tonic-gate 7400Sstevel@tonic-gate /* 7410Sstevel@tonic-gate * Need to pass the node the devid of the disk and get it to 7420Sstevel@tonic-gate * send back the details of the disk from that side. 7430Sstevel@tonic-gate */ 7441623Stw21770 if ((np = metaname(&sp, bname, UNKNOWN, ep)) == NULL) 7450Sstevel@tonic-gate return (-1); 7460Sstevel@tonic-gate 7470Sstevel@tonic-gate dnp = np->drivenamep; 7480Sstevel@tonic-gate 7490Sstevel@tonic-gate /* 7500Sstevel@tonic-gate * By default, set up the parameters so that they are copied out. 7510Sstevel@tonic-gate */ 7520Sstevel@tonic-gate if (ret_bname != NULL) 7530Sstevel@tonic-gate *ret_bname = Strdup(np->bname); 7540Sstevel@tonic-gate 7550Sstevel@tonic-gate if (ret_dname != NULL) { 7560Sstevel@tonic-gate mdcinfo_t *cinfo; 7570Sstevel@tonic-gate 7580Sstevel@tonic-gate if ((cinfo = metagetcinfo(np, ep)) == NULL) 7590Sstevel@tonic-gate return (-1); 7600Sstevel@tonic-gate 7610Sstevel@tonic-gate *ret_dname = Strdup(cinfo->dname); 7620Sstevel@tonic-gate } 7630Sstevel@tonic-gate 7640Sstevel@tonic-gate if (ret_mnum != NULL) 7650Sstevel@tonic-gate *ret_mnum = meta_getminor(np->dev); 7660Sstevel@tonic-gate 7670Sstevel@tonic-gate /* 7680Sstevel@tonic-gate * Try some optimization. If this is the local set or the device 7690Sstevel@tonic-gate * is a metadevice then just copy the information. If the device 7700Sstevel@tonic-gate * does not have a devid (due to not having a minor name) then 7710Sstevel@tonic-gate * fall back to the pre-devid behaviour of copying the information 7720Sstevel@tonic-gate * on the device: this is okay because the sanity checks before this 7730Sstevel@tonic-gate * call would have found any issues with the device. If it's a 7740Sstevel@tonic-gate * multi-node diskset also just return ie. copy. 7750Sstevel@tonic-gate */ 7760Sstevel@tonic-gate if (metaislocalset(sp) || metaismeta(np) || (dnp->devid == NULL) || 7770Sstevel@tonic-gate (MD_MNSET_DESC(sd))) 7780Sstevel@tonic-gate return (1); 7790Sstevel@tonic-gate 7800Sstevel@tonic-gate if (np->minor_name == (char *)NULL) { 7810Sstevel@tonic-gate /* 7820Sstevel@tonic-gate * Have to get the minor name then. The slice should exist 7830Sstevel@tonic-gate * on the disk because it will have already been repartitioned 7840Sstevel@tonic-gate * up prior to getting to this point. 7850Sstevel@tonic-gate */ 7860Sstevel@tonic-gate if ((fd = open(np->bname, (O_RDONLY|O_NDELAY), 0)) < 0) { 7870Sstevel@tonic-gate (void) mdsyserror(ep, errno, np->bname); 7880Sstevel@tonic-gate return (-1); 7890Sstevel@tonic-gate } 7900Sstevel@tonic-gate (void) devid_get_minor_name(fd, &minor_name); 7910Sstevel@tonic-gate np->minor_name = Strdup(minor_name); 7920Sstevel@tonic-gate devid_str_free(minor_name); 7930Sstevel@tonic-gate (void) close(fd); 7940Sstevel@tonic-gate } 7950Sstevel@tonic-gate 7960Sstevel@tonic-gate /* allocate extra space for "/" and NULL hence +2 */ 7970Sstevel@tonic-gate devidstrlen = strlen(dnp->devid) + strlen(np->minor_name) + 2; 7980Sstevel@tonic-gate devidstr = (char *)Malloc(devidstrlen); 7990Sstevel@tonic-gate 8000Sstevel@tonic-gate /* 8010Sstevel@tonic-gate * As a minor name is supplied then the ret_devname will be 8020Sstevel@tonic-gate * appropriate to that minor_name and in this case it will be 8030Sstevel@tonic-gate * a block device ie /dev/dsk. 8040Sstevel@tonic-gate */ 8050Sstevel@tonic-gate (void) snprintf(devidstr, devidstrlen, 8060Sstevel@tonic-gate "%s/%s", dnp->devid, np->minor_name); 8070Sstevel@tonic-gate 8080Sstevel@tonic-gate ret = clnt_devinfo_by_devid(nodename, sp, devidstr, &retdev, 8090Sstevel@tonic-gate np->bname, &ret_devname, &ret_driver, ep); 8100Sstevel@tonic-gate 8110Sstevel@tonic-gate Free(devidstr); 8120Sstevel@tonic-gate 8130Sstevel@tonic-gate /* 8140Sstevel@tonic-gate * If the other side is not running device id in disksets, 8150Sstevel@tonic-gate * 'ret' is set to ENOTSUP in which case we fallback to 8160Sstevel@tonic-gate * the existing behaviour 8170Sstevel@tonic-gate */ 8180Sstevel@tonic-gate if (ret == ENOTSUP) 8190Sstevel@tonic-gate return (1); 8200Sstevel@tonic-gate else if (ret == -1) 8210Sstevel@tonic-gate return (-1); 8220Sstevel@tonic-gate 8230Sstevel@tonic-gate /* 8240Sstevel@tonic-gate * ret_devname comes from the rpc call and is a 8250Sstevel@tonic-gate * raw device name. We need to make this into a 8260Sstevel@tonic-gate * block device via blkname for further processing. 8270Sstevel@tonic-gate * Unfortunately, when our device id isn't found in 8280Sstevel@tonic-gate * the system, the rpc call will return a " " in 8290Sstevel@tonic-gate * ret_devname in which case we need to fill that in 8300Sstevel@tonic-gate * as ret_blkname because blkname of " " returns NULL. 8310Sstevel@tonic-gate */ 8320Sstevel@tonic-gate if (ret_bname != NULL && ret_devname != NULL) { 8330Sstevel@tonic-gate ret_blkdevname = blkname(ret_devname); 8340Sstevel@tonic-gate if (ret_blkdevname == NULL) 8350Sstevel@tonic-gate *ret_bname = Strdup(ret_devname); 8360Sstevel@tonic-gate else 8370Sstevel@tonic-gate *ret_bname = Strdup(ret_blkdevname); 8380Sstevel@tonic-gate } 8390Sstevel@tonic-gate 8400Sstevel@tonic-gate if (ret_dname != NULL && ret_driver != NULL) 8410Sstevel@tonic-gate *ret_dname = Strdup(ret_driver); 8420Sstevel@tonic-gate 8430Sstevel@tonic-gate if (ret_mnum != NULL) 8440Sstevel@tonic-gate *ret_mnum = meta_getminor(retdev); 8450Sstevel@tonic-gate 8460Sstevel@tonic-gate return (1); 8470Sstevel@tonic-gate } 8480Sstevel@tonic-gate 8490Sstevel@tonic-gate int 8500Sstevel@tonic-gate meta_is_drive_in_anyset( 8510Sstevel@tonic-gate mddrivename_t *dnp, 8520Sstevel@tonic-gate mdsetname_t **spp, 8530Sstevel@tonic-gate int bypass_daemon, 8540Sstevel@tonic-gate md_error_t *ep 8550Sstevel@tonic-gate ) 8560Sstevel@tonic-gate { 8570Sstevel@tonic-gate set_t setno; 8580Sstevel@tonic-gate mdsetname_t *this_sp; 8590Sstevel@tonic-gate int is_it; 8600Sstevel@tonic-gate set_t max_sets; 8610Sstevel@tonic-gate 8620Sstevel@tonic-gate if ((max_sets = get_max_sets(ep)) == 0) 8630Sstevel@tonic-gate return (-1); 8640Sstevel@tonic-gate 8650Sstevel@tonic-gate assert(spp != NULL); 8660Sstevel@tonic-gate *spp = NULL; 8670Sstevel@tonic-gate 8680Sstevel@tonic-gate for (setno = 1; setno < max_sets; setno++) { 8690Sstevel@tonic-gate if (!bypass_daemon) { 8700Sstevel@tonic-gate if ((this_sp = metasetnosetname(setno, ep)) == NULL) { 8710Sstevel@tonic-gate if (mdismddberror(ep, MDE_DB_NODB)) { 8720Sstevel@tonic-gate mdclrerror(ep); 8730Sstevel@tonic-gate return (0); 8740Sstevel@tonic-gate } 8750Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) { 8760Sstevel@tonic-gate mdclrerror(ep); 8770Sstevel@tonic-gate continue; 8780Sstevel@tonic-gate } 8790Sstevel@tonic-gate return (-1); 8800Sstevel@tonic-gate } 8810Sstevel@tonic-gate } else 8820Sstevel@tonic-gate this_sp = metafakesetname(setno, NULL); 8830Sstevel@tonic-gate 8840Sstevel@tonic-gate if ((is_it = meta_is_drive_in_thisset(this_sp, dnp, 8850Sstevel@tonic-gate bypass_daemon, ep)) == -1) { 8860Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) { 8870Sstevel@tonic-gate mdclrerror(ep); 8880Sstevel@tonic-gate continue; 8890Sstevel@tonic-gate } 8900Sstevel@tonic-gate return (-1); 8910Sstevel@tonic-gate } 8920Sstevel@tonic-gate if (is_it) { 8930Sstevel@tonic-gate *spp = this_sp; 8940Sstevel@tonic-gate return (0); 8950Sstevel@tonic-gate } 8960Sstevel@tonic-gate } 8970Sstevel@tonic-gate return (0); 8980Sstevel@tonic-gate } 8990Sstevel@tonic-gate 9000Sstevel@tonic-gate int 9010Sstevel@tonic-gate meta_is_drive_in_thisset( 9020Sstevel@tonic-gate mdsetname_t *sp, 9030Sstevel@tonic-gate mddrivename_t *dnp, 9040Sstevel@tonic-gate int bypass_daemon, 9050Sstevel@tonic-gate md_error_t *ep 9060Sstevel@tonic-gate ) 9070Sstevel@tonic-gate { 9080Sstevel@tonic-gate md_drive_desc *dd, *p; 9090Sstevel@tonic-gate 9100Sstevel@tonic-gate if (bypass_daemon) 9110Sstevel@tonic-gate dd = dr2drivedesc(sp, MD_SIDEWILD, 9120Sstevel@tonic-gate (MD_BASICNAME_OK | MD_BYPASS_DAEMON), ep); 9130Sstevel@tonic-gate else 9140Sstevel@tonic-gate dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep); 9150Sstevel@tonic-gate 9160Sstevel@tonic-gate if (dd == NULL) { 9170Sstevel@tonic-gate if (! mdisok(ep)) 9180Sstevel@tonic-gate return (-1); 9190Sstevel@tonic-gate return (0); 9200Sstevel@tonic-gate } 9210Sstevel@tonic-gate 9220Sstevel@tonic-gate 9230Sstevel@tonic-gate for (p = dd; p != NULL; p = p->dd_next) 9240Sstevel@tonic-gate if (strcmp(p->dd_dnp->cname, dnp->cname) == 0) 9250Sstevel@tonic-gate return (1); 9260Sstevel@tonic-gate return (0); 9270Sstevel@tonic-gate } 9280Sstevel@tonic-gate 929*1945Sjeanm /* 930*1945Sjeanm * Check to see if devid is in use in any diskset. 931*1945Sjeanm * This is used in the case when a partial diskset is being imported 932*1945Sjeanm * to make sure that the unvailable drive isn't already in use in an 933*1945Sjeanm * already imported partial diskset. Can't check on the cname since the 934*1945Sjeanm * unavailable disk's cname is from the previous system and may collide 935*1945Sjeanm * with a cname on this system. 936*1945Sjeanm * Return values: 937*1945Sjeanm * 1: devid has been found in a diskset 938*1945Sjeanm * 0: devid not found in any diskset 939*1945Sjeanm */ 940*1945Sjeanm int 941*1945Sjeanm meta_is_devid_in_anyset( 942*1945Sjeanm void *devid, 943*1945Sjeanm mdsetname_t **spp, 944*1945Sjeanm md_error_t *ep 945*1945Sjeanm ) 946*1945Sjeanm { 947*1945Sjeanm set_t setno; 948*1945Sjeanm mdsetname_t *this_sp; 949*1945Sjeanm int is_it; 950*1945Sjeanm set_t max_sets; 951*1945Sjeanm 952*1945Sjeanm if ((max_sets = get_max_sets(ep)) == 0) 953*1945Sjeanm return (-1); 954*1945Sjeanm 955*1945Sjeanm assert(spp != NULL); 956*1945Sjeanm *spp = NULL; 957*1945Sjeanm 958*1945Sjeanm for (setno = 1; setno < max_sets; setno++) { 959*1945Sjeanm if ((this_sp = metasetnosetname(setno, ep)) == NULL) { 960*1945Sjeanm if (mdismddberror(ep, MDE_DB_NODB)) { 961*1945Sjeanm mdclrerror(ep); 962*1945Sjeanm return (0); 963*1945Sjeanm } 964*1945Sjeanm if (mdiserror(ep, MDE_NO_SET)) { 965*1945Sjeanm mdclrerror(ep); 966*1945Sjeanm continue; 967*1945Sjeanm } 968*1945Sjeanm return (-1); 969*1945Sjeanm } 970*1945Sjeanm 971*1945Sjeanm if ((is_it = meta_is_devid_in_thisset(this_sp, 972*1945Sjeanm devid, ep)) == -1) { 973*1945Sjeanm if (mdiserror(ep, MDE_NO_SET)) { 974*1945Sjeanm mdclrerror(ep); 975*1945Sjeanm continue; 976*1945Sjeanm } 977*1945Sjeanm return (-1); 978*1945Sjeanm } 979*1945Sjeanm if (is_it) { 980*1945Sjeanm *spp = this_sp; 981*1945Sjeanm return (0); 982*1945Sjeanm } 983*1945Sjeanm } 984*1945Sjeanm return (0); 985*1945Sjeanm } 986*1945Sjeanm 987*1945Sjeanm int 988*1945Sjeanm meta_is_devid_in_thisset( 989*1945Sjeanm mdsetname_t *sp, 990*1945Sjeanm void *devid, 991*1945Sjeanm md_error_t *ep 992*1945Sjeanm ) 993*1945Sjeanm { 994*1945Sjeanm md_drive_desc *dd, *p; 995*1945Sjeanm ddi_devid_t dd_devid; 996*1945Sjeanm 997*1945Sjeanm dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep); 998*1945Sjeanm if (dd == NULL) { 999*1945Sjeanm if (! mdisok(ep)) 1000*1945Sjeanm return (-1); 1001*1945Sjeanm return (0); 1002*1945Sjeanm } 1003*1945Sjeanm 1004*1945Sjeanm for (p = dd; p != NULL; p = p->dd_next) { 1005*1945Sjeanm if (p->dd_dnp->devid == NULL) 1006*1945Sjeanm continue; 1007*1945Sjeanm (void) devid_str_decode(p->dd_dnp->devid, 1008*1945Sjeanm &dd_devid, NULL); 1009*1945Sjeanm if (dd_devid == NULL) 1010*1945Sjeanm continue; 1011*1945Sjeanm if (devid_compare(devid, dd_devid) == 0) { 1012*1945Sjeanm devid_free(dd_devid); 1013*1945Sjeanm return (1); 1014*1945Sjeanm } 1015*1945Sjeanm devid_free(dd_devid); 1016*1945Sjeanm } 1017*1945Sjeanm return (0); 1018*1945Sjeanm } 1019*1945Sjeanm 10200Sstevel@tonic-gate int 10210Sstevel@tonic-gate meta_set_balance( 10220Sstevel@tonic-gate mdsetname_t *sp, 10230Sstevel@tonic-gate md_error_t *ep 10240Sstevel@tonic-gate ) 10250Sstevel@tonic-gate { 10260Sstevel@tonic-gate md_set_desc *sd; 10270Sstevel@tonic-gate md_drive_desc *dd, *curdd; 10280Sstevel@tonic-gate daddr_t dbsize; 10290Sstevel@tonic-gate daddr_t nblks; 10300Sstevel@tonic-gate int i; 10310Sstevel@tonic-gate int rval = 0; 10320Sstevel@tonic-gate sigset_t oldsigs; 10330Sstevel@tonic-gate md_setkey_t *cl_sk; 10340Sstevel@tonic-gate md_error_t xep = mdnullerror; 10350Sstevel@tonic-gate md_mnnode_desc *nd; 10360Sstevel@tonic-gate int suspend1_flag = 0; 10370Sstevel@tonic-gate 10380Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) 10390Sstevel@tonic-gate return (-1); 10400Sstevel@tonic-gate 10410Sstevel@tonic-gate dbsize = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE; 10420Sstevel@tonic-gate 10430Sstevel@tonic-gate /* Make sure we own the set */ 10440Sstevel@tonic-gate if (meta_check_ownership(sp, ep) != 0) 10450Sstevel@tonic-gate return (-1); 10460Sstevel@tonic-gate 10470Sstevel@tonic-gate /* END CHECK CODE */ 10480Sstevel@tonic-gate 10490Sstevel@tonic-gate /* 10500Sstevel@tonic-gate * Get drive descriptors for the drives that are currently in the set. 10510Sstevel@tonic-gate */ 10520Sstevel@tonic-gate curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep); 10530Sstevel@tonic-gate 10540Sstevel@tonic-gate if (! mdisok(ep)) 10550Sstevel@tonic-gate return (-1); 10560Sstevel@tonic-gate 10570Sstevel@tonic-gate /* Find the minimum replica size in use is or use the default */ 10580Sstevel@tonic-gate if ((nblks = meta_db_minreplica(sp, ep)) < 0) 10590Sstevel@tonic-gate mdclrerror(ep); 10600Sstevel@tonic-gate else 10610Sstevel@tonic-gate dbsize = nblks; /* adjust replica size */ 10620Sstevel@tonic-gate 10630Sstevel@tonic-gate /* Make sure we are blocking all signals */ 10640Sstevel@tonic-gate if (procsigs(TRUE, &oldsigs, &xep) < 0) 10650Sstevel@tonic-gate mdclrerror(&xep); 10660Sstevel@tonic-gate 10670Sstevel@tonic-gate /* 10680Sstevel@tonic-gate * Lock the set on current set members. 10690Sstevel@tonic-gate * For MN diskset lock_set and SUSPEND are used to protect against 10700Sstevel@tonic-gate * other meta* commands running on the other nodes. 10710Sstevel@tonic-gate */ 10720Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) { 10730Sstevel@tonic-gate nd = sd->sd_nodelist; 10740Sstevel@tonic-gate while (nd) { 10750Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 10760Sstevel@tonic-gate nd = nd->nd_next; 10770Sstevel@tonic-gate continue; 10780Sstevel@tonic-gate } 10790Sstevel@tonic-gate if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 10800Sstevel@tonic-gate rval = -1; 10810Sstevel@tonic-gate goto out; 10820Sstevel@tonic-gate } 10830Sstevel@tonic-gate nd = nd->nd_next; 10840Sstevel@tonic-gate } 10850Sstevel@tonic-gate /* 10860Sstevel@tonic-gate * Lock out other meta* commands by suspending 10870Sstevel@tonic-gate * class 1 messages across the diskset. 10880Sstevel@tonic-gate */ 10890Sstevel@tonic-gate nd = sd->sd_nodelist; 10900Sstevel@tonic-gate while (nd) { 10910Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 10920Sstevel@tonic-gate nd = nd->nd_next; 10930Sstevel@tonic-gate continue; 10940Sstevel@tonic-gate } 10950Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, 10960Sstevel@tonic-gate COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, 10970Sstevel@tonic-gate MD_MSCF_NO_FLAGS, ep)) { 10980Sstevel@tonic-gate rval = -1; 10990Sstevel@tonic-gate goto out; 11000Sstevel@tonic-gate } 11010Sstevel@tonic-gate suspend1_flag = 1; 11020Sstevel@tonic-gate nd = nd->nd_next; 11030Sstevel@tonic-gate } 11040Sstevel@tonic-gate } else { 11050Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) { 11060Sstevel@tonic-gate /* Skip empty slots */ 11070Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0') continue; 11080Sstevel@tonic-gate 11090Sstevel@tonic-gate if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { 11100Sstevel@tonic-gate rval = -1; 11110Sstevel@tonic-gate goto out; 11120Sstevel@tonic-gate } 11130Sstevel@tonic-gate } 11140Sstevel@tonic-gate } 11150Sstevel@tonic-gate 11160Sstevel@tonic-gate /* We are not adding or deleting any drives, just balancing */ 11170Sstevel@tonic-gate dd = NULL; 11180Sstevel@tonic-gate 11190Sstevel@tonic-gate /* 11200Sstevel@tonic-gate * Balance the DB's according to the list of existing drives and the 11210Sstevel@tonic-gate * list of added drives. 11220Sstevel@tonic-gate */ 11230Sstevel@tonic-gate if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1) 11240Sstevel@tonic-gate goto out; 11250Sstevel@tonic-gate 11260Sstevel@tonic-gate out: 11270Sstevel@tonic-gate /* 11280Sstevel@tonic-gate * Unlock diskset by resuming class 1 messages across the diskset. 11290Sstevel@tonic-gate * Just resume all classes so that resume is the same whether 11300Sstevel@tonic-gate * just one class was locked or all classes were locked. 11310Sstevel@tonic-gate */ 11320Sstevel@tonic-gate if (suspend1_flag) { 11330Sstevel@tonic-gate nd = sd->sd_nodelist; 11340Sstevel@tonic-gate while (nd) { 11350Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 11360Sstevel@tonic-gate nd = nd->nd_next; 11370Sstevel@tonic-gate continue; 11380Sstevel@tonic-gate } 11390Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 11400Sstevel@tonic-gate sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 11410Sstevel@tonic-gate /* 11420Sstevel@tonic-gate * We are here because we failed to resume 11430Sstevel@tonic-gate * rpc.mdcommd. However we potentially have 11440Sstevel@tonic-gate * an error from the previous call 11450Sstevel@tonic-gate * (meta_db_balance). If the previous call 11460Sstevel@tonic-gate * did fail, we capture that error and 11470Sstevel@tonic-gate * generate a perror withthe string, 11480Sstevel@tonic-gate * "Unable to resume...". 11490Sstevel@tonic-gate * Setting rval to -1 ensures that in the 11500Sstevel@tonic-gate * next iteration of the loop, ep is not 11510Sstevel@tonic-gate * clobbered. 11520Sstevel@tonic-gate */ 11530Sstevel@tonic-gate if (rval == 0) 11540Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 11550Sstevel@tonic-gate else 11560Sstevel@tonic-gate mdclrerror(&xep); 11570Sstevel@tonic-gate rval = -1; 11580Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 11590Sstevel@tonic-gate "Unable to resume rpc.mdcommd.")); 11600Sstevel@tonic-gate } 11610Sstevel@tonic-gate nd = nd->nd_next; 11620Sstevel@tonic-gate } 11630Sstevel@tonic-gate } 11640Sstevel@tonic-gate 11650Sstevel@tonic-gate /* Unlock the set */ 11660Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname); 11670Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) { 11680Sstevel@tonic-gate nd = sd->sd_nodelist; 11690Sstevel@tonic-gate while (nd) { 11700Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 11710Sstevel@tonic-gate nd = nd->nd_next; 11720Sstevel@tonic-gate continue; 11730Sstevel@tonic-gate } 11740Sstevel@tonic-gate if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 11750Sstevel@tonic-gate if (rval == 0) 11760Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 11770Sstevel@tonic-gate else 11780Sstevel@tonic-gate mdclrerror(&xep); 11790Sstevel@tonic-gate rval = -1; 11800Sstevel@tonic-gate } 11810Sstevel@tonic-gate nd = nd->nd_next; 11820Sstevel@tonic-gate } 11830Sstevel@tonic-gate } else { 11840Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) { 11850Sstevel@tonic-gate /* Skip empty slots */ 11860Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0') 11870Sstevel@tonic-gate continue; 11880Sstevel@tonic-gate 11890Sstevel@tonic-gate if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { 11900Sstevel@tonic-gate if (rval == 0) 11910Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 11920Sstevel@tonic-gate rval = -1; 11930Sstevel@tonic-gate } 11940Sstevel@tonic-gate } 11950Sstevel@tonic-gate } 11960Sstevel@tonic-gate 11970Sstevel@tonic-gate /* release signals back to what they were on entry */ 11980Sstevel@tonic-gate if (procsigs(FALSE, &oldsigs, &xep) < 0) 11990Sstevel@tonic-gate mdclrerror(&xep); 12000Sstevel@tonic-gate 12010Sstevel@tonic-gate cl_set_setkey(NULL); 12020Sstevel@tonic-gate 12030Sstevel@tonic-gate metaflushsetname(sp); 12040Sstevel@tonic-gate 12050Sstevel@tonic-gate return (rval); 12060Sstevel@tonic-gate } 12070Sstevel@tonic-gate 12080Sstevel@tonic-gate int 12090Sstevel@tonic-gate meta_set_destroy( 12100Sstevel@tonic-gate mdsetname_t *sp, 12110Sstevel@tonic-gate int lock_set, 12120Sstevel@tonic-gate md_error_t *ep 12130Sstevel@tonic-gate ) 12140Sstevel@tonic-gate { 12150Sstevel@tonic-gate int i; 12160Sstevel@tonic-gate med_rec_t medr; 12170Sstevel@tonic-gate md_set_desc *sd; 12180Sstevel@tonic-gate md_drive_desc *dd, *p, *p1; 12190Sstevel@tonic-gate mddrivename_t *dnp; 12200Sstevel@tonic-gate mdname_t *np; 12210Sstevel@tonic-gate mdnamelist_t *nlp = NULL; 12220Sstevel@tonic-gate int num_users = 0; 12230Sstevel@tonic-gate int has_set; 12240Sstevel@tonic-gate side_t mysideno; 12250Sstevel@tonic-gate sigset_t oldsigs; 12260Sstevel@tonic-gate md_error_t xep = mdnullerror; 12270Sstevel@tonic-gate md_setkey_t *cl_sk; 12280Sstevel@tonic-gate int rval = 0; 12290Sstevel@tonic-gate int delete_end = 1; 12300Sstevel@tonic-gate 12310Sstevel@tonic-gate /* Make sure we are blocking all signals */ 12320Sstevel@tonic-gate if (procsigs(TRUE, &oldsigs, ep) < 0) 12330Sstevel@tonic-gate return (-1); 12340Sstevel@tonic-gate 12350Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 12360Sstevel@tonic-gate if (! mdisok(ep)) 12370Sstevel@tonic-gate rval = -1; 12380Sstevel@tonic-gate goto out; 12390Sstevel@tonic-gate } 12400Sstevel@tonic-gate 12410Sstevel@tonic-gate /* 12420Sstevel@tonic-gate * meta_set_destroy should not be called for a MN diskset. 12430Sstevel@tonic-gate * This routine destroys a set without communicating this information 12440Sstevel@tonic-gate * to the other nodes which would lead to an inconsistency in 12450Sstevel@tonic-gate * the MN diskset. 12460Sstevel@tonic-gate */ 12470Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) { 12480Sstevel@tonic-gate rval = -1; 12490Sstevel@tonic-gate goto out; 12500Sstevel@tonic-gate } 12510Sstevel@tonic-gate 12520Sstevel@tonic-gate /* Continue if a traditional diskset */ 12530Sstevel@tonic-gate 12540Sstevel@tonic-gate /* 12550Sstevel@tonic-gate * Check to see who has the set. If we are not the last user of the 12560Sstevel@tonic-gate * set, we will not touch the replicas. 12570Sstevel@tonic-gate */ 12580Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) { 12590Sstevel@tonic-gate /* Skip empty slots */ 12600Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0') 12610Sstevel@tonic-gate continue; 12620Sstevel@tonic-gate 12630Sstevel@tonic-gate has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ, 12640Sstevel@tonic-gate ep); 12650Sstevel@tonic-gate 12660Sstevel@tonic-gate if (has_set < 0) { 12670Sstevel@tonic-gate mdclrerror(ep); 12680Sstevel@tonic-gate } else 12690Sstevel@tonic-gate num_users++; 12700Sstevel@tonic-gate } 12710Sstevel@tonic-gate 12720Sstevel@tonic-gate if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) == NULL) { 12730Sstevel@tonic-gate if (! mdisok(ep)) { 12740Sstevel@tonic-gate rval = -1; 12750Sstevel@tonic-gate goto out; 12760Sstevel@tonic-gate } 12770Sstevel@tonic-gate } 12780Sstevel@tonic-gate 12790Sstevel@tonic-gate if (setup_db_bydd(sp, dd, TRUE, ep) == -1) { 12800Sstevel@tonic-gate rval = -1; 12810Sstevel@tonic-gate goto out; 12820Sstevel@tonic-gate } 12830Sstevel@tonic-gate 12840Sstevel@tonic-gate if (lock_set == TRUE) { 12850Sstevel@tonic-gate /* Lock the set on our side */ 12860Sstevel@tonic-gate if (clnt_lock_set(mynode(), sp, ep)) { 12870Sstevel@tonic-gate rval = -1; 12880Sstevel@tonic-gate goto out; 12890Sstevel@tonic-gate } 12900Sstevel@tonic-gate } 12910Sstevel@tonic-gate 12920Sstevel@tonic-gate /* 12930Sstevel@tonic-gate * A traditional diskset has no diskset stale information to send 12940Sstevel@tonic-gate * since there can only be one owner node at a time. 12950Sstevel@tonic-gate */ 12960Sstevel@tonic-gate if (snarf_set(sp, FALSE, ep)) 12970Sstevel@tonic-gate mdclrerror(ep); 12980Sstevel@tonic-gate 12990Sstevel@tonic-gate if (dd != NULL) { 13000Sstevel@tonic-gate /* 13010Sstevel@tonic-gate * Make sure that no drives are in use as parts of metadrives 13020Sstevel@tonic-gate * or hot spare pools, this is one of the few error conditions 13030Sstevel@tonic-gate * that will stop this routine, unless the environment has 13040Sstevel@tonic-gate * META_DESTROY_SET_OK set, in which case, the operation will 13050Sstevel@tonic-gate * proceed. 13060Sstevel@tonic-gate */ 13070Sstevel@tonic-gate if (getenv("META_DESTROY_SET_OK") == NULL) { 13080Sstevel@tonic-gate for (p = dd; p != NULL; p = p->dd_next) { 13090Sstevel@tonic-gate dnp = p->dd_dnp; 13100Sstevel@tonic-gate 13110Sstevel@tonic-gate i = meta_check_drive_inuse(sp, dnp, FALSE, ep); 13120Sstevel@tonic-gate if (i == -1) { 13130Sstevel@tonic-gate /* need xep - wire calls clear error */ 13140Sstevel@tonic-gate i = metaget_setownership(sp, &xep); 13150Sstevel@tonic-gate if (i == -1) { 13160Sstevel@tonic-gate rval = -1; 13170Sstevel@tonic-gate goto out; 13180Sstevel@tonic-gate } 13190Sstevel@tonic-gate 13200Sstevel@tonic-gate mysideno = getmyside(sp, &xep); 13210Sstevel@tonic-gate 13220Sstevel@tonic-gate if (mysideno == MD_SIDEWILD) { 13230Sstevel@tonic-gate rval = -1; 13240Sstevel@tonic-gate goto out; 13250Sstevel@tonic-gate } 13260Sstevel@tonic-gate 13270Sstevel@tonic-gate if (sd->sd_isown[mysideno] == FALSE) 13280Sstevel@tonic-gate if (halt_set(sp, &xep)) { 13290Sstevel@tonic-gate rval = -1; 13300Sstevel@tonic-gate goto out; 13310Sstevel@tonic-gate } 13320Sstevel@tonic-gate 13330Sstevel@tonic-gate rval = -1; 13340Sstevel@tonic-gate goto out; 13350Sstevel@tonic-gate } 13360Sstevel@tonic-gate } 13370Sstevel@tonic-gate } 13380Sstevel@tonic-gate 13390Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) { 13400Sstevel@tonic-gate /* Skip empty slots */ 13410Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0') 13420Sstevel@tonic-gate continue; 13430Sstevel@tonic-gate 13440Sstevel@tonic-gate /* Skip non local nodes */ 13450Sstevel@tonic-gate if (strcmp(mynode(), sd->sd_nodes[i]) != 0) 13460Sstevel@tonic-gate continue; 13470Sstevel@tonic-gate 13480Sstevel@tonic-gate if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep)) 13490Sstevel@tonic-gate mdclrerror(ep); 13500Sstevel@tonic-gate } 13510Sstevel@tonic-gate 13520Sstevel@tonic-gate /* 13530Sstevel@tonic-gate * Go thru each drive and individually delete the replicas. 13540Sstevel@tonic-gate * This way we can ignore individual errors. 13550Sstevel@tonic-gate */ 13560Sstevel@tonic-gate for (p = dd; p != NULL; p = p->dd_next) { 13570Sstevel@tonic-gate uint_t rep_slice; 13580Sstevel@tonic-gate 13590Sstevel@tonic-gate dnp = p->dd_dnp; 13600Sstevel@tonic-gate if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) || 13610Sstevel@tonic-gate (((np = metaslicename(dnp, rep_slice, ep)) 13620Sstevel@tonic-gate == NULL) && 13630Sstevel@tonic-gate ((np = metaslicename(dnp, MD_SLICE0, ep)) 13640Sstevel@tonic-gate == NULL))) { 13650Sstevel@tonic-gate rval = -1; 13660Sstevel@tonic-gate goto out; 13670Sstevel@tonic-gate } 13680Sstevel@tonic-gate 13690Sstevel@tonic-gate if ((np = metaslicename(dnp, 13700Sstevel@tonic-gate rep_slice, ep)) == NULL) { 13710Sstevel@tonic-gate if ((np = metaslicename(dnp, 13720Sstevel@tonic-gate MD_SLICE0, ep)) == NULL) { 13730Sstevel@tonic-gate rval = -1; 13740Sstevel@tonic-gate goto out; 13750Sstevel@tonic-gate } 13760Sstevel@tonic-gate mdclrerror(ep); 13770Sstevel@tonic-gate } 13780Sstevel@tonic-gate 13790Sstevel@tonic-gate /* Yes this is UGLY!!! */ 13800Sstevel@tonic-gate p1 = p->dd_next; 13810Sstevel@tonic-gate p->dd_next = NULL; 13820Sstevel@tonic-gate if (rel_own_bydd(sp, p, FALSE, ep)) 13830Sstevel@tonic-gate mdclrerror(ep); 13840Sstevel@tonic-gate p->dd_next = p1; 13850Sstevel@tonic-gate 13860Sstevel@tonic-gate if (p->dd_dbcnt == 0) 13870Sstevel@tonic-gate continue; 13880Sstevel@tonic-gate 13890Sstevel@tonic-gate /* 13900Sstevel@tonic-gate * Skip the replica removal if we are not the last user 13910Sstevel@tonic-gate */ 13920Sstevel@tonic-gate if (num_users != 1) 13930Sstevel@tonic-gate continue; 13940Sstevel@tonic-gate 13950Sstevel@tonic-gate nlp = NULL; 13960Sstevel@tonic-gate (void) metanamelist_append(&nlp, np); 13970Sstevel@tonic-gate if (meta_db_detach(sp, nlp, 13980Sstevel@tonic-gate (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, ep)) 13990Sstevel@tonic-gate mdclrerror(ep); 14000Sstevel@tonic-gate metafreenamelist(nlp); 14010Sstevel@tonic-gate } 14020Sstevel@tonic-gate } 14030Sstevel@tonic-gate 14040Sstevel@tonic-gate if (halt_set(sp, ep)) { 14050Sstevel@tonic-gate rval = -1; 14060Sstevel@tonic-gate goto out; 14070Sstevel@tonic-gate } 14080Sstevel@tonic-gate 14090Sstevel@tonic-gate /* Setup the mediator record */ 14100Sstevel@tonic-gate (void) memset(&medr, '\0', sizeof (med_rec_t)); 14110Sstevel@tonic-gate medr.med_rec_mag = MED_REC_MAGIC; 14120Sstevel@tonic-gate medr.med_rec_rev = MED_REC_REV; 14130Sstevel@tonic-gate medr.med_rec_fl = 0; 14140Sstevel@tonic-gate medr.med_rec_sn = sp->setno; 14150Sstevel@tonic-gate (void) strcpy(medr.med_rec_snm, sp->setname); 14160Sstevel@tonic-gate medr.med_rec_meds = sd->sd_med; /* structure assigment */ 14170Sstevel@tonic-gate (void) memset(&medr.med_rec_data, '\0', sizeof (med_data_t)); 14180Sstevel@tonic-gate medr.med_rec_foff = 0; 14190Sstevel@tonic-gate 14200Sstevel@tonic-gate /* 14210Sstevel@tonic-gate * If we are the last remaining user, then remove the mediator hosts 14220Sstevel@tonic-gate */ 14230Sstevel@tonic-gate if (num_users == 1) { 14240Sstevel@tonic-gate for (i = 0; i < MED_MAX_HOSTS; i++) { 14250Sstevel@tonic-gate if (medr.med_rec_meds.n_lst[i].a_cnt != 0) 14260Sstevel@tonic-gate SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE, 14270Sstevel@tonic-gate SVM_TAG_MEDIATOR, sp->setno, i); 14280Sstevel@tonic-gate (void) memset(&medr.med_rec_meds.n_lst[i], '\0', 14290Sstevel@tonic-gate sizeof (md_h_t)); 14300Sstevel@tonic-gate } 14310Sstevel@tonic-gate medr.med_rec_meds.n_cnt = 0; 14320Sstevel@tonic-gate } else { /* Remove this host from the mediator node list. */ 14330Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) { 14340Sstevel@tonic-gate /* Skip empty slots */ 14350Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0') 14360Sstevel@tonic-gate continue; 14370Sstevel@tonic-gate 14380Sstevel@tonic-gate /* Copy non local node */ 14390Sstevel@tonic-gate if (strcmp(mynode(), sd->sd_nodes[i]) != 0) { 14400Sstevel@tonic-gate (void) strcpy(medr.med_rec_nodes[i], 14410Sstevel@tonic-gate sd->sd_nodes[i]); 14420Sstevel@tonic-gate continue; 14430Sstevel@tonic-gate } 14440Sstevel@tonic-gate 14450Sstevel@tonic-gate /* Clear local node */ 14460Sstevel@tonic-gate (void) memset(&medr.med_rec_nodes[i], '\0', 14470Sstevel@tonic-gate sizeof (md_node_nm_t)); 14480Sstevel@tonic-gate } 14490Sstevel@tonic-gate } 14500Sstevel@tonic-gate 14510Sstevel@tonic-gate crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL); 14520Sstevel@tonic-gate 14530Sstevel@tonic-gate /* 14540Sstevel@tonic-gate * If the client is part of a cluster put the DCS service 14550Sstevel@tonic-gate * into a deleteing state. 14560Sstevel@tonic-gate */ 14570Sstevel@tonic-gate if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) { 14580Sstevel@tonic-gate if (metad_isautotakebyname(sp->setname)) { 14590Sstevel@tonic-gate delete_end = 0; 14600Sstevel@tonic-gate } else { 14610Sstevel@tonic-gate mdclrerror(ep); 14620Sstevel@tonic-gate goto out; 14630Sstevel@tonic-gate } 14640Sstevel@tonic-gate } 14650Sstevel@tonic-gate 14660Sstevel@tonic-gate /* Inform the mediator hosts of the new information */ 14670Sstevel@tonic-gate for (i = 0; i < MED_MAX_HOSTS; i++) { 14680Sstevel@tonic-gate if (sd->sd_med.n_lst[i].a_cnt == 0) 14690Sstevel@tonic-gate continue; 14700Sstevel@tonic-gate 14710Sstevel@tonic-gate if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep)) 14720Sstevel@tonic-gate mdclrerror(ep); 14730Sstevel@tonic-gate } 14740Sstevel@tonic-gate 14750Sstevel@tonic-gate /* Delete the set locally */ 14760Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) { 14770Sstevel@tonic-gate /* Skip empty slots */ 14780Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0') 14790Sstevel@tonic-gate continue; 14800Sstevel@tonic-gate 14810Sstevel@tonic-gate /* Skip non local nodes */ 14820Sstevel@tonic-gate if (strcmp(mynode(), sd->sd_nodes[i]) != 0) 14830Sstevel@tonic-gate continue; 14840Sstevel@tonic-gate 14850Sstevel@tonic-gate if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) 14860Sstevel@tonic-gate mdclrerror(ep); 14870Sstevel@tonic-gate } 14880Sstevel@tonic-gate if (delete_end && 14890Sstevel@tonic-gate sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR) 14900Sstevel@tonic-gate rval = -1; 14910Sstevel@tonic-gate 14920Sstevel@tonic-gate out: 14930Sstevel@tonic-gate /* release signals back to what they were on entry */ 14940Sstevel@tonic-gate if (procsigs(FALSE, &oldsigs, &xep) < 0) { 14950Sstevel@tonic-gate if (rval == 0) 14960Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 14970Sstevel@tonic-gate rval = -1; 14980Sstevel@tonic-gate } 14990Sstevel@tonic-gate 15000Sstevel@tonic-gate if (lock_set == TRUE) { 15010Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname); 15020Sstevel@tonic-gate if (clnt_unlock_set(mynode(), cl_sk, &xep)) { 15030Sstevel@tonic-gate if (rval == 0) 15040Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 15050Sstevel@tonic-gate rval = -1; 15060Sstevel@tonic-gate } 15070Sstevel@tonic-gate cl_set_setkey(NULL); 15080Sstevel@tonic-gate } 15090Sstevel@tonic-gate 15100Sstevel@tonic-gate metaflushsetname(sp); 15110Sstevel@tonic-gate return (rval); 15120Sstevel@tonic-gate } 15130Sstevel@tonic-gate 15140Sstevel@tonic-gate int 15150Sstevel@tonic-gate meta_set_purge( 15160Sstevel@tonic-gate mdsetname_t *sp, 15170Sstevel@tonic-gate int bypass_cluster, 15180Sstevel@tonic-gate int forceflg, 15190Sstevel@tonic-gate md_error_t *ep 15200Sstevel@tonic-gate ) 15210Sstevel@tonic-gate { 15220Sstevel@tonic-gate char *thishost = mynode(); 15230Sstevel@tonic-gate md_set_desc *sd; 15240Sstevel@tonic-gate md_setkey_t *cl_sk; 15250Sstevel@tonic-gate md_error_t xep = mdnullerror; 15260Sstevel@tonic-gate int rval = 0; 15270Sstevel@tonic-gate int i, num_hosts = 0; 15280Sstevel@tonic-gate int has_set = 0; 15290Sstevel@tonic-gate int max_node = 0; 15300Sstevel@tonic-gate int delete_end = 1; 15310Sstevel@tonic-gate md_mnnode_desc *nd; 15320Sstevel@tonic-gate 15330Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 15340Sstevel@tonic-gate /* unable to find set description */ 15350Sstevel@tonic-gate rval = 1; 15360Sstevel@tonic-gate return (rval); 15370Sstevel@tonic-gate } 15380Sstevel@tonic-gate 15390Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) { 15400Sstevel@tonic-gate /* 15410Sstevel@tonic-gate * Get a count of the hosts in the set and also lock the set 15420Sstevel@tonic-gate * on those hosts that know about it. 15430Sstevel@tonic-gate */ 15440Sstevel@tonic-gate nd = sd->sd_nodelist; 15450Sstevel@tonic-gate while (nd) { 15460Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 15470Sstevel@tonic-gate nd = nd->nd_next; 15480Sstevel@tonic-gate continue; 15490Sstevel@tonic-gate } 15500Sstevel@tonic-gate has_set = nodehasset(sp, nd->nd_nodename, 15510Sstevel@tonic-gate NHS_NST_EQ, ep); 15520Sstevel@tonic-gate 15530Sstevel@tonic-gate /* 15540Sstevel@tonic-gate * The host is not aware of this set (has_set < 0) or 15550Sstevel@tonic-gate * the set does not match (has_set == 0). This check 15560Sstevel@tonic-gate * prevents the code getting confused by an apparent 15570Sstevel@tonic-gate * inconsistancy in the set's state, this is in the 15580Sstevel@tonic-gate * purge code so something is broken in any case and 15590Sstevel@tonic-gate * this is just trying to fix the brokeness. 15600Sstevel@tonic-gate */ 15610Sstevel@tonic-gate if (has_set <= 0) { 15620Sstevel@tonic-gate mdclrerror(ep); 15630Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_NOSET; 15640Sstevel@tonic-gate } else { 15650Sstevel@tonic-gate num_hosts++; 15660Sstevel@tonic-gate if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 15670Sstevel@tonic-gate /* 15680Sstevel@tonic-gate * If the force flag is set then 15690Sstevel@tonic-gate * ignore any RPC failures because we 15700Sstevel@tonic-gate * are only really interested with 15710Sstevel@tonic-gate * the set on local node. 15720Sstevel@tonic-gate */ 15730Sstevel@tonic-gate if (forceflg && mdanyrpcerror(ep)) { 15740Sstevel@tonic-gate mdclrerror(ep); 15750Sstevel@tonic-gate } else { 15760Sstevel@tonic-gate /* 15770Sstevel@tonic-gate * set max_node so that in the 15780Sstevel@tonic-gate * unlock code nodes in the 15790Sstevel@tonic-gate * set that have not been 15800Sstevel@tonic-gate * locked are not unlocked. 15810Sstevel@tonic-gate */ 15820Sstevel@tonic-gate max_node = nd->nd_nodeid; 15830Sstevel@tonic-gate rval = 2; 15840Sstevel@tonic-gate goto out1; 15850Sstevel@tonic-gate } 15860Sstevel@tonic-gate } 15870Sstevel@tonic-gate 15880Sstevel@tonic-gate } 15890Sstevel@tonic-gate nd = nd->nd_next; 15900Sstevel@tonic-gate } 15910Sstevel@tonic-gate max_node = 0; 15920Sstevel@tonic-gate } else { 15930Sstevel@tonic-gate /* 15940Sstevel@tonic-gate * Get a count of the hosts in the set and also lock the set 15950Sstevel@tonic-gate * on those hosts that know about it. 15960Sstevel@tonic-gate */ 15970Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) { 15980Sstevel@tonic-gate /* Skip empty slots */ 15990Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0') 16000Sstevel@tonic-gate continue; 16010Sstevel@tonic-gate 16020Sstevel@tonic-gate has_set = nodehasset(sp, sd->sd_nodes[i], 16030Sstevel@tonic-gate NHS_NST_EQ, ep); 16040Sstevel@tonic-gate 16050Sstevel@tonic-gate /* 16060Sstevel@tonic-gate * The host is not aware of this set (has_set < 0) or 16070Sstevel@tonic-gate * the set does not match (has_set == 0). This check 16080Sstevel@tonic-gate * prevents the code getting confused by an apparent 16090Sstevel@tonic-gate * inconsistancy in the set's state, this is in the 16100Sstevel@tonic-gate * purge code so something is broken in any case and 16110Sstevel@tonic-gate * this is just trying to fix the brokeness. 16120Sstevel@tonic-gate */ 16130Sstevel@tonic-gate if (has_set <= 0) { 16140Sstevel@tonic-gate mdclrerror(ep); 16150Sstevel@tonic-gate /* 16160Sstevel@tonic-gate * set the node to NULL to prevent further 16170Sstevel@tonic-gate * requests to this unresponsive node. 16180Sstevel@tonic-gate */ 16190Sstevel@tonic-gate sd->sd_nodes[i][0] = '\0'; 16200Sstevel@tonic-gate } else { 16210Sstevel@tonic-gate num_hosts++; 16220Sstevel@tonic-gate if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { 16230Sstevel@tonic-gate /* 16240Sstevel@tonic-gate * If the force flag is set then 16250Sstevel@tonic-gate * ignore any RPC failures because we 16260Sstevel@tonic-gate * are only really interested with 16270Sstevel@tonic-gate * the set on local node. 16280Sstevel@tonic-gate */ 16290Sstevel@tonic-gate if (forceflg && mdanyrpcerror(ep)) { 16300Sstevel@tonic-gate mdclrerror(ep); 16310Sstevel@tonic-gate } else { 16320Sstevel@tonic-gate rval = 2; 16330Sstevel@tonic-gate /* 16340Sstevel@tonic-gate * set max_node so that in the 16350Sstevel@tonic-gate * unlock code nodes in the 16360Sstevel@tonic-gate * set that have not been 16370Sstevel@tonic-gate * locked are not unlocked. 16380Sstevel@tonic-gate */ 16390Sstevel@tonic-gate max_node = i; 16400Sstevel@tonic-gate goto out1; 16410Sstevel@tonic-gate } 16420Sstevel@tonic-gate } 16430Sstevel@tonic-gate } 16440Sstevel@tonic-gate } 16450Sstevel@tonic-gate max_node = i; /* now MD_MAXSIDES */ 16460Sstevel@tonic-gate } 16470Sstevel@tonic-gate if (!bypass_cluster) { 16480Sstevel@tonic-gate /* 16490Sstevel@tonic-gate * If there is only one host associated with the 16500Sstevel@tonic-gate * set then remove the set from the cluster. 16510Sstevel@tonic-gate */ 16520Sstevel@tonic-gate if (num_hosts == 1) { 16530Sstevel@tonic-gate if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) { 16540Sstevel@tonic-gate if (metad_isautotakebyname(sp->setname)) { 16550Sstevel@tonic-gate delete_end = 0; 16560Sstevel@tonic-gate } else { 16570Sstevel@tonic-gate mdclrerror(ep); 16580Sstevel@tonic-gate rval = 3; 16590Sstevel@tonic-gate goto out1; 16600Sstevel@tonic-gate } 16610Sstevel@tonic-gate } 16620Sstevel@tonic-gate } 16630Sstevel@tonic-gate } 16640Sstevel@tonic-gate 16650Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) { 16660Sstevel@tonic-gate /* 16670Sstevel@tonic-gate * Get a count of the hosts in the set and also lock the set 16680Sstevel@tonic-gate * on those hosts that know about it. 16690Sstevel@tonic-gate */ 16700Sstevel@tonic-gate nd = sd->sd_nodelist; 16710Sstevel@tonic-gate while (nd) { 16720Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 16730Sstevel@tonic-gate nd = nd->nd_next; 16740Sstevel@tonic-gate continue; 16750Sstevel@tonic-gate } 16760Sstevel@tonic-gate if (nd->nd_nodeid != sd->sd_mn_mynode->nd_nodeid) { 16770Sstevel@tonic-gate /* 16780Sstevel@tonic-gate * Tell the remote node to remove this node 16790Sstevel@tonic-gate */ 16800Sstevel@tonic-gate if (clnt_delhosts(nd->nd_nodename, sp, 1, 16810Sstevel@tonic-gate &thishost, ep) == -1) { 16820Sstevel@tonic-gate /* 16830Sstevel@tonic-gate * If we fail to delete ourselves 16840Sstevel@tonic-gate * from the remote host it does not 16850Sstevel@tonic-gate * really matter because the set is 16860Sstevel@tonic-gate * being "purged" from this node. The 16870Sstevel@tonic-gate * set can be purged from the other 16880Sstevel@tonic-gate * node at a later time. 16890Sstevel@tonic-gate */ 16900Sstevel@tonic-gate mdclrerror(ep); 16910Sstevel@tonic-gate } 16920Sstevel@tonic-gate nd = nd->nd_next; 16930Sstevel@tonic-gate continue; 16940Sstevel@tonic-gate } 16950Sstevel@tonic-gate /* remove the set from this host */ 16960Sstevel@tonic-gate if (clnt_delset(nd->nd_nodename, sp, ep) == -1) { 16970Sstevel@tonic-gate md_perror(dgettext(TEXT_DOMAIN, "delset")); 16980Sstevel@tonic-gate if (!bypass_cluster && num_hosts == 1) 16990Sstevel@tonic-gate (void) sdssc_delete_end(sp->setname, 17000Sstevel@tonic-gate SDSSC_CLEANUP); 17010Sstevel@tonic-gate mdclrerror(ep); 17020Sstevel@tonic-gate goto out1; 17030Sstevel@tonic-gate } 17040Sstevel@tonic-gate nd = nd->nd_next; 17050Sstevel@tonic-gate } 17060Sstevel@tonic-gate } else { 17070Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) { 17080Sstevel@tonic-gate /* Skip empty slots */ 17090Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0') 17100Sstevel@tonic-gate continue; 17110Sstevel@tonic-gate if (strcmp(thishost, sd->sd_nodes[i]) != 0) { 17120Sstevel@tonic-gate /* 17130Sstevel@tonic-gate * Tell the remote node to remove this node 17140Sstevel@tonic-gate */ 17150Sstevel@tonic-gate if (clnt_delhosts(sd->sd_nodes[i], sp, 1, 17160Sstevel@tonic-gate &thishost, ep) == -1) { 17170Sstevel@tonic-gate /* 17180Sstevel@tonic-gate * If we fail to delete ourselves 17190Sstevel@tonic-gate * from the remote host it does not 17200Sstevel@tonic-gate * really matter because the set is 17210Sstevel@tonic-gate * being "purged" from this node. The 17220Sstevel@tonic-gate * set can be purged from the other 17230Sstevel@tonic-gate * node at a later time. 17240Sstevel@tonic-gate */ 17250Sstevel@tonic-gate mdclrerror(ep); 17260Sstevel@tonic-gate } 17270Sstevel@tonic-gate continue; 17280Sstevel@tonic-gate } 17290Sstevel@tonic-gate 17300Sstevel@tonic-gate /* remove the set from this host */ 17310Sstevel@tonic-gate if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) { 17320Sstevel@tonic-gate md_perror(dgettext(TEXT_DOMAIN, "delset")); 17330Sstevel@tonic-gate if (!bypass_cluster && num_hosts == 1) 17340Sstevel@tonic-gate (void) sdssc_delete_end(sp->setname, 17350Sstevel@tonic-gate SDSSC_CLEANUP); 17360Sstevel@tonic-gate mdclrerror(ep); 17370Sstevel@tonic-gate goto out1; 17380Sstevel@tonic-gate } 17390Sstevel@tonic-gate } 17400Sstevel@tonic-gate } 17410Sstevel@tonic-gate 17420Sstevel@tonic-gate if (!bypass_cluster && num_hosts == 1) { 17430Sstevel@tonic-gate if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) == 17440Sstevel@tonic-gate SDSSC_ERROR) { 17450Sstevel@tonic-gate rval = 4; 17460Sstevel@tonic-gate } 17470Sstevel@tonic-gate } 17480Sstevel@tonic-gate 17490Sstevel@tonic-gate out1: 17500Sstevel@tonic-gate 17510Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname); 17520Sstevel@tonic-gate 17530Sstevel@tonic-gate /* 17540Sstevel@tonic-gate * Remove the set lock on those nodes that had the set locked 17550Sstevel@tonic-gate * max_node will either be MD_MAXSIDES or array index of the last 17560Sstevel@tonic-gate * node contacted (or rather failed to contact) for traditional 17570Sstevel@tonic-gate * diskset. For a MN diskset, max_node is the node_id of the node 17580Sstevel@tonic-gate * that failed the lock. 17590Sstevel@tonic-gate */ 17600Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) { 17610Sstevel@tonic-gate nd = sd->sd_nodelist; 17620Sstevel@tonic-gate while (nd) { 17630Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 17640Sstevel@tonic-gate nd = nd->nd_next; 17650Sstevel@tonic-gate continue; 17660Sstevel@tonic-gate } 17670Sstevel@tonic-gate if (nd->nd_nodeid == max_node) 17680Sstevel@tonic-gate break; 17690Sstevel@tonic-gate if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 17700Sstevel@tonic-gate if (forceflg && mdanyrpcerror(&xep)) { 17710Sstevel@tonic-gate mdclrerror(&xep); 17720Sstevel@tonic-gate nd = nd->nd_next; 17730Sstevel@tonic-gate continue; 17740Sstevel@tonic-gate } 17750Sstevel@tonic-gate if (rval == 0) 17760Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 17770Sstevel@tonic-gate rval = 5; 17780Sstevel@tonic-gate } 17790Sstevel@tonic-gate nd = nd->nd_next; 17800Sstevel@tonic-gate } 17810Sstevel@tonic-gate } else { 17820Sstevel@tonic-gate for (i = 0; i < max_node; i++) { 17830Sstevel@tonic-gate /* Skip empty slots */ 17840Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0') 17850Sstevel@tonic-gate continue; 17860Sstevel@tonic-gate 17870Sstevel@tonic-gate if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { 17880Sstevel@tonic-gate if (forceflg && mdanyrpcerror(&xep)) { 17890Sstevel@tonic-gate mdclrerror(&xep); 17900Sstevel@tonic-gate continue; 17910Sstevel@tonic-gate } 17920Sstevel@tonic-gate if (rval == 0) 17930Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 17940Sstevel@tonic-gate rval = 5; 17950Sstevel@tonic-gate } 17960Sstevel@tonic-gate } 17970Sstevel@tonic-gate } 17980Sstevel@tonic-gate 17990Sstevel@tonic-gate cl_set_setkey(NULL); 18000Sstevel@tonic-gate 18010Sstevel@tonic-gate return (rval); 18020Sstevel@tonic-gate } 18030Sstevel@tonic-gate 18040Sstevel@tonic-gate int 18050Sstevel@tonic-gate meta_set_query( 18060Sstevel@tonic-gate mdsetname_t *sp, 18070Sstevel@tonic-gate mddb_dtag_lst_t **dtlpp, 18080Sstevel@tonic-gate md_error_t *ep 18090Sstevel@tonic-gate ) 18100Sstevel@tonic-gate { 18110Sstevel@tonic-gate mddb_dtag_get_parm_t dtgp; 18120Sstevel@tonic-gate 18130Sstevel@tonic-gate (void) memset(&dtgp, '\0', sizeof (mddb_dtag_get_parm_t)); 18140Sstevel@tonic-gate dtgp.dtgp_setno = sp->setno; 18150Sstevel@tonic-gate 18160Sstevel@tonic-gate /*CONSTCOND*/ 18170Sstevel@tonic-gate while (1) { 18180Sstevel@tonic-gate if (metaioctl(MD_MED_GET_TAG, &dtgp, &dtgp.dtgp_mde, NULL) != 0) 18190Sstevel@tonic-gate if (! mdismddberror(&dtgp.dtgp_mde, MDE_DB_NOTAG) || 18200Sstevel@tonic-gate *dtlpp == NULL) 18210Sstevel@tonic-gate return (mdstealerror(ep, &dtgp.dtgp_mde)); 18220Sstevel@tonic-gate else 18230Sstevel@tonic-gate break; 18240Sstevel@tonic-gate 18250Sstevel@tonic-gate /* 18260Sstevel@tonic-gate * Run to the end of the list 18270Sstevel@tonic-gate */ 18280Sstevel@tonic-gate for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) 18290Sstevel@tonic-gate /* void */; 18300Sstevel@tonic-gate 18310Sstevel@tonic-gate *dtlpp = Zalloc(sizeof (mddb_dtag_lst_t)); 18320Sstevel@tonic-gate 18330Sstevel@tonic-gate (void) memmove(&(*dtlpp)->dtl_dt, &dtgp.dtgp_dt, 18340Sstevel@tonic-gate sizeof (mddb_dtag_t)); 18350Sstevel@tonic-gate 18360Sstevel@tonic-gate dtgp.dtgp_dt.dt_id++; 18370Sstevel@tonic-gate } 18380Sstevel@tonic-gate return (0); 18390Sstevel@tonic-gate } 18400Sstevel@tonic-gate 18410Sstevel@tonic-gate /* 18420Sstevel@tonic-gate * return drivename get by key 18430Sstevel@tonic-gate */ 18440Sstevel@tonic-gate mddrivename_t * 18450Sstevel@tonic-gate metadrivename_withdrkey( 18460Sstevel@tonic-gate mdsetname_t *sp, 18470Sstevel@tonic-gate side_t sideno, 18480Sstevel@tonic-gate mdkey_t key, 18490Sstevel@tonic-gate int flags, 18500Sstevel@tonic-gate md_error_t *ep 18510Sstevel@tonic-gate ) 18520Sstevel@tonic-gate { 18530Sstevel@tonic-gate char *nm; 18540Sstevel@tonic-gate mdname_t *np; 18550Sstevel@tonic-gate mddrivename_t *dnp; 18560Sstevel@tonic-gate ddi_devid_t devidp; 18570Sstevel@tonic-gate md_set_desc *sd; 18580Sstevel@tonic-gate 18590Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 18600Sstevel@tonic-gate return (NULL); 18610Sstevel@tonic-gate } 18620Sstevel@tonic-gate 18630Sstevel@tonic-gate 18640Sstevel@tonic-gate /* 18650Sstevel@tonic-gate * Get the devid associated with the key. 18660Sstevel@tonic-gate * 18670Sstevel@tonic-gate * If a devid was returned, it MUST be valid even in 18680Sstevel@tonic-gate * the case where a device id has been "updated". The 18690Sstevel@tonic-gate * "update" of the device id may have occured due to 18700Sstevel@tonic-gate * a firmware upgrade. 18710Sstevel@tonic-gate */ 18720Sstevel@tonic-gate if ((devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep)) 18730Sstevel@tonic-gate != NULL) { 1874*1945Sjeanm /* 1875*1945Sjeanm * Look for the correct dnp using the devid for comparison. 1876*1945Sjeanm */ 1877*1945Sjeanm dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep); 18780Sstevel@tonic-gate free(devidp); 1879*1945Sjeanm dnp->side_names_key = key; 18800Sstevel@tonic-gate } else { 18810Sstevel@tonic-gate /* 1882*1945Sjeanm * We didn't get a devid. We'll try for a dnp using the 1883*1945Sjeanm * name. If we have a MN diskset or if the dnp is a did 1884*1945Sjeanm * device, we're done because then we don't have devids. 1885*1945Sjeanm * Otherwise we'll try to set the devid 1886*1945Sjeanm * and get the dnp via devid again. 1887*1945Sjeanm * We also need to clear the ep structure. When the 1888*1945Sjeanm * above call to meta_getdidbykey returned a null, it 1889*1945Sjeanm * also put an error code into ep. In this case, the null 1890*1945Sjeanm * return is actually OK and any errors can be ignored. The 1891*1945Sjeanm * reason it is OK is because this could be a MN set or 1892*1945Sjeanm * we could be running without devids (ex cluster). 1893*1945Sjeanm */ 1894*1945Sjeanm mdclrerror(ep); 1895*1945Sjeanm 1896*1945Sjeanm if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno, key, 1897*1945Sjeanm ep)) == NULL) 1898*1945Sjeanm return (NULL); 1899*1945Sjeanm /* get device name */ 1900*1945Sjeanm if (flags & PRINT_FAST) { 1901*1945Sjeanm if ((np = metaname_fast(&sp, nm, 1902*1945Sjeanm LOGICAL_DEVICE, ep)) == NULL) { 1903*1945Sjeanm Free(nm); 1904*1945Sjeanm return (NULL); 1905*1945Sjeanm } 1906*1945Sjeanm } else { 1907*1945Sjeanm if ((np = metaname(&sp, nm, LOGICAL_DEVICE, 1908*1945Sjeanm ep)) == NULL) { 1909*1945Sjeanm Free(nm); 1910*1945Sjeanm return (NULL); 1911*1945Sjeanm } 1912*1945Sjeanm } 1913*1945Sjeanm Free(nm); 1914*1945Sjeanm /* make sure it's OK */ 1915*1945Sjeanm if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np, 1916*1945Sjeanm ep) != 0)) 1917*1945Sjeanm return (NULL); 1918*1945Sjeanm 1919*1945Sjeanm /* get drivename */ 1920*1945Sjeanm dnp = np->drivenamep; 1921*1945Sjeanm dnp->side_names_key = key; 1922*1945Sjeanm /* 1923*1945Sjeanm * Skip the devid set/check for the following cases: 1924*1945Sjeanm * 1) If MN diskset, there are no devid's 1925*1945Sjeanm * 2) if dnp is did device 1926*1945Sjeanm * The device id is disabled for did device due to the 1927*1945Sjeanm * lack of minor name support in the did driver. The following 1928*1945Sjeanm * devid code path can set and propagate the error and 1929*1945Sjeanm * eventually prevent did disks from being added to the 1930*1945Sjeanm * diskset under SunCluster systems 1931*1945Sjeanm */ 1932*1945Sjeanm if ((strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/")) 1933*1945Sjeanm == 0) || (MD_MNSET_DESC(sd))) 1934*1945Sjeanm goto out; 1935*1945Sjeanm 1936*1945Sjeanm /* 19370Sstevel@tonic-gate * It is okay if replica is not in devid mode 19380Sstevel@tonic-gate */ 19390Sstevel@tonic-gate if (mdissyserror(ep, MDDB_F_NODEVID)) { 19400Sstevel@tonic-gate mdclrerror(ep); 19410Sstevel@tonic-gate goto out; 19420Sstevel@tonic-gate } 19430Sstevel@tonic-gate 19440Sstevel@tonic-gate /* 1945*1945Sjeanm * We're not MN or did devices but 19460Sstevel@tonic-gate * devid is missing so this means that we have 19470Sstevel@tonic-gate * just upgraded from a configuration where 19480Sstevel@tonic-gate * devid's were not used so try to add in 1949*1945Sjeanm * the devid and requery. If the devid still isn't there, 1950*1945Sjeanm * that's OK. dnp->devid will be null as it is in any 1951*1945Sjeanm * configuration with no devids. 19520Sstevel@tonic-gate */ 19530Sstevel@tonic-gate if (meta_setdid(MD_LOCAL_SET, sideno + SKEW, key, 19540Sstevel@tonic-gate ep) < 0) 19550Sstevel@tonic-gate return (NULL); 19560Sstevel@tonic-gate if ((devidp = (ddi_devid_t)meta_getdidbykey(MD_LOCAL_SET, 1957*1945Sjeanm sideno+SKEW, key, ep)) != NULL) { 1958*1945Sjeanm /* 1959*1945Sjeanm * Found a devid so look for the dnp using the 1960*1945Sjeanm * devid as the search mechanism. 1961*1945Sjeanm */ 1962*1945Sjeanm dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep); 1963*1945Sjeanm free(devidp); 1964*1945Sjeanm dnp->side_names_key = key; 1965*1945Sjeanm } 19660Sstevel@tonic-gate } 19670Sstevel@tonic-gate 1968*1945Sjeanm 1969*1945Sjeanm 19700Sstevel@tonic-gate out: 19710Sstevel@tonic-gate if (flags & MD_BYPASS_DAEMON) 19720Sstevel@tonic-gate return (dnp); 19730Sstevel@tonic-gate 19740Sstevel@tonic-gate if (get_sidenmlist(sp, dnp, ep)) 19750Sstevel@tonic-gate return (NULL); 19760Sstevel@tonic-gate 19770Sstevel@tonic-gate /* return success */ 19780Sstevel@tonic-gate return (dnp); 19790Sstevel@tonic-gate } 19800Sstevel@tonic-gate 19810Sstevel@tonic-gate void 19820Sstevel@tonic-gate metafreedrivedesc(md_drive_desc **dd) 19830Sstevel@tonic-gate { 19840Sstevel@tonic-gate md_drive_desc *p, *next = NULL; 19850Sstevel@tonic-gate 19860Sstevel@tonic-gate for (p = *dd; p != NULL; p = next) { 19870Sstevel@tonic-gate next = p->dd_next; 19880Sstevel@tonic-gate Free(p); 19890Sstevel@tonic-gate } 19900Sstevel@tonic-gate *dd = NULL; 19910Sstevel@tonic-gate } 19920Sstevel@tonic-gate 19930Sstevel@tonic-gate md_drive_desc * 19940Sstevel@tonic-gate metaget_drivedesc( 19950Sstevel@tonic-gate mdsetname_t *sp, 19960Sstevel@tonic-gate int flags, 19970Sstevel@tonic-gate md_error_t *ep 19980Sstevel@tonic-gate ) 19990Sstevel@tonic-gate { 20000Sstevel@tonic-gate side_t sideno = MD_SIDEWILD; 20010Sstevel@tonic-gate 20020Sstevel@tonic-gate assert(! (flags & MD_BYPASS_DAEMON)); 20030Sstevel@tonic-gate 20040Sstevel@tonic-gate if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD) 20050Sstevel@tonic-gate return (NULL); 20060Sstevel@tonic-gate 20070Sstevel@tonic-gate return (metaget_drivedesc_sideno(sp, sideno, flags, ep)); 20080Sstevel@tonic-gate } 20090Sstevel@tonic-gate 20100Sstevel@tonic-gate md_drive_desc * 20110Sstevel@tonic-gate metaget_drivedesc_fromnamelist( 20120Sstevel@tonic-gate mdsetname_t *sp, 20130Sstevel@tonic-gate mdnamelist_t *nlp, 20140Sstevel@tonic-gate md_error_t *ep 20150Sstevel@tonic-gate ) 20160Sstevel@tonic-gate { 20170Sstevel@tonic-gate md_set_desc *sd; 20180Sstevel@tonic-gate mdnamelist_t *p; 20190Sstevel@tonic-gate md_drive_desc *dd = NULL; 20200Sstevel@tonic-gate 20210Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) 20220Sstevel@tonic-gate return (NULL); 20230Sstevel@tonic-gate 20240Sstevel@tonic-gate for (p = nlp; p != NULL; p = p->next) 20250Sstevel@tonic-gate (void) metadrivedesc_append(&dd, p->namep->drivenamep, 0, 0, 20260Sstevel@tonic-gate sd->sd_ctime, sd->sd_genid, MD_DR_ADD); 20270Sstevel@tonic-gate 20280Sstevel@tonic-gate return (dd); 20290Sstevel@tonic-gate } 20300Sstevel@tonic-gate 20310Sstevel@tonic-gate md_drive_desc * 20320Sstevel@tonic-gate metaget_drivedesc_sideno( 20330Sstevel@tonic-gate mdsetname_t *sp, 20340Sstevel@tonic-gate side_t sideno, 20350Sstevel@tonic-gate int flags, 20360Sstevel@tonic-gate md_error_t *ep 20370Sstevel@tonic-gate ) 20380Sstevel@tonic-gate { 20390Sstevel@tonic-gate md_set_desc *sd = NULL; 20400Sstevel@tonic-gate 20410Sstevel@tonic-gate assert(! (flags & MD_BYPASS_DAEMON)); 20420Sstevel@tonic-gate 20430Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) 20440Sstevel@tonic-gate return (NULL); 20450Sstevel@tonic-gate 20460Sstevel@tonic-gate if (sd->sd_drvs) 20470Sstevel@tonic-gate return (sd->sd_drvs); 20480Sstevel@tonic-gate 20490Sstevel@tonic-gate if ((sd->sd_drvs = dr2drivedesc(sp, sideno, flags, ep)) == NULL) 20500Sstevel@tonic-gate return (NULL); 20510Sstevel@tonic-gate 20520Sstevel@tonic-gate return (sd->sd_drvs); 20530Sstevel@tonic-gate } 20540Sstevel@tonic-gate 20550Sstevel@tonic-gate int 20560Sstevel@tonic-gate metaget_setownership( 20570Sstevel@tonic-gate mdsetname_t *sp, 20580Sstevel@tonic-gate md_error_t *ep 20590Sstevel@tonic-gate ) 20600Sstevel@tonic-gate { 20610Sstevel@tonic-gate md_set_desc *sd; 20620Sstevel@tonic-gate int bool; 20630Sstevel@tonic-gate int i; 20640Sstevel@tonic-gate md_mnnode_desc *nd; 20650Sstevel@tonic-gate 20660Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) 20670Sstevel@tonic-gate return (-1); 20680Sstevel@tonic-gate 20690Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) { 20700Sstevel@tonic-gate nd = sd->sd_nodelist; 20710Sstevel@tonic-gate while (nd) { 20720Sstevel@tonic-gate /* If node isn't alive, can't own diskset */ 20730Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 20740Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OWN; 20750Sstevel@tonic-gate nd = nd->nd_next; 20760Sstevel@tonic-gate continue; 20770Sstevel@tonic-gate } 20780Sstevel@tonic-gate /* 20790Sstevel@tonic-gate * If can't communicate with rpc.metad, then mark 20800Sstevel@tonic-gate * this node as not an owner. That node may 20810Sstevel@tonic-gate * in fact, be an owner, but without rpc.metad running 20820Sstevel@tonic-gate * that node can't do much. 20830Sstevel@tonic-gate */ 20840Sstevel@tonic-gate if (clnt_ownset(nd->nd_nodename, sp, &bool, ep) == -1) { 20850Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OWN; 20860Sstevel@tonic-gate } else if (bool == TRUE) { 20870Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_OWN; 20880Sstevel@tonic-gate } else { 20890Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OWN; 20900Sstevel@tonic-gate } 20910Sstevel@tonic-gate nd = nd->nd_next; 20920Sstevel@tonic-gate } 20930Sstevel@tonic-gate return (0); 20940Sstevel@tonic-gate } 20950Sstevel@tonic-gate 20960Sstevel@tonic-gate /* Rest of code handles traditional disksets */ 20970Sstevel@tonic-gate 20980Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) 20990Sstevel@tonic-gate sd->sd_isown[i] = 0; 21000Sstevel@tonic-gate 21010Sstevel@tonic-gate if (clnt_ownset(mynode(), sp, &bool, ep) == -1) 21020Sstevel@tonic-gate return (-1); 21030Sstevel@tonic-gate 21040Sstevel@tonic-gate if (bool == TRUE) 21050Sstevel@tonic-gate sd->sd_isown[getmyside(sp, ep)] = 1; 21060Sstevel@tonic-gate 21070Sstevel@tonic-gate return (0); 21080Sstevel@tonic-gate } 21090Sstevel@tonic-gate 21100Sstevel@tonic-gate char * 21110Sstevel@tonic-gate mynode(void) 21120Sstevel@tonic-gate { 21130Sstevel@tonic-gate static struct utsname myuname; 21140Sstevel@tonic-gate static int done = 0; 21150Sstevel@tonic-gate 21160Sstevel@tonic-gate if (! done) { 21170Sstevel@tonic-gate if (uname(&myuname) == -1) { 21180Sstevel@tonic-gate md_perror(dgettext(TEXT_DOMAIN, "uname")); 21190Sstevel@tonic-gate assert(0); 21200Sstevel@tonic-gate } 21210Sstevel@tonic-gate done = 1; 21220Sstevel@tonic-gate } 21230Sstevel@tonic-gate return (myuname.nodename); 21240Sstevel@tonic-gate } 21250Sstevel@tonic-gate 21260Sstevel@tonic-gate int 21270Sstevel@tonic-gate strinlst(char *str, int cnt, char **lst) 21280Sstevel@tonic-gate { 21290Sstevel@tonic-gate int i; 21300Sstevel@tonic-gate 21310Sstevel@tonic-gate for (i = 0; i < cnt; i++) 21320Sstevel@tonic-gate if (strcmp(lst[i], str) == 0) 21330Sstevel@tonic-gate return (TRUE); 21340Sstevel@tonic-gate 21350Sstevel@tonic-gate return (FALSE); 21360Sstevel@tonic-gate } 21370Sstevel@tonic-gate 21380Sstevel@tonic-gate /* 21390Sstevel@tonic-gate * meta_get_reserved_names 21400Sstevel@tonic-gate * returns an mdnamelist_t of reserved slices 21410Sstevel@tonic-gate * reserved slices are those that are used but don't necessarily 21420Sstevel@tonic-gate * show up as metadevices (ex. reserved slice for db in sets, logs) 21430Sstevel@tonic-gate */ 21440Sstevel@tonic-gate 21450Sstevel@tonic-gate /*ARGSUSED*/ 21460Sstevel@tonic-gate int 21470Sstevel@tonic-gate meta_get_reserved_names( 21480Sstevel@tonic-gate mdsetname_t *sp, 21490Sstevel@tonic-gate mdnamelist_t **nlpp, 21500Sstevel@tonic-gate int options, 21510Sstevel@tonic-gate md_error_t *ep) 21520Sstevel@tonic-gate { 21530Sstevel@tonic-gate int count = 0; 21540Sstevel@tonic-gate mdname_t *np = NULL; 21550Sstevel@tonic-gate mdnamelist_t *transnlp = NULL; 21560Sstevel@tonic-gate mdnamelist_t **tailpp = nlpp; 21570Sstevel@tonic-gate mdnamelist_t *nlp; 21580Sstevel@tonic-gate md_drive_desc *dd, *di; 21590Sstevel@tonic-gate 21600Sstevel@tonic-gate if (metaislocalset(sp)) 21610Sstevel@tonic-gate goto out; 21620Sstevel@tonic-gate 21630Sstevel@tonic-gate if (!(dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) && !mdisok(ep)) { 21640Sstevel@tonic-gate count = -1; 21650Sstevel@tonic-gate goto out; 21660Sstevel@tonic-gate } 21670Sstevel@tonic-gate 21680Sstevel@tonic-gate /* db in for sets on reserved slice */ 21690Sstevel@tonic-gate for (di = dd; di && count >= 0; di = di->dd_next) { 21700Sstevel@tonic-gate uint_t rep_slice; 21710Sstevel@tonic-gate 21720Sstevel@tonic-gate /* 21730Sstevel@tonic-gate * Add the name struct to the end of the 21740Sstevel@tonic-gate * namelist but keep a pointer to the last 21750Sstevel@tonic-gate * element so that we don't incur the overhead 21760Sstevel@tonic-gate * of traversing the list each time 21770Sstevel@tonic-gate */ 21780Sstevel@tonic-gate if (di->dd_dnp && 21790Sstevel@tonic-gate (meta_replicaslice(di->dd_dnp, &rep_slice, ep) == 0) && 21800Sstevel@tonic-gate (np = metaslicename(di->dd_dnp, rep_slice, ep)) && 21810Sstevel@tonic-gate (tailpp = meta_namelist_append_wrapper(tailpp, np))) 21820Sstevel@tonic-gate count++; 21830Sstevel@tonic-gate else 21840Sstevel@tonic-gate count = -1; 21850Sstevel@tonic-gate } 21860Sstevel@tonic-gate 21870Sstevel@tonic-gate /* now find logs */ 21880Sstevel@tonic-gate if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) { 21890Sstevel@tonic-gate count = -1; 21900Sstevel@tonic-gate goto out; 21910Sstevel@tonic-gate } 21920Sstevel@tonic-gate 21930Sstevel@tonic-gate for (nlp = transnlp; (nlp != NULL); nlp = nlp->next) { 21940Sstevel@tonic-gate mdname_t *transnp = nlp->namep; 21950Sstevel@tonic-gate md_trans_t *transp; 21960Sstevel@tonic-gate 21970Sstevel@tonic-gate if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) { 21980Sstevel@tonic-gate count = -1; 21990Sstevel@tonic-gate goto out; 22000Sstevel@tonic-gate } 22010Sstevel@tonic-gate if (transp->lognamep) { 22020Sstevel@tonic-gate /* 22030Sstevel@tonic-gate * Add the name struct to the end of the 22040Sstevel@tonic-gate * namelist but keep a pointer to the last 22050Sstevel@tonic-gate * element so that we don't incur the overhead 22060Sstevel@tonic-gate * of traversing the list each time 22070Sstevel@tonic-gate */ 22080Sstevel@tonic-gate tailpp = meta_namelist_append_wrapper( 22090Sstevel@tonic-gate tailpp, transp->lognamep); 22100Sstevel@tonic-gate } 22110Sstevel@tonic-gate } 22120Sstevel@tonic-gate out: 22130Sstevel@tonic-gate metafreenamelist(transnlp); 22140Sstevel@tonic-gate return (count); 22150Sstevel@tonic-gate } 22160Sstevel@tonic-gate 22170Sstevel@tonic-gate /* 22180Sstevel@tonic-gate * Entry point to join a node to MultiNode diskset. 22190Sstevel@tonic-gate * 22200Sstevel@tonic-gate * Validate host in diskset. 22210Sstevel@tonic-gate * - Should be in membership list from API 22220Sstevel@tonic-gate * - Should not already be joined into diskset. 22230Sstevel@tonic-gate * - Set must have drives 22240Sstevel@tonic-gate * Assume valid configuration is stored in the set/drive/node records 22250Sstevel@tonic-gate * in the local mddb since no node or drive can be added to the MNset 22260Sstevel@tonic-gate * unless all drives and nodes are available. Reconfig steps will 22270Sstevel@tonic-gate * resync all ALIVE nodes in case of panic in critical areas. 22280Sstevel@tonic-gate * 22290Sstevel@tonic-gate * Lock down the set. 22300Sstevel@tonic-gate * Verify host is a member of this diskset. 22310Sstevel@tonic-gate * If drives exist in the configuration, load the mddbs. 22320Sstevel@tonic-gate * Set this node to active by notifying master if one exists. 22330Sstevel@tonic-gate * If this is the first node active in the diskset, this node 22340Sstevel@tonic-gate * becomes the master. 22350Sstevel@tonic-gate * Unlock the set. 22360Sstevel@tonic-gate * 22370Sstevel@tonic-gate * Mirror Resync: 22380Sstevel@tonic-gate * If this node is the last node to join the set and clustering 22390Sstevel@tonic-gate * isn't running, then start the 'metasync -r' type resync 22400Sstevel@tonic-gate * on all mirrors in this diskset. 22410Sstevel@tonic-gate * If clustering is running, this resync operation will 22420Sstevel@tonic-gate * be handled by the reconfig steps and should NOT 22430Sstevel@tonic-gate * be handled during a join operation. 22440Sstevel@tonic-gate * 22450Sstevel@tonic-gate * There are multiple return values in order to assist 22460Sstevel@tonic-gate * the join operation of all sets in the metaset command. 22470Sstevel@tonic-gate * 22480Sstevel@tonic-gate * Return values: 22490Sstevel@tonic-gate * 0 - Node successfully joined to set. 22500Sstevel@tonic-gate * -1 - Join attempted but failed 22510Sstevel@tonic-gate * - any failure from libmeta calls 22520Sstevel@tonic-gate * - node not in the member list 22530Sstevel@tonic-gate * -2 - Join not attempted since 22540Sstevel@tonic-gate * - this set had no drives in set 22550Sstevel@tonic-gate * - this node already joined to set 22560Sstevel@tonic-gate * - set is not a multinode set 22570Sstevel@tonic-gate * -3 - Node joined to STALE set. 22580Sstevel@tonic-gate */ 22590Sstevel@tonic-gate extern int 22600Sstevel@tonic-gate meta_set_join( 22610Sstevel@tonic-gate mdsetname_t *sp, 22620Sstevel@tonic-gate md_error_t *ep 22630Sstevel@tonic-gate ) 22640Sstevel@tonic-gate { 22650Sstevel@tonic-gate md_set_desc *sd; 22660Sstevel@tonic-gate md_drive_desc *dd; 22670Sstevel@tonic-gate md_mnnode_desc *nd, *nd2, my_nd; 22680Sstevel@tonic-gate int rval = 0; 22690Sstevel@tonic-gate md_setkey_t *cl_sk; 22700Sstevel@tonic-gate md_error_t xep = mdnullerror; 22710Sstevel@tonic-gate md_error_t ep_snarf = mdnullerror; 22720Sstevel@tonic-gate int master_flag = 0; 22730Sstevel@tonic-gate md_mnset_record *mas_mnsr = NULL; 22740Sstevel@tonic-gate int clear_nr_flags = 0; 22750Sstevel@tonic-gate md_mnnode_record *nr; 22760Sstevel@tonic-gate int stale_set = 0; 22770Sstevel@tonic-gate int rb_flags = 0; 22780Sstevel@tonic-gate int stale_bool = FALSE; 22790Sstevel@tonic-gate int suspendall_flag = 0; 22800Sstevel@tonic-gate int suspend1_flag = 0; 22810Sstevel@tonic-gate sigset_t oldsigs; 22820Sstevel@tonic-gate int send_reinit = 0; 22830Sstevel@tonic-gate 22840Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 22850Sstevel@tonic-gate return (-1); 22860Sstevel@tonic-gate } 22870Sstevel@tonic-gate 22880Sstevel@tonic-gate /* Must be a multinode diskset */ 22890Sstevel@tonic-gate if (!MD_MNSET_DESC(sd)) { 22900Sstevel@tonic-gate (void) mderror(ep, MDE_NOT_MN, sp->setname); 22910Sstevel@tonic-gate return (-2); 22920Sstevel@tonic-gate } 22930Sstevel@tonic-gate 22940Sstevel@tonic-gate /* Verify that the node is ALIVE (i.e. is in the API membership list) */ 22950Sstevel@tonic-gate if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_ALIVE)) { 22960Sstevel@tonic-gate (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno, 22970Sstevel@tonic-gate sd->sd_mn_mynode->nd_nodename, NULL, 22980Sstevel@tonic-gate sp->setname); 22990Sstevel@tonic-gate return (-1); 23000Sstevel@tonic-gate } 23010Sstevel@tonic-gate 23020Sstevel@tonic-gate /* Make sure we are blocking all signals */ 23030Sstevel@tonic-gate if (procsigs(TRUE, &oldsigs, &xep) < 0) 23040Sstevel@tonic-gate mdclrerror(&xep); 23050Sstevel@tonic-gate 23060Sstevel@tonic-gate /* 23070Sstevel@tonic-gate * Lock the set on current set members. 23080Sstevel@tonic-gate * For MN diskset lock_set and SUSPEND are used to protect against 23090Sstevel@tonic-gate * other meta* commands running on the other nodes. 23100Sstevel@tonic-gate */ 23110Sstevel@tonic-gate nd = sd->sd_nodelist; 23120Sstevel@tonic-gate while (nd) { 23130Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 23140Sstevel@tonic-gate nd = nd->nd_next; 23150Sstevel@tonic-gate continue; 23160Sstevel@tonic-gate } 23170Sstevel@tonic-gate if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 23180Sstevel@tonic-gate rval = -1; 23190Sstevel@tonic-gate goto out; 23200Sstevel@tonic-gate } 23210Sstevel@tonic-gate nd = nd->nd_next; 23220Sstevel@tonic-gate } 23230Sstevel@tonic-gate 23240Sstevel@tonic-gate /* 23250Sstevel@tonic-gate * Lock out other meta* commands by suspending 23260Sstevel@tonic-gate * class 1 messages across the diskset. 23270Sstevel@tonic-gate */ 23280Sstevel@tonic-gate nd = sd->sd_nodelist; 23290Sstevel@tonic-gate while (nd) { 23300Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 23310Sstevel@tonic-gate nd = nd->nd_next; 23320Sstevel@tonic-gate continue; 23330Sstevel@tonic-gate } 23340Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 23350Sstevel@tonic-gate sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) { 23360Sstevel@tonic-gate rval = -1; 23370Sstevel@tonic-gate goto out; 23380Sstevel@tonic-gate } 23390Sstevel@tonic-gate suspend1_flag = 1; 23400Sstevel@tonic-gate nd = nd->nd_next; 23410Sstevel@tonic-gate } 23420Sstevel@tonic-gate 23430Sstevel@tonic-gate /* 23440Sstevel@tonic-gate * Verify that this host is a member (in the host list) of the set. 23450Sstevel@tonic-gate */ 23460Sstevel@tonic-gate nd = sd->sd_nodelist; 23470Sstevel@tonic-gate while (nd) { 23480Sstevel@tonic-gate if (strcmp(mynode(), nd->nd_nodename) == 0) { 23490Sstevel@tonic-gate break; 23500Sstevel@tonic-gate } 23510Sstevel@tonic-gate nd = nd->nd_next; 23520Sstevel@tonic-gate } 23530Sstevel@tonic-gate if (!nd) { 23540Sstevel@tonic-gate (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 23550Sstevel@tonic-gate sd->sd_mn_mynode->nd_nodename, NULL, 23560Sstevel@tonic-gate sp->setname); 23570Sstevel@tonic-gate rval = -1; 23580Sstevel@tonic-gate goto out; 23590Sstevel@tonic-gate } 23600Sstevel@tonic-gate 23610Sstevel@tonic-gate /* 23620Sstevel@tonic-gate * Need to return failure if host is already 'joined' 23630Sstevel@tonic-gate * into the set. This is done so that if later the user 23640Sstevel@tonic-gate * issues a command to join all sets and a failure is 23650Sstevel@tonic-gate * encountered - that the resulting cleanup effort 23660Sstevel@tonic-gate * (withdrawing from all sets that were joined 23670Sstevel@tonic-gate * during that command) won't withdraw from this set. 23680Sstevel@tonic-gate */ 23690Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OWN) { 23700Sstevel@tonic-gate rval = -2; 23710Sstevel@tonic-gate goto out2; 23720Sstevel@tonic-gate } 23730Sstevel@tonic-gate 23740Sstevel@tonic-gate /* 23750Sstevel@tonic-gate * Call metaget_setownership that calls each node in diskset and 23760Sstevel@tonic-gate * marks in set descriptor if node is an owner of the set or not. 23770Sstevel@tonic-gate * metaget_setownership checks to see if a node is an owner by 23780Sstevel@tonic-gate * checking to see if that node's kernel has the mddb loaded. 23790Sstevel@tonic-gate * If a node had panic'd during a reconfig or an 23800Sstevel@tonic-gate * add/delete/join/withdraw operation, the other nodes' node 23810Sstevel@tonic-gate * records may not reflect the current state of the diskset, 23820Sstevel@tonic-gate * so calling metaget_setownership is the safest thing to do. 23830Sstevel@tonic-gate */ 23840Sstevel@tonic-gate if (metaget_setownership(sp, ep) == -1) { 23850Sstevel@tonic-gate rval = -1; 23860Sstevel@tonic-gate goto out; 23870Sstevel@tonic-gate } 23880Sstevel@tonic-gate 23890Sstevel@tonic-gate /* If first active member of diskset, become the master. */ 23900Sstevel@tonic-gate nd = sd->sd_nodelist; 23910Sstevel@tonic-gate while (nd) { 23920Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OWN) 23930Sstevel@tonic-gate break; 23940Sstevel@tonic-gate nd = nd->nd_next; 23950Sstevel@tonic-gate } 23960Sstevel@tonic-gate if (nd == NULL) 23970Sstevel@tonic-gate master_flag = 1; 23980Sstevel@tonic-gate 23990Sstevel@tonic-gate /* 24000Sstevel@tonic-gate * If not first active member of diskset, then get the 24010Sstevel@tonic-gate * master information from a node that is already joined 24020Sstevel@tonic-gate * and set the master information for this node. Be sure 24030Sstevel@tonic-gate * that this node (the already joined node) has its own 24040Sstevel@tonic-gate * join flag set. If not, then this diskset isn't currently 24050Sstevel@tonic-gate * consistent and shouldn't allow a node to join. This diskset 24060Sstevel@tonic-gate * inconsistency should only occur when a node has panic'd in 24070Sstevel@tonic-gate * the set while doing a metaset operation and the sysadmin is 24080Sstevel@tonic-gate * attempting to join a node into the set. This inconsistency 24090Sstevel@tonic-gate * will be fixed during a reconfig cycle which should be occurring 24100Sstevel@tonic-gate * soon since a node panic'd. 24110Sstevel@tonic-gate * 24120Sstevel@tonic-gate * If unable to get this information from an owning node, then 24130Sstevel@tonic-gate * this diskset isn't currently consistent and shouldn't 24140Sstevel@tonic-gate * allow a node to join. 24150Sstevel@tonic-gate */ 24160Sstevel@tonic-gate if (!master_flag) { 24170Sstevel@tonic-gate /* get master information from an owner (joined) node */ 24180Sstevel@tonic-gate if (clnt_mngetset(nd->nd_nodename, sp->setname, 24190Sstevel@tonic-gate sp->setno, &mas_mnsr, ep) == -1) { 24200Sstevel@tonic-gate rval = -1; 24210Sstevel@tonic-gate goto out; 24220Sstevel@tonic-gate } 24230Sstevel@tonic-gate 24240Sstevel@tonic-gate /* Verify that owner (joined) node has its own JOIN flag set */ 24250Sstevel@tonic-gate nr = mas_mnsr->sr_nodechain; 24260Sstevel@tonic-gate while (nr) { 24270Sstevel@tonic-gate if ((nd->nd_nodeid == nr->nr_nodeid) && 24280Sstevel@tonic-gate ((nr->nr_flags & MD_MN_NODE_OWN) == NULL)) { 24290Sstevel@tonic-gate (void) mddserror(ep, MDE_DS_NODENOSET, 24300Sstevel@tonic-gate sp->setno, nd->nd_nodename, NULL, 24310Sstevel@tonic-gate nd->nd_nodename); 24320Sstevel@tonic-gate free_sr((md_set_record *)mas_mnsr); 24330Sstevel@tonic-gate rval = -1; 24340Sstevel@tonic-gate goto out; 24350Sstevel@tonic-gate } 24360Sstevel@tonic-gate nr = nr->nr_next; 24370Sstevel@tonic-gate } 24380Sstevel@tonic-gate 24390Sstevel@tonic-gate /* 24400Sstevel@tonic-gate * Does master have set marked as STALE? 24410Sstevel@tonic-gate * If so, need to pass this down to kernel when 24420Sstevel@tonic-gate * this node snarfs the set. 24430Sstevel@tonic-gate */ 24440Sstevel@tonic-gate if (clnt_mn_is_stale(nd->nd_nodename, sp, 24450Sstevel@tonic-gate &stale_bool, ep) == -1) { 24460Sstevel@tonic-gate rval = -1; 24470Sstevel@tonic-gate goto out; 24480Sstevel@tonic-gate } 24490Sstevel@tonic-gate 24500Sstevel@tonic-gate /* set master information in my rpc.metad's set record */ 24510Sstevel@tonic-gate if (clnt_mnsetmaster(mynode(), sp, mas_mnsr->sr_master_nodenm, 24520Sstevel@tonic-gate mas_mnsr->sr_master_nodeid, ep)) { 24530Sstevel@tonic-gate free_sr((md_set_record *)mas_mnsr); 24540Sstevel@tonic-gate rval = -1; 24550Sstevel@tonic-gate goto out; 24560Sstevel@tonic-gate } 24570Sstevel@tonic-gate 24580Sstevel@tonic-gate /* set master information in my cached set desc */ 24590Sstevel@tonic-gate (void) strcpy(sd->sd_mn_master_nodenm, 24600Sstevel@tonic-gate mas_mnsr->sr_master_nodenm); 24610Sstevel@tonic-gate sd->sd_mn_master_nodeid = mas_mnsr->sr_master_nodeid; 24620Sstevel@tonic-gate nd2 = sd->sd_nodelist; 24630Sstevel@tonic-gate while (nd2) { 24640Sstevel@tonic-gate if (nd2->nd_nodeid == mas_mnsr->sr_master_nodeid) { 24650Sstevel@tonic-gate sd->sd_mn_masternode = nd2; 24660Sstevel@tonic-gate break; 24670Sstevel@tonic-gate } 24680Sstevel@tonic-gate nd2 = nd2->nd_next; 24690Sstevel@tonic-gate } 24700Sstevel@tonic-gate free_sr((md_set_record *)mas_mnsr); 24710Sstevel@tonic-gate 24720Sstevel@tonic-gate /* 24730Sstevel@tonic-gate * Set the node flags in mynode's rpc.metad node records for 24740Sstevel@tonic-gate * the nodes that are in the diskset. Can use my sd 24750Sstevel@tonic-gate * since earlier call to metaget_setownership set the 24760Sstevel@tonic-gate * owner flags based on whether that node had snarfed 24770Sstevel@tonic-gate * the MN diskset mddb. Reconfig steps guarantee that 24780Sstevel@tonic-gate * return of metaget_setownership will match the owning 24790Sstevel@tonic-gate * node's owner list except in the case where a node 24800Sstevel@tonic-gate * has just panic'd and in this case, a reconfig will 24810Sstevel@tonic-gate * be starting immediately and the owner lists will 24820Sstevel@tonic-gate * be sync'd up by the reconfig. 24830Sstevel@tonic-gate * 24840Sstevel@tonic-gate * Flag of SET means to take no action except to 24850Sstevel@tonic-gate * set the node flags as given in the nodelist linked list. 24860Sstevel@tonic-gate */ 24870Sstevel@tonic-gate if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, 24880Sstevel@tonic-gate MD_NR_SET, NULL, ep)) { 24890Sstevel@tonic-gate rval = -1; 24900Sstevel@tonic-gate goto out; 24910Sstevel@tonic-gate } 24920Sstevel@tonic-gate } 24930Sstevel@tonic-gate 24940Sstevel@tonic-gate /* 24950Sstevel@tonic-gate * Read in the mddb if there are drives in the set. 24960Sstevel@tonic-gate */ 24970Sstevel@tonic-gate if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 24980Sstevel@tonic-gate ep)) == NULL) { 24990Sstevel@tonic-gate /* No drives in list */ 25000Sstevel@tonic-gate if (! mdisok(ep)) { 25010Sstevel@tonic-gate rval = -1; 25020Sstevel@tonic-gate goto out; 25030Sstevel@tonic-gate } 25040Sstevel@tonic-gate rval = -2; 25050Sstevel@tonic-gate goto out; 25060Sstevel@tonic-gate } 25070Sstevel@tonic-gate 25080Sstevel@tonic-gate /* 25090Sstevel@tonic-gate * Notify rpc.mdcommd on all nodes of a nodelist change. 25100Sstevel@tonic-gate * Start by suspending rpc.mdcommd (which drains it of all messages), 25110Sstevel@tonic-gate * then change the nodelist followed by a reinit and resume. 25120Sstevel@tonic-gate */ 25130Sstevel@tonic-gate nd = sd->sd_nodelist; 25140Sstevel@tonic-gate while (nd) { 25150Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 25160Sstevel@tonic-gate nd = nd->nd_next; 25170Sstevel@tonic-gate continue; 25180Sstevel@tonic-gate } 25190Sstevel@tonic-gate 25200Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp, 25210Sstevel@tonic-gate MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { 25220Sstevel@tonic-gate rval = -1; 25230Sstevel@tonic-gate goto out; 25240Sstevel@tonic-gate } 25250Sstevel@tonic-gate suspendall_flag = 1; 25260Sstevel@tonic-gate nd = nd->nd_next; 25270Sstevel@tonic-gate } 25280Sstevel@tonic-gate 25290Sstevel@tonic-gate /* Set master in my set record in rpc.metad */ 25300Sstevel@tonic-gate if (master_flag) { 25310Sstevel@tonic-gate if (clnt_mnsetmaster(mynode(), sp, 25320Sstevel@tonic-gate sd->sd_mn_mynode->nd_nodename, 25330Sstevel@tonic-gate sd->sd_mn_mynode->nd_nodeid, ep)) { 25340Sstevel@tonic-gate rval = -1; 25350Sstevel@tonic-gate goto out; 25360Sstevel@tonic-gate } 25370Sstevel@tonic-gate } 2538650Sskamm /* 2539650Sskamm * Causes mddbs to be loaded into the kernel. 2540650Sskamm * Set the force flag so that replica locations can be 2541650Sskamm * loaded into the kernel even if a mediator node was 2542650Sskamm * unavailable. This allows a node to join an MO 2543650Sskamm * diskset when there are sufficient replicas available, 2544650Sskamm * but a mediator node in unavailable. 2545650Sskamm */ 2546650Sskamm if (setup_db_bydd(sp, dd, TRUE, ep) == -1) { 25470Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 25480Sstevel@tonic-gate "Host not able to start diskset.")); 25490Sstevel@tonic-gate rval = -1; 25500Sstevel@tonic-gate goto out; 25510Sstevel@tonic-gate } 25520Sstevel@tonic-gate 25530Sstevel@tonic-gate if (! mdisok(ep)) { 25540Sstevel@tonic-gate rval = -1; 25550Sstevel@tonic-gate goto out; 25560Sstevel@tonic-gate } 25570Sstevel@tonic-gate 25580Sstevel@tonic-gate /* 25590Sstevel@tonic-gate * Set rollback flags to 1 so that halt_set is called if a failure 25600Sstevel@tonic-gate * is seen after this point. If snarf_set fails, still need to 25610Sstevel@tonic-gate * call halt_set to cleanup the diskset. 25620Sstevel@tonic-gate */ 25630Sstevel@tonic-gate rb_flags = 1; 25640Sstevel@tonic-gate 25650Sstevel@tonic-gate /* Starts the set */ 25660Sstevel@tonic-gate if (snarf_set(sp, stale_bool, ep) != 0) { 25670Sstevel@tonic-gate if (mdismddberror(ep, MDE_DB_STALE)) { 25680Sstevel@tonic-gate /* 25690Sstevel@tonic-gate * Don't fail join, STALE means that set has 25700Sstevel@tonic-gate * < 50% mddbs. 25710Sstevel@tonic-gate */ 25720Sstevel@tonic-gate (void) mdstealerror(&ep_snarf, ep); 25730Sstevel@tonic-gate stale_set = 1; 25740Sstevel@tonic-gate } else if (mdisok(ep)) { 25750Sstevel@tonic-gate /* If snarf failed, but no error was set - set it */ 257662Sjeanm (void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64, 25770Sstevel@tonic-gate sp->setno, 0, NULL); 25780Sstevel@tonic-gate rval = -1; 25790Sstevel@tonic-gate goto out; 25800Sstevel@tonic-gate } else if (!(mdismddberror(ep, MDE_DB_ACCOK))) { 25810Sstevel@tonic-gate /* 25820Sstevel@tonic-gate * Don't fail join if ACCOK; ACCOK means that mediator 25830Sstevel@tonic-gate * provided extra vote. 25840Sstevel@tonic-gate */ 25850Sstevel@tonic-gate rval = -1; 25860Sstevel@tonic-gate goto out; 25870Sstevel@tonic-gate } 25880Sstevel@tonic-gate } 25890Sstevel@tonic-gate 25900Sstevel@tonic-gate /* Did set really get snarfed? */ 25910Sstevel@tonic-gate if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_NO) { 25920Sstevel@tonic-gate if (mdisok(ep)) { 25930Sstevel@tonic-gate /* If snarf failed, but no error was set - set it */ 259462Sjeanm (void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64, 25950Sstevel@tonic-gate sp->setno, 0, NULL); 25960Sstevel@tonic-gate } 25970Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 25980Sstevel@tonic-gate "Host not able to start diskset.")); 25990Sstevel@tonic-gate rval = -1; 26000Sstevel@tonic-gate goto out; 26010Sstevel@tonic-gate } 26020Sstevel@tonic-gate 26030Sstevel@tonic-gate /* Change to nodelist so need to send reinit to rpc.mdcommd */ 26040Sstevel@tonic-gate send_reinit = 1; 26050Sstevel@tonic-gate 26060Sstevel@tonic-gate /* If first node to enter set, setup master and clear change log */ 26070Sstevel@tonic-gate if (master_flag) { 26080Sstevel@tonic-gate /* Set master in my locally cached set descriptor */ 26090Sstevel@tonic-gate (void) strcpy(sd->sd_mn_master_nodenm, 26100Sstevel@tonic-gate sd->sd_mn_mynode->nd_nodename); 26110Sstevel@tonic-gate sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid; 26120Sstevel@tonic-gate sd->sd_mn_am_i_master = 1; 26130Sstevel@tonic-gate 26140Sstevel@tonic-gate /* 26150Sstevel@tonic-gate * If first node to join set, then clear out change log 26160Sstevel@tonic-gate * entries. Change log entries are only needed when a 26170Sstevel@tonic-gate * change of master is occurring in a diskset that has 26180Sstevel@tonic-gate * multiple owners. Since this node is the first owner 26190Sstevel@tonic-gate * of the diskset, clear the entries. 26200Sstevel@tonic-gate * 26210Sstevel@tonic-gate * Only do this if we are in a single node non-SC3.x 26220Sstevel@tonic-gate * situation. 26230Sstevel@tonic-gate */ 26240Sstevel@tonic-gate if (meta_mn_singlenode() && 26250Sstevel@tonic-gate mdmn_reset_changelog(sp, ep, MDMN_CLF_RESETLOG) != 0) { 26260Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 26270Sstevel@tonic-gate "Unable to reset changelog.")); 26280Sstevel@tonic-gate rval = -1; 26290Sstevel@tonic-gate goto out; 26300Sstevel@tonic-gate } 26310Sstevel@tonic-gate } 26320Sstevel@tonic-gate 26330Sstevel@tonic-gate /* Set my locally cached flag */ 26340Sstevel@tonic-gate sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN; 26350Sstevel@tonic-gate 26360Sstevel@tonic-gate /* 26370Sstevel@tonic-gate * Set this node's own flag on all joined nodes in the set 26380Sstevel@tonic-gate * (including my node). 26390Sstevel@tonic-gate */ 26400Sstevel@tonic-gate clear_nr_flags = 1; 26410Sstevel@tonic-gate 26420Sstevel@tonic-gate my_nd = *(sd->sd_mn_mynode); 26430Sstevel@tonic-gate my_nd.nd_next = NULL; 26440Sstevel@tonic-gate nd = sd->sd_nodelist; 26450Sstevel@tonic-gate while (nd) { 26460Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 26470Sstevel@tonic-gate nd = nd->nd_next; 26480Sstevel@tonic-gate continue; 26490Sstevel@tonic-gate } 26500Sstevel@tonic-gate if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 26510Sstevel@tonic-gate MD_NR_JOIN, NULL, ep)) { 26520Sstevel@tonic-gate rval = -1; 26530Sstevel@tonic-gate goto out; 26540Sstevel@tonic-gate } 26550Sstevel@tonic-gate nd = nd->nd_next; 26560Sstevel@tonic-gate } 26570Sstevel@tonic-gate 26580Sstevel@tonic-gate out: 26590Sstevel@tonic-gate if (rval != NULL) { 26600Sstevel@tonic-gate /* 26610Sstevel@tonic-gate * If rollback flag is 1, then node was joined to set. 26620Sstevel@tonic-gate * Since an error occurred, withdraw node from set in 26630Sstevel@tonic-gate * order to rollback to before command was run. 26640Sstevel@tonic-gate * Need to preserve ep so that calling function can 26650Sstevel@tonic-gate * get error information. 26660Sstevel@tonic-gate */ 26670Sstevel@tonic-gate if (rb_flags == 1) { 26680Sstevel@tonic-gate if (halt_set(sp, &xep)) { 26690Sstevel@tonic-gate mdclrerror(&xep); 26700Sstevel@tonic-gate } 26710Sstevel@tonic-gate } 26720Sstevel@tonic-gate 26730Sstevel@tonic-gate /* 26740Sstevel@tonic-gate * If error, reset master to INVALID. 26750Sstevel@tonic-gate * Ignore error since (next) first node to successfully join 26760Sstevel@tonic-gate * will set master on all nodes. 26770Sstevel@tonic-gate */ 26780Sstevel@tonic-gate (void) clnt_mnsetmaster(mynode(), sp, "", 26790Sstevel@tonic-gate MD_MN_INVALID_NID, &xep); 26800Sstevel@tonic-gate mdclrerror(&xep); 26810Sstevel@tonic-gate /* Reset master in my locally cached set descriptor */ 26820Sstevel@tonic-gate sd->sd_mn_master_nodeid = MD_MN_INVALID_NID; 26830Sstevel@tonic-gate sd->sd_mn_am_i_master = 0; 26840Sstevel@tonic-gate 26850Sstevel@tonic-gate /* 26860Sstevel@tonic-gate * If nr flags set on other nodes, reset them. 26870Sstevel@tonic-gate */ 26880Sstevel@tonic-gate if (clear_nr_flags) { 26890Sstevel@tonic-gate nd = sd->sd_nodelist; 26900Sstevel@tonic-gate while (nd) { 26910Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 26920Sstevel@tonic-gate nd = nd->nd_next; 26930Sstevel@tonic-gate continue; 26940Sstevel@tonic-gate } 26950Sstevel@tonic-gate (void) clnt_upd_nr_flags(nd->nd_nodename, sp, 26960Sstevel@tonic-gate &my_nd, MD_NR_WITHDRAW, NULL, &xep); 26970Sstevel@tonic-gate mdclrerror(&xep); 26980Sstevel@tonic-gate nd = nd->nd_next; 26990Sstevel@tonic-gate } 27000Sstevel@tonic-gate /* Reset my locally cached flag */ 27010Sstevel@tonic-gate sd->sd_mn_mynode->nd_flags &= ~MD_MN_NODE_OWN; 27020Sstevel@tonic-gate } 27030Sstevel@tonic-gate } 27040Sstevel@tonic-gate 27050Sstevel@tonic-gate /* 27060Sstevel@tonic-gate * Notify rpc.mdcommd on all nodes of a nodelist change. 27070Sstevel@tonic-gate * Send reinit command to mdcommd which forces it to get 27080Sstevel@tonic-gate * fresh set description. 27090Sstevel@tonic-gate */ 27100Sstevel@tonic-gate if (send_reinit) { 27110Sstevel@tonic-gate /* Send reinit */ 27120Sstevel@tonic-gate nd = sd->sd_nodelist; 27130Sstevel@tonic-gate while (nd) { 27140Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 27150Sstevel@tonic-gate nd = nd->nd_next; 27160Sstevel@tonic-gate continue; 27170Sstevel@tonic-gate } 27180Sstevel@tonic-gate 27190Sstevel@tonic-gate /* Class is ignored for REINIT */ 27200Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, 27210Sstevel@tonic-gate sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { 27220Sstevel@tonic-gate /* 27230Sstevel@tonic-gate * We are here because we failed to resume 27240Sstevel@tonic-gate * rpc.mdcommd. However we potentially have 27250Sstevel@tonic-gate * an error from the previous call 27260Sstevel@tonic-gate * If the previous call did fail, we capture 27270Sstevel@tonic-gate * that error and generate a perror with 27280Sstevel@tonic-gate * the string, "Unable to resume...". 27290Sstevel@tonic-gate * Setting rval to -1 ensures that in the 27300Sstevel@tonic-gate * next iteration of the loop, ep is not 27310Sstevel@tonic-gate * clobbered. 27320Sstevel@tonic-gate */ 27330Sstevel@tonic-gate if (rval == 0) 27340Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 27350Sstevel@tonic-gate else 27360Sstevel@tonic-gate mdclrerror(&xep); 27370Sstevel@tonic-gate rval = -1; 27380Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 27390Sstevel@tonic-gate "Unable to reinit rpc.mdcommd.")); 27400Sstevel@tonic-gate } 27410Sstevel@tonic-gate nd = nd->nd_next; 27420Sstevel@tonic-gate } 27430Sstevel@tonic-gate 27440Sstevel@tonic-gate } 27450Sstevel@tonic-gate 27460Sstevel@tonic-gate out2: 27470Sstevel@tonic-gate /* 27480Sstevel@tonic-gate * Unlock diskset by resuming messages across the diskset. 27490Sstevel@tonic-gate * Just resume all classes so that resume is the same whether 27500Sstevel@tonic-gate * just one class was locked or all classes were locked. 27510Sstevel@tonic-gate */ 27520Sstevel@tonic-gate if ((suspend1_flag) || (suspendall_flag)) { 27530Sstevel@tonic-gate nd = sd->sd_nodelist; 27540Sstevel@tonic-gate while (nd) { 27550Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 27560Sstevel@tonic-gate nd = nd->nd_next; 27570Sstevel@tonic-gate continue; 27580Sstevel@tonic-gate } 27590Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 27600Sstevel@tonic-gate sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 27610Sstevel@tonic-gate /* 27620Sstevel@tonic-gate * We are here because we failed to resume 27630Sstevel@tonic-gate * rpc.mdcommd. However we potentially have 27640Sstevel@tonic-gate * an error from the previous call 27650Sstevel@tonic-gate * If the previous call did fail, we capture 27660Sstevel@tonic-gate * that error and generate a perror with 27670Sstevel@tonic-gate * the string, "Unable to resume...". 27680Sstevel@tonic-gate * Setting rval to -1 ensures that in the 27690Sstevel@tonic-gate * next iteration of the loop, ep is not 27700Sstevel@tonic-gate * clobbered. 27710Sstevel@tonic-gate */ 27720Sstevel@tonic-gate if (rval == 0) 27730Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 27740Sstevel@tonic-gate else 27750Sstevel@tonic-gate mdclrerror(&xep); 27760Sstevel@tonic-gate rval = -1; 27770Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 27780Sstevel@tonic-gate "Unable to resume rpc.mdcommd.")); 27790Sstevel@tonic-gate } 27800Sstevel@tonic-gate nd = nd->nd_next; 27810Sstevel@tonic-gate } 27820Sstevel@tonic-gate meta_ping_mnset(sp->setno); 27830Sstevel@tonic-gate } 27840Sstevel@tonic-gate 27850Sstevel@tonic-gate /* 27860Sstevel@tonic-gate * Unlock set. This flushes the caches on the servers. 27870Sstevel@tonic-gate */ 27880Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname); 27890Sstevel@tonic-gate nd = sd->sd_nodelist; 27900Sstevel@tonic-gate while (nd) { 27910Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 27920Sstevel@tonic-gate nd = nd->nd_next; 27930Sstevel@tonic-gate continue; 27940Sstevel@tonic-gate } 27950Sstevel@tonic-gate if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 27960Sstevel@tonic-gate if (rval == 0) 27970Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 27980Sstevel@tonic-gate else 27990Sstevel@tonic-gate mdclrerror(&xep); 28000Sstevel@tonic-gate rval = -1; 28010Sstevel@tonic-gate } 28020Sstevel@tonic-gate nd = nd->nd_next; 28030Sstevel@tonic-gate } 28040Sstevel@tonic-gate 28050Sstevel@tonic-gate /* 28060Sstevel@tonic-gate * If this node is the last to join the diskset and clustering isn't 28070Sstevel@tonic-gate * running, then resync the mirrors in the diskset. We have to wait 28080Sstevel@tonic-gate * until all nodes are joined so that the status gets propagated to 28090Sstevel@tonic-gate * all of the members of the set. 28100Sstevel@tonic-gate * Ignore any error from the resync as the join function shouldn't fail 28110Sstevel@tonic-gate * because the mirror resync had a problem. 28120Sstevel@tonic-gate * 28130Sstevel@tonic-gate * Don't start resync if set is stale. 28140Sstevel@tonic-gate */ 28150Sstevel@tonic-gate if ((rval == 0) && (sdssc_bind_library() != SDSSC_OKAY) && 28160Sstevel@tonic-gate (stale_set != 1)) { 28170Sstevel@tonic-gate nd = sd->sd_nodelist; 28180Sstevel@tonic-gate while (nd) { 28190Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) 28200Sstevel@tonic-gate break; 28210Sstevel@tonic-gate nd = nd->nd_next; 28220Sstevel@tonic-gate } 28230Sstevel@tonic-gate /* 28240Sstevel@tonic-gate * nd set to NULL means that we have no nodes in the set that 28250Sstevel@tonic-gate * haven't joined. In this case we start the resync. 28260Sstevel@tonic-gate */ 28270Sstevel@tonic-gate if (nd == NULL) { 28280Sstevel@tonic-gate (void) meta_mirror_resync_all(sp, 0, &xep); 28290Sstevel@tonic-gate mdclrerror(&xep); 28300Sstevel@tonic-gate } 28310Sstevel@tonic-gate } 28320Sstevel@tonic-gate 28330Sstevel@tonic-gate /* Update ABR state for all soft partitions */ 28340Sstevel@tonic-gate (void) meta_sp_update_abr(sp, &xep); 28350Sstevel@tonic-gate mdclrerror(&xep); 28360Sstevel@tonic-gate 28370Sstevel@tonic-gate /* 28380Sstevel@tonic-gate * call metaflushsetnames to reset local cache for master and 28390Sstevel@tonic-gate * node information. 28400Sstevel@tonic-gate */ 28410Sstevel@tonic-gate metaflushsetname(sp); 28420Sstevel@tonic-gate 28430Sstevel@tonic-gate /* release signals back to what they were on entry */ 28440Sstevel@tonic-gate if (procsigs(FALSE, &oldsigs, &xep) < 0) 28450Sstevel@tonic-gate mdclrerror(&xep); 28460Sstevel@tonic-gate 28470Sstevel@tonic-gate /* 28480Sstevel@tonic-gate * If no error and stale_set is set, then set ep back 28490Sstevel@tonic-gate * to ep from snarf_set call and return -3. If another error 28500Sstevel@tonic-gate * occurred and rval is not 0, then that error would have 28510Sstevel@tonic-gate * caused the node to be withdrawn from the set and would 28520Sstevel@tonic-gate * have set ep to that error information. 28530Sstevel@tonic-gate */ 28540Sstevel@tonic-gate if ((rval == 0) && (stale_set)) { 28550Sstevel@tonic-gate (void) mdstealerror(ep, &ep_snarf); 28560Sstevel@tonic-gate return (-3); 28570Sstevel@tonic-gate } 28580Sstevel@tonic-gate 28590Sstevel@tonic-gate return (rval); 28600Sstevel@tonic-gate } 28610Sstevel@tonic-gate 28620Sstevel@tonic-gate /* 28630Sstevel@tonic-gate * Entry point to withdraw a node from MultiNode diskset. 28640Sstevel@tonic-gate * 28650Sstevel@tonic-gate * Validate host in diskset. 28660Sstevel@tonic-gate * - Should be joined into diskset. 28670Sstevel@tonic-gate * Assume valid configuration is stored in the set/drive/node records 28680Sstevel@tonic-gate * in the local mddb since no node or drive can be added to the MNset 28690Sstevel@tonic-gate * unless all drives and nodes are available. Reconfig steps will 28700Sstevel@tonic-gate * resync all ALIVE nodes in case of panic in critical areas. 28710Sstevel@tonic-gate * 28720Sstevel@tonic-gate * Lock down the set. 28730Sstevel@tonic-gate * Verify that drives exist in configuration. 28740Sstevel@tonic-gate * Verify host is a member of this diskset. 28750Sstevel@tonic-gate * Verify host is an owner of the diskset (host is joined to diskset). 28760Sstevel@tonic-gate * Only allow withdrawal of master node if master node is the only joined 28770Sstevel@tonic-gate * in the diskset. 28780Sstevel@tonic-gate * Halt the diskset on this node. 28790Sstevel@tonic-gate * Reset Master on this node. 28800Sstevel@tonic-gate * Updated node flags that this node with withdrawn. 28810Sstevel@tonic-gate * Unlock the set. 28820Sstevel@tonic-gate * 28830Sstevel@tonic-gate * Return values: 28840Sstevel@tonic-gate * 0 - Node successfully withdrew from set. 28850Sstevel@tonic-gate * -1 - Withdrawal attempted but failed 28860Sstevel@tonic-gate * - any failure from libmeta calls 28870Sstevel@tonic-gate * - node not in the member list 28880Sstevel@tonic-gate * -2 - Withdrawal not attempted since 28890Sstevel@tonic-gate * - this set had no drives in set 28900Sstevel@tonic-gate * - this node not joined to set 28910Sstevel@tonic-gate * - set is not a multinode set 28920Sstevel@tonic-gate */ 28930Sstevel@tonic-gate extern int 28940Sstevel@tonic-gate meta_set_withdraw( 28950Sstevel@tonic-gate mdsetname_t *sp, 28960Sstevel@tonic-gate md_error_t *ep 28970Sstevel@tonic-gate ) 28980Sstevel@tonic-gate { 28990Sstevel@tonic-gate md_set_desc *sd; 29000Sstevel@tonic-gate md_drive_desc *dd = 0; 29010Sstevel@tonic-gate md_mnnode_desc *nd, my_nd; 29020Sstevel@tonic-gate int rval = 0; 29030Sstevel@tonic-gate md_setkey_t *cl_sk; 29040Sstevel@tonic-gate md_error_t xep = mdnullerror; 29050Sstevel@tonic-gate int set_halted = 0; 29060Sstevel@tonic-gate int suspendall_flag = 0; 29070Sstevel@tonic-gate int suspend1_flag = 0; 29080Sstevel@tonic-gate bool_t stale_bool = FALSE; 29090Sstevel@tonic-gate mddb_config_t c; 29100Sstevel@tonic-gate int node_id_list[1]; 29110Sstevel@tonic-gate sigset_t oldsigs; 29120Sstevel@tonic-gate int send_reinit = 0; 29130Sstevel@tonic-gate 29140Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 29150Sstevel@tonic-gate return (-1); 29160Sstevel@tonic-gate } 29170Sstevel@tonic-gate 29180Sstevel@tonic-gate /* Must be a multinode diskset */ 29190Sstevel@tonic-gate if (!MD_MNSET_DESC(sd)) { 29200Sstevel@tonic-gate (void) mderror(ep, MDE_NOT_MN, sp->setname); 29210Sstevel@tonic-gate return (-1); 29220Sstevel@tonic-gate } 29230Sstevel@tonic-gate 29240Sstevel@tonic-gate /* Make sure we are blocking all signals */ 29250Sstevel@tonic-gate if (procsigs(TRUE, &oldsigs, &xep) < 0) 29260Sstevel@tonic-gate mdclrerror(&xep); 29270Sstevel@tonic-gate 29280Sstevel@tonic-gate /* 29290Sstevel@tonic-gate * Lock the set on current set members. 29300Sstevel@tonic-gate * For MN diskset lock_set and SUSPEND are used to protect against 29310Sstevel@tonic-gate * other meta* commands running on the other nodes. 29320Sstevel@tonic-gate */ 29330Sstevel@tonic-gate nd = sd->sd_nodelist; 29340Sstevel@tonic-gate while (nd) { 29350Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 29360Sstevel@tonic-gate nd = nd->nd_next; 29370Sstevel@tonic-gate continue; 29380Sstevel@tonic-gate } 29390Sstevel@tonic-gate if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 29400Sstevel@tonic-gate rval = -1; 29410Sstevel@tonic-gate goto out; 29420Sstevel@tonic-gate } 29430Sstevel@tonic-gate nd = nd->nd_next; 29440Sstevel@tonic-gate } 29450Sstevel@tonic-gate /* 29460Sstevel@tonic-gate * Lock out other meta* commands by suspending 29470Sstevel@tonic-gate * class 1 messages across the diskset. 29480Sstevel@tonic-gate */ 29490Sstevel@tonic-gate nd = sd->sd_nodelist; 29500Sstevel@tonic-gate while (nd) { 29510Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 29520Sstevel@tonic-gate nd = nd->nd_next; 29530Sstevel@tonic-gate continue; 29540Sstevel@tonic-gate } 29550Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 29560Sstevel@tonic-gate sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) { 29570Sstevel@tonic-gate rval = -1; 29580Sstevel@tonic-gate goto out; 29590Sstevel@tonic-gate } 29600Sstevel@tonic-gate suspend1_flag = 1; 29610Sstevel@tonic-gate nd = nd->nd_next; 29620Sstevel@tonic-gate } 29630Sstevel@tonic-gate 29640Sstevel@tonic-gate /* Get list of drives - needed in case of failure */ 29650Sstevel@tonic-gate if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 29660Sstevel@tonic-gate ep)) == NULL) { 29670Sstevel@tonic-gate /* Error getting drives in list */ 29680Sstevel@tonic-gate if (! mdisok(ep)) { 29690Sstevel@tonic-gate rval = -1; 29700Sstevel@tonic-gate goto out2; 29710Sstevel@tonic-gate } 29720Sstevel@tonic-gate /* no drives in list */ 29730Sstevel@tonic-gate rval = -2; 29740Sstevel@tonic-gate goto out2; 29750Sstevel@tonic-gate } 29760Sstevel@tonic-gate 29770Sstevel@tonic-gate /* 29780Sstevel@tonic-gate * Verify that this host is a member (in the host list) of the set. 29790Sstevel@tonic-gate */ 29800Sstevel@tonic-gate nd = sd->sd_nodelist; 29810Sstevel@tonic-gate while (nd) { 29820Sstevel@tonic-gate if (strcmp(mynode(), nd->nd_nodename) == 0) { 29830Sstevel@tonic-gate break; 29840Sstevel@tonic-gate } 29850Sstevel@tonic-gate nd = nd->nd_next; 29860Sstevel@tonic-gate } 29870Sstevel@tonic-gate if (!nd) { 29880Sstevel@tonic-gate (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 29890Sstevel@tonic-gate sd->sd_mn_mynode->nd_nodename, NULL, 29900Sstevel@tonic-gate sp->setname); 29910Sstevel@tonic-gate rval = -1; 29920Sstevel@tonic-gate goto out2; 29930Sstevel@tonic-gate } 29940Sstevel@tonic-gate 29950Sstevel@tonic-gate /* 29960Sstevel@tonic-gate * Call metaget_setownership that calls each node in diskset and 29970Sstevel@tonic-gate * marks in set descriptor if node is an owner of the set or not. 29980Sstevel@tonic-gate * metaget_setownership checks to see if a node is an owner by 29990Sstevel@tonic-gate * checking to see if that node's kernel has the mddb loaded. 30000Sstevel@tonic-gate * If a node had panic'd during a reconfig or an 30010Sstevel@tonic-gate * add/delete/join/withdraw operation, the other nodes' node 30020Sstevel@tonic-gate * records may not reflect the current state of the diskset, 30030Sstevel@tonic-gate * so calling metaget_setownership is the safest thing to do. 30040Sstevel@tonic-gate */ 30050Sstevel@tonic-gate if (metaget_setownership(sp, ep) == -1) { 30060Sstevel@tonic-gate rval = -1; 30070Sstevel@tonic-gate goto out2; 30080Sstevel@tonic-gate } 30090Sstevel@tonic-gate 30100Sstevel@tonic-gate /* 30110Sstevel@tonic-gate * Verify that this node is joined 30120Sstevel@tonic-gate * to diskset (i.e. is an owner of the diskset). 30130Sstevel@tonic-gate */ 30140Sstevel@tonic-gate if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 30150Sstevel@tonic-gate rval = -2; 30160Sstevel@tonic-gate goto out2; 30170Sstevel@tonic-gate } 30180Sstevel@tonic-gate 30190Sstevel@tonic-gate /* 30200Sstevel@tonic-gate * For a MN diskset, only withdraw master if it is 30210Sstevel@tonic-gate * the only joined node. 30220Sstevel@tonic-gate */ 30230Sstevel@tonic-gate if (sd->sd_mn_master_nodeid == sd->sd_mn_mynode->nd_nodeid) { 30240Sstevel@tonic-gate nd = sd->sd_nodelist; 30250Sstevel@tonic-gate while (nd) { 30260Sstevel@tonic-gate /* Skip my node since checking for other owners */ 30270Sstevel@tonic-gate if (nd->nd_nodeid == sd->sd_mn_master_nodeid) { 30280Sstevel@tonic-gate nd = nd->nd_next; 30290Sstevel@tonic-gate continue; 30300Sstevel@tonic-gate } 30310Sstevel@tonic-gate /* If another owner node if found, error */ 30320Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OWN) { 30330Sstevel@tonic-gate (void) mddserror(ep, MDE_DS_WITHDRAWMASTER, 30340Sstevel@tonic-gate sp->setno, 30350Sstevel@tonic-gate sd->sd_mn_mynode->nd_nodename, NULL, 30360Sstevel@tonic-gate sp->setname); 30370Sstevel@tonic-gate rval = -1; 30380Sstevel@tonic-gate goto out2; 30390Sstevel@tonic-gate } 30400Sstevel@tonic-gate nd = nd->nd_next; 30410Sstevel@tonic-gate } 30420Sstevel@tonic-gate } 30430Sstevel@tonic-gate 30440Sstevel@tonic-gate /* 30450Sstevel@tonic-gate * Is current set STALE? 30460Sstevel@tonic-gate */ 30470Sstevel@tonic-gate (void) memset(&c, 0, sizeof (c)); 30480Sstevel@tonic-gate c.c_id = 0; 30490Sstevel@tonic-gate c.c_setno = sp->setno; 30500Sstevel@tonic-gate if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 30510Sstevel@tonic-gate (void) mdstealerror(ep, &c.c_mde); 30520Sstevel@tonic-gate rval = -1; 30530Sstevel@tonic-gate goto out; 30540Sstevel@tonic-gate } 30550Sstevel@tonic-gate if (c.c_flags & MDDB_C_STALE) { 30560Sstevel@tonic-gate stale_bool = TRUE; 30570Sstevel@tonic-gate } 30580Sstevel@tonic-gate 30590Sstevel@tonic-gate /* 30600Sstevel@tonic-gate * Notify rpc.mdcommd on all nodes of a nodelist change. 30610Sstevel@tonic-gate * Start by suspending rpc.mdcommd (which drains it of all messages), 30620Sstevel@tonic-gate * then change the nodelist followed by a reinit and resume. 30630Sstevel@tonic-gate */ 30640Sstevel@tonic-gate nd = sd->sd_nodelist; 30650Sstevel@tonic-gate while (nd) { 30660Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 30670Sstevel@tonic-gate nd = nd->nd_next; 30680Sstevel@tonic-gate continue; 30690Sstevel@tonic-gate } 30700Sstevel@tonic-gate 30710Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 30720Sstevel@tonic-gate sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { 30730Sstevel@tonic-gate rval = -1; 30740Sstevel@tonic-gate goto out; 30750Sstevel@tonic-gate } 30760Sstevel@tonic-gate suspendall_flag = 1; 30770Sstevel@tonic-gate nd = nd->nd_next; 30780Sstevel@tonic-gate } 30790Sstevel@tonic-gate 30800Sstevel@tonic-gate /* 30810Sstevel@tonic-gate * Withdraw the set - halt set. 30820Sstevel@tonic-gate * This will fail if any I/O is occuring to any metadevice which 30830Sstevel@tonic-gate * includes a resync to a mirror metadevice. 30840Sstevel@tonic-gate */ 30850Sstevel@tonic-gate set_halted = 1; 30860Sstevel@tonic-gate if (halt_set(sp, ep)) { 30870Sstevel@tonic-gate /* Was set actually halted? */ 30880Sstevel@tonic-gate if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_YES) { 30890Sstevel@tonic-gate set_halted = 0; 30900Sstevel@tonic-gate } 30910Sstevel@tonic-gate rval = -1; 30920Sstevel@tonic-gate goto out; 30930Sstevel@tonic-gate } 30940Sstevel@tonic-gate 30950Sstevel@tonic-gate /* Change to nodelist so need to send reinit to rpc.mdcommd */ 30960Sstevel@tonic-gate send_reinit = 1; 30970Sstevel@tonic-gate 30980Sstevel@tonic-gate /* Reset master on withdrawn node */ 30990Sstevel@tonic-gate if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, "", 31000Sstevel@tonic-gate MD_MN_INVALID_NID, ep)) { 31010Sstevel@tonic-gate rval = -1; 31020Sstevel@tonic-gate goto out; 31030Sstevel@tonic-gate } 31040Sstevel@tonic-gate 31050Sstevel@tonic-gate /* Mark my node as withdrawn and send to other nodes */ 31060Sstevel@tonic-gate nd = sd->sd_nodelist; 31070Sstevel@tonic-gate my_nd = *(sd->sd_mn_mynode); /* structure copy */ 31080Sstevel@tonic-gate my_nd.nd_next = NULL; 31090Sstevel@tonic-gate while (nd) { 31100Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 31110Sstevel@tonic-gate nd = nd->nd_next; 31120Sstevel@tonic-gate continue; 31130Sstevel@tonic-gate } 31140Sstevel@tonic-gate if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 31150Sstevel@tonic-gate MD_NR_WITHDRAW, NULL, ep)) { 31160Sstevel@tonic-gate rval = -1; 31170Sstevel@tonic-gate goto out; 31180Sstevel@tonic-gate } 31190Sstevel@tonic-gate nd = nd->nd_next; 31200Sstevel@tonic-gate } 31210Sstevel@tonic-gate 31220Sstevel@tonic-gate /* 31230Sstevel@tonic-gate * If withdrawn node is a mirror owner, reset mirror owner 31240Sstevel@tonic-gate * to NULL. If an error occurs, print a warning and continue. 31250Sstevel@tonic-gate * Don't fail metaset because of mirror owner reset problem since 31260Sstevel@tonic-gate * next node to grab mirror will resolve this issue. 31270Sstevel@tonic-gate * Before next node grabs mirrors, metaset will show the withdrawn 31280Sstevel@tonic-gate * node as owner which is why an attempt to reset the mirror owner 31290Sstevel@tonic-gate * is made. 31300Sstevel@tonic-gate */ 31310Sstevel@tonic-gate node_id_list[0] = sd->sd_mn_mynode->nd_nodeid; /* Setup my nodeid */ 31320Sstevel@tonic-gate nd = sd->sd_nodelist; 31330Sstevel@tonic-gate while (nd) { 31340Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 31350Sstevel@tonic-gate nd = nd->nd_next; 31360Sstevel@tonic-gate continue; 31370Sstevel@tonic-gate } 31380Sstevel@tonic-gate if (clnt_reset_mirror_owner(nd->nd_nodename, sp, 31390Sstevel@tonic-gate 1, &node_id_list[0], &xep) == 01) { 31400Sstevel@tonic-gate mde_perror(&xep, dgettext(TEXT_DOMAIN, 31410Sstevel@tonic-gate "Unable to reset mirror owner on node %s"), 31420Sstevel@tonic-gate nd->nd_nodename); 31430Sstevel@tonic-gate mdclrerror(&xep); 31440Sstevel@tonic-gate } 31450Sstevel@tonic-gate nd = nd->nd_next; 31460Sstevel@tonic-gate } 31470Sstevel@tonic-gate 31480Sstevel@tonic-gate out: 31490Sstevel@tonic-gate if (rval == -1) { 31500Sstevel@tonic-gate /* Rejoin node - Mark node as joined and send to other nodes */ 31510Sstevel@tonic-gate nd = sd->sd_nodelist; 31520Sstevel@tonic-gate my_nd = *(sd->sd_mn_mynode); /* structure copy */ 31530Sstevel@tonic-gate my_nd.nd_next = NULL; 31540Sstevel@tonic-gate while (nd) { 31550Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 31560Sstevel@tonic-gate nd = nd->nd_next; 31570Sstevel@tonic-gate continue; 31580Sstevel@tonic-gate } 31590Sstevel@tonic-gate if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 31600Sstevel@tonic-gate MD_NR_JOIN, NULL, &xep)) { 31610Sstevel@tonic-gate mdclrerror(&xep); 31620Sstevel@tonic-gate } 31630Sstevel@tonic-gate nd = nd->nd_next; 31640Sstevel@tonic-gate } 31650Sstevel@tonic-gate 31660Sstevel@tonic-gate /* Set master on withdrawn node */ 31670Sstevel@tonic-gate if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, 31680Sstevel@tonic-gate sd->sd_mn_master_nodenm, 31690Sstevel@tonic-gate sd->sd_mn_master_nodeid, &xep)) { 31700Sstevel@tonic-gate mdclrerror(&xep); 31710Sstevel@tonic-gate } 31720Sstevel@tonic-gate 31730Sstevel@tonic-gate /* Join set if halt_set had succeeded */ 31740Sstevel@tonic-gate if (set_halted) { 3175650Sskamm /* 3176650Sskamm * Causes mddbs to be loaded into the kernel. 3177650Sskamm * Set the force flag so that replica locations can be 3178650Sskamm * loaded into the kernel even if a mediator node was 3179650Sskamm * unavailable. This allows a node to join an MO 3180650Sskamm * diskset when there are sufficient replicas available, 3181650Sskamm * but a mediator node in unavailable. 3182650Sskamm */ 3183650Sskamm if (setup_db_bydd(sp, dd, TRUE, &xep) == -1) { 31840Sstevel@tonic-gate mdclrerror(&xep); 31850Sstevel@tonic-gate } 31860Sstevel@tonic-gate /* If set previously stale - make it so at re-join */ 31870Sstevel@tonic-gate if (snarf_set(sp, stale_bool, &xep) != 0) { 31880Sstevel@tonic-gate mdclrerror(&xep); 31890Sstevel@tonic-gate (void) halt_set(sp, &xep); 31900Sstevel@tonic-gate mdclrerror(&xep); 31910Sstevel@tonic-gate } 31920Sstevel@tonic-gate } 31930Sstevel@tonic-gate } 31940Sstevel@tonic-gate 31950Sstevel@tonic-gate /* 31960Sstevel@tonic-gate * Notify rpc.mdcommd on all nodes of a nodelist change. 31970Sstevel@tonic-gate * Send reinit command to mdcommd which forces it to get 31980Sstevel@tonic-gate * fresh set description. 31990Sstevel@tonic-gate */ 32000Sstevel@tonic-gate if (send_reinit) { 32010Sstevel@tonic-gate /* Send reinit */ 32020Sstevel@tonic-gate nd = sd->sd_nodelist; 32030Sstevel@tonic-gate while (nd) { 32040Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 32050Sstevel@tonic-gate nd = nd->nd_next; 32060Sstevel@tonic-gate continue; 32070Sstevel@tonic-gate } 32080Sstevel@tonic-gate 32090Sstevel@tonic-gate /* Class is ignored for REINIT */ 32100Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, 32110Sstevel@tonic-gate sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { 32120Sstevel@tonic-gate /* 32130Sstevel@tonic-gate * We are here because we failed to resume 32140Sstevel@tonic-gate * rpc.mdcommd. However we potentially have 32150Sstevel@tonic-gate * an error from the previous call. 32160Sstevel@tonic-gate * If the previous call did fail, we 32170Sstevel@tonic-gate * capture that error and generate a perror 32180Sstevel@tonic-gate * withthe string, "Unable to resume...". 32190Sstevel@tonic-gate * Setting rval to -1 ensures that in the 32200Sstevel@tonic-gate * next iteration of the loop, ep is not 32210Sstevel@tonic-gate * clobbered. 32220Sstevel@tonic-gate */ 32230Sstevel@tonic-gate if (rval == 0) 32240Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 32250Sstevel@tonic-gate else 32260Sstevel@tonic-gate mdclrerror(&xep); 32270Sstevel@tonic-gate rval = -1; 32280Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 32290Sstevel@tonic-gate "Unable to reinit rpc.mdcommd.")); 32300Sstevel@tonic-gate } 32310Sstevel@tonic-gate nd = nd->nd_next; 32320Sstevel@tonic-gate } 32330Sstevel@tonic-gate } 32340Sstevel@tonic-gate 32350Sstevel@tonic-gate out2: 32360Sstevel@tonic-gate /* 32370Sstevel@tonic-gate * Unlock diskset by resuming messages across the diskset. 32380Sstevel@tonic-gate * Just resume all classes so that resume is the same whether 32390Sstevel@tonic-gate * just one class was locked or all classes were locked. 32400Sstevel@tonic-gate */ 32410Sstevel@tonic-gate if ((suspend1_flag) || (suspendall_flag)) { 32420Sstevel@tonic-gate nd = sd->sd_nodelist; 32430Sstevel@tonic-gate while (nd) { 32440Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 32450Sstevel@tonic-gate nd = nd->nd_next; 32460Sstevel@tonic-gate continue; 32470Sstevel@tonic-gate } 32480Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 32490Sstevel@tonic-gate sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 32500Sstevel@tonic-gate /* 32510Sstevel@tonic-gate * We are here because we failed to resume 32520Sstevel@tonic-gate * rpc.mdcommd. However we potentially have 32530Sstevel@tonic-gate * an error from the previous call 32540Sstevel@tonic-gate * If the previous call did fail, we capture 32550Sstevel@tonic-gate * that error and generate a perror with 32560Sstevel@tonic-gate * the string, "Unable to resume...". 32570Sstevel@tonic-gate * Setting rval to -1 ensures that in the 32580Sstevel@tonic-gate * next iteration of the loop, ep is not 32590Sstevel@tonic-gate * clobbered. 32600Sstevel@tonic-gate */ 32610Sstevel@tonic-gate if (rval == 0) 32620Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 32630Sstevel@tonic-gate else 32640Sstevel@tonic-gate mdclrerror(&xep); 32650Sstevel@tonic-gate rval = -1; 32660Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 32670Sstevel@tonic-gate "Unable to resume rpc.mdcommd.")); 32680Sstevel@tonic-gate } 32690Sstevel@tonic-gate nd = nd->nd_next; 32700Sstevel@tonic-gate } 32710Sstevel@tonic-gate meta_ping_mnset(sp->setno); 32720Sstevel@tonic-gate } 32730Sstevel@tonic-gate 32740Sstevel@tonic-gate /* 32750Sstevel@tonic-gate * Unlock set. This flushes the caches on the servers. 32760Sstevel@tonic-gate */ 32770Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname); 32780Sstevel@tonic-gate nd = sd->sd_nodelist; 32790Sstevel@tonic-gate while (nd) { 32800Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 32810Sstevel@tonic-gate nd = nd->nd_next; 32820Sstevel@tonic-gate continue; 32830Sstevel@tonic-gate } 32840Sstevel@tonic-gate if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 32850Sstevel@tonic-gate if (rval == 0) 32860Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 32870Sstevel@tonic-gate else 32880Sstevel@tonic-gate mdclrerror(&xep); 32890Sstevel@tonic-gate rval = -1; 32900Sstevel@tonic-gate } 32910Sstevel@tonic-gate nd = nd->nd_next; 32920Sstevel@tonic-gate } 32930Sstevel@tonic-gate 32940Sstevel@tonic-gate /* 32950Sstevel@tonic-gate * call metaflushsetnames to reset local cache for master and 32960Sstevel@tonic-gate * node information. 32970Sstevel@tonic-gate */ 32980Sstevel@tonic-gate metaflushsetname(sp); 32990Sstevel@tonic-gate 33000Sstevel@tonic-gate /* release signals back to what they were on entry */ 33010Sstevel@tonic-gate if (procsigs(FALSE, &oldsigs, &xep) < 0) 33020Sstevel@tonic-gate mdclrerror(&xep); 33030Sstevel@tonic-gate 33040Sstevel@tonic-gate return (rval); 33050Sstevel@tonic-gate 33060Sstevel@tonic-gate } 33070Sstevel@tonic-gate 33080Sstevel@tonic-gate /* 33090Sstevel@tonic-gate * Update nodelist with cluster member information. 33100Sstevel@tonic-gate * A node not in the member list will be marked 33110Sstevel@tonic-gate * as not ALIVE and not OWN. 33120Sstevel@tonic-gate * A node in the member list will be marked ALIVE, but 33130Sstevel@tonic-gate * the OWN bit will not be changed. 33140Sstevel@tonic-gate * 33150Sstevel@tonic-gate * If mynode isn't in the membership list, fail causing 33160Sstevel@tonic-gate * another reconfig cycle to be started since a non-member 33170Sstevel@tonic-gate * node shouldn't be taking part in the reconfig cycle. 33180Sstevel@tonic-gate * 33190Sstevel@tonic-gate * Return values: 33200Sstevel@tonic-gate * 0 - No problem. 33210Sstevel@tonic-gate * 1 - Any failure including RPC failure to my node. 33220Sstevel@tonic-gate */ 33230Sstevel@tonic-gate int 33240Sstevel@tonic-gate meta_reconfig_update_nodelist( 33250Sstevel@tonic-gate mdsetname_t *sp, 33260Sstevel@tonic-gate mndiskset_membershiplist_t *nl, 33270Sstevel@tonic-gate md_set_desc *sd, 33280Sstevel@tonic-gate md_error_t *ep 33290Sstevel@tonic-gate ) 33300Sstevel@tonic-gate { 33310Sstevel@tonic-gate mndiskset_membershiplist_t *nl2; 33320Sstevel@tonic-gate md_mnnode_desc *nd; 33330Sstevel@tonic-gate md_error_t xep = mdnullerror; 33340Sstevel@tonic-gate int rval = 0; 33350Sstevel@tonic-gate 33360Sstevel@tonic-gate /* 33370Sstevel@tonic-gate * Walk through nodelist, checking to see if each 33380Sstevel@tonic-gate * node is in the member list. 33390Sstevel@tonic-gate * If node is not a member, reset ALIVE and OWN node flag. 33400Sstevel@tonic-gate * If node is a member, set ALIVE. 33410Sstevel@tonic-gate * If mynode's OWN flag gets reset, then halt the diskset on this node. 33420Sstevel@tonic-gate */ 33430Sstevel@tonic-gate nd = sd->sd_nodelist; 33440Sstevel@tonic-gate while (nd) { 33450Sstevel@tonic-gate nl2 = nl; 33460Sstevel@tonic-gate while (nl2) { 33470Sstevel@tonic-gate /* If node is in member list, set ALIVE */ 33480Sstevel@tonic-gate if (nl2->msl_node_id == nd->nd_nodeid) { 33490Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_ALIVE; 33500Sstevel@tonic-gate break; 33510Sstevel@tonic-gate } else { 33520Sstevel@tonic-gate nl2 = nl2->next; 33530Sstevel@tonic-gate } 33540Sstevel@tonic-gate /* node is not in member list, mark !ALIVE and !OWN */ 33550Sstevel@tonic-gate if (nl2 == NULL) { 33560Sstevel@tonic-gate /* If node is mynode, then halt set if needed */ 33570Sstevel@tonic-gate if (strcmp(mynode(), nd->nd_nodename) == 0) { 33580Sstevel@tonic-gate /* 33590Sstevel@tonic-gate * This shouldn't happen, but just 33600Sstevel@tonic-gate * in case... Any node not in the 33610Sstevel@tonic-gate * membership list should be dead and 33620Sstevel@tonic-gate * not running reconfig step1. 33630Sstevel@tonic-gate */ 33640Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OWN) { 33650Sstevel@tonic-gate if (halt_set(sp, &xep)) { 33660Sstevel@tonic-gate mde_perror(&xep, ""); 33670Sstevel@tonic-gate mdclrerror(&xep); 33680Sstevel@tonic-gate } 33690Sstevel@tonic-gate } 33700Sstevel@tonic-gate /* 33710Sstevel@tonic-gate * Return failure since this node 33720Sstevel@tonic-gate * (mynode) is not in the membership 33730Sstevel@tonic-gate * list, but process the rest of the 33740Sstevel@tonic-gate * nodelist first so that rpc.metad 33750Sstevel@tonic-gate * can be updated with the latest 33760Sstevel@tonic-gate * membership information. 33770Sstevel@tonic-gate */ 33780Sstevel@tonic-gate (void) mddserror(ep, 33790Sstevel@tonic-gate MDE_DS_NOTINMEMBERLIST, 33800Sstevel@tonic-gate sp->setno, nd->nd_nodename, NULL, 33810Sstevel@tonic-gate sp->setname); 33820Sstevel@tonic-gate rval = 1; 33830Sstevel@tonic-gate } 33840Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_ALIVE; 33850Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OWN; 33860Sstevel@tonic-gate } 33870Sstevel@tonic-gate } 33880Sstevel@tonic-gate nd = nd->nd_next; 33890Sstevel@tonic-gate } 33900Sstevel@tonic-gate 33910Sstevel@tonic-gate /* Send this information to rpc.metad */ 33920Sstevel@tonic-gate if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, 33930Sstevel@tonic-gate MD_NR_SET, MNSET_IN_RECONFIG, &xep)) { 33940Sstevel@tonic-gate /* Return failure if can't send node flags to rpc.metad */ 33950Sstevel@tonic-gate if (rval == 0) { 33960Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 33970Sstevel@tonic-gate rval = 1; 33980Sstevel@tonic-gate } 33990Sstevel@tonic-gate } 34000Sstevel@tonic-gate return (rval); 34010Sstevel@tonic-gate } 34020Sstevel@tonic-gate 34030Sstevel@tonic-gate /* 34040Sstevel@tonic-gate * Choose master determines the master for a diskset. 34050Sstevel@tonic-gate * Each node determines the master on its own and 34060Sstevel@tonic-gate * adds this information to its local rpc.metad nodelist 34070Sstevel@tonic-gate * and also sends it to the kernel. 34080Sstevel@tonic-gate * 34090Sstevel@tonic-gate * Nodelist in set descriptor (sd) is sorted in 34100Sstevel@tonic-gate * monotonically increasing sequence of nodeid. 34110Sstevel@tonic-gate * 34120Sstevel@tonic-gate * Return values: 34130Sstevel@tonic-gate * 0 - No problem. 34140Sstevel@tonic-gate * 205 - There was an RPC problem to another node. 34150Sstevel@tonic-gate * -1 - There was an error. This could be an RPC error to my node. 34160Sstevel@tonic-gate * This is a catastrophic failure causing node to panic. 34170Sstevel@tonic-gate */ 34180Sstevel@tonic-gate int 34190Sstevel@tonic-gate meta_reconfig_choose_master_for_set( 34200Sstevel@tonic-gate mdsetname_t *sp, 34210Sstevel@tonic-gate md_set_desc *sd, 34220Sstevel@tonic-gate md_error_t *ep 34230Sstevel@tonic-gate ) 34240Sstevel@tonic-gate { 34250Sstevel@tonic-gate int is_owner; 34260Sstevel@tonic-gate md_mnset_record *mnsr = NULL; 34270Sstevel@tonic-gate int lowest_alive_nodeid = 0; 34280Sstevel@tonic-gate uint_t master_nodeid; 34290Sstevel@tonic-gate md_mnnode_desc *nd, *nd2; 34300Sstevel@tonic-gate md_mnnode_record *nr; 34310Sstevel@tonic-gate md_drive_desc *dd; 34320Sstevel@tonic-gate md_setkey_t *cl_sk; 34330Sstevel@tonic-gate int rval = 0; 34340Sstevel@tonic-gate md_error_t xep = mdnullerror; 34350Sstevel@tonic-gate mddb_setflags_config_t sf; 34360Sstevel@tonic-gate 34370Sstevel@tonic-gate /* 34380Sstevel@tonic-gate * Is current node joined to diskset? 34390Sstevel@tonic-gate * Don't trust flags, really check to see if mddb is snarfed. 34400Sstevel@tonic-gate */ 34410Sstevel@tonic-gate if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 34420Sstevel@tonic-gate /* 34430Sstevel@tonic-gate * If a node is joined to the diskset, this node checks 34440Sstevel@tonic-gate * to see if the current master of the diskset is valid and 34450Sstevel@tonic-gate * is still in the membership list (ALIVE) and is 34460Sstevel@tonic-gate * still joined (OWN). Need to verify if master is 34470Sstevel@tonic-gate * really joined - don't trust the flags. (Can trust 34480Sstevel@tonic-gate * ALIVE since set during earlier part of reconfig cycle.) 34490Sstevel@tonic-gate * If the current master is valid, still in the membership 34500Sstevel@tonic-gate * list and joined, then master is not changed on this node. 34510Sstevel@tonic-gate * Just return. 34520Sstevel@tonic-gate * 34530Sstevel@tonic-gate * Verify that nodeid is valid before accessing masternode. 34540Sstevel@tonic-gate */ 34550Sstevel@tonic-gate if ((sd->sd_mn_master_nodeid != MD_MN_INVALID_NID) && 34560Sstevel@tonic-gate (sd->sd_mn_masternode->nd_flags & MD_MN_NODE_ALIVE)) { 34570Sstevel@tonic-gate if (clnt_ownset(sd->sd_mn_master_nodenm, sp, 34580Sstevel@tonic-gate &is_owner, ep) == -1) { 34590Sstevel@tonic-gate /* If RPC failure to another node return 205 */ 34600Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 34610Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 34620Sstevel@tonic-gate sd->sd_mn_master_nodeid)) { 34630Sstevel@tonic-gate return (205); 34640Sstevel@tonic-gate } else { 34650Sstevel@tonic-gate /* Any other failure */ 34660Sstevel@tonic-gate return (-1); 34670Sstevel@tonic-gate } 34680Sstevel@tonic-gate } else { 34690Sstevel@tonic-gate if (is_owner == TRUE) { 34700Sstevel@tonic-gate 34710Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext( 34720Sstevel@tonic-gate TEXT_DOMAIN, "Set %s previous " 34730Sstevel@tonic-gate "master chosen %s (%d): %s"), 34740Sstevel@tonic-gate sp->setname, 34750Sstevel@tonic-gate sd->sd_mn_master_nodenm, 34760Sstevel@tonic-gate sd->sd_mn_master_nodeid, 34770Sstevel@tonic-gate meta_print_hrtime(gethrtime() - 34780Sstevel@tonic-gate start_time)); 34790Sstevel@tonic-gate 34800Sstevel@tonic-gate /* Previous master is ok - done */ 34810Sstevel@tonic-gate return (0); 34820Sstevel@tonic-gate } 34830Sstevel@tonic-gate } 34840Sstevel@tonic-gate } 34850Sstevel@tonic-gate 34860Sstevel@tonic-gate /* 34870Sstevel@tonic-gate * If current master is no longer in the membership list or 34880Sstevel@tonic-gate * is no longer joined, then this node uses the following 34890Sstevel@tonic-gate * algorithm: 34900Sstevel@tonic-gate * - node calls RPC routine clnt_ownset to get latest 34910Sstevel@tonic-gate * information on which nodes are owners of diskset. 34920Sstevel@tonic-gate * clnt_ownset checks on each node to see if its kernel 34930Sstevel@tonic-gate * has that diskset snarfed. 34940Sstevel@tonic-gate */ 34950Sstevel@tonic-gate nd = sd->sd_nodelist; 34960Sstevel@tonic-gate while (nd) { 34970Sstevel@tonic-gate /* Don't consider node that isn't in member list */ 34980Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 34990Sstevel@tonic-gate nd = nd->nd_next; 35000Sstevel@tonic-gate continue; 35010Sstevel@tonic-gate } 35020Sstevel@tonic-gate 35030Sstevel@tonic-gate if (clnt_ownset(nd->nd_nodename, sp, 35040Sstevel@tonic-gate &is_owner, ep) == -1) { 35050Sstevel@tonic-gate /* If RPC failure to another node return 205 */ 35060Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 35070Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 35080Sstevel@tonic-gate nd->nd_nodeid)) { 35090Sstevel@tonic-gate return (205); 35100Sstevel@tonic-gate } else { 35110Sstevel@tonic-gate /* Any other failure */ 35120Sstevel@tonic-gate return (-1); 35130Sstevel@tonic-gate } 35140Sstevel@tonic-gate } 35150Sstevel@tonic-gate 35160Sstevel@tonic-gate /* 35170Sstevel@tonic-gate * Set owner flag for each node based on whether 35180Sstevel@tonic-gate * that node really has a diskset mddb snarfed in 35190Sstevel@tonic-gate * or not. 35200Sstevel@tonic-gate */ 35210Sstevel@tonic-gate if (is_owner == TRUE) 35220Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_OWN; 35230Sstevel@tonic-gate else 35240Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OWN; 35250Sstevel@tonic-gate 35260Sstevel@tonic-gate nd = nd->nd_next; 35270Sstevel@tonic-gate } 35280Sstevel@tonic-gate 35290Sstevel@tonic-gate /* 35300Sstevel@tonic-gate * - node walks through nodelist looking for nodes that are 35310Sstevel@tonic-gate * owners of the diskset that are in the membership list. 35320Sstevel@tonic-gate * - for each owner, node calls RPC routine clnt_getset to 35330Sstevel@tonic-gate * see if that node has its node record set to OK. 35340Sstevel@tonic-gate * - If so, master is chosen to be this owner node. 35350Sstevel@tonic-gate */ 35360Sstevel@tonic-gate nd = sd->sd_nodelist; 35370Sstevel@tonic-gate while (nd) { 35380Sstevel@tonic-gate /* Don't consider node that isn't in member list */ 35390Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 35400Sstevel@tonic-gate nd = nd->nd_next; 35410Sstevel@tonic-gate continue; 35420Sstevel@tonic-gate } 35430Sstevel@tonic-gate 35440Sstevel@tonic-gate /* Don't consider a node that isn't an owner */ 35450Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 35460Sstevel@tonic-gate nd = nd->nd_next; 35470Sstevel@tonic-gate continue; 35480Sstevel@tonic-gate } 35490Sstevel@tonic-gate 35500Sstevel@tonic-gate /* Does node has its own node record set to OK? */ 35510Sstevel@tonic-gate if (clnt_mngetset(nd->nd_nodename, sp->setname, 35520Sstevel@tonic-gate MD_SET_BAD, &mnsr, ep) == -1) { 35530Sstevel@tonic-gate /* If RPC failure to another node return 205 */ 35540Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 35550Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 35560Sstevel@tonic-gate nd->nd_nodeid)) { 35570Sstevel@tonic-gate return (205); 35580Sstevel@tonic-gate } else { 35590Sstevel@tonic-gate /* Any other failure */ 35600Sstevel@tonic-gate return (-1); 35610Sstevel@tonic-gate } 35620Sstevel@tonic-gate } 35630Sstevel@tonic-gate nr = mnsr->sr_nodechain; 35640Sstevel@tonic-gate while (nr) { 35650Sstevel@tonic-gate if (nd->nd_nodeid == nr->nr_nodeid) { 35660Sstevel@tonic-gate if (nr->nr_flags & MD_MN_NODE_OK) { 35670Sstevel@tonic-gate /* Found a master */ 35680Sstevel@tonic-gate free_sr( 35690Sstevel@tonic-gate (md_set_record *)mnsr); 35700Sstevel@tonic-gate goto found_master; 35710Sstevel@tonic-gate } 35720Sstevel@tonic-gate } 35730Sstevel@tonic-gate nr = nr->nr_next; 35740Sstevel@tonic-gate } 35750Sstevel@tonic-gate free_sr((md_set_record *)mnsr); 35760Sstevel@tonic-gate nd = nd->nd_next; 35770Sstevel@tonic-gate } 35780Sstevel@tonic-gate 35790Sstevel@tonic-gate /* 35800Sstevel@tonic-gate * - If no owner node has its own node record on its own node 35810Sstevel@tonic-gate * set to OK, then this node checks all of the non-owner 35820Sstevel@tonic-gate * nodes that are in the membership list. 35830Sstevel@tonic-gate * - for each non-owner, node calls RPC routine clnt_getset to 35840Sstevel@tonic-gate * see if that node has its node record set to OK. 35850Sstevel@tonic-gate * - If set doesn't exist, don't choose node for master. 35860Sstevel@tonic-gate * - If so, master is chosen to be this non-owner node. 35870Sstevel@tonic-gate * 35880Sstevel@tonic-gate */ 35890Sstevel@tonic-gate nd = sd->sd_nodelist; 35900Sstevel@tonic-gate while (nd) { 35910Sstevel@tonic-gate /* Don't consider node that isn't in member list */ 35920Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 35930Sstevel@tonic-gate nd = nd->nd_next; 35940Sstevel@tonic-gate continue; 35950Sstevel@tonic-gate } 35960Sstevel@tonic-gate 35970Sstevel@tonic-gate /* Only checking non-owner nodes this time around */ 35980Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OWN) { 35990Sstevel@tonic-gate nd = nd->nd_next; 36000Sstevel@tonic-gate continue; 36010Sstevel@tonic-gate } 36020Sstevel@tonic-gate 36030Sstevel@tonic-gate /* Does node has its own node record set to OK? */ 36040Sstevel@tonic-gate if (clnt_mngetset(nd->nd_nodename, sp->setname, 36050Sstevel@tonic-gate MD_SET_BAD, &mnsr, ep) == -1) { 36060Sstevel@tonic-gate /* 36070Sstevel@tonic-gate * If set doesn't exist on non-owner node, 36080Sstevel@tonic-gate * don't consider this node for master. 36090Sstevel@tonic-gate */ 36100Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) { 36110Sstevel@tonic-gate nd = nd->nd_next; 36120Sstevel@tonic-gate continue; 36130Sstevel@tonic-gate } else if ((mdanyrpcerror(ep)) && 36140Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 36150Sstevel@tonic-gate nd->nd_nodeid)) { 36160Sstevel@tonic-gate /* RPC failure to another node */ 36170Sstevel@tonic-gate return (205); 36180Sstevel@tonic-gate } else { 36190Sstevel@tonic-gate /* Any other failure */ 36200Sstevel@tonic-gate return (-1); 36210Sstevel@tonic-gate } 36220Sstevel@tonic-gate } 36230Sstevel@tonic-gate nr = mnsr->sr_nodechain; 36240Sstevel@tonic-gate while (nr) { 36250Sstevel@tonic-gate if (nd->nd_nodeid == nr->nr_nodeid) { 36260Sstevel@tonic-gate if (nr->nr_flags & MD_MN_NODE_OK) { 36270Sstevel@tonic-gate /* Found a master */ 36280Sstevel@tonic-gate free_sr( 36290Sstevel@tonic-gate (md_set_record *)mnsr); 36300Sstevel@tonic-gate goto found_master; 36310Sstevel@tonic-gate } 36320Sstevel@tonic-gate } 36330Sstevel@tonic-gate nr = nr->nr_next; 36340Sstevel@tonic-gate } 36350Sstevel@tonic-gate free_sr((md_set_record *)mnsr); 36360Sstevel@tonic-gate nd = nd->nd_next; 36370Sstevel@tonic-gate } 36380Sstevel@tonic-gate 36390Sstevel@tonic-gate /* 36400Sstevel@tonic-gate * - If no node can be found that has its own node record on 36410Sstevel@tonic-gate * its node to be set to OK, then all alive nodes 36420Sstevel@tonic-gate * were in the process of being added to or deleted 36430Sstevel@tonic-gate * from set. Each alive node will remove all 36440Sstevel@tonic-gate * information pertaining to this set from its node. 36450Sstevel@tonic-gate * 36460Sstevel@tonic-gate * If all nodes in set are ALIVE, then call sdssc end routines 36470Sstevel@tonic-gate * since set was truly being initially created or destroyed. 36480Sstevel@tonic-gate */ 36490Sstevel@tonic-gate goto delete_set; 36500Sstevel@tonic-gate } else { 36510Sstevel@tonic-gate 36520Sstevel@tonic-gate /* 36530Sstevel@tonic-gate * If node is not joined to diskset, then this 36540Sstevel@tonic-gate * node uses the following algorithm: 36550Sstevel@tonic-gate * - If unjoined node doesn't have a node record for itself, 36560Sstevel@tonic-gate * just delete the diskset since diskset was in the 36570Sstevel@tonic-gate * process of being created. 36580Sstevel@tonic-gate * - node needs to find master of diskset before 36590Sstevel@tonic-gate * reconfig cycle, if a master existed. 36600Sstevel@tonic-gate * - node calls RPC routine clnt_ownset to get latest 36610Sstevel@tonic-gate * information on which nodes are owners of diskset. 36620Sstevel@tonic-gate * clnt_ownset checks on each node to see if its 36630Sstevel@tonic-gate * kernel has that diskset snarfed. 36640Sstevel@tonic-gate */ 36650Sstevel@tonic-gate 36660Sstevel@tonic-gate /* 36670Sstevel@tonic-gate * Is my node in the set description? 36680Sstevel@tonic-gate * If not, delete the set from this node. 36690Sstevel@tonic-gate * sr2setdesc sets sd_mn_mynode pointer to the node 36700Sstevel@tonic-gate * descriptor for this node if there was a node 36710Sstevel@tonic-gate * record for this node. 36720Sstevel@tonic-gate * 36730Sstevel@tonic-gate */ 36740Sstevel@tonic-gate if (sd->sd_mn_mynode == NULL) { 36750Sstevel@tonic-gate goto delete_set; 36760Sstevel@tonic-gate } 36770Sstevel@tonic-gate 36780Sstevel@tonic-gate nd = sd->sd_nodelist; 36790Sstevel@tonic-gate while (nd) { 36800Sstevel@tonic-gate /* Don't consider node that isn't in member list */ 36810Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 36820Sstevel@tonic-gate nd = nd->nd_next; 36830Sstevel@tonic-gate continue; 36840Sstevel@tonic-gate } 36850Sstevel@tonic-gate 36860Sstevel@tonic-gate if (clnt_ownset(nd->nd_nodename, sp, 36870Sstevel@tonic-gate &is_owner, ep) == -1) { 36880Sstevel@tonic-gate /* If RPC failure to another node return 205 */ 36890Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 36900Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 36910Sstevel@tonic-gate nd->nd_nodeid)) { 36920Sstevel@tonic-gate return (205); 36930Sstevel@tonic-gate } else { 36940Sstevel@tonic-gate /* Any other failure */ 36950Sstevel@tonic-gate return (-1); 36960Sstevel@tonic-gate } 36970Sstevel@tonic-gate } 36980Sstevel@tonic-gate 36990Sstevel@tonic-gate /* 37000Sstevel@tonic-gate * Set owner flag for each node based on whether 37010Sstevel@tonic-gate * that node really has a diskset mddb snarfed in 37020Sstevel@tonic-gate * or not. 37030Sstevel@tonic-gate */ 37040Sstevel@tonic-gate if (is_owner == TRUE) 37050Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_OWN; 37060Sstevel@tonic-gate else 37070Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OWN; 37080Sstevel@tonic-gate 37090Sstevel@tonic-gate nd = nd->nd_next; 37100Sstevel@tonic-gate } 37110Sstevel@tonic-gate 37120Sstevel@tonic-gate /* 37130Sstevel@tonic-gate * - node walks through nodelist looking for nodes that 37140Sstevel@tonic-gate * are owners of the diskset that are in 37150Sstevel@tonic-gate * the membership list. 37160Sstevel@tonic-gate * - for each owner, node calls RPC routine clnt_getset to 37170Sstevel@tonic-gate * see if that node has a master set and to get the 37180Sstevel@tonic-gate * diskset description. 37190Sstevel@tonic-gate * - If the owner node has a set description that doesn't 37200Sstevel@tonic-gate * include the non-joined node in the nodelist, this node 37210Sstevel@tonic-gate * removes its set description of that diskset 37220Sstevel@tonic-gate * (i.e. removes the set from its local mddbs). This is 37230Sstevel@tonic-gate * handling the case of when a node was removed from a 37240Sstevel@tonic-gate * diskset while it was not in the cluster membership 37250Sstevel@tonic-gate * list. 37260Sstevel@tonic-gate * - If that node has a master set and the master is in the 37270Sstevel@tonic-gate * membership list and is an owner, then either this was 37280Sstevel@tonic-gate * the master from before the reconfig cycle or this 37290Sstevel@tonic-gate * node has already chosen a new master - either way, 37300Sstevel@tonic-gate * the master value is valid as long as it is in the 37310Sstevel@tonic-gate * membership list and is an owner 37320Sstevel@tonic-gate * - master is chosen to be owner node's master 37330Sstevel@tonic-gate */ 37340Sstevel@tonic-gate nd = sd->sd_nodelist; 37350Sstevel@tonic-gate while (nd) { 37360Sstevel@tonic-gate /* Don't consider node that isn't in member list */ 37370Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 37380Sstevel@tonic-gate nd = nd->nd_next; 37390Sstevel@tonic-gate continue; 37400Sstevel@tonic-gate } 37410Sstevel@tonic-gate 37420Sstevel@tonic-gate /* Don't consider a node that isn't an owner */ 37430Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 37440Sstevel@tonic-gate nd = nd->nd_next; 37450Sstevel@tonic-gate continue; 37460Sstevel@tonic-gate } 37470Sstevel@tonic-gate 37480Sstevel@tonic-gate /* Get owner node's set record */ 37490Sstevel@tonic-gate if (clnt_mngetset(nd->nd_nodename, sp->setname, 37500Sstevel@tonic-gate MD_SET_BAD, &mnsr, ep) == -1) { 37510Sstevel@tonic-gate /* If RPC failure to another node return 205 */ 37520Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 37530Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 37540Sstevel@tonic-gate nd->nd_nodeid)) { 37550Sstevel@tonic-gate return (205); 37560Sstevel@tonic-gate } else { 37570Sstevel@tonic-gate /* Any other failure */ 37580Sstevel@tonic-gate return (-1); 37590Sstevel@tonic-gate } 37600Sstevel@tonic-gate } 37610Sstevel@tonic-gate 37620Sstevel@tonic-gate /* Is this node in the owner node's set record */ 37630Sstevel@tonic-gate nr = mnsr->sr_nodechain; 37640Sstevel@tonic-gate while (nr) { 37650Sstevel@tonic-gate if (sd->sd_mn_mynode->nd_nodeid == 37660Sstevel@tonic-gate nr->nr_nodeid) { 37670Sstevel@tonic-gate break; 37680Sstevel@tonic-gate } 37690Sstevel@tonic-gate nr = nr->nr_next; 37700Sstevel@tonic-gate } 37710Sstevel@tonic-gate if (nr == NULL) { 37720Sstevel@tonic-gate /* my node not found - delete set */ 37730Sstevel@tonic-gate free_sr((md_set_record *)mnsr); 37740Sstevel@tonic-gate goto delete_set; 37750Sstevel@tonic-gate } 37760Sstevel@tonic-gate 37770Sstevel@tonic-gate /* Is owner's node's master valid? */ 37780Sstevel@tonic-gate master_nodeid = mnsr->sr_master_nodeid; 37790Sstevel@tonic-gate free_sr((md_set_record *)mnsr); 37800Sstevel@tonic-gate if (master_nodeid == MD_MN_INVALID_NID) { 37810Sstevel@tonic-gate nd = nd->nd_next; 37820Sstevel@tonic-gate continue; 37830Sstevel@tonic-gate } 37840Sstevel@tonic-gate 37850Sstevel@tonic-gate nd2 = sd->sd_nodelist; 37860Sstevel@tonic-gate while (nd2) { 37870Sstevel@tonic-gate if ((nd2->nd_nodeid == master_nodeid) && 37880Sstevel@tonic-gate (nd2->nd_flags & MD_MN_NODE_ALIVE) && 37890Sstevel@tonic-gate (nd2->nd_flags & MD_MN_NODE_OWN)) { 37900Sstevel@tonic-gate nd = nd2; 37910Sstevel@tonic-gate goto found_master; 37920Sstevel@tonic-gate } 37930Sstevel@tonic-gate nd2 = nd2->nd_next; 37940Sstevel@tonic-gate } 37950Sstevel@tonic-gate nd = nd->nd_next; 37960Sstevel@tonic-gate } 37970Sstevel@tonic-gate 37980Sstevel@tonic-gate /* 37990Sstevel@tonic-gate * - If no owner node has a valid master, then follow 38000Sstevel@tonic-gate * algorithm of when a node is joined to the diskset. 38010Sstevel@tonic-gate * - node walks through nodelist looking for nodes that are 38020Sstevel@tonic-gate * owners of the diskset that are in the membership list. 38030Sstevel@tonic-gate * - for each owner, node calls RPC routine clnt_getset to 38040Sstevel@tonic-gate * see if that node has its node record set to OK. 38050Sstevel@tonic-gate * - If so, master is chosen to be this owner node. 38060Sstevel@tonic-gate */ 38070Sstevel@tonic-gate nd = sd->sd_nodelist; 38080Sstevel@tonic-gate while (nd) { 38090Sstevel@tonic-gate /* Don't consider node that isn't in member list */ 38100Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 38110Sstevel@tonic-gate nd = nd->nd_next; 38120Sstevel@tonic-gate continue; 38130Sstevel@tonic-gate } 38140Sstevel@tonic-gate 38150Sstevel@tonic-gate /* Don't consider a node that isn't an owner */ 38160Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 38170Sstevel@tonic-gate nd = nd->nd_next; 38180Sstevel@tonic-gate continue; 38190Sstevel@tonic-gate } 38200Sstevel@tonic-gate 38210Sstevel@tonic-gate /* Does node has its own node record set to OK? */ 38220Sstevel@tonic-gate if (clnt_mngetset(nd->nd_nodename, sp->setname, 38230Sstevel@tonic-gate MD_SET_BAD, &mnsr, ep) == -1) { 38240Sstevel@tonic-gate /* If RPC failure to another node return 205 */ 38250Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 38260Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 38270Sstevel@tonic-gate nd->nd_nodeid)) { 38280Sstevel@tonic-gate return (205); 38290Sstevel@tonic-gate } else { 38300Sstevel@tonic-gate /* Any other failure */ 38310Sstevel@tonic-gate return (-1); 38320Sstevel@tonic-gate } 38330Sstevel@tonic-gate } 38340Sstevel@tonic-gate nr = mnsr->sr_nodechain; 38350Sstevel@tonic-gate while (nr) { 38360Sstevel@tonic-gate if (nd->nd_nodeid == nr->nr_nodeid) { 38370Sstevel@tonic-gate if (nr->nr_flags & MD_MN_NODE_OK) { 38380Sstevel@tonic-gate /* Found a master */ 38390Sstevel@tonic-gate free_sr( 38400Sstevel@tonic-gate (md_set_record *)mnsr); 38410Sstevel@tonic-gate goto found_master; 38420Sstevel@tonic-gate } 38430Sstevel@tonic-gate } 38440Sstevel@tonic-gate nr = nr->nr_next; 38450Sstevel@tonic-gate } 38460Sstevel@tonic-gate free_sr((md_set_record *)mnsr); 38470Sstevel@tonic-gate nd = nd->nd_next; 38480Sstevel@tonic-gate } 38490Sstevel@tonic-gate 38500Sstevel@tonic-gate /* 38510Sstevel@tonic-gate * - If no owner node has its own node record on its own node 38520Sstevel@tonic-gate * set to OK, then this node checks all of the non-owner 38530Sstevel@tonic-gate * nodes that are in the membership list. 38540Sstevel@tonic-gate * - for each non-owner, node calls RPC routine clnt_getset to 38550Sstevel@tonic-gate * see if that node has its node record set to OK. 38560Sstevel@tonic-gate * - If set doesn't exist, don't choose node for master. 38570Sstevel@tonic-gate * - If this node doesn't exist in the nodelist on any of the 38580Sstevel@tonic-gate * non-owner nodes, this node removes its set description 38590Sstevel@tonic-gate * of that diskset (i.e. removes the set from its local 38600Sstevel@tonic-gate * mddbs). This is handling the case of when a node was 38610Sstevel@tonic-gate * removed from a diskset while it was not in the 38620Sstevel@tonic-gate * cluster membership list. 38630Sstevel@tonic-gate * - If non-owner node has its node record set to OK and if 38640Sstevel@tonic-gate * this node hasn't removed this diskset (step directly 38650Sstevel@tonic-gate * before this one), then the master is chosen to be this 38660Sstevel@tonic-gate * non-owner node. 38670Sstevel@tonic-gate */ 38680Sstevel@tonic-gate nd = sd->sd_nodelist; 38690Sstevel@tonic-gate while (nd) { 38700Sstevel@tonic-gate /* Don't consider node that isn't in member list */ 38710Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 38720Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL; 38730Sstevel@tonic-gate nd = nd->nd_next; 38740Sstevel@tonic-gate continue; 38750Sstevel@tonic-gate } 38760Sstevel@tonic-gate 38770Sstevel@tonic-gate /* Don't consider owner nodes since none are OK */ 38780Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OWN) { 38790Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL; 38800Sstevel@tonic-gate nd = nd->nd_next; 38810Sstevel@tonic-gate continue; 38820Sstevel@tonic-gate } 38830Sstevel@tonic-gate 38840Sstevel@tonic-gate /* 38850Sstevel@tonic-gate * Don't need to get nodelist from my node since 38860Sstevel@tonic-gate * this is where sd_nodelist was obtained. 38870Sstevel@tonic-gate */ 38880Sstevel@tonic-gate if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) { 38890Sstevel@tonic-gate nd = nd->nd_next; 38900Sstevel@tonic-gate continue; 38910Sstevel@tonic-gate } 38920Sstevel@tonic-gate 38930Sstevel@tonic-gate /* 38940Sstevel@tonic-gate * If node has already been decided against for 38950Sstevel@tonic-gate * master, then skip it. 38960Sstevel@tonic-gate */ 38970Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_DEL) { 38980Sstevel@tonic-gate nd = nd->nd_next; 38990Sstevel@tonic-gate continue; 39000Sstevel@tonic-gate } 39010Sstevel@tonic-gate 39020Sstevel@tonic-gate /* 39030Sstevel@tonic-gate * Does node in my nodelist have its own node 39040Sstevel@tonic-gate * record marked OK on its node? And does node 39050Sstevel@tonic-gate * in my nodelist exist on all other nodes? 39060Sstevel@tonic-gate * Don't want to choose a node for master unless 39070Sstevel@tonic-gate * that node is marked OK on its own node and that 39080Sstevel@tonic-gate * node exists on all other alive nodes. 39090Sstevel@tonic-gate * 39100Sstevel@tonic-gate * This is guarding against the case when several 39110Sstevel@tonic-gate * nodes are down and one of the downed nodes is 39120Sstevel@tonic-gate * deleted from the diskset. When the down nodes 39130Sstevel@tonic-gate * are rebooted into the cluster, you don't want 39140Sstevel@tonic-gate * any node to pick the deleted node as the master. 39150Sstevel@tonic-gate */ 39160Sstevel@tonic-gate if (clnt_mngetset(nd->nd_nodename, sp->setname, 39170Sstevel@tonic-gate MD_SET_BAD, &mnsr, ep) == -1) { 39180Sstevel@tonic-gate /* 39190Sstevel@tonic-gate * If set doesn't exist on non-owner node, 39200Sstevel@tonic-gate * don't consider this node for master. 39210Sstevel@tonic-gate */ 39220Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) { 39230Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL; 39240Sstevel@tonic-gate nd = nd->nd_next; 39250Sstevel@tonic-gate continue; 39260Sstevel@tonic-gate } else if (mdanyrpcerror(ep)) { 39270Sstevel@tonic-gate /* RPC failure to another node */ 39280Sstevel@tonic-gate return (205); 39290Sstevel@tonic-gate } else { 39300Sstevel@tonic-gate /* Any other failure */ 39310Sstevel@tonic-gate return (-1); 39320Sstevel@tonic-gate } 39330Sstevel@tonic-gate } 39340Sstevel@tonic-gate /* 39350Sstevel@tonic-gate * Is my node in the nodelist gotten from the other 39360Sstevel@tonic-gate * node? If not, then remove the set from my node 39370Sstevel@tonic-gate * since set was deleted from my node while my node 39380Sstevel@tonic-gate * was out of the cluster. 39390Sstevel@tonic-gate */ 39400Sstevel@tonic-gate nr = mnsr->sr_nodechain; 39410Sstevel@tonic-gate while (nr) { 39420Sstevel@tonic-gate if (sd->sd_mn_mynode->nd_nodeid == 39430Sstevel@tonic-gate nr->nr_nodeid) { 39440Sstevel@tonic-gate break; 39450Sstevel@tonic-gate } 39460Sstevel@tonic-gate nr = nr->nr_next; 39470Sstevel@tonic-gate } 39480Sstevel@tonic-gate if (nr == NULL) { 39490Sstevel@tonic-gate /* my node not found - delete set */ 39500Sstevel@tonic-gate free_sr((md_set_record *)mnsr); 39510Sstevel@tonic-gate goto delete_set; 39520Sstevel@tonic-gate } 39530Sstevel@tonic-gate 39540Sstevel@tonic-gate /* Is node being checked marked OK on its own node? */ 39550Sstevel@tonic-gate nr = mnsr->sr_nodechain; 39560Sstevel@tonic-gate while (nr) { 39570Sstevel@tonic-gate if (nd->nd_nodeid == nr->nr_nodeid) { 39580Sstevel@tonic-gate if (!(nr->nr_flags & MD_MN_NODE_OK)) { 39590Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL; 39600Sstevel@tonic-gate } 39610Sstevel@tonic-gate break; 39620Sstevel@tonic-gate } 39630Sstevel@tonic-gate nr = nr->nr_next; 39640Sstevel@tonic-gate } 39650Sstevel@tonic-gate /* 39660Sstevel@tonic-gate * If node being checked doesn't exist on its 39670Sstevel@tonic-gate * own node - don't choose it as master. 39680Sstevel@tonic-gate */ 39690Sstevel@tonic-gate if (nr == NULL) { 39700Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL; 39710Sstevel@tonic-gate } 39720Sstevel@tonic-gate 39730Sstevel@tonic-gate /* 39740Sstevel@tonic-gate * Check every node in my node's nodelist against 39750Sstevel@tonic-gate * the nodelist gotten from the other node. 39760Sstevel@tonic-gate * If a node in my node's nodelist is not found in the 39770Sstevel@tonic-gate * other node's nodelist, then set the DEL flag. 39780Sstevel@tonic-gate */ 39790Sstevel@tonic-gate nd2 = sd->sd_nodelist; 39800Sstevel@tonic-gate while (nd2) { 39810Sstevel@tonic-gate nr = mnsr->sr_nodechain; 39820Sstevel@tonic-gate while (nr) { 39830Sstevel@tonic-gate if (nd2->nd_nodeid == nr->nr_nodeid) { 39840Sstevel@tonic-gate break; 39850Sstevel@tonic-gate } 39860Sstevel@tonic-gate nr = nr->nr_next; 39870Sstevel@tonic-gate } 39880Sstevel@tonic-gate /* nd2 not found in other node's nodelist */ 39890Sstevel@tonic-gate if (nr == NULL) { 39900Sstevel@tonic-gate nd2->nd_flags |= MD_MN_NODE_DEL; 39910Sstevel@tonic-gate } 39920Sstevel@tonic-gate nd2 = nd2->nd_next; 39930Sstevel@tonic-gate } 39940Sstevel@tonic-gate 39950Sstevel@tonic-gate free_sr((md_set_record *)mnsr); 39960Sstevel@tonic-gate nd = nd->nd_next; 39970Sstevel@tonic-gate } 39980Sstevel@tonic-gate 39990Sstevel@tonic-gate /* 40000Sstevel@tonic-gate * Rescan list look for node that has not been marked DEL. 40010Sstevel@tonic-gate * First node found is the master. 40020Sstevel@tonic-gate */ 40030Sstevel@tonic-gate nd = sd->sd_nodelist; 40040Sstevel@tonic-gate while (nd) { 40050Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_DEL)) { 40060Sstevel@tonic-gate break; 40070Sstevel@tonic-gate } 40080Sstevel@tonic-gate nd = nd->nd_next; 40090Sstevel@tonic-gate continue; 40100Sstevel@tonic-gate } 40110Sstevel@tonic-gate if (nd) { 40120Sstevel@tonic-gate /* Found a master */ 40130Sstevel@tonic-gate goto found_master; 40140Sstevel@tonic-gate } 40150Sstevel@tonic-gate 40160Sstevel@tonic-gate /* 40170Sstevel@tonic-gate * - If no node can be found that has its own node record on 40180Sstevel@tonic-gate * its node to be set to OK, then all alive nodes 40190Sstevel@tonic-gate * were in the process of being added to or deleted 40200Sstevel@tonic-gate * from set. Each alive node will remove all 40210Sstevel@tonic-gate * information pertaining to this set from its node. 40220Sstevel@tonic-gate * 40230Sstevel@tonic-gate * If all nodes in set are ALIVE, then call sdssc end routines 40240Sstevel@tonic-gate * since set was truly being initially created or destroyed. 40250Sstevel@tonic-gate */ 40260Sstevel@tonic-gate goto delete_set; 40270Sstevel@tonic-gate } 40280Sstevel@tonic-gate 40290Sstevel@tonic-gate found_master: 40300Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 40310Sstevel@tonic-gate "Set %s master chosen %s (%d): %s"), 40320Sstevel@tonic-gate sp->setname, nd->nd_nodename, nd->nd_nodeid, 40330Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 40340Sstevel@tonic-gate 40350Sstevel@tonic-gate if (clnt_lock_set(mynode(), sp, ep) == -1) { 40360Sstevel@tonic-gate return (-1); 40370Sstevel@tonic-gate } 40380Sstevel@tonic-gate 40390Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname); 40400Sstevel@tonic-gate 40410Sstevel@tonic-gate if (clnt_mnsetmaster(mynode(), sp, 40420Sstevel@tonic-gate nd->nd_nodename, nd->nd_nodeid, ep)) { 40430Sstevel@tonic-gate rval = -1; 40440Sstevel@tonic-gate } else if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) { 40450Sstevel@tonic-gate /* If this node is new master, set flag in this node's kernel */ 40460Sstevel@tonic-gate (void) memset(&sf, 0, sizeof (sf)); 40470Sstevel@tonic-gate sf.sf_setno = sp->setno; 40480Sstevel@tonic-gate sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 40490Sstevel@tonic-gate /* Use magic to help protect ioctl against attack. */ 40500Sstevel@tonic-gate sf.sf_magic = MDDB_SETFLAGS_MAGIC; 40510Sstevel@tonic-gate sf.sf_flags = MDDB_NM_SET; 40520Sstevel@tonic-gate 40530Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 40540Sstevel@tonic-gate "Setting new master flag for set %s: %s"), 40550Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 40560Sstevel@tonic-gate 40570Sstevel@tonic-gate /* 40580Sstevel@tonic-gate * Fail reconfig cycle if ioctl fails since it is critical 40590Sstevel@tonic-gate * to set new master flag. 40600Sstevel@tonic-gate */ 40610Sstevel@tonic-gate if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, 40620Sstevel@tonic-gate NULL) != NULL) { 40630Sstevel@tonic-gate (void) mdstealerror(ep, &sf.sf_mde); 40640Sstevel@tonic-gate rval = -1; 40650Sstevel@tonic-gate } 40660Sstevel@tonic-gate } 40670Sstevel@tonic-gate 40680Sstevel@tonic-gate if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) { 40690Sstevel@tonic-gate if (rval == 0) { 40700Sstevel@tonic-gate (void) mdstealerror(ep, &xep); 40710Sstevel@tonic-gate rval = -1; 40720Sstevel@tonic-gate } 40730Sstevel@tonic-gate } 40740Sstevel@tonic-gate 40750Sstevel@tonic-gate cl_set_setkey(NULL); 40760Sstevel@tonic-gate 40770Sstevel@tonic-gate metaflushsetname(sp); 40780Sstevel@tonic-gate 40790Sstevel@tonic-gate return (rval); 40800Sstevel@tonic-gate 40810Sstevel@tonic-gate delete_set: 40820Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 40830Sstevel@tonic-gate "Master not chosen, deleting set %s: %s"), 40840Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 40850Sstevel@tonic-gate 40860Sstevel@tonic-gate /* 40870Sstevel@tonic-gate * Remove all set information from this node: 40880Sstevel@tonic-gate * - node records for this set 40890Sstevel@tonic-gate * - drive records for this set 40900Sstevel@tonic-gate * - set record for this set 40910Sstevel@tonic-gate * (Only do this on this node since each node 40920Sstevel@tonic-gate * will do it for its own local mddb.) 40930Sstevel@tonic-gate * 40940Sstevel@tonic-gate * If all nodes in set are ALIVE, then 40950Sstevel@tonic-gate * the lowest numbered ALIVE nodeid in set 40960Sstevel@tonic-gate * (irregardless of whether an owner node or not) will 40970Sstevel@tonic-gate * call the DCS service to cleanup for create/delete of set. 40980Sstevel@tonic-gate * sdssc_create_end(cleanup) if set was being created or 40990Sstevel@tonic-gate * sdssc_delete_end(cleanup) if set was being deleted. 41000Sstevel@tonic-gate * A node record with flag ADD denotes a set being 41010Sstevel@tonic-gate * created. A node record with flag DEL denotes a 41020Sstevel@tonic-gate * set being deleted. 41030Sstevel@tonic-gate */ 41040Sstevel@tonic-gate nd = sd->sd_nodelist; 41050Sstevel@tonic-gate while (nd) { 41060Sstevel@tonic-gate /* Found a node that isn't alive */ 41070Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) 41080Sstevel@tonic-gate break; 41090Sstevel@tonic-gate 41100Sstevel@tonic-gate /* Is my node the lowest numbered ALIVE node? */ 41110Sstevel@tonic-gate if (nd->nd_nodeid < sd->sd_mn_mynode->nd_nodeid) { 41120Sstevel@tonic-gate break; 41130Sstevel@tonic-gate } 41140Sstevel@tonic-gate nd = nd->nd_next; 41150Sstevel@tonic-gate } 41160Sstevel@tonic-gate if (nd == NULL) { 41170Sstevel@tonic-gate /* All nodes ALIVE and this is the lowest nodeid */ 41180Sstevel@tonic-gate lowest_alive_nodeid = 1; 41190Sstevel@tonic-gate } 41200Sstevel@tonic-gate 41210Sstevel@tonic-gate if (clnt_lock_set(mynode(), sp, ep) == -1) { 41220Sstevel@tonic-gate return (-1); 41230Sstevel@tonic-gate } 41240Sstevel@tonic-gate 41250Sstevel@tonic-gate 41260Sstevel@tonic-gate /* 41270Sstevel@tonic-gate * If this node had been joined, withdraw and reset master. 41280Sstevel@tonic-gate * 41290Sstevel@tonic-gate * This could happen if a node was being added to or removed 41300Sstevel@tonic-gate * from a diskset and the node doing the add/delete operation and 41310Sstevel@tonic-gate * all other nodes in the diskset have left the cluster. 41320Sstevel@tonic-gate */ 41330Sstevel@tonic-gate if (sd->sd_mn_mynode) { 41340Sstevel@tonic-gate nd = sd->sd_mn_mynode; 41350Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OWN) { 41360Sstevel@tonic-gate if (clnt_withdrawset(mynode(), sp, ep)) { 41370Sstevel@tonic-gate rval = -1; 41380Sstevel@tonic-gate goto out; 41390Sstevel@tonic-gate } 41400Sstevel@tonic-gate if (clnt_mnsetmaster(mynode(), sp, "", 41410Sstevel@tonic-gate MD_MN_INVALID_NID, ep)) { 41420Sstevel@tonic-gate rval = -1; 41430Sstevel@tonic-gate goto out; 41440Sstevel@tonic-gate } 41450Sstevel@tonic-gate } 41460Sstevel@tonic-gate } 41470Sstevel@tonic-gate 41480Sstevel@tonic-gate /* 41490Sstevel@tonic-gate * Remove side records for this node (side) from local mddb 41500Sstevel@tonic-gate * (clnt_deldrvs does this) if there are drives in the set. 41510Sstevel@tonic-gate * 41520Sstevel@tonic-gate * Don't need to mark this node as DEL since already marked as 41530Sstevel@tonic-gate * ADD or DEL (or this node would have been chosen as master). 41540Sstevel@tonic-gate * Don't need to mark other node records, drive records or 41550Sstevel@tonic-gate * set records as DEL. If a panic occurs during clnt_delset, 41560Sstevel@tonic-gate * these records will be deleted the next time this node 41570Sstevel@tonic-gate * becomes a member and goes through the reconfig cycle. 41580Sstevel@tonic-gate */ 41590Sstevel@tonic-gate /* Get the drive descriptors for this set */ 41600Sstevel@tonic-gate if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 41610Sstevel@tonic-gate ep)) == NULL) { 41620Sstevel@tonic-gate if (! mdisok(ep)) { 41630Sstevel@tonic-gate /* 41640Sstevel@tonic-gate * Ignore and clear out any failures from 41650Sstevel@tonic-gate * metaget_drivedesc since a panic could have 41660Sstevel@tonic-gate * occurred when a node was partially added to a set. 41670Sstevel@tonic-gate */ 41680Sstevel@tonic-gate mdclrerror(ep); 41690Sstevel@tonic-gate } 41700Sstevel@tonic-gate } else { 41710Sstevel@tonic-gate if (clnt_deldrvs(mynode(), sp, dd, ep)) { 41720Sstevel@tonic-gate rval = -1; 41730Sstevel@tonic-gate goto out; 41740Sstevel@tonic-gate } 41750Sstevel@tonic-gate } 41760Sstevel@tonic-gate 41770Sstevel@tonic-gate /* 41780Sstevel@tonic-gate * Now, delete the set - this removes the node, drive 41790Sstevel@tonic-gate * and set records from the local mddb. 41800Sstevel@tonic-gate */ 41810Sstevel@tonic-gate if (clnt_delset(mynode(), sp, ep)) { 41820Sstevel@tonic-gate rval = -1; 41830Sstevel@tonic-gate goto out; 41840Sstevel@tonic-gate } 41850Sstevel@tonic-gate 41860Sstevel@tonic-gate out: 41870Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname); 41880Sstevel@tonic-gate 41890Sstevel@tonic-gate /* 41900Sstevel@tonic-gate * Ignore errors from unlock of set since set is no longer 41910Sstevel@tonic-gate * known (if clnt_delset worked). 41920Sstevel@tonic-gate */ 41930Sstevel@tonic-gate if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) { 41940Sstevel@tonic-gate mdclrerror(&xep); 41950Sstevel@tonic-gate } 41960Sstevel@tonic-gate 41970Sstevel@tonic-gate cl_set_setkey(NULL); 41980Sstevel@tonic-gate 41990Sstevel@tonic-gate metaflushsetname(sp); 42000Sstevel@tonic-gate 42010Sstevel@tonic-gate /* 42020Sstevel@tonic-gate * If this node is the lowest numbered nodeid then 42030Sstevel@tonic-gate * call sdssc_create/delete_end depending on whether 42040Sstevel@tonic-gate * this node is marked as ADD or DEL in the node record. 42050Sstevel@tonic-gate */ 42060Sstevel@tonic-gate if (lowest_alive_nodeid) { 42070Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_ADD) 42080Sstevel@tonic-gate sdssc_create_end(sp->setname, SDSSC_CLEANUP); 42090Sstevel@tonic-gate else if (nd->nd_flags & MD_MN_NODE_DEL) 42100Sstevel@tonic-gate sdssc_delete_end(sp->setname, SDSSC_CLEANUP); 42110Sstevel@tonic-gate } 42120Sstevel@tonic-gate 42130Sstevel@tonic-gate /* Finished with this set -- return */ 42140Sstevel@tonic-gate return (rval); 42150Sstevel@tonic-gate } 42160Sstevel@tonic-gate 42170Sstevel@tonic-gate /* 42180Sstevel@tonic-gate * Reconfig step to choose a new master for all MN disksets. 42190Sstevel@tonic-gate * Return values: 42200Sstevel@tonic-gate * 0 - Everything is great. 42210Sstevel@tonic-gate * 1 - This node failed to reconfig. 42220Sstevel@tonic-gate * 205 - Cause another reconfig due to a nodelist problem 42230Sstevel@tonic-gate * or RPC failure to another node 42240Sstevel@tonic-gate */ 42250Sstevel@tonic-gate int 42260Sstevel@tonic-gate meta_reconfig_choose_master( 42270Sstevel@tonic-gate md_error_t *ep 42280Sstevel@tonic-gate ) 42290Sstevel@tonic-gate { 42300Sstevel@tonic-gate set_t max_sets, setno; 42310Sstevel@tonic-gate int nodecnt; 42320Sstevel@tonic-gate mndiskset_membershiplist_t *nl; 42330Sstevel@tonic-gate md_set_desc *sd; 42340Sstevel@tonic-gate mdsetname_t *sp; 42350Sstevel@tonic-gate int rval = 0; 42360Sstevel@tonic-gate mddb_setflags_config_t sf; 42370Sstevel@tonic-gate int start_node_delayed = 0; 42380Sstevel@tonic-gate 42390Sstevel@tonic-gate if ((max_sets = get_max_sets(ep)) == 0) { 42400Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 42410Sstevel@tonic-gate "Unable to get number of sets")); 42420Sstevel@tonic-gate return (1); 42430Sstevel@tonic-gate } 42440Sstevel@tonic-gate 42450Sstevel@tonic-gate /* 42460Sstevel@tonic-gate * Get membershiplist from API routine. If there's 42470Sstevel@tonic-gate * an error, return a 205 to cause another reconfig. 42480Sstevel@tonic-gate */ 42490Sstevel@tonic-gate if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) { 42500Sstevel@tonic-gate mde_perror(ep, ""); 42510Sstevel@tonic-gate return (205); 42520Sstevel@tonic-gate } 42530Sstevel@tonic-gate 42540Sstevel@tonic-gate for (setno = 1; setno < max_sets; setno++) { 42550Sstevel@tonic-gate if ((sp = metasetnosetname(setno, ep)) == NULL) { 42560Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) { 42570Sstevel@tonic-gate /* No set for this setno - continue */ 42580Sstevel@tonic-gate mdclrerror(ep); 42590Sstevel@tonic-gate continue; 42600Sstevel@tonic-gate } else { 42610Sstevel@tonic-gate /* 42620Sstevel@tonic-gate * If encountered an RPC error from my node, 42630Sstevel@tonic-gate * then immediately fail. 42640Sstevel@tonic-gate */ 42650Sstevel@tonic-gate if (mdanyrpcerror(ep)) { 42660Sstevel@tonic-gate mde_perror(ep, ""); 42670Sstevel@tonic-gate return (1); 42680Sstevel@tonic-gate } 42690Sstevel@tonic-gate /* Can't get set information */ 42700Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 42710Sstevel@tonic-gate "Unable to get information for " 42720Sstevel@tonic-gate "set number %d"), setno); 42730Sstevel@tonic-gate mdclrerror(ep); 42740Sstevel@tonic-gate continue; 42750Sstevel@tonic-gate } 42760Sstevel@tonic-gate } 42770Sstevel@tonic-gate 42780Sstevel@tonic-gate /* If setname is there, set desc should exist. */ 42790Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 42800Sstevel@tonic-gate /* 42810Sstevel@tonic-gate * If encountered an RPC error from my node, 42820Sstevel@tonic-gate * then immediately fail. 42830Sstevel@tonic-gate */ 42840Sstevel@tonic-gate if (mdanyrpcerror(ep)) { 42850Sstevel@tonic-gate mde_perror(ep, ""); 42860Sstevel@tonic-gate return (1); 42870Sstevel@tonic-gate } 42880Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 42890Sstevel@tonic-gate "Unable to get set %s desc information"), 42900Sstevel@tonic-gate sp->setname); 42910Sstevel@tonic-gate mdclrerror(ep); 42920Sstevel@tonic-gate continue; 42930Sstevel@tonic-gate } 42940Sstevel@tonic-gate 42950Sstevel@tonic-gate /* Only reconfig MN disksets */ 42960Sstevel@tonic-gate if (!MD_MNSET_DESC(sd)) { 42970Sstevel@tonic-gate continue; 42980Sstevel@tonic-gate } 42990Sstevel@tonic-gate 43000Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 43010Sstevel@tonic-gate "Begin choose master for set %s: %s"), 43020Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 43030Sstevel@tonic-gate 43040Sstevel@tonic-gate /* Update nodelist with member information. */ 43050Sstevel@tonic-gate if (meta_reconfig_update_nodelist(sp, nl, sd, ep)) { 43060Sstevel@tonic-gate /* 43070Sstevel@tonic-gate * If encountered an RPC error from my node, 43080Sstevel@tonic-gate * then immediately fail. 43090Sstevel@tonic-gate */ 43100Sstevel@tonic-gate if (mdanyrpcerror(ep)) { 43110Sstevel@tonic-gate mde_perror(ep, ""); 43120Sstevel@tonic-gate return (1); 43130Sstevel@tonic-gate } 43140Sstevel@tonic-gate mde_perror(ep, ""); 43150Sstevel@tonic-gate mdclrerror(ep); 43160Sstevel@tonic-gate continue; 43170Sstevel@tonic-gate } 43180Sstevel@tonic-gate 43190Sstevel@tonic-gate /* 43200Sstevel@tonic-gate * If all nodes in a cluster are starting, then 43210Sstevel@tonic-gate * all nodes will attempt to contact all other nodes 43220Sstevel@tonic-gate * to determine a master node. This can lead to a 43230Sstevel@tonic-gate * problem where node 1 is trying to contact the rpc.metad 43240Sstevel@tonic-gate * node 2 and node 2 is trying to contact the rpc.metad 43250Sstevel@tonic-gate * on node 1 -- and this causes the rpc call to fail 43260Sstevel@tonic-gate * on both nodes and causes a new reconfig cycle. 43270Sstevel@tonic-gate * 43280Sstevel@tonic-gate * In order to break this problem, a newly starting node 43290Sstevel@tonic-gate * will delay a small amount of time (nodeid mod 4 seconds) 43300Sstevel@tonic-gate * and will then run the code to choose a master for the 43310Sstevel@tonic-gate * first set. Delay will only be done once regardless of the 43320Sstevel@tonic-gate * number of sets. 43330Sstevel@tonic-gate */ 43340Sstevel@tonic-gate if (start_node_delayed == 0) { 43350Sstevel@tonic-gate (void) memset(&sf, 0, sizeof (sf)); 43360Sstevel@tonic-gate sf.sf_setno = sp->setno; 43370Sstevel@tonic-gate sf.sf_flags = MDDB_NM_GET; 43380Sstevel@tonic-gate /* Use magic to help protect ioctl against attack. */ 43390Sstevel@tonic-gate sf.sf_magic = MDDB_SETFLAGS_MAGIC; 43400Sstevel@tonic-gate if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, 43410Sstevel@tonic-gate &sf.sf_mde, NULL) == 0) && 43420Sstevel@tonic-gate ((sf.sf_setflags & MD_SET_MN_START_RC) == 43430Sstevel@tonic-gate MD_SET_MN_START_RC)) { 43440Sstevel@tonic-gate (void) sleep(sd->sd_mn_mynode->nd_nodeid % 4); 43450Sstevel@tonic-gate } 43460Sstevel@tonic-gate start_node_delayed = 1; 43470Sstevel@tonic-gate } 43480Sstevel@tonic-gate 43490Sstevel@tonic-gate /* Choose master for this set */ 43500Sstevel@tonic-gate rval = meta_reconfig_choose_master_for_set(sp, sd, ep); 43510Sstevel@tonic-gate if (rval == -1) { 43520Sstevel@tonic-gate mde_perror(ep, ""); 43530Sstevel@tonic-gate return (1); 43540Sstevel@tonic-gate } else if (rval == 205) { 43550Sstevel@tonic-gate mde_perror(ep, ""); 43560Sstevel@tonic-gate return (205); 43570Sstevel@tonic-gate } 43580Sstevel@tonic-gate 43590Sstevel@tonic-gate /* Send new nodelist to rpc.mdcommd */ 43600Sstevel@tonic-gate (void) mdmn_reinit_set(sp->setno); 43610Sstevel@tonic-gate 43620Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 43630Sstevel@tonic-gate "Choose master for set %s completed: %s"), 43640Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 43650Sstevel@tonic-gate } 43660Sstevel@tonic-gate 43670Sstevel@tonic-gate /* 43680Sstevel@tonic-gate * Each node turns on I/Os for all MN disksets. 43690Sstevel@tonic-gate * This is to recover from the situation where the master died 43700Sstevel@tonic-gate * during a previous reconfig cycle when I/Os were suspended 43710Sstevel@tonic-gate * for a MN diskset. 43720Sstevel@tonic-gate * If a failure occurs return a 1 which will force this node to 43730Sstevel@tonic-gate * panic. Cannot leave node in the situation where I/Os are 43740Sstevel@tonic-gate * not resumed. 43750Sstevel@tonic-gate */ 43760Sstevel@tonic-gate setno = 0; /* 0 means all MN sets */ 43770Sstevel@tonic-gate if (metaioctl(MD_MN_RESUME_SET, &setno, ep, NULL)) { 43780Sstevel@tonic-gate mde_perror(ep, ""); 43790Sstevel@tonic-gate return (1); 43800Sstevel@tonic-gate } 43810Sstevel@tonic-gate 43820Sstevel@tonic-gate /* Free the nodelist */ 43830Sstevel@tonic-gate if (nodecnt) 43840Sstevel@tonic-gate meta_free_nodelist(nl); 43850Sstevel@tonic-gate 43860Sstevel@tonic-gate return (0); 43870Sstevel@tonic-gate } 43880Sstevel@tonic-gate 43890Sstevel@tonic-gate /* 43900Sstevel@tonic-gate * meta_mnsync_user_records will synchronize the diskset user records across 43910Sstevel@tonic-gate * all nodes in the diskset. The diskset user records are stored in 43920Sstevel@tonic-gate * each node's local set mddb. 43930Sstevel@tonic-gate * 43940Sstevel@tonic-gate * This needs to be done even if there is no master change during the 43950Sstevel@tonic-gate * reconfig cycle since this routine should clean up any mess left by 43960Sstevel@tonic-gate * the untimely termination of a metaset or metadb command (due to a 43970Sstevel@tonic-gate * node panic or to user intervention). 43980Sstevel@tonic-gate * 43990Sstevel@tonic-gate * Caller is the Master node. 44000Sstevel@tonic-gate * 44010Sstevel@tonic-gate * Returns 0 - Success 44020Sstevel@tonic-gate * 205 - Failure during RPC to another node 44030Sstevel@tonic-gate * -1 - Any other failure and ep is filled in. 44040Sstevel@tonic-gate */ 44050Sstevel@tonic-gate int 44060Sstevel@tonic-gate meta_mnsync_user_records( 44070Sstevel@tonic-gate mdsetname_t *sp, 44080Sstevel@tonic-gate md_error_t *ep 44090Sstevel@tonic-gate ) 44100Sstevel@tonic-gate { 44110Sstevel@tonic-gate md_set_desc *sd; 44120Sstevel@tonic-gate md_mnnode_desc *master_nodelist, *nd, *nd2, *ndtail; 44130Sstevel@tonic-gate md_mnset_record *mnsr; 44140Sstevel@tonic-gate md_mnsr_node_t *master_mnsr_node = NULL, *mnsr_node = NULL; 44150Sstevel@tonic-gate md_mnnode_record *nr; 44160Sstevel@tonic-gate md_drive_record *dr; 44170Sstevel@tonic-gate int dr_cnt, dd_cnt; 44180Sstevel@tonic-gate int found_my_nr; 44190Sstevel@tonic-gate md_drive_desc *dd, *dd_prev, *master_dd, *other_dd; 44200Sstevel@tonic-gate int all_drives_ok; 44210Sstevel@tonic-gate int rval = 0; 44220Sstevel@tonic-gate int max_genid = 0; 44230Sstevel@tonic-gate int num_alive_nodes, num_alive_nodes_del = 0; 44240Sstevel@tonic-gate int set_locked = 0; 44250Sstevel@tonic-gate md_setkey_t *cl_sk; 44260Sstevel@tonic-gate md_error_t xep = mdnullerror; 44270Sstevel@tonic-gate char *anode[1]; 44280Sstevel@tonic-gate mddb_setflags_config_t sf; 44290Sstevel@tonic-gate 44300Sstevel@tonic-gate /* 44310Sstevel@tonic-gate * Sync up node records first. 44320Sstevel@tonic-gate * Construct a master nodelist using the nodelist from this 44330Sstevel@tonic-gate * node's rpc.metad node records and then setting the state of each 44340Sstevel@tonic-gate * node following these rules: 44350Sstevel@tonic-gate * - If a node record is marked OK on its node, mark it OK 44360Sstevel@tonic-gate * in the master nodelist (and later OK on all nodes) 44370Sstevel@tonic-gate * If a node record is also marked OWN on its node, 44380Sstevel@tonic-gate * mark it OWN in the master nodelist. 44390Sstevel@tonic-gate * - If a node record is not marked OK on its node, then mark 44400Sstevel@tonic-gate * it as DEL in the master list (later deleting it) 44410Sstevel@tonic-gate * - If node record doesn't exist on that node, then mark it DEL 44420Sstevel@tonic-gate * (later deleting it) 44430Sstevel@tonic-gate * - If set record doesn't exist on that node, mark node as DEL 44440Sstevel@tonic-gate * - If a node record doesn't exist on all nodes, then mark it DEL 44450Sstevel@tonic-gate * - If a node is not ALIVE, then 44460Sstevel@tonic-gate * - If that node marked DEL on any node - mark it DEL 44470Sstevel@tonic-gate * in master list but leave in nodelist 44480Sstevel@tonic-gate * - If that node is marked as ADD on any node, mark it 44490Sstevel@tonic-gate * ADD in the master list but leave in nodelist 44500Sstevel@tonic-gate * - When that node returns to the living, the DEL 44510Sstevel@tonic-gate * node record will be removed and the ADD node 44520Sstevel@tonic-gate * record may be removed if marked ADD on that 44530Sstevel@tonic-gate * node. 44540Sstevel@tonic-gate * The key rule is to not remove a node from the nodelist until 44550Sstevel@tonic-gate * that node record is removed from its own node. Do not want to 44560Sstevel@tonic-gate * remove a node's record from all other nodes and then have 44570Sstevel@tonic-gate * that node have its own record marked OK so that a node will pick 44580Sstevel@tonic-gate * a different master than the other nodes. 44590Sstevel@tonic-gate * 44600Sstevel@tonic-gate * Next, 44610Sstevel@tonic-gate * If node is ALIVE and node record is marked DEL in master nodelist, 44620Sstevel@tonic-gate * remove node from set. 44630Sstevel@tonic-gate * If node is ALIVE and node record is marked OK in master nodelist, 44640Sstevel@tonic-gate * mark it OK on all other nodes. 44650Sstevel@tonic-gate * If node is not ALIVE and node record is marked DEL in master 44660Sstevel@tonic-gate * nodelist, mark it DEL on all other nodes. 44670Sstevel@tonic-gate * If node is not ALIVE and node record is marked ADD in master, 44680Sstevel@tonic-gate * nodelist, mark it ADD on all other nodes. 44690Sstevel@tonic-gate */ 44700Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 44710Sstevel@tonic-gate return (-1); 44720Sstevel@tonic-gate } 44730Sstevel@tonic-gate master_nodelist = sd->sd_nodelist; 44740Sstevel@tonic-gate 44750Sstevel@tonic-gate /* 44760Sstevel@tonic-gate * Walk through nodelist creating a master nodelist. 44770Sstevel@tonic-gate */ 44780Sstevel@tonic-gate num_alive_nodes = 0; 44790Sstevel@tonic-gate nd = master_nodelist; 44800Sstevel@tonic-gate while (nd) { 44810Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 44820Sstevel@tonic-gate nd = nd->nd_next; 44830Sstevel@tonic-gate continue; 44840Sstevel@tonic-gate } 44850Sstevel@tonic-gate num_alive_nodes++; 44860Sstevel@tonic-gate if (clnt_mngetset(nd->nd_nodename, sp->setname, 44870Sstevel@tonic-gate MD_SET_BAD, &mnsr, ep) == -1) { 44880Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) { 44890Sstevel@tonic-gate /* set doesn't exist, mark node as DEL */ 44900Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OK; 44910Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_ADD; 44920Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL; 44930Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_NOSET; 44940Sstevel@tonic-gate nd = nd->nd_next; 44950Sstevel@tonic-gate continue; 44960Sstevel@tonic-gate } else { 44970Sstevel@tonic-gate /* If RPC failure to another node return 205 */ 44980Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 44990Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 45000Sstevel@tonic-gate nd->nd_nodeid)) { 45010Sstevel@tonic-gate rval = 205; 45020Sstevel@tonic-gate } else { 45030Sstevel@tonic-gate /* Any other failure */ 45040Sstevel@tonic-gate rval = -1; 45050Sstevel@tonic-gate } 45060Sstevel@tonic-gate goto out; 45070Sstevel@tonic-gate } 45080Sstevel@tonic-gate } 45090Sstevel@tonic-gate /* Find biggest genid in records for this diskset */ 45100Sstevel@tonic-gate if (mnsr->sr_genid > max_genid) 45110Sstevel@tonic-gate max_genid = mnsr->sr_genid; 45120Sstevel@tonic-gate 45130Sstevel@tonic-gate dr = mnsr->sr_drivechain; 45140Sstevel@tonic-gate while (dr) { 45150Sstevel@tonic-gate /* Find biggest genid in records for this diskset */ 45160Sstevel@tonic-gate if (dr->dr_genid > max_genid) { 45170Sstevel@tonic-gate max_genid = dr->dr_genid; 45180Sstevel@tonic-gate } 45190Sstevel@tonic-gate dr = dr->dr_next; 45200Sstevel@tonic-gate } 45210Sstevel@tonic-gate 45220Sstevel@tonic-gate found_my_nr = 0; 45230Sstevel@tonic-gate nr = mnsr->sr_nodechain; 45240Sstevel@tonic-gate /* nr is the list of node recs from nd_nodename node */ 45250Sstevel@tonic-gate while (nr) { 45260Sstevel@tonic-gate /* Find biggest genid in records for this diskset */ 45270Sstevel@tonic-gate if (nr->nr_genid > max_genid) 45280Sstevel@tonic-gate max_genid = nr->nr_genid; 45290Sstevel@tonic-gate nd2 = master_nodelist; 45300Sstevel@tonic-gate ndtail = NULL; 45310Sstevel@tonic-gate /* For each node record, is it in master list? */ 45320Sstevel@tonic-gate while (nd2) { 45330Sstevel@tonic-gate if (nd2->nd_nodeid == nr->nr_nodeid) 45340Sstevel@tonic-gate break; 45350Sstevel@tonic-gate if (nd2->nd_next == NULL) 45360Sstevel@tonic-gate ndtail = nd2; 45370Sstevel@tonic-gate nd2 = nd2->nd_next; 45380Sstevel@tonic-gate } 45390Sstevel@tonic-gate /* 45400Sstevel@tonic-gate * Found node record not in master list -- add it 45410Sstevel@tonic-gate * to list marking it as DEL since node record 45420Sstevel@tonic-gate * should exist on all nodes unless a panic occurred 45430Sstevel@tonic-gate * during addition or deletion of host to diskset. 45440Sstevel@tonic-gate */ 45450Sstevel@tonic-gate if (nd2 == NULL) { 45460Sstevel@tonic-gate nd2 = Zalloc(sizeof (*nd2)); 45470Sstevel@tonic-gate (void) strcpy(nd2->nd_nodename, 45480Sstevel@tonic-gate nr->nr_nodename); 45490Sstevel@tonic-gate nd2->nd_flags = nr->nr_flags; 45500Sstevel@tonic-gate nd2->nd_flags |= MD_MN_NODE_DEL; 45510Sstevel@tonic-gate nd2->nd_nodeid = nr->nr_nodeid; 45520Sstevel@tonic-gate nd2->nd_next = NULL; 45530Sstevel@tonic-gate ndtail->nd_next = nd2; 45540Sstevel@tonic-gate nd2 = NULL; 45550Sstevel@tonic-gate nr = nr->nr_next; 45560Sstevel@tonic-gate continue; 45570Sstevel@tonic-gate } 45580Sstevel@tonic-gate /* 45590Sstevel@tonic-gate * Is this the node record for the node that 45600Sstevel@tonic-gate * we requested the set desc from? 45610Sstevel@tonic-gate * If so, check if node has its own node record 45620Sstevel@tonic-gate * marked OK. If marked OK, check for the OWN bit. 45630Sstevel@tonic-gate */ 45640Sstevel@tonic-gate if (nr->nr_nodeid == nd->nd_nodeid) { 45650Sstevel@tonic-gate found_my_nr = 1; 45660Sstevel@tonic-gate if (nr->nr_flags & MD_MN_NODE_OK) { 45670Sstevel@tonic-gate /* 45680Sstevel@tonic-gate * If node record is marked OK 45690Sstevel@tonic-gate * on its own node, then mark it OK 45700Sstevel@tonic-gate * in the master list. Node record 45710Sstevel@tonic-gate * would have to exist on all nodes 45720Sstevel@tonic-gate * in the ADD state before it could 45730Sstevel@tonic-gate * be put into the OK state. 45740Sstevel@tonic-gate */ 45750Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_OK; 45760Sstevel@tonic-gate nd->nd_flags &= 45770Sstevel@tonic-gate ~(MD_MN_NODE_ADD | MD_MN_NODE_DEL); 45780Sstevel@tonic-gate /* 45790Sstevel@tonic-gate * Mark own in master list as marked 45800Sstevel@tonic-gate * on own node. 45810Sstevel@tonic-gate */ 45820Sstevel@tonic-gate if (nr->nr_flags & MD_MN_NODE_OWN) 45830Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_OWN; 45840Sstevel@tonic-gate else 45850Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OWN; 45860Sstevel@tonic-gate } else { 45870Sstevel@tonic-gate /* Otherwise, mark node as DEL */ 45880Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OK; 45890Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_ADD; 45900Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL; 45910Sstevel@tonic-gate } 45920Sstevel@tonic-gate } 45930Sstevel@tonic-gate /* 45940Sstevel@tonic-gate * If node is not ALIVE and marked DEL 45950Sstevel@tonic-gate * on any node, make it DEL in master list. 45960Sstevel@tonic-gate * If node is not ALIVE and marked ADD 45970Sstevel@tonic-gate * on any node, make it ADD in master list 45980Sstevel@tonic-gate * unless node record has already been marked DEL. 45990Sstevel@tonic-gate */ 46000Sstevel@tonic-gate if (!(nr->nr_flags & MD_MN_NODE_ALIVE)) { 46010Sstevel@tonic-gate if (nr->nr_flags & MD_MN_NODE_ADD) { 46020Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_DEL)) { 46030Sstevel@tonic-gate /* If not DEL - mark it ADD */ 46040Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_ADD; 46050Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OK; 46060Sstevel@tonic-gate } 46070Sstevel@tonic-gate } 46080Sstevel@tonic-gate if (nr->nr_flags & MD_MN_NODE_DEL) { 46090Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL; 46100Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OK; 46110Sstevel@tonic-gate /* Could already be ADD - make it DEL */ 46120Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_ADD; 46130Sstevel@tonic-gate } 46140Sstevel@tonic-gate } 46150Sstevel@tonic-gate nr = nr->nr_next; 46160Sstevel@tonic-gate } 46170Sstevel@tonic-gate /* 46180Sstevel@tonic-gate * If a node record doesn't exist on its own node, 46190Sstevel@tonic-gate * then mark node as DEL. 46200Sstevel@tonic-gate */ 46210Sstevel@tonic-gate if (found_my_nr == 0) { 46220Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OK; 46230Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL; 46240Sstevel@tonic-gate } 46250Sstevel@tonic-gate 46260Sstevel@tonic-gate /* 46270Sstevel@tonic-gate * If node is OK - put mnsr onto master_mnsr_node list for 46280Sstevel@tonic-gate * later use when syncing up the drive records in the set. 46290Sstevel@tonic-gate */ 46300Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OK) { 46310Sstevel@tonic-gate mnsr_node = Zalloc(sizeof (*mnsr_node)); 46320Sstevel@tonic-gate mnsr_node->mmn_mnsr = mnsr; 46330Sstevel@tonic-gate (void) strncpy(mnsr_node->mmn_nodename, 46340Sstevel@tonic-gate nd->nd_nodename, MD_MAX_MNNODENAME_PLUS_1); 46350Sstevel@tonic-gate mnsr_node->mmn_next = master_mnsr_node; 46360Sstevel@tonic-gate master_mnsr_node = mnsr_node; 46370Sstevel@tonic-gate } else { 46380Sstevel@tonic-gate free_sr((struct md_set_record *)mnsr); 46390Sstevel@tonic-gate } 46400Sstevel@tonic-gate 46410Sstevel@tonic-gate nd = nd->nd_next; 46420Sstevel@tonic-gate } 46430Sstevel@tonic-gate 46440Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 46450Sstevel@tonic-gate "Master nodelist created for set %s: %s"), 46460Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 46470Sstevel@tonic-gate 46480Sstevel@tonic-gate /* 46490Sstevel@tonic-gate * Send master nodelist to the rpc.metad on all nodes (including 46500Sstevel@tonic-gate * myself) and each node will update itself. This will set the 46510Sstevel@tonic-gate * ADD and DEL flags on each node as setup in the master nodelist. 46520Sstevel@tonic-gate * Don't send nodelist to node where set doesn't exist. 46530Sstevel@tonic-gate */ 46540Sstevel@tonic-gate nd = master_nodelist; 46550Sstevel@tonic-gate while (nd) { 46560Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 46570Sstevel@tonic-gate (nd->nd_flags & MD_MN_NODE_NOSET)) { 46580Sstevel@tonic-gate nd = nd->nd_next; 46590Sstevel@tonic-gate continue; 46600Sstevel@tonic-gate } 46610Sstevel@tonic-gate if (clnt_upd_nr_flags(nd->nd_nodename, sp, 46620Sstevel@tonic-gate master_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) { 46630Sstevel@tonic-gate /* If RPC failure to another node return 205 */ 46640Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 46650Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 46660Sstevel@tonic-gate nd->nd_nodeid)) { 46670Sstevel@tonic-gate rval = 205; 46680Sstevel@tonic-gate } else { 46690Sstevel@tonic-gate /* Any other failure */ 46700Sstevel@tonic-gate rval = -1; 46710Sstevel@tonic-gate } 46720Sstevel@tonic-gate goto out; 46730Sstevel@tonic-gate } 46740Sstevel@tonic-gate nd = nd->nd_next; 46750Sstevel@tonic-gate } 46760Sstevel@tonic-gate 46770Sstevel@tonic-gate /* 46780Sstevel@tonic-gate * Now, delete nodes that need to be deleted. 46790Sstevel@tonic-gate */ 46800Sstevel@tonic-gate if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 46810Sstevel@tonic-gate ep)) == NULL) { 46820Sstevel@tonic-gate if (! mdisok(ep)) { 46830Sstevel@tonic-gate rval = -1; 46840Sstevel@tonic-gate goto out; 46850Sstevel@tonic-gate } 46860Sstevel@tonic-gate } 46870Sstevel@tonic-gate 46880Sstevel@tonic-gate /* 46890Sstevel@tonic-gate * May be doing lots of RPC commands to the nodes, so lock the 46900Sstevel@tonic-gate * ALIVE members of the set since most of the rpc.metad routines 46910Sstevel@tonic-gate * require this for security reasons. 46920Sstevel@tonic-gate */ 46930Sstevel@tonic-gate nd = master_nodelist; 46940Sstevel@tonic-gate while (nd) { 46950Sstevel@tonic-gate /* Skip non-alive nodes and node without set */ 46960Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 46970Sstevel@tonic-gate (nd->nd_flags & MD_MN_NODE_NOSET)) { 46980Sstevel@tonic-gate nd = nd->nd_next; 46990Sstevel@tonic-gate continue; 47000Sstevel@tonic-gate } 47010Sstevel@tonic-gate if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 47020Sstevel@tonic-gate /* If RPC failure to another node return 205 */ 47030Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 47040Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 47050Sstevel@tonic-gate nd->nd_nodeid)) { 47060Sstevel@tonic-gate rval = 205; 47070Sstevel@tonic-gate } else { 47080Sstevel@tonic-gate /* Any other failure */ 47090Sstevel@tonic-gate rval = -1; 47100Sstevel@tonic-gate } 47110Sstevel@tonic-gate goto out; 47120Sstevel@tonic-gate } 47130Sstevel@tonic-gate set_locked = 1; 47140Sstevel@tonic-gate nd = nd->nd_next; 47150Sstevel@tonic-gate } 47160Sstevel@tonic-gate 47170Sstevel@tonic-gate nd = master_nodelist; 47180Sstevel@tonic-gate while (nd) { 47190Sstevel@tonic-gate /* Skip non-alive nodes */ 47200Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 47210Sstevel@tonic-gate nd = nd->nd_next; 47220Sstevel@tonic-gate continue; 47230Sstevel@tonic-gate } 47240Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_DEL) { 47250Sstevel@tonic-gate num_alive_nodes_del++; 47260Sstevel@tonic-gate /* 47270Sstevel@tonic-gate * Delete this node rec from all ALIVE nodes in diskset. 47280Sstevel@tonic-gate */ 47290Sstevel@tonic-gate nd2 = master_nodelist; 47300Sstevel@tonic-gate while (nd2) { 47310Sstevel@tonic-gate /* Skip non-alive nodes and node without set */ 47320Sstevel@tonic-gate if (!(nd2->nd_flags & MD_MN_NODE_ALIVE) || 47330Sstevel@tonic-gate (nd2->nd_flags & MD_MN_NODE_NOSET)) { 47340Sstevel@tonic-gate nd2 = nd2->nd_next; 47350Sstevel@tonic-gate continue; 47360Sstevel@tonic-gate } 47370Sstevel@tonic-gate 47380Sstevel@tonic-gate /* This is a node being deleted from set */ 47390Sstevel@tonic-gate if (nd2->nd_nodeid == nd->nd_nodeid) { 47400Sstevel@tonic-gate /* Mark set record as DEL */ 47410Sstevel@tonic-gate if (clnt_upd_sr_flags(nd->nd_nodename, 47420Sstevel@tonic-gate sp, MD_SR_DEL, ep)) { 47430Sstevel@tonic-gate /* RPC failure to !my node */ 47440Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 47450Sstevel@tonic-gate (sd->sd_mn_mynode-> 47460Sstevel@tonic-gate nd_nodeid 47470Sstevel@tonic-gate != nd->nd_nodeid)) { 47480Sstevel@tonic-gate rval = 205; 47490Sstevel@tonic-gate } else { 47500Sstevel@tonic-gate /* Any other failure */ 47510Sstevel@tonic-gate rval = -1; 47520Sstevel@tonic-gate } 47530Sstevel@tonic-gate goto out; 47540Sstevel@tonic-gate } 47550Sstevel@tonic-gate if (clnt_deldrvs(nd->nd_nodename, sp, 47560Sstevel@tonic-gate dd, ep)) { 47570Sstevel@tonic-gate /* RPC failure to !my node */ 47580Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 47590Sstevel@tonic-gate (sd->sd_mn_mynode-> 47600Sstevel@tonic-gate nd_nodeid 47610Sstevel@tonic-gate != nd->nd_nodeid)) { 47620Sstevel@tonic-gate rval = 205; 47630Sstevel@tonic-gate } else { 47640Sstevel@tonic-gate /* Any other failure */ 47650Sstevel@tonic-gate rval = -1; 47660Sstevel@tonic-gate } 47670Sstevel@tonic-gate goto out; 47680Sstevel@tonic-gate } 47690Sstevel@tonic-gate if (clnt_delset(nd->nd_nodename, sp, 47700Sstevel@tonic-gate ep) == -1) { 47710Sstevel@tonic-gate /* RPC failure to !my node */ 47720Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 47730Sstevel@tonic-gate (sd->sd_mn_mynode-> 47740Sstevel@tonic-gate nd_nodeid 47750Sstevel@tonic-gate != nd->nd_nodeid)) { 47760Sstevel@tonic-gate rval = 205; 47770Sstevel@tonic-gate } else { 47780Sstevel@tonic-gate /* Any other failure */ 47790Sstevel@tonic-gate rval = -1; 47800Sstevel@tonic-gate } 47810Sstevel@tonic-gate goto out; 47820Sstevel@tonic-gate } 47830Sstevel@tonic-gate } else { 47840Sstevel@tonic-gate /* 47850Sstevel@tonic-gate * Delete host from sets on hosts 47860Sstevel@tonic-gate * not being deleted. 47870Sstevel@tonic-gate */ 47880Sstevel@tonic-gate anode[0] = Strdup(nd->nd_nodename); 47890Sstevel@tonic-gate if (clnt_delhosts(nd2->nd_nodename, sp, 47900Sstevel@tonic-gate 1, anode, ep) == -1) { 47910Sstevel@tonic-gate Free(anode[0]); 47920Sstevel@tonic-gate /* RPC failure to !my node */ 47930Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 47940Sstevel@tonic-gate (sd->sd_mn_mynode-> 47950Sstevel@tonic-gate nd_nodeid 47960Sstevel@tonic-gate != nd2->nd_nodeid)) { 47970Sstevel@tonic-gate rval = 205; 47980Sstevel@tonic-gate } else { 47990Sstevel@tonic-gate /* Any other failure */ 48000Sstevel@tonic-gate rval = -1; 48010Sstevel@tonic-gate } 48020Sstevel@tonic-gate goto out; 48030Sstevel@tonic-gate } 48040Sstevel@tonic-gate 48050Sstevel@tonic-gate meta_mc_log(MC_LOG5, 48060Sstevel@tonic-gate dgettext(TEXT_DOMAIN, 48070Sstevel@tonic-gate "Deleted node %s (%d) on node %s " 48080Sstevel@tonic-gate "from set %s: %s"), 48090Sstevel@tonic-gate nd->nd_nodename, nd->nd_nodeid, 48100Sstevel@tonic-gate nd2->nd_nodename, 48110Sstevel@tonic-gate sp->setname, 48120Sstevel@tonic-gate meta_print_hrtime( 48130Sstevel@tonic-gate gethrtime() - start_time)); 48140Sstevel@tonic-gate 48150Sstevel@tonic-gate Free(anode[0]); 48160Sstevel@tonic-gate } 48170Sstevel@tonic-gate nd2 = nd2->nd_next; 48180Sstevel@tonic-gate } 48190Sstevel@tonic-gate } 48200Sstevel@tonic-gate nd = nd->nd_next; 48210Sstevel@tonic-gate } 48220Sstevel@tonic-gate 48230Sstevel@tonic-gate nd = master_nodelist; 48240Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname); 48250Sstevel@tonic-gate while (nd) { 48260Sstevel@tonic-gate /* Skip non-alive nodes and node without set */ 48270Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 48280Sstevel@tonic-gate (nd->nd_flags & MD_MN_NODE_NOSET)) { 48290Sstevel@tonic-gate nd = nd->nd_next; 48300Sstevel@tonic-gate continue; 48310Sstevel@tonic-gate } 48320Sstevel@tonic-gate if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) { 48330Sstevel@tonic-gate /* If RPC failure to another node return 205 */ 48340Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 48350Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 48360Sstevel@tonic-gate nd->nd_nodeid)) { 48370Sstevel@tonic-gate rval = 205; 48380Sstevel@tonic-gate } else { 48390Sstevel@tonic-gate /* Any other failure */ 48400Sstevel@tonic-gate rval = -1; 48410Sstevel@tonic-gate } 48420Sstevel@tonic-gate goto out; 48430Sstevel@tonic-gate } 48440Sstevel@tonic-gate nd = nd->nd_next; 48450Sstevel@tonic-gate } 48460Sstevel@tonic-gate cl_set_setkey(NULL); 48470Sstevel@tonic-gate set_locked = 0; 48480Sstevel@tonic-gate 48490Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 48500Sstevel@tonic-gate "Nodelist syncronization complete for set %s: %s"), 48510Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 48520Sstevel@tonic-gate 48530Sstevel@tonic-gate metaflushsetname(sp); 48540Sstevel@tonic-gate 48550Sstevel@tonic-gate /* 48560Sstevel@tonic-gate * If all alive nodes have been deleted from set, just 48570Sstevel@tonic-gate * return since nothing else can be done until non-alive 48580Sstevel@tonic-gate * nodes (if there are any) rejoin the cluster. 48590Sstevel@tonic-gate */ 48600Sstevel@tonic-gate if (num_alive_nodes == num_alive_nodes_del) { 48610Sstevel@tonic-gate rval = 0; 48620Sstevel@tonic-gate goto out; 48630Sstevel@tonic-gate } 48640Sstevel@tonic-gate 48650Sstevel@tonic-gate /* 48660Sstevel@tonic-gate * Sync up drive records. 48670Sstevel@tonic-gate * 48680Sstevel@tonic-gate * If a node panic'd (or metaset command was killed) during the 48690Sstevel@tonic-gate * addition or deletion of a drive to the diskset, the nodes 48700Sstevel@tonic-gate * may have a different view of the drive list. During cleanup 48710Sstevel@tonic-gate * of the drive list during reconfig, a drive will be deleted 48720Sstevel@tonic-gate * from the list if the master node sees that the drive has been 48730Sstevel@tonic-gate * marked in the ADD state on any node or is marked in the DEL state 48740Sstevel@tonic-gate * on all nodes. 48750Sstevel@tonic-gate * This cleanup must occur even if all nodes in the cluster are 48760Sstevel@tonic-gate * not part of the cluster so that all nodes have the same view 48770Sstevel@tonic-gate * of the drivelist. 48780Sstevel@tonic-gate * Then if the entire cluster goes down and comes back up, the 48790Sstevel@tonic-gate * new master node could be a node that wasn't in the cluster when 48800Sstevel@tonic-gate * the node was deleted. This could lead to a situation where the 48810Sstevel@tonic-gate * master node thinks that a drive is OK, but this drive isn't 48820Sstevel@tonic-gate * known to the other nodes. 48830Sstevel@tonic-gate * This situation can also occur during the addition of a drive 48840Sstevel@tonic-gate * where a node has the drive marked OK, but the node executing the 48850Sstevel@tonic-gate * metaset command enountered a failure before marking that drive OK 48860Sstevel@tonic-gate * on the rest of the nodes. If the node with the OK drive then 48870Sstevel@tonic-gate * panics, then rest of the nodes will remove that drive marked ADD 48880Sstevel@tonic-gate * and when the node with the OK drive rejoins the cluster, it will 48890Sstevel@tonic-gate * have a drive marked OK that is unknown by the other nodes. 48900Sstevel@tonic-gate * 48910Sstevel@tonic-gate * There are 2 situations to consider: 48920Sstevel@tonic-gate * A) Master knows about a drive that other nodes don't know about. 48930Sstevel@tonic-gate * B) At least one slave node knows about a drive that the master 48940Sstevel@tonic-gate * node doesn't know about. 48950Sstevel@tonic-gate * 48960Sstevel@tonic-gate * To handle these situations the following steps are followed: 48970Sstevel@tonic-gate * 1) Count number of drives known by this master node and the 48980Sstevel@tonic-gate * other slave nodes. 48990Sstevel@tonic-gate * If all nodes have the same number of drives and the master has 49000Sstevel@tonic-gate * all drives marked OK, then skip to step4. 49010Sstevel@tonic-gate * 49020Sstevel@tonic-gate * 2) If a node has less drives listed than the master, the master 49030Sstevel@tonic-gate * must get the drive descriptor list from that node so that 49040Sstevel@tonic-gate * master can determine which drive it needs to delete from that 49050Sstevel@tonic-gate * node. Master must get the drive descriptor list since the 49060Sstevel@tonic-gate * drive record list does not contain the name of the drive, but 49070Sstevel@tonic-gate * only a key and the key can only be interprested on that other 49080Sstevel@tonic-gate * node. 49090Sstevel@tonic-gate * 49100Sstevel@tonic-gate * 3) The master will then create the master drive list by doing: 49110Sstevel@tonic-gate * - Master starts with drive list known by master. 49120Sstevel@tonic-gate * - Any drive marked ADD will be removed from the list. 49130Sstevel@tonic-gate * - Any drive not known by another node (from step2) will be 49140Sstevel@tonic-gate * removed from the drive list. 49150Sstevel@tonic-gate * - If a drive is marked DEL on the master, the master must 49160Sstevel@tonic-gate * verify that the drive record is marked DEL on all nodes. 49170Sstevel@tonic-gate * If any node has the drive record marked OK, mark it OK 49180Sstevel@tonic-gate * on the master. (The reason why is described below). 49190Sstevel@tonic-gate * 49200Sstevel@tonic-gate * 4) The master sends out the master drive list and the slave 49210Sstevel@tonic-gate * nodes will force their drive lists to match the master 49220Sstevel@tonic-gate * drive list by deleting drives, if necessary and by changing 49230Sstevel@tonic-gate * the drive record states from ADD->OK if master has drive 49240Sstevel@tonic-gate * marked OK and slave has drive marked ADD. 49250Sstevel@tonic-gate * 49260Sstevel@tonic-gate * Interesting scenarios: 49270Sstevel@tonic-gate * 49280Sstevel@tonic-gate * 1) System has 4 nodes with node 1 as the master. Node 3 starts 49290Sstevel@tonic-gate * to delete a drive record (drive record on node 1 is marked DEL), 49300Sstevel@tonic-gate * but is stopped when node 3 panics. Node 1 also panics. 49310Sstevel@tonic-gate * During reconfig cycle, node 2 is picked as master and the drive 49320Sstevel@tonic-gate * record is left alone since all nodes in the cluster have it 49330Sstevel@tonic-gate * marked OK. User now sees drive as part of diskset. 49340Sstevel@tonic-gate * Now, entire cluster is rebooted and node 1 rejoins the cluster. 49350Sstevel@tonic-gate * Node 1 is picked as the master and node 1 has drive record 49360Sstevel@tonic-gate * marked DEL. Node 1 contacts all other nodes in the cluster 49370Sstevel@tonic-gate * and since at least one node has the drive record marked OK, 49380Sstevel@tonic-gate * the master marks the drive record OK. 49390Sstevel@tonic-gate * User continues to see the drive as part of the diskset. 49400Sstevel@tonic-gate */ 49410Sstevel@tonic-gate 49420Sstevel@tonic-gate /* Reget set descriptor since flushed above */ 49430Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 49440Sstevel@tonic-gate rval = -1; 49450Sstevel@tonic-gate goto out; 49460Sstevel@tonic-gate } 49470Sstevel@tonic-gate 49480Sstevel@tonic-gate /* Has side effect of setting sd->sd_drvs to same as master_dd */ 49490Sstevel@tonic-gate if ((master_dd = metaget_drivedesc_sideno(sp, 49500Sstevel@tonic-gate sd->sd_mn_mynode->nd_nodeid, 49510Sstevel@tonic-gate (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) { 49520Sstevel@tonic-gate /* No drives in list */ 49530Sstevel@tonic-gate if (!mdisok(ep)) { 49540Sstevel@tonic-gate /* 49550Sstevel@tonic-gate * Can't get drive list for this node, so 49560Sstevel@tonic-gate * return -1 causing this node to be removed 49570Sstevel@tonic-gate * cluster config and fixed. 49580Sstevel@tonic-gate */ 49590Sstevel@tonic-gate rval = -1; 49600Sstevel@tonic-gate goto out; 49610Sstevel@tonic-gate } 49620Sstevel@tonic-gate } 49630Sstevel@tonic-gate 49640Sstevel@tonic-gate /* Count the number of drives for all nodes */ 49650Sstevel@tonic-gate mnsr_node = master_mnsr_node; 49660Sstevel@tonic-gate while (mnsr_node) { 49670Sstevel@tonic-gate dr_cnt = 0; 49680Sstevel@tonic-gate dr = mnsr_node->mmn_mnsr->sr_drivechain; 49690Sstevel@tonic-gate while (dr) { 49700Sstevel@tonic-gate dr_cnt++; 49710Sstevel@tonic-gate dr = dr->dr_next; 49720Sstevel@tonic-gate } 49730Sstevel@tonic-gate mnsr_node->mmn_numdrives = dr_cnt; 49740Sstevel@tonic-gate mnsr_node = mnsr_node->mmn_next; 49750Sstevel@tonic-gate } 49760Sstevel@tonic-gate 49770Sstevel@tonic-gate /* Count the number of drives for the master; also check flags */ 49780Sstevel@tonic-gate all_drives_ok = 1; 49790Sstevel@tonic-gate dd_cnt = 0; 49800Sstevel@tonic-gate dd = master_dd; 49810Sstevel@tonic-gate while (dd) { 49820Sstevel@tonic-gate dd_cnt++; 49830Sstevel@tonic-gate if (!(dd->dd_flags & MD_DR_OK)) 49840Sstevel@tonic-gate all_drives_ok = 0; 49850Sstevel@tonic-gate dd = dd->dd_next; 49860Sstevel@tonic-gate } 49870Sstevel@tonic-gate 49880Sstevel@tonic-gate /* If all drives are ok, do quick check against number of drives */ 49890Sstevel@tonic-gate if (all_drives_ok) { 49900Sstevel@tonic-gate /* If all nodes have same number of drives, almost done */ 49910Sstevel@tonic-gate mnsr_node = master_mnsr_node; 49920Sstevel@tonic-gate while (mnsr_node) { 49930Sstevel@tonic-gate if (mnsr_node->mmn_numdrives != dd_cnt) 49940Sstevel@tonic-gate break; 49950Sstevel@tonic-gate mnsr_node = mnsr_node->mmn_next; 49960Sstevel@tonic-gate } 49970Sstevel@tonic-gate /* All nodes have same number of drives, just send flags */ 49980Sstevel@tonic-gate if (mnsr_node == NULL) { 49990Sstevel@tonic-gate goto send_drive_list; 50000Sstevel@tonic-gate } 50010Sstevel@tonic-gate } 50020Sstevel@tonic-gate 50030Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 50040Sstevel@tonic-gate "Begin detailed drive synchronization for set %s: %s"), 50050Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 50060Sstevel@tonic-gate 50070Sstevel@tonic-gate /* Detailed check required */ 50080Sstevel@tonic-gate mnsr_node = master_mnsr_node; 50090Sstevel@tonic-gate while (mnsr_node) { 50100Sstevel@tonic-gate /* Does slave node have less drives than master? */ 50110Sstevel@tonic-gate if (mnsr_node->mmn_numdrives < dd_cnt) { 50120Sstevel@tonic-gate /* Yes - must determine which drive is missing */ 50130Sstevel@tonic-gate if (clnt_getdrivedesc(mnsr_node->mmn_nodename, sp, 50140Sstevel@tonic-gate &other_dd, ep)) { 50150Sstevel@tonic-gate /* RPC failure to !my node */ 50160Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 50170Sstevel@tonic-gate (strcmp(mynode(), mnsr_node->mmn_nodename) 50180Sstevel@tonic-gate != 0)) { 50190Sstevel@tonic-gate rval = 205; 50200Sstevel@tonic-gate } else { 50210Sstevel@tonic-gate /* Any other failure */ 50220Sstevel@tonic-gate rval = -1; 50230Sstevel@tonic-gate } 50240Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 50250Sstevel@tonic-gate "Master node %s unable to " 50260Sstevel@tonic-gate "retrieve drive list from node %s"), 50270Sstevel@tonic-gate mynode(), mnsr_node->mmn_nodename); 50280Sstevel@tonic-gate goto out; 50290Sstevel@tonic-gate } 50300Sstevel@tonic-gate mnsr_node->mmn_dd = other_dd; 50310Sstevel@tonic-gate dd = master_dd; 50320Sstevel@tonic-gate while (dd) { 50330Sstevel@tonic-gate if (!(dd->dd_flags & MD_DR_OK)) { 50340Sstevel@tonic-gate dd = dd->dd_next; 50350Sstevel@tonic-gate continue; 50360Sstevel@tonic-gate } 50370Sstevel@tonic-gate other_dd = mnsr_node->mmn_dd; 50380Sstevel@tonic-gate while (other_dd) { 50390Sstevel@tonic-gate /* Convert to devids, when available */ 50400Sstevel@tonic-gate if (strcmp(other_dd->dd_dnp->cname, 50410Sstevel@tonic-gate dd->dd_dnp->cname) == 0) { 50420Sstevel@tonic-gate break; 50430Sstevel@tonic-gate } 50440Sstevel@tonic-gate other_dd = other_dd->dd_next; 50450Sstevel@tonic-gate } 50460Sstevel@tonic-gate /* 50470Sstevel@tonic-gate * dd not found on slave so mark it 50480Sstevel@tonic-gate * ADD for later deletion (drives in ADD 50490Sstevel@tonic-gate * state are deleted later in this routine). 50500Sstevel@tonic-gate */ 50510Sstevel@tonic-gate if (other_dd == NULL) { 50520Sstevel@tonic-gate dd->dd_flags = MD_DR_ADD; 50530Sstevel@tonic-gate } 50540Sstevel@tonic-gate dd = dd->dd_next; 50550Sstevel@tonic-gate } 50560Sstevel@tonic-gate 50570Sstevel@tonic-gate } 50580Sstevel@tonic-gate mnsr_node = mnsr_node->mmn_next; 50590Sstevel@tonic-gate } 50600Sstevel@tonic-gate 50610Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 50620Sstevel@tonic-gate "Drive check completed for set %s: %s"), 50630Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 50640Sstevel@tonic-gate 50650Sstevel@tonic-gate dd = master_dd; 50660Sstevel@tonic-gate dd_prev = 0; 50670Sstevel@tonic-gate while (dd) { 50680Sstevel@tonic-gate /* Remove any ADD drives from list */ 50690Sstevel@tonic-gate if (dd->dd_flags & MD_DR_ADD) { 50700Sstevel@tonic-gate if (dd_prev) { 50710Sstevel@tonic-gate dd_prev->dd_next = dd->dd_next; 50720Sstevel@tonic-gate dd->dd_next = NULL; 50730Sstevel@tonic-gate metafreedrivedesc(&dd); 50740Sstevel@tonic-gate dd = dd_prev->dd_next; 50750Sstevel@tonic-gate } else { 50760Sstevel@tonic-gate /* 50770Sstevel@tonic-gate * If removing drive descriptor from head 50780Sstevel@tonic-gate * of linked list, also change sd->sd_drvs. 50790Sstevel@tonic-gate */ 50800Sstevel@tonic-gate master_dd = sd->sd_drvs = dd->dd_next; 50810Sstevel@tonic-gate dd->dd_next = NULL; 50820Sstevel@tonic-gate metafreedrivedesc(&dd); 50830Sstevel@tonic-gate dd = master_dd; 50840Sstevel@tonic-gate } 50850Sstevel@tonic-gate /* dd setup in if/else above */ 50860Sstevel@tonic-gate continue; 50870Sstevel@tonic-gate } 50880Sstevel@tonic-gate /* 50890Sstevel@tonic-gate * If drive is marked DEL, check all other nodes. 50900Sstevel@tonic-gate * If drive on another node is marked OK, mark drive OK 50910Sstevel@tonic-gate * in master list. If drive is marked DEL or doesn't exist 50920Sstevel@tonic-gate * on all nodes, remove drive from list. 50930Sstevel@tonic-gate */ 50940Sstevel@tonic-gate if (dd->dd_flags & MD_DR_DEL) { 50950Sstevel@tonic-gate mnsr_node = master_mnsr_node; 50960Sstevel@tonic-gate while (mnsr_node) { 50970Sstevel@tonic-gate if (mnsr_node->mmn_dd == NULL) { 50980Sstevel@tonic-gate if (clnt_getdrivedesc( 50990Sstevel@tonic-gate mnsr_node->mmn_nodename, sp, 51000Sstevel@tonic-gate &other_dd, ep)) { 51010Sstevel@tonic-gate /* RPC failure to !my node */ 51020Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 51030Sstevel@tonic-gate (strcmp(mynode(), 51040Sstevel@tonic-gate mnsr_node->mmn_nodename) 51050Sstevel@tonic-gate != 0)) { 51060Sstevel@tonic-gate rval = 205; 51070Sstevel@tonic-gate } else { 51080Sstevel@tonic-gate /* Any other failure */ 51090Sstevel@tonic-gate rval = -1; 51100Sstevel@tonic-gate } 51110Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 51120Sstevel@tonic-gate "Master node %s unable " 51130Sstevel@tonic-gate "to retrieve drive list from " 51140Sstevel@tonic-gate "node %s"), mynode(), 51150Sstevel@tonic-gate mnsr_node->mmn_nodename); 51160Sstevel@tonic-gate goto out; 51170Sstevel@tonic-gate } 51180Sstevel@tonic-gate mnsr_node->mmn_dd = other_dd; 51190Sstevel@tonic-gate } 51200Sstevel@tonic-gate other_dd = mnsr_node->mmn_dd; 51210Sstevel@tonic-gate while (other_dd) { 51220Sstevel@tonic-gate /* Found drive (OK) from other node */ 51230Sstevel@tonic-gate if (strcmp(dd->dd_dnp->cname, 51240Sstevel@tonic-gate other_dd->dd_dnp->cname) 51250Sstevel@tonic-gate == 0) { 51260Sstevel@tonic-gate /* Drive marked OK */ 51270Sstevel@tonic-gate if (other_dd->dd_flags & 51280Sstevel@tonic-gate MD_DR_OK) { 51290Sstevel@tonic-gate dd->dd_flags = MD_DR_OK; 51300Sstevel@tonic-gate } 51310Sstevel@tonic-gate break; 51320Sstevel@tonic-gate } 51330Sstevel@tonic-gate other_dd = other_dd->dd_next; 51340Sstevel@tonic-gate } 51350Sstevel@tonic-gate if (dd->dd_flags == MD_DR_OK) 51360Sstevel@tonic-gate break; 51370Sstevel@tonic-gate 51380Sstevel@tonic-gate mnsr_node = mnsr_node->mmn_next; 51390Sstevel@tonic-gate } 51400Sstevel@tonic-gate /* 51410Sstevel@tonic-gate * If no node had this drive marked OK, delete it. 51420Sstevel@tonic-gate */ 51430Sstevel@tonic-gate if (dd->dd_flags & MD_DR_DEL) { 51440Sstevel@tonic-gate if (dd_prev) { 51450Sstevel@tonic-gate dd_prev->dd_next = dd->dd_next; 51460Sstevel@tonic-gate dd->dd_next = NULL; 51470Sstevel@tonic-gate metafreedrivedesc(&dd); 51480Sstevel@tonic-gate dd = dd_prev->dd_next; 51490Sstevel@tonic-gate } else { 51500Sstevel@tonic-gate /* 51510Sstevel@tonic-gate * If removing drive descriptor from 51520Sstevel@tonic-gate * head of linked list, also change 51530Sstevel@tonic-gate * sd->sd_drvs. 51540Sstevel@tonic-gate */ 51550Sstevel@tonic-gate master_dd = sd->sd_drvs = dd->dd_next; 51560Sstevel@tonic-gate dd->dd_next = NULL; 51570Sstevel@tonic-gate metafreedrivedesc(&dd); 51580Sstevel@tonic-gate dd = master_dd; 51590Sstevel@tonic-gate } 51600Sstevel@tonic-gate /* dd setup in if/else above */ 51610Sstevel@tonic-gate continue; 51620Sstevel@tonic-gate } 51630Sstevel@tonic-gate } 51640Sstevel@tonic-gate dd_prev = dd; 51650Sstevel@tonic-gate dd = dd->dd_next; 51660Sstevel@tonic-gate } 51670Sstevel@tonic-gate 51680Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 51690Sstevel@tonic-gate "Setting drive states completed for set %s: %s"), 51700Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 51710Sstevel@tonic-gate 51720Sstevel@tonic-gate send_drive_list: 51730Sstevel@tonic-gate /* 51740Sstevel@tonic-gate * Set genid on all drives to be the highest value seen. 51750Sstevel@tonic-gate */ 51760Sstevel@tonic-gate dd = master_dd; 51770Sstevel@tonic-gate while (dd) { 51780Sstevel@tonic-gate dd->dd_genid = max_genid; 51790Sstevel@tonic-gate dd = dd->dd_next; 51800Sstevel@tonic-gate } 51810Sstevel@tonic-gate /* 51820Sstevel@tonic-gate * Send updated drive list to all alive nodes. 51830Sstevel@tonic-gate * Will also set genid on set and node records to have same 51840Sstevel@tonic-gate * as the drive records. 51850Sstevel@tonic-gate */ 51860Sstevel@tonic-gate nd = sd->sd_nodelist; 51870Sstevel@tonic-gate while (nd) { 51880Sstevel@tonic-gate /* Skip non-alive nodes */ 51890Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 51900Sstevel@tonic-gate nd = nd->nd_next; 51910Sstevel@tonic-gate continue; 51920Sstevel@tonic-gate } 51930Sstevel@tonic-gate if (clnt_upd_dr_reconfig(nd->nd_nodename, sp, master_dd, ep)) { 51940Sstevel@tonic-gate /* RPC failure to another node */ 51950Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 51960Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != nd->nd_nodeid)) { 51970Sstevel@tonic-gate rval = 205; 51980Sstevel@tonic-gate } else { 51990Sstevel@tonic-gate /* Any other failure */ 52000Sstevel@tonic-gate rval = -1; 52010Sstevel@tonic-gate } 52020Sstevel@tonic-gate goto out; 52030Sstevel@tonic-gate } 52040Sstevel@tonic-gate nd = nd->nd_next; 52050Sstevel@tonic-gate } 52060Sstevel@tonic-gate 52070Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 52080Sstevel@tonic-gate "Sent drive list to all nodes for set %s: %s"), 52090Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 52100Sstevel@tonic-gate 52110Sstevel@tonic-gate /* 52120Sstevel@tonic-gate * If no drive records left in set and nodes had been joined, 52130Sstevel@tonic-gate * withdraw the nodes. Always reset the master and mark 52140Sstevel@tonic-gate * all nodes as withdrawn on all nodes. 52150Sstevel@tonic-gate */ 52160Sstevel@tonic-gate if (master_dd == NULL) { 52170Sstevel@tonic-gate /* Reset new master flag since no longer master */ 52180Sstevel@tonic-gate (void) memset(&sf, 0, sizeof (sf)); 52190Sstevel@tonic-gate sf.sf_setno = sp->setno; 52200Sstevel@tonic-gate sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 52210Sstevel@tonic-gate sf.sf_flags = MDDB_NM_RESET; 52220Sstevel@tonic-gate /* Use magic to help protect ioctl against attack. */ 52230Sstevel@tonic-gate sf.sf_magic = MDDB_SETFLAGS_MAGIC; 52240Sstevel@tonic-gate /* Ignore failure, failure to reset flag isn't catastrophic */ 52250Sstevel@tonic-gate (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, 52260Sstevel@tonic-gate &sf.sf_mde, NULL); 52270Sstevel@tonic-gate 52280Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 52290Sstevel@tonic-gate "Reset new master flag for " "set %s: %s"), 52300Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 52310Sstevel@tonic-gate 52320Sstevel@tonic-gate nd = sd->sd_nodelist; 52330Sstevel@tonic-gate while (nd) { 52340Sstevel@tonic-gate /* Skip non-alive nodes */ 52350Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 52360Sstevel@tonic-gate nd = nd->nd_next; 52370Sstevel@tonic-gate continue; 52380Sstevel@tonic-gate } 52390Sstevel@tonic-gate 52400Sstevel@tonic-gate if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 52410Sstevel@tonic-gate /* RPC failure to another node */ 52420Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 52430Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 52440Sstevel@tonic-gate nd->nd_nodeid)) { 52450Sstevel@tonic-gate rval = 205; 52460Sstevel@tonic-gate } else { 52470Sstevel@tonic-gate /* Any other failure */ 52480Sstevel@tonic-gate rval = -1; 52490Sstevel@tonic-gate } 52500Sstevel@tonic-gate goto out; 52510Sstevel@tonic-gate } 52520Sstevel@tonic-gate set_locked = 1; 52530Sstevel@tonic-gate 52540Sstevel@tonic-gate /* Withdraw node from set if owner */ 52550Sstevel@tonic-gate if ((nd->nd_flags & MD_MN_NODE_OWN) && 52560Sstevel@tonic-gate (clnt_withdrawset(nd->nd_nodename, sp, ep))) { 52570Sstevel@tonic-gate /* RPC failure to another node */ 52580Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 52590Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 52600Sstevel@tonic-gate nd->nd_nodeid)) { 52610Sstevel@tonic-gate rval = 205; 52620Sstevel@tonic-gate } else { 52630Sstevel@tonic-gate /* Any other failure */ 52640Sstevel@tonic-gate rval = -1; 52650Sstevel@tonic-gate } 52660Sstevel@tonic-gate goto out; 52670Sstevel@tonic-gate } 52680Sstevel@tonic-gate 52690Sstevel@tonic-gate /* Mark all nodes as withdrawn on this node */ 52700Sstevel@tonic-gate if (clnt_upd_nr_flags(nd->nd_nodename, sp, 52710Sstevel@tonic-gate sd->sd_nodelist, MD_NR_WITHDRAW, NULL, ep)) { 52720Sstevel@tonic-gate /* RPC failure to another node */ 52730Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 52740Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 52750Sstevel@tonic-gate nd->nd_nodeid)) { 52760Sstevel@tonic-gate rval = 205; 52770Sstevel@tonic-gate } else { 52780Sstevel@tonic-gate /* Any other failure */ 52790Sstevel@tonic-gate rval = -1; 52800Sstevel@tonic-gate } 52810Sstevel@tonic-gate goto out; 52820Sstevel@tonic-gate } 52830Sstevel@tonic-gate 52840Sstevel@tonic-gate /* Resets master to no-master on this node */ 52850Sstevel@tonic-gate if (clnt_mnsetmaster(nd->nd_nodename, sp, 52860Sstevel@tonic-gate "", MD_MN_INVALID_NID, ep)) { 52870Sstevel@tonic-gate /* RPC failure to another node */ 52880Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 52890Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 52900Sstevel@tonic-gate nd->nd_nodeid)) { 52910Sstevel@tonic-gate rval = 205; 52920Sstevel@tonic-gate } else { 52930Sstevel@tonic-gate /* Any other failure */ 52940Sstevel@tonic-gate rval = -1; 52950Sstevel@tonic-gate } 52960Sstevel@tonic-gate goto out; 52970Sstevel@tonic-gate } 52980Sstevel@tonic-gate 52990Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname); 53000Sstevel@tonic-gate if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) { 53010Sstevel@tonic-gate /* RPC failure to another node */ 53020Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 53030Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 53040Sstevel@tonic-gate nd->nd_nodeid)) { 53050Sstevel@tonic-gate rval = 205; 53060Sstevel@tonic-gate } else { 53070Sstevel@tonic-gate /* Any other failure */ 53080Sstevel@tonic-gate rval = -1; 53090Sstevel@tonic-gate } 53100Sstevel@tonic-gate goto out; 53110Sstevel@tonic-gate } 53120Sstevel@tonic-gate set_locked = 0; 53130Sstevel@tonic-gate nd = nd->nd_next; 53140Sstevel@tonic-gate } 53150Sstevel@tonic-gate } 53160Sstevel@tonic-gate 53170Sstevel@tonic-gate out: 53180Sstevel@tonic-gate /* 53190Sstevel@tonic-gate * If got here and set is still locked, then an error has 53200Sstevel@tonic-gate * occurred and master_nodelist is still valid. 53210Sstevel@tonic-gate * If error is not an RPC error, then unlock. 53220Sstevel@tonic-gate * If error is an RPC error, skip unlocks since this could cause 53230Sstevel@tonic-gate * yet another RPC timeout if a node has failed. 53240Sstevel@tonic-gate * Ignore failures in unlock since unlock is just trying to 53250Sstevel@tonic-gate * clean things up. 53260Sstevel@tonic-gate */ 53270Sstevel@tonic-gate if ((set_locked) && !(mdanyrpcerror(ep))) { 53280Sstevel@tonic-gate nd = master_nodelist; 53290Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname); 53300Sstevel@tonic-gate while (nd) { 53310Sstevel@tonic-gate /* Skip non-alive nodes */ 53320Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 53330Sstevel@tonic-gate nd = nd->nd_next; 53340Sstevel@tonic-gate continue; 53350Sstevel@tonic-gate } 53360Sstevel@tonic-gate /* 53370Sstevel@tonic-gate * If clnt_unlock fails, just break out since next 53380Sstevel@tonic-gate * reconfig cycle will reset the locks anyway. 53390Sstevel@tonic-gate */ 53400Sstevel@tonic-gate if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 53410Sstevel@tonic-gate break; 53420Sstevel@tonic-gate } 53430Sstevel@tonic-gate nd = nd->nd_next; 53440Sstevel@tonic-gate } 53450Sstevel@tonic-gate cl_set_setkey(NULL); 53460Sstevel@tonic-gate } 53470Sstevel@tonic-gate /* Free master_mnsr and drive descs */ 53480Sstevel@tonic-gate mnsr_node = master_mnsr_node; 53490Sstevel@tonic-gate while (mnsr_node) { 53500Sstevel@tonic-gate master_mnsr_node = mnsr_node->mmn_next; 53510Sstevel@tonic-gate free_sr((md_set_record *)mnsr_node->mmn_mnsr); 53520Sstevel@tonic-gate free_rem_dd(mnsr_node->mmn_dd); 53530Sstevel@tonic-gate Free(mnsr_node); 53540Sstevel@tonic-gate mnsr_node = master_mnsr_node; 53550Sstevel@tonic-gate } 53560Sstevel@tonic-gate 53570Sstevel@tonic-gate /* Frees sd->sd_drvs (which is also master_dd) */ 53580Sstevel@tonic-gate metaflushsetname(sp); 53590Sstevel@tonic-gate return (rval); 53600Sstevel@tonic-gate } 53610Sstevel@tonic-gate 53620Sstevel@tonic-gate /* 53630Sstevel@tonic-gate * meta_mnsync_diskset_mddbs 53640Sstevel@tonic-gate * Calling node is guaranteed to be an owner node. 53650Sstevel@tonic-gate * Calling node is the master node. 53660Sstevel@tonic-gate * 53670Sstevel@tonic-gate * Master node verifies that ondisk mddb format matches its incore format. 53680Sstevel@tonic-gate * If no nodes are joined to set, remove the change log entries. 53690Sstevel@tonic-gate * If a node is joined to set, play the change log. 53700Sstevel@tonic-gate * 53710Sstevel@tonic-gate * Returns 0 - Success 53720Sstevel@tonic-gate * 1 - Master unable to join to set. 53730Sstevel@tonic-gate * 205 - Failure during RPC to another node 53740Sstevel@tonic-gate * -1 - Any other failure and ep is filled in. 53750Sstevel@tonic-gate * -1 return will eventually cause node to panic 53760Sstevel@tonic-gate * in a SunCluster environment. 53770Sstevel@tonic-gate */ 53780Sstevel@tonic-gate int 53790Sstevel@tonic-gate meta_mnsync_diskset_mddbs( 53800Sstevel@tonic-gate mdsetname_t *sp, 53810Sstevel@tonic-gate md_error_t *ep 53820Sstevel@tonic-gate ) 53830Sstevel@tonic-gate { 53840Sstevel@tonic-gate md_set_desc *sd; 53850Sstevel@tonic-gate mddb_config_t c; 53860Sstevel@tonic-gate md_mn_msgclass_t class; 53870Sstevel@tonic-gate mddb_setflags_config_t sf; 53880Sstevel@tonic-gate md_mnnode_desc *nd, *nd2; 53890Sstevel@tonic-gate md_error_t xep = mdnullerror; 53900Sstevel@tonic-gate int stale_set = 0; 53910Sstevel@tonic-gate 53920Sstevel@tonic-gate /* If setname is there, set desc should exist. */ 53930Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 53940Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 53950Sstevel@tonic-gate "Unable to get set %s desc information"), sp->setname); 53960Sstevel@tonic-gate return (-1); 53970Sstevel@tonic-gate } 53980Sstevel@tonic-gate 53990Sstevel@tonic-gate /* Are there drives in the set? */ 54000Sstevel@tonic-gate if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 54010Sstevel@tonic-gate ep) == NULL) { 54020Sstevel@tonic-gate if (! mdisok(ep)) { 54030Sstevel@tonic-gate return (-1); 54040Sstevel@tonic-gate } 54050Sstevel@tonic-gate /* No drives in set -- nothing to sync up */ 54060Sstevel@tonic-gate return (0); 54070Sstevel@tonic-gate } 54080Sstevel@tonic-gate 54090Sstevel@tonic-gate /* 54100Sstevel@tonic-gate * Is master node (which is this node) joined to set? 54110Sstevel@tonic-gate * If master node isn't joined (which means that no nodes 54120Sstevel@tonic-gate * are joined to diskset), remove the change log entries 54130Sstevel@tonic-gate * since no need to replay them - all nodes will have same 54140Sstevel@tonic-gate * view of mddbs since all nodes are reading in the mddbs 54150Sstevel@tonic-gate * from disk. 54160Sstevel@tonic-gate * There is also no need to sync up the master and ondisk mddbs 54170Sstevel@tonic-gate * since master has no incore knowledge. 54180Sstevel@tonic-gate * Need to join master to set in order to flush the change 54190Sstevel@tonic-gate * log entries. Don't need to block I/O during join of master 54200Sstevel@tonic-gate * to set since no other nodes are joined to set and so no I/O 54210Sstevel@tonic-gate * can be occurring. 54220Sstevel@tonic-gate */ 54230Sstevel@tonic-gate if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 54240Sstevel@tonic-gate /* Join master to set */ 54250Sstevel@tonic-gate if (clnt_joinset(mynode(), sp, 54260Sstevel@tonic-gate MNSET_IN_RECONFIG, ep)) { 54270Sstevel@tonic-gate if (mdismddberror(ep, MDE_DB_STALE)) { 54280Sstevel@tonic-gate /* 54290Sstevel@tonic-gate * If STALE, print message and continue on. 54300Sstevel@tonic-gate * Don't do any writes or reads to mddbs 54310Sstevel@tonic-gate * so don't clear change log. 54320Sstevel@tonic-gate */ 54330Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 54340Sstevel@tonic-gate "Join of master node to STALE set %s"), 54350Sstevel@tonic-gate sp->setname); 54360Sstevel@tonic-gate stale_set = 1; 54370Sstevel@tonic-gate mdclrerror(ep); 54380Sstevel@tonic-gate } else if (mdismddberror(ep, MDE_DB_ACCOK)) { 54390Sstevel@tonic-gate /* ACCOK means mediator provided extra vote */ 54400Sstevel@tonic-gate mdclrerror(ep); 54410Sstevel@tonic-gate } else { 54420Sstevel@tonic-gate /* 54430Sstevel@tonic-gate * If master is unable to join set, print an 54440Sstevel@tonic-gate * error message. Don't return failure or node 54450Sstevel@tonic-gate * will panic during cluster reconfig cycle. 54460Sstevel@tonic-gate * Also, withdraw node from set in order to 54470Sstevel@tonic-gate * cleanup from failed join attempt. 54480Sstevel@tonic-gate */ 54490Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 54500Sstevel@tonic-gate "Join of master node in set %s failed"), 54510Sstevel@tonic-gate sp->setname); 54520Sstevel@tonic-gate if (clnt_withdrawset(mynode(), sp, &xep)) 54530Sstevel@tonic-gate mdclrerror(&xep); 54540Sstevel@tonic-gate return (1); 54550Sstevel@tonic-gate } 54560Sstevel@tonic-gate } 54570Sstevel@tonic-gate /* 54580Sstevel@tonic-gate * Master node successfully joined. 54590Sstevel@tonic-gate * Set local copy of flags to OWN and 54600Sstevel@tonic-gate * send owner flag to rpc.metad. If not stale, 54610Sstevel@tonic-gate * flush the change log. 54620Sstevel@tonic-gate */ 54630Sstevel@tonic-gate sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN; 54640Sstevel@tonic-gate if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, MD_NR_SET, 54650Sstevel@tonic-gate MNSET_IN_RECONFIG, ep)) { 54660Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 54670Sstevel@tonic-gate "Flag update of master node join in set %s failed"), 54680Sstevel@tonic-gate sp->setname); 54690Sstevel@tonic-gate return (-1); 54700Sstevel@tonic-gate } 54710Sstevel@tonic-gate 54720Sstevel@tonic-gate if (!stale_set) { 54730Sstevel@tonic-gate if (mdmn_reset_changelog(sp, ep, 54740Sstevel@tonic-gate MDMN_CLF_RESETLOG) != 0) { 54750Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 54760Sstevel@tonic-gate "Unable to reset changelog.")); 54770Sstevel@tonic-gate return (-1); 54780Sstevel@tonic-gate } 54790Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 54800Sstevel@tonic-gate "Removed changelog entries for set %s: %s"), 54810Sstevel@tonic-gate sp->setname, 54820Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 54830Sstevel@tonic-gate } 54840Sstevel@tonic-gate /* Reset new master flag before return */ 54850Sstevel@tonic-gate (void) memset(&sf, 0, sizeof (sf)); 54860Sstevel@tonic-gate sf.sf_setno = sp->setno; 54870Sstevel@tonic-gate sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 54880Sstevel@tonic-gate sf.sf_flags = MDDB_NM_RESET; 54890Sstevel@tonic-gate /* Use magic to help protect ioctl against attack. */ 54900Sstevel@tonic-gate sf.sf_magic = MDDB_SETFLAGS_MAGIC; 54910Sstevel@tonic-gate /* Ignore failure, failure to reset flag isn't catastrophic */ 54920Sstevel@tonic-gate (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, 54930Sstevel@tonic-gate &sf.sf_mde, NULL); 54940Sstevel@tonic-gate 54950Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 54960Sstevel@tonic-gate "Reset new master flag for set %s: %s"), 54970Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 54980Sstevel@tonic-gate 54990Sstevel@tonic-gate return (0); 55000Sstevel@tonic-gate } 55010Sstevel@tonic-gate 55020Sstevel@tonic-gate /* 55030Sstevel@tonic-gate * Is master already joined to STALE set (< 50% mddbs avail)? 55040Sstevel@tonic-gate * If so, can make no config changes to mddbs so don't check or play 55050Sstevel@tonic-gate * changelog and don't sync master node to ondisk mddbs. 55060Sstevel@tonic-gate * To get out of the stale state all nodes must be withdrawn 55070Sstevel@tonic-gate * from set. Then as nodes are re-joined, all nodes will 55080Sstevel@tonic-gate * have same view of mddbs since all nodes are reading the 55090Sstevel@tonic-gate * mddbs from disk. 55100Sstevel@tonic-gate */ 55110Sstevel@tonic-gate (void) memset(&c, 0, sizeof (c)); 55120Sstevel@tonic-gate c.c_id = 0; 55130Sstevel@tonic-gate c.c_setno = sp->setno; 55140Sstevel@tonic-gate if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 55150Sstevel@tonic-gate (void) mdstealerror(ep, &c.c_mde); 55160Sstevel@tonic-gate return (-1); 55170Sstevel@tonic-gate } 55180Sstevel@tonic-gate if (c.c_flags & MDDB_C_STALE) { 55190Sstevel@tonic-gate return (0); 55200Sstevel@tonic-gate } 55210Sstevel@tonic-gate 55220Sstevel@tonic-gate /* 55230Sstevel@tonic-gate * If this node is NOT a newly chosen master, then there's 55240Sstevel@tonic-gate * nothing else to do since the change log should be empty and 55250Sstevel@tonic-gate * the ondisk and incore mddbs are already consistent. 55260Sstevel@tonic-gate * 55270Sstevel@tonic-gate * A newly chosen master is a node that was not the master 55280Sstevel@tonic-gate * at the beginning of the reconfig cycle. If a node is a new 55290Sstevel@tonic-gate * master, then the new master state is reset after the ondisk 55300Sstevel@tonic-gate * and incore mddbs are consistent and the change log has 55310Sstevel@tonic-gate * been replayed. 55320Sstevel@tonic-gate */ 55330Sstevel@tonic-gate (void) memset(&sf, 0, sizeof (sf)); 55340Sstevel@tonic-gate sf.sf_setno = sp->setno; 55350Sstevel@tonic-gate sf.sf_flags = MDDB_NM_GET; 55360Sstevel@tonic-gate /* Use magic to help protect ioctl against attack. */ 55370Sstevel@tonic-gate sf.sf_magic = MDDB_SETFLAGS_MAGIC; 55380Sstevel@tonic-gate if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) == 0) && 55390Sstevel@tonic-gate ((sf.sf_setflags & MD_SET_MN_NEWMAS_RC) == 0)) { 55400Sstevel@tonic-gate return (0); 55410Sstevel@tonic-gate } 55420Sstevel@tonic-gate 55430Sstevel@tonic-gate /* 55440Sstevel@tonic-gate * Now, sync up incore master view to ondisk mddbs. 55450Sstevel@tonic-gate * This is needed in the case where a master node 55460Sstevel@tonic-gate * had made a change to the mddb, but this change 55470Sstevel@tonic-gate * may not have been relayed to the slaves yet. 55480Sstevel@tonic-gate * So, the new master needs to verify that the ondisk 55490Sstevel@tonic-gate * mddbs match what the new master has incore - 55500Sstevel@tonic-gate * if different, new master rewrites all of the mddbs. 55510Sstevel@tonic-gate * Then the new master will replay the changelog and the 55520Sstevel@tonic-gate * new master will then execute what the old master had 55530Sstevel@tonic-gate * done. 55540Sstevel@tonic-gate * 55550Sstevel@tonic-gate * Block all I/Os to disks in this diskset on all nodes in 55560Sstevel@tonic-gate * the diskset. This will allow the rewriting of the mddbs 55570Sstevel@tonic-gate * (if needed), to proceed in a timely manner. 55580Sstevel@tonic-gate * 55590Sstevel@tonic-gate * If block of I/Os fail, return a -1. 55600Sstevel@tonic-gate */ 55610Sstevel@tonic-gate 55620Sstevel@tonic-gate nd = sd->sd_nodelist; 55630Sstevel@tonic-gate while (nd) { 55640Sstevel@tonic-gate /* Skip non-alive and non-owner nodes */ 55650Sstevel@tonic-gate if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 55660Sstevel@tonic-gate (!(nd->nd_flags & MD_MN_NODE_OWN))) { 55670Sstevel@tonic-gate nd = nd->nd_next; 55680Sstevel@tonic-gate continue; 55690Sstevel@tonic-gate } 55700Sstevel@tonic-gate if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 55710Sstevel@tonic-gate MN_SUSP_IO, ep)) { 55720Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 55730Sstevel@tonic-gate "Unable to suspend I/O on node %s in set %s"), 55740Sstevel@tonic-gate nd->nd_nodename, sp->setname); 55750Sstevel@tonic-gate 55760Sstevel@tonic-gate /* 55770Sstevel@tonic-gate * Resume all other nodes that had been suspended. 55780Sstevel@tonic-gate * (Reconfig return step also resumes I/Os 55790Sstevel@tonic-gate * for all sets.) 55800Sstevel@tonic-gate */ 55810Sstevel@tonic-gate nd2 = sd->sd_nodelist; 55820Sstevel@tonic-gate while (nd2) { 55830Sstevel@tonic-gate /* Stop when reaching failed node */ 55840Sstevel@tonic-gate if (nd2->nd_nodeid == nd->nd_nodeid) 55850Sstevel@tonic-gate break; 55860Sstevel@tonic-gate /* Skip non-alive and non-owner nodes */ 55870Sstevel@tonic-gate if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) || 55880Sstevel@tonic-gate (!(nd2->nd_flags & MD_MN_NODE_OWN))) { 55890Sstevel@tonic-gate nd2 = nd2->nd_next; 55900Sstevel@tonic-gate continue; 55910Sstevel@tonic-gate } 55920Sstevel@tonic-gate (void) (clnt_mn_susp_res_io(nd2->nd_nodename, 55930Sstevel@tonic-gate sp->setno, MN_RES_IO, &xep)); 55940Sstevel@tonic-gate nd2 = nd2->nd_next; 55950Sstevel@tonic-gate } 55960Sstevel@tonic-gate 55970Sstevel@tonic-gate /* 55980Sstevel@tonic-gate * If an RPC failure on another node, return a 205. 55990Sstevel@tonic-gate * Otherwise, exit with failure. 56000Sstevel@tonic-gate */ 56010Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 56020Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 56030Sstevel@tonic-gate nd->nd_nodeid)) { 56040Sstevel@tonic-gate return (205); 56050Sstevel@tonic-gate } else { 56060Sstevel@tonic-gate return (-1); 56070Sstevel@tonic-gate } 56080Sstevel@tonic-gate 56090Sstevel@tonic-gate } 56100Sstevel@tonic-gate nd = nd->nd_next; 56110Sstevel@tonic-gate } 56120Sstevel@tonic-gate 56130Sstevel@tonic-gate (void) memset(&c, 0, sizeof (c)); 56140Sstevel@tonic-gate c.c_id = 0; 56150Sstevel@tonic-gate c.c_setno = sp->setno; 56160Sstevel@tonic-gate /* Master can't sync up to ondisk mddbs? Kick it out of cluster */ 56170Sstevel@tonic-gate if (metaioctl(MD_MN_CHK_WRT_MDDB, &c, &c.c_mde, NULL) != 0) 56180Sstevel@tonic-gate return (-1); 56190Sstevel@tonic-gate 56200Sstevel@tonic-gate /* 56210Sstevel@tonic-gate * Resume I/Os that were suspended above. 56220Sstevel@tonic-gate */ 56230Sstevel@tonic-gate nd = sd->sd_nodelist; 56240Sstevel@tonic-gate while (nd) { 56250Sstevel@tonic-gate /* Skip non-alive and non-owner nodes */ 56260Sstevel@tonic-gate if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 56270Sstevel@tonic-gate (!(nd->nd_flags & MD_MN_NODE_OWN))) { 56280Sstevel@tonic-gate nd = nd->nd_next; 56290Sstevel@tonic-gate continue; 56300Sstevel@tonic-gate } 56310Sstevel@tonic-gate if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 56320Sstevel@tonic-gate MN_RES_IO, ep)) { 56330Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 56340Sstevel@tonic-gate "Unable to resume I/O on node %s in set %s"), 56350Sstevel@tonic-gate nd->nd_nodename, sp->setname); 56360Sstevel@tonic-gate 56370Sstevel@tonic-gate /* 56380Sstevel@tonic-gate * If an RPC failure then don't do any 56390Sstevel@tonic-gate * more RPC calls, since one timeout is enough 56400Sstevel@tonic-gate * to endure. If RPC failure to another node, return 56410Sstevel@tonic-gate * 205. If RPC failure to my node, return -1. 56420Sstevel@tonic-gate * If not an RPC failure, continue resuming the 56430Sstevel@tonic-gate * rest of the nodes and then return -1. 56440Sstevel@tonic-gate */ 56450Sstevel@tonic-gate if (mdanyrpcerror(ep)) { 56460Sstevel@tonic-gate if (sd->sd_mn_mynode->nd_nodeid == 56470Sstevel@tonic-gate nd->nd_nodeid) { 56480Sstevel@tonic-gate return (-1); 56490Sstevel@tonic-gate } else { 56500Sstevel@tonic-gate return (205); 56510Sstevel@tonic-gate } 56520Sstevel@tonic-gate } 56530Sstevel@tonic-gate 56540Sstevel@tonic-gate /* 56550Sstevel@tonic-gate * If not an RPC error, continue resuming rest of 56560Sstevel@tonic-gate * nodes, ignoring any failures except for an 56570Sstevel@tonic-gate * RPC failure which constitutes an immediate exit. 56580Sstevel@tonic-gate * Start in middle of list with failing node. 56590Sstevel@tonic-gate */ 56600Sstevel@tonic-gate nd2 = nd->nd_next; 56610Sstevel@tonic-gate while (nd2) { 56620Sstevel@tonic-gate /* Skip non-alive and non-owner nodes */ 56630Sstevel@tonic-gate if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) || 56640Sstevel@tonic-gate (!(nd2->nd_flags & MD_MN_NODE_OWN))) { 56650Sstevel@tonic-gate nd2 = nd2->nd_next; 56660Sstevel@tonic-gate continue; 56670Sstevel@tonic-gate } 56680Sstevel@tonic-gate (void) (clnt_mn_susp_res_io(nd2->nd_nodename, 56690Sstevel@tonic-gate sp->setno, MN_RES_IO, &xep)); 56700Sstevel@tonic-gate if (mdanyrpcerror(&xep)) { 56710Sstevel@tonic-gate return (-1); 56720Sstevel@tonic-gate } 56730Sstevel@tonic-gate nd2 = nd2->nd_next; 56740Sstevel@tonic-gate } 56750Sstevel@tonic-gate } 56760Sstevel@tonic-gate nd = nd->nd_next; 56770Sstevel@tonic-gate } 56780Sstevel@tonic-gate 56790Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, "Master node has completed " 56800Sstevel@tonic-gate "checking/writing the mddb for set %s: %s"), sp->setname, 56810Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time)); 56820Sstevel@tonic-gate 56830Sstevel@tonic-gate /* 56840Sstevel@tonic-gate * Send (aka replay) all messages we find in the changelog. 56850Sstevel@tonic-gate * Flag the messages with 56860Sstevel@tonic-gate * MD_MSGF_REPLAY_MSG, so no new message ID is generated for them 56870Sstevel@tonic-gate * MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd. 56880Sstevel@tonic-gate */ 56890Sstevel@tonic-gate for (class = MD_MN_NCLASSES - 1; class > 0; class--) { 56900Sstevel@tonic-gate mdmn_changelog_record_t *lr; 56910Sstevel@tonic-gate md_error_t xep = mdnullerror; 56920Sstevel@tonic-gate md_mn_result_t *resultp = NULL; 56930Sstevel@tonic-gate int ret; 56940Sstevel@tonic-gate 56950Sstevel@tonic-gate lr = mdmn_get_changelogrec(sp->setno, class); 56960Sstevel@tonic-gate if ((lr->lr_flags & MD_MN_LR_INUSE) == 0) { 56970Sstevel@tonic-gate /* no entry for this class */ 56980Sstevel@tonic-gate continue; 56990Sstevel@tonic-gate } 57000Sstevel@tonic-gate 57010Sstevel@tonic-gate meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN, 57020Sstevel@tonic-gate "replaying message ID=(%d, 0x%llx-%d)\n"), 57030Sstevel@tonic-gate MSGID_ELEMS(lr->lr_msg.msg_msgid)); 57040Sstevel@tonic-gate 57050Sstevel@tonic-gate ret = mdmn_send_message_with_msgid( 57060Sstevel@tonic-gate lr->lr_msg.msg_setno, 57070Sstevel@tonic-gate lr->lr_msg.msg_type, 57080Sstevel@tonic-gate lr->lr_msg.msg_flags | MD_MSGF_REPLAY_MSG | 57090Sstevel@tonic-gate MD_MSGF_OVERRIDE_SUSPEND, 57100Sstevel@tonic-gate lr->lr_msg.msg_event_data, 57110Sstevel@tonic-gate lr->lr_msg.msg_event_size, 57120Sstevel@tonic-gate &resultp, 57130Sstevel@tonic-gate &lr->lr_msg.msg_msgid, 57140Sstevel@tonic-gate &xep); 57150Sstevel@tonic-gate 57160Sstevel@tonic-gate meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN, 57170Sstevel@tonic-gate "mdmn_send_message returned %d\n"), ret); 57180Sstevel@tonic-gate 57190Sstevel@tonic-gate if (resultp) 57200Sstevel@tonic-gate free_result(resultp); 57210Sstevel@tonic-gate } 57220Sstevel@tonic-gate 57230Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 57240Sstevel@tonic-gate "Playing changelog completed for set %s: %s"), 57250Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 57260Sstevel@tonic-gate 57270Sstevel@tonic-gate /* 57280Sstevel@tonic-gate * Now that new master has ondisk and incore mddbs in sync, reset 57290Sstevel@tonic-gate * this node's new master kernel flag (for this set). If this node 57300Sstevel@tonic-gate * re-enters another reconfig cycle before the completion of this 57310Sstevel@tonic-gate * reconfig cycle, this master node won't need to check if the ondisk 57320Sstevel@tonic-gate * and incore mddbs are in sync since this node won't be considered 57330Sstevel@tonic-gate * a new master (since this flag is being reset here in the middle of 57340Sstevel@tonic-gate * step2). This will save time during any subsequent reconfig 57350Sstevel@tonic-gate * cycles as long as this node continues to be master. 57360Sstevel@tonic-gate */ 57370Sstevel@tonic-gate (void) memset(&sf, 0, sizeof (sf)); 57380Sstevel@tonic-gate sf.sf_setno = sp->setno; 57390Sstevel@tonic-gate sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 57400Sstevel@tonic-gate sf.sf_flags = MDDB_NM_RESET; 57410Sstevel@tonic-gate /* Use magic to help protect ioctl against attack. */ 57420Sstevel@tonic-gate sf.sf_magic = MDDB_SETFLAGS_MAGIC; 57430Sstevel@tonic-gate /* Ignore failure, since failure to reset flag isn't catastrophic */ 57440Sstevel@tonic-gate (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL); 57450Sstevel@tonic-gate 57460Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 57470Sstevel@tonic-gate "Reset new master flag for set %s: %s"), 57480Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 57490Sstevel@tonic-gate 57500Sstevel@tonic-gate return (0); 57510Sstevel@tonic-gate } 57520Sstevel@tonic-gate 57530Sstevel@tonic-gate /* 57540Sstevel@tonic-gate * meta_mnjoin_all will join all starting nodes in the diskset. 57550Sstevel@tonic-gate * A starting node is considered to be any node that is not 57560Sstevel@tonic-gate * an owner of the set but is a member of the cluster. 57570Sstevel@tonic-gate * Master node is already joined to set (done in meta_mnsync_diskset_mddbs). 57580Sstevel@tonic-gate * 57590Sstevel@tonic-gate * Caller is the Master node. 57600Sstevel@tonic-gate * 57610Sstevel@tonic-gate * Returns 0 - Success 57620Sstevel@tonic-gate * 205 - Failure during RPC to another node 57630Sstevel@tonic-gate * -1 - Any other failure and ep is filled in. 57640Sstevel@tonic-gate */ 57650Sstevel@tonic-gate int 57660Sstevel@tonic-gate meta_mnjoin_all( 57670Sstevel@tonic-gate mdsetname_t *sp, 57680Sstevel@tonic-gate md_error_t *ep 57690Sstevel@tonic-gate ) 57700Sstevel@tonic-gate { 57710Sstevel@tonic-gate md_set_desc *sd; 57720Sstevel@tonic-gate md_mnnode_desc *nd, *nd2; 57730Sstevel@tonic-gate int rval = 0; 57740Sstevel@tonic-gate int stale_flag = 0; 57750Sstevel@tonic-gate mddb_config_t c; 57760Sstevel@tonic-gate int susp_res_flag = 0; 57770Sstevel@tonic-gate md_error_t xep = mdnullerror; 57780Sstevel@tonic-gate 57790Sstevel@tonic-gate /* If setname is there, set desc should exist. */ 57800Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) { 57810Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 57820Sstevel@tonic-gate "Unable to get set %s desc information"), sp->setname); 57830Sstevel@tonic-gate return (-1); 57840Sstevel@tonic-gate } 57850Sstevel@tonic-gate 57860Sstevel@tonic-gate /* Are there drives in the set? */ 57870Sstevel@tonic-gate if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 57880Sstevel@tonic-gate ep) == NULL) { 57890Sstevel@tonic-gate if (! mdisok(ep)) { 57900Sstevel@tonic-gate return (-1); 57910Sstevel@tonic-gate } 57920Sstevel@tonic-gate /* No drives in set -- nothing to join */ 57930Sstevel@tonic-gate return (0); 57940Sstevel@tonic-gate } 57950Sstevel@tonic-gate 57960Sstevel@tonic-gate /* 57970Sstevel@tonic-gate * Is set currently stale? 57980Sstevel@tonic-gate */ 57990Sstevel@tonic-gate (void) memset(&c, 0, sizeof (c)); 58000Sstevel@tonic-gate c.c_id = 0; 58010Sstevel@tonic-gate c.c_setno = sp->setno; 58020Sstevel@tonic-gate /* Ignore failure since master node may not be joined yet */ 58030Sstevel@tonic-gate (void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL); 58040Sstevel@tonic-gate if (c.c_flags & MDDB_C_STALE) { 58050Sstevel@tonic-gate stale_flag = MNSET_IS_STALE; 58060Sstevel@tonic-gate } 58070Sstevel@tonic-gate 58080Sstevel@tonic-gate /* 58090Sstevel@tonic-gate * If any nodes are going to be joined to diskset, then 58100Sstevel@tonic-gate * suspend I/O to all disks in diskset so that nodes can join 58110Sstevel@tonic-gate * (read in mddbs) in a reasonable amount of time even under 58120Sstevel@tonic-gate * high I/O load. Don't need to do this if set is STALE since 58130Sstevel@tonic-gate * no I/O can be occurring to a STALE set. 58140Sstevel@tonic-gate */ 58150Sstevel@tonic-gate if (stale_flag != MNSET_IS_STALE) { 58160Sstevel@tonic-gate nd = sd->sd_nodelist; 58170Sstevel@tonic-gate while (nd) { 58180Sstevel@tonic-gate /* Found a node that will be joined to diskset */ 58190Sstevel@tonic-gate if ((nd->nd_flags & MD_MN_NODE_ALIVE) && 58200Sstevel@tonic-gate (!(nd->nd_flags & MD_MN_NODE_OWN))) { 58210Sstevel@tonic-gate /* Set flag that diskset should be suspended */ 58220Sstevel@tonic-gate susp_res_flag = 1; 58230Sstevel@tonic-gate break; 58240Sstevel@tonic-gate } 58250Sstevel@tonic-gate nd = nd->nd_next; 58260Sstevel@tonic-gate } 58270Sstevel@tonic-gate } 58280Sstevel@tonic-gate 58290Sstevel@tonic-gate if (susp_res_flag) { 58300Sstevel@tonic-gate /* 58310Sstevel@tonic-gate * Block all I/Os to disks in this diskset on all joined 58320Sstevel@tonic-gate * nodes in the diskset. 58330Sstevel@tonic-gate * If block of I/Os fails due to an RPC failure on another 58340Sstevel@tonic-gate * node, return 205; otherwise, return -1. 58350Sstevel@tonic-gate */ 58360Sstevel@tonic-gate nd = sd->sd_nodelist; 58370Sstevel@tonic-gate while (nd) { 58380Sstevel@tonic-gate /* Skip non-alive and non-owner nodes */ 58390Sstevel@tonic-gate if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 58400Sstevel@tonic-gate (!(nd->nd_flags & MD_MN_NODE_OWN))) { 58410Sstevel@tonic-gate nd = nd->nd_next; 58420Sstevel@tonic-gate continue; 58430Sstevel@tonic-gate } 58440Sstevel@tonic-gate if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 58450Sstevel@tonic-gate MN_SUSP_IO, ep)) { 58460Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 58470Sstevel@tonic-gate "Unable to suspend I/O on node %s" 58480Sstevel@tonic-gate " in set %s"), nd->nd_nodename, 58490Sstevel@tonic-gate sp->setname); 58500Sstevel@tonic-gate /* 58510Sstevel@tonic-gate * Resume other nodes that had been suspended. 58520Sstevel@tonic-gate * (Reconfig return step also resumes I/Os 58530Sstevel@tonic-gate * for all sets.) 58540Sstevel@tonic-gate */ 58550Sstevel@tonic-gate nd2 = sd->sd_nodelist; 58560Sstevel@tonic-gate while (nd2) { 58570Sstevel@tonic-gate /* Stop when reaching failed node */ 58580Sstevel@tonic-gate if (nd2->nd_nodeid == nd->nd_nodeid) 58590Sstevel@tonic-gate break; 58600Sstevel@tonic-gate /* Skip non-alive/non-owner nodes */ 58610Sstevel@tonic-gate if ((!(nd2->nd_flags & 58620Sstevel@tonic-gate MD_MN_NODE_ALIVE)) || 58630Sstevel@tonic-gate (!(nd2->nd_flags & 58640Sstevel@tonic-gate MD_MN_NODE_OWN))) { 58650Sstevel@tonic-gate nd2 = nd2->nd_next; 58660Sstevel@tonic-gate continue; 58670Sstevel@tonic-gate } 58680Sstevel@tonic-gate (void) (clnt_mn_susp_res_io( 58690Sstevel@tonic-gate nd2->nd_nodename, sp->setno, 58700Sstevel@tonic-gate MN_RES_IO, &xep)); 58710Sstevel@tonic-gate nd2 = nd2->nd_next; 58720Sstevel@tonic-gate } 58730Sstevel@tonic-gate 58740Sstevel@tonic-gate /* 58750Sstevel@tonic-gate * If the suspend failed due to an 58760Sstevel@tonic-gate * RPC failure on another node, return 58770Sstevel@tonic-gate * a 205. 58780Sstevel@tonic-gate * Otherwise, exit with failure. 58790Sstevel@tonic-gate * The return reconfig step will resume 58800Sstevel@tonic-gate * I/Os for all disksets. 58810Sstevel@tonic-gate */ 58820Sstevel@tonic-gate if ((mdanyrpcerror(ep)) && 58830Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != 58840Sstevel@tonic-gate nd->nd_nodeid)) { 58850Sstevel@tonic-gate return (205); 58860Sstevel@tonic-gate } else { 58870Sstevel@tonic-gate return (-1); 58880Sstevel@tonic-gate } 58890Sstevel@tonic-gate } 58900Sstevel@tonic-gate nd = nd->nd_next; 58910Sstevel@tonic-gate } 58920Sstevel@tonic-gate } 58930Sstevel@tonic-gate 58940Sstevel@tonic-gate nd = sd->sd_nodelist; 58950Sstevel@tonic-gate while (nd) { 58960Sstevel@tonic-gate /* 58970Sstevel@tonic-gate * If a node is in the membership list but isn't joined 58980Sstevel@tonic-gate * to the set, try to join the node. 58990Sstevel@tonic-gate */ 59000Sstevel@tonic-gate if ((nd->nd_flags & MD_MN_NODE_ALIVE) && 59010Sstevel@tonic-gate (!(nd->nd_flags & MD_MN_NODE_OWN))) { 59020Sstevel@tonic-gate if (clnt_joinset(nd->nd_nodename, sp, 59030Sstevel@tonic-gate (MNSET_IN_RECONFIG | stale_flag), ep)) { 59040Sstevel@tonic-gate /* 59050Sstevel@tonic-gate * If RPC failure to another node 59060Sstevel@tonic-gate * then exit without attempting anything else. 59070Sstevel@tonic-gate * (Reconfig return step will resume I/Os 59080Sstevel@tonic-gate * for all sets.) 59090Sstevel@tonic-gate */ 59100Sstevel@tonic-gate if (mdanyrpcerror(ep)) { 59110Sstevel@tonic-gate mde_perror(ep, ""); 59120Sstevel@tonic-gate return (205); 59130Sstevel@tonic-gate } 59140Sstevel@tonic-gate /* 59150Sstevel@tonic-gate * STALE and ACCOK failures aren't true 59160Sstevel@tonic-gate * failures. STALE means that <50% mddbs 59170Sstevel@tonic-gate * are available. ACCOK means that the 59180Sstevel@tonic-gate * mediator provided the extra vote. 59190Sstevel@tonic-gate * If a true failure, then print messasge 59200Sstevel@tonic-gate * and withdraw node from set in order to 59210Sstevel@tonic-gate * cleanup from failed join attempt. 59220Sstevel@tonic-gate */ 59230Sstevel@tonic-gate if ((!mdismddberror(ep, MDE_DB_STALE)) && 59240Sstevel@tonic-gate (!mdismddberror(ep, MDE_DB_ACCOK))) { 59250Sstevel@tonic-gate mde_perror(ep, 59260Sstevel@tonic-gate "WARNING: Unable to join node %s " 59270Sstevel@tonic-gate "to set %s", nd->nd_nodename, 59280Sstevel@tonic-gate sp->setname); 59290Sstevel@tonic-gate mdclrerror(ep); 59300Sstevel@tonic-gate if (clnt_withdrawset(nd->nd_nodename, 59310Sstevel@tonic-gate sp, &xep)) 59320Sstevel@tonic-gate mdclrerror(&xep); 59330Sstevel@tonic-gate nd = nd->nd_next; 59340Sstevel@tonic-gate continue; 59350Sstevel@tonic-gate } 59360Sstevel@tonic-gate } 59370Sstevel@tonic-gate /* Set owner flag even if STALE or ACCOK */ 59380Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_OWN; 59390Sstevel@tonic-gate } 59400Sstevel@tonic-gate nd = nd->nd_next; 59410Sstevel@tonic-gate } 59420Sstevel@tonic-gate /* 59430Sstevel@tonic-gate * Resume I/Os if suspended above. 59440Sstevel@tonic-gate */ 59450Sstevel@tonic-gate if (susp_res_flag) { 59460Sstevel@tonic-gate nd = sd->sd_nodelist; 59470Sstevel@tonic-gate while (nd) { 59480Sstevel@tonic-gate /* 59490Sstevel@tonic-gate * Skip non-alive and non-owner nodes 59500Sstevel@tonic-gate * (this list doesn't include any of 59510Sstevel@tonic-gate * the nodes that were joined). 59520Sstevel@tonic-gate */ 59530Sstevel@tonic-gate if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 59540Sstevel@tonic-gate (!(nd->nd_flags & MD_MN_NODE_OWN))) { 59550Sstevel@tonic-gate nd = nd->nd_next; 59560Sstevel@tonic-gate continue; 59570Sstevel@tonic-gate } 59580Sstevel@tonic-gate if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 59590Sstevel@tonic-gate MN_RES_IO, ep)) { 59600Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN, 59610Sstevel@tonic-gate "Unable to resume I/O on node %s" 59620Sstevel@tonic-gate " in set %s"), nd->nd_nodename, 59630Sstevel@tonic-gate sp->setname); 59640Sstevel@tonic-gate 59650Sstevel@tonic-gate /* 59660Sstevel@tonic-gate * If an RPC failure then don't do any 59670Sstevel@tonic-gate * more RPC calls, since one timeout is enough 59680Sstevel@tonic-gate * to endure. If RPC failure to another node, 59690Sstevel@tonic-gate * return 205. If RPC failure to my node, 59700Sstevel@tonic-gate * return -1. 59710Sstevel@tonic-gate * (Reconfig return step will resume I/Os 59720Sstevel@tonic-gate * for all sets.) 59730Sstevel@tonic-gate * If not an RPC failure, continue resuming the 59740Sstevel@tonic-gate * rest of the nodes and then return -1. 59750Sstevel@tonic-gate */ 59760Sstevel@tonic-gate if (mdanyrpcerror(ep)) { 59770Sstevel@tonic-gate if (sd->sd_mn_mynode->nd_nodeid == 59780Sstevel@tonic-gate nd->nd_nodeid) { 59790Sstevel@tonic-gate return (-1); 59800Sstevel@tonic-gate } else { 59810Sstevel@tonic-gate return (205); 59820Sstevel@tonic-gate } 59830Sstevel@tonic-gate } 59840Sstevel@tonic-gate 59850Sstevel@tonic-gate /* 59860Sstevel@tonic-gate * If not an RPC error, continue resuming rest 59870Sstevel@tonic-gate * of nodes, ignoring any failures except for 59880Sstevel@tonic-gate * an RPC failure which constitutes an 59890Sstevel@tonic-gate * immediate exit. 59900Sstevel@tonic-gate * Start in middle of list with failing node. 59910Sstevel@tonic-gate */ 59920Sstevel@tonic-gate nd2 = nd->nd_next; 59930Sstevel@tonic-gate while (nd2) { 59940Sstevel@tonic-gate /* Skip non-owner nodes */ 59950Sstevel@tonic-gate if ((!(nd2->nd_flags & 59960Sstevel@tonic-gate MD_MN_NODE_ALIVE)) || 59970Sstevel@tonic-gate (!(nd2->nd_flags & 59980Sstevel@tonic-gate MD_MN_NODE_OWN))) { 59990Sstevel@tonic-gate nd2 = nd2->nd_next; 60000Sstevel@tonic-gate continue; 60010Sstevel@tonic-gate } 60020Sstevel@tonic-gate (void) (clnt_mn_susp_res_io( 60030Sstevel@tonic-gate nd2->nd_nodename, sp->setno, 60040Sstevel@tonic-gate MN_RES_IO, &xep)); 60050Sstevel@tonic-gate if (mdanyrpcerror(&xep)) { 60060Sstevel@tonic-gate return (-1); 60070Sstevel@tonic-gate } 60080Sstevel@tonic-gate nd2 = nd2->nd_next; 60090Sstevel@tonic-gate } 60100Sstevel@tonic-gate } 60110Sstevel@tonic-gate nd = nd->nd_next; 60120Sstevel@tonic-gate } 60130Sstevel@tonic-gate } 60140Sstevel@tonic-gate 60150Sstevel@tonic-gate nd = sd->sd_nodelist; 60160Sstevel@tonic-gate while (nd) { 60170Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 60180Sstevel@tonic-gate nd = nd->nd_next; 60190Sstevel@tonic-gate continue; 60200Sstevel@tonic-gate } 60210Sstevel@tonic-gate /* 60220Sstevel@tonic-gate * If 1 node fails - go ahead and update the rest except 60230Sstevel@tonic-gate * in the case of an RPC failure, fail immediately. 60240Sstevel@tonic-gate */ 60250Sstevel@tonic-gate if (clnt_upd_nr_flags(nd->nd_nodename, sp, 60260Sstevel@tonic-gate sd->sd_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) { 60270Sstevel@tonic-gate /* RPC failure to another node */ 60280Sstevel@tonic-gate if (mdanyrpcerror(ep)) { 60290Sstevel@tonic-gate return (205); 60300Sstevel@tonic-gate } 60310Sstevel@tonic-gate nd = nd->nd_next; 60320Sstevel@tonic-gate rval = -1; 60330Sstevel@tonic-gate continue; 60340Sstevel@tonic-gate } 60350Sstevel@tonic-gate nd = nd->nd_next; 60360Sstevel@tonic-gate } 60370Sstevel@tonic-gate 60380Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 60390Sstevel@tonic-gate "Join of all nodes completed for set %s: %s"), 60400Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time)); 60410Sstevel@tonic-gate 60420Sstevel@tonic-gate return (rval); 60430Sstevel@tonic-gate } 6044