10Sstevel@tonic-gate /*
20Sstevel@tonic-gate * CDDL HEADER START
30Sstevel@tonic-gate *
40Sstevel@tonic-gate * The contents of this file are subject to the terms of the
51623Stw21770 * Common Development and Distribution License (the "License").
61623Stw21770 * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate *
80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate * See the License for the specific language governing permissions
110Sstevel@tonic-gate * and limitations under the License.
120Sstevel@tonic-gate *
130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate *
190Sstevel@tonic-gate * CDDL HEADER END
200Sstevel@tonic-gate */
210Sstevel@tonic-gate /*
22*8452SJohn.Wren.Kennedy@Sun.COM * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
230Sstevel@tonic-gate * Use is subject to license terms.
240Sstevel@tonic-gate */
250Sstevel@tonic-gate
260Sstevel@tonic-gate /*
270Sstevel@tonic-gate * Just in case we're not in a build environment, make sure that
280Sstevel@tonic-gate * TEXT_DOMAIN gets set to something.
290Sstevel@tonic-gate */
300Sstevel@tonic-gate #if !defined(TEXT_DOMAIN)
310Sstevel@tonic-gate #define TEXT_DOMAIN "SYS_TEST"
320Sstevel@tonic-gate #endif
330Sstevel@tonic-gate
340Sstevel@tonic-gate /*
350Sstevel@tonic-gate * Metadevice diskset interfaces
360Sstevel@tonic-gate */
370Sstevel@tonic-gate
380Sstevel@tonic-gate #include "meta_set_prv.h"
390Sstevel@tonic-gate #include <meta.h>
400Sstevel@tonic-gate #include <metad.h>
410Sstevel@tonic-gate #include <mdmn_changelog.h>
420Sstevel@tonic-gate #include <sys/lvm/md_crc.h>
430Sstevel@tonic-gate #include <sys/utsname.h>
440Sstevel@tonic-gate #include <sdssc.h>
450Sstevel@tonic-gate
460Sstevel@tonic-gate #include <sys/sysevent/eventdefs.h>
470Sstevel@tonic-gate #include <sys/sysevent/svm.h>
480Sstevel@tonic-gate extern char *blkname(char *);
490Sstevel@tonic-gate
500Sstevel@tonic-gate static md_drive_desc *
dr2drivedesc(mdsetname_t * sp,side_t sideno,int flags,md_error_t * ep)510Sstevel@tonic-gate dr2drivedesc(
520Sstevel@tonic-gate mdsetname_t *sp,
530Sstevel@tonic-gate side_t sideno,
540Sstevel@tonic-gate int flags,
550Sstevel@tonic-gate md_error_t *ep
560Sstevel@tonic-gate )
570Sstevel@tonic-gate {
580Sstevel@tonic-gate md_set_record *sr;
590Sstevel@tonic-gate md_drive_record *dr;
600Sstevel@tonic-gate mddrivename_t *dnp;
610Sstevel@tonic-gate md_drive_desc *dd_head = NULL;
620Sstevel@tonic-gate md_set_desc *sd;
630Sstevel@tonic-gate
640Sstevel@tonic-gate if (flags & MD_BYPASS_DAEMON) {
650Sstevel@tonic-gate if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL)
660Sstevel@tonic-gate return (NULL);
670Sstevel@tonic-gate sd = metaget_setdesc(sp, ep);
680Sstevel@tonic-gate sideno = getnodeside(mynode(), sd);
690Sstevel@tonic-gate sp = metafakesetname(sp->setno, sr->sr_setname);
700Sstevel@tonic-gate } else {
710Sstevel@tonic-gate if ((sr = getsetbyname(sp->setname, ep)) == NULL)
720Sstevel@tonic-gate return (NULL);
730Sstevel@tonic-gate }
740Sstevel@tonic-gate
750Sstevel@tonic-gate assert(sideno != MD_SIDEWILD);
760Sstevel@tonic-gate
770Sstevel@tonic-gate /*
780Sstevel@tonic-gate * WARNING:
790Sstevel@tonic-gate * The act of getting the dnp from the namespace means that we
800Sstevel@tonic-gate * will get the devid of the disk as recorded in the namespace.
810Sstevel@tonic-gate * This devid has the potential to be stale if the disk is being
820Sstevel@tonic-gate * replaced via a rebind, this means that any code that relies
830Sstevel@tonic-gate * on any of the dnp information should take the appropriate action
840Sstevel@tonic-gate * to preserve that information. For example in the rebind code the
850Sstevel@tonic-gate * devid of the new disk is saved off and then copied back in once
860Sstevel@tonic-gate * the code that has called this function has completed.
870Sstevel@tonic-gate */
880Sstevel@tonic-gate for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) {
890Sstevel@tonic-gate if ((dnp = metadrivename_withdrkey(sp, sideno, dr->dr_key,
900Sstevel@tonic-gate flags, ep)) == NULL) {
910Sstevel@tonic-gate if (!(flags & MD_BYPASS_DAEMON))
920Sstevel@tonic-gate free_sr(sr);
930Sstevel@tonic-gate metafreedrivedesc(&dd_head);
940Sstevel@tonic-gate return (NULL);
950Sstevel@tonic-gate }
960Sstevel@tonic-gate
970Sstevel@tonic-gate (void) metadrivedesc_append(&dd_head, dnp, dr->dr_dbcnt,
980Sstevel@tonic-gate dr->dr_dbsize, dr->dr_ctime, dr->dr_genid, dr->dr_flags);
990Sstevel@tonic-gate }
1000Sstevel@tonic-gate
1010Sstevel@tonic-gate if (!(flags & MD_BYPASS_DAEMON)) {
1020Sstevel@tonic-gate free_sr(sr);
1030Sstevel@tonic-gate }
1040Sstevel@tonic-gate return (dd_head);
1050Sstevel@tonic-gate }
1060Sstevel@tonic-gate
1070Sstevel@tonic-gate static int
get_sidenmlist(mdsetname_t * sp,mddrivename_t * dnp,md_error_t * ep)1080Sstevel@tonic-gate get_sidenmlist(
1090Sstevel@tonic-gate mdsetname_t *sp,
1100Sstevel@tonic-gate mddrivename_t *dnp,
1110Sstevel@tonic-gate md_error_t *ep
1120Sstevel@tonic-gate )
1130Sstevel@tonic-gate {
1140Sstevel@tonic-gate md_set_desc *sd;
1150Sstevel@tonic-gate mdsidenames_t *sn, **sn_next;
1160Sstevel@tonic-gate int i;
1170Sstevel@tonic-gate
1180Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL)
1190Sstevel@tonic-gate return (-1);
1200Sstevel@tonic-gate
1210Sstevel@tonic-gate metaflushsidenames(dnp);
1220Sstevel@tonic-gate sn_next = &dnp->side_names;
1230Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) {
1240Sstevel@tonic-gate /*
1250Sstevel@tonic-gate * Only get sidenames for this node since
1260Sstevel@tonic-gate * that is the only side information stored in
1270Sstevel@tonic-gate * the local mddb for a multi-node diskset.
1280Sstevel@tonic-gate */
1290Sstevel@tonic-gate if (sd->sd_mn_mynode) {
1300Sstevel@tonic-gate sn = Zalloc(sizeof (*sn));
1310Sstevel@tonic-gate sn->sideno = sd->sd_mn_mynode->nd_nodeid;
1320Sstevel@tonic-gate if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
1330Sstevel@tonic-gate sn->sideno, dnp->side_names_key, &sn->dname,
1340Sstevel@tonic-gate &sn->mnum, NULL, ep)) == NULL) {
1350Sstevel@tonic-gate if (sn->dname != NULL)
1360Sstevel@tonic-gate Free(sn->dname);
1370Sstevel@tonic-gate Free(sn);
1380Sstevel@tonic-gate return (-1);
1390Sstevel@tonic-gate }
1400Sstevel@tonic-gate
1410Sstevel@tonic-gate /* Add to the end of the linked list */
1420Sstevel@tonic-gate assert(*sn_next == NULL);
1430Sstevel@tonic-gate *sn_next = sn;
1440Sstevel@tonic-gate sn_next = &sn->next;
1450Sstevel@tonic-gate }
1460Sstevel@tonic-gate } else {
1470Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) {
1480Sstevel@tonic-gate /* Skip empty slots */
1490Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0')
1500Sstevel@tonic-gate continue;
1510Sstevel@tonic-gate
1520Sstevel@tonic-gate sn = Zalloc(sizeof (*sn));
1530Sstevel@tonic-gate sn->sideno = i;
1540Sstevel@tonic-gate if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
1550Sstevel@tonic-gate i+SKEW, dnp->side_names_key, &sn->dname,
1560Sstevel@tonic-gate &sn->mnum, NULL, ep)) == NULL) {
1570Sstevel@tonic-gate /*
1580Sstevel@tonic-gate * It is possible that during the add of a
1590Sstevel@tonic-gate * host to have a 'missing' side as the side
1600Sstevel@tonic-gate * for this disk will be added later. So ignore
1610Sstevel@tonic-gate * the error. The 'missing' side will be added
1620Sstevel@tonic-gate * once the addhosts process has completed.
1630Sstevel@tonic-gate */
1640Sstevel@tonic-gate if (mdissyserror(ep, ENOENT)) {
1650Sstevel@tonic-gate mdclrerror(ep);
1660Sstevel@tonic-gate Free(sn);
1670Sstevel@tonic-gate continue;
1680Sstevel@tonic-gate }
1690Sstevel@tonic-gate
1700Sstevel@tonic-gate if (sn->dname != NULL)
1710Sstevel@tonic-gate Free(sn->dname);
1720Sstevel@tonic-gate Free(sn);
1730Sstevel@tonic-gate return (-1);
1740Sstevel@tonic-gate }
1750Sstevel@tonic-gate
1760Sstevel@tonic-gate /* Add to the end of the linked list */
1770Sstevel@tonic-gate assert(*sn_next == NULL);
1780Sstevel@tonic-gate *sn_next = sn;
1790Sstevel@tonic-gate sn_next = &sn->next;
1800Sstevel@tonic-gate }
1810Sstevel@tonic-gate }
1820Sstevel@tonic-gate
1830Sstevel@tonic-gate return (0);
1840Sstevel@tonic-gate }
1850Sstevel@tonic-gate
1860Sstevel@tonic-gate static md_drive_desc *
rl_to_dd(mdsetname_t * sp,md_replicalist_t * rlp,md_error_t * ep)1870Sstevel@tonic-gate rl_to_dd(
1880Sstevel@tonic-gate mdsetname_t *sp,
1890Sstevel@tonic-gate md_replicalist_t *rlp,
1900Sstevel@tonic-gate md_error_t *ep
1910Sstevel@tonic-gate )
1920Sstevel@tonic-gate {
1930Sstevel@tonic-gate md_replicalist_t *rl;
1940Sstevel@tonic-gate md_replica_t *r;
1950Sstevel@tonic-gate md_drive_desc *dd = NULL;
1960Sstevel@tonic-gate md_drive_desc *d;
1970Sstevel@tonic-gate int found;
1980Sstevel@tonic-gate md_set_desc *sd;
1990Sstevel@tonic-gate daddr_t nblks = 0;
2000Sstevel@tonic-gate
2010Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL)
2020Sstevel@tonic-gate return (NULL);
2030Sstevel@tonic-gate
2040Sstevel@tonic-gate /* find the smallest existing replica */
2050Sstevel@tonic-gate for (rl = rlp; rl != NULL; rl = rl->rl_next) {
2060Sstevel@tonic-gate r = rl->rl_repp;
2070Sstevel@tonic-gate nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks));
2080Sstevel@tonic-gate }
2090Sstevel@tonic-gate
2100Sstevel@tonic-gate if (nblks <= 0)
2110Sstevel@tonic-gate nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
2120Sstevel@tonic-gate
2130Sstevel@tonic-gate for (rl = rlp; rl != NULL; rl = rl->rl_next) {
2140Sstevel@tonic-gate r = rl->rl_repp;
2150Sstevel@tonic-gate
2160Sstevel@tonic-gate found = 0;
2170Sstevel@tonic-gate for (d = dd; d != NULL; d = d->dd_next) {
2180Sstevel@tonic-gate if (strcmp(r->r_namep->drivenamep->cname,
2190Sstevel@tonic-gate d->dd_dnp->cname) == 0) {
2200Sstevel@tonic-gate found = 1;
2210Sstevel@tonic-gate dd->dd_dbcnt++;
2220Sstevel@tonic-gate break;
2230Sstevel@tonic-gate }
2240Sstevel@tonic-gate }
2250Sstevel@tonic-gate
2260Sstevel@tonic-gate if (! found)
2270Sstevel@tonic-gate (void) metadrivedesc_append(&dd, r->r_namep->drivenamep,
2280Sstevel@tonic-gate 1, nblks, sd->sd_ctime, sd->sd_genid, MD_DR_OK);
2290Sstevel@tonic-gate }
2300Sstevel@tonic-gate
2310Sstevel@tonic-gate return (dd);
2320Sstevel@tonic-gate }
2330Sstevel@tonic-gate
2340Sstevel@tonic-gate /*
2350Sstevel@tonic-gate * Exported Entry Points
2360Sstevel@tonic-gate */
2370Sstevel@tonic-gate
2380Sstevel@tonic-gate set_t
get_max_sets(md_error_t * ep)2390Sstevel@tonic-gate get_max_sets(md_error_t *ep)
2400Sstevel@tonic-gate {
2410Sstevel@tonic-gate
2420Sstevel@tonic-gate static set_t max_sets = 0;
2430Sstevel@tonic-gate
2440Sstevel@tonic-gate if (max_sets == 0)
2450Sstevel@tonic-gate if (metaioctl(MD_IOCGETNSET, &max_sets, ep, NULL) != 0)
2460Sstevel@tonic-gate return (0);
2470Sstevel@tonic-gate
2480Sstevel@tonic-gate return (max_sets);
2490Sstevel@tonic-gate }
2500Sstevel@tonic-gate
2510Sstevel@tonic-gate int
get_max_meds(md_error_t * ep)2520Sstevel@tonic-gate get_max_meds(md_error_t *ep)
2530Sstevel@tonic-gate {
2540Sstevel@tonic-gate static int max_meds = 0;
2550Sstevel@tonic-gate
2560Sstevel@tonic-gate if (max_meds == 0)
2570Sstevel@tonic-gate if (metaioctl(MD_MED_GET_NMED, &max_meds, ep, NULL) != 0)
2580Sstevel@tonic-gate return (0);
2590Sstevel@tonic-gate
2600Sstevel@tonic-gate return (max_meds);
2610Sstevel@tonic-gate }
2620Sstevel@tonic-gate
2630Sstevel@tonic-gate side_t
getmyside(mdsetname_t * sp,md_error_t * ep)2640Sstevel@tonic-gate getmyside(mdsetname_t *sp, md_error_t *ep)
2650Sstevel@tonic-gate {
2660Sstevel@tonic-gate md_set_desc *sd;
2670Sstevel@tonic-gate char *node = NULL;
2680Sstevel@tonic-gate side_t sideno;
2690Sstevel@tonic-gate
2700Sstevel@tonic-gate if (sp->setno == 0)
2710Sstevel@tonic-gate return (0);
2720Sstevel@tonic-gate
2730Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL)
2740Sstevel@tonic-gate return (MD_SIDEWILD);
2750Sstevel@tonic-gate
2760Sstevel@tonic-gate node = mynode();
2770Sstevel@tonic-gate
2780Sstevel@tonic-gate assert(node != NULL);
2790Sstevel@tonic-gate
2800Sstevel@tonic-gate sideno = getnodeside(node, sd);
2810Sstevel@tonic-gate
2820Sstevel@tonic-gate if (sideno != MD_SIDEWILD)
2830Sstevel@tonic-gate return (sideno);
2840Sstevel@tonic-gate
2850Sstevel@tonic-gate return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, node, NULL, node));
2860Sstevel@tonic-gate }
2870Sstevel@tonic-gate
2880Sstevel@tonic-gate /*
2890Sstevel@tonic-gate * get set info from name
2900Sstevel@tonic-gate */
2910Sstevel@tonic-gate md_set_record *
getsetbyname(char * setname,md_error_t * ep)2920Sstevel@tonic-gate getsetbyname(char *setname, md_error_t *ep)
2930Sstevel@tonic-gate {
2940Sstevel@tonic-gate md_set_record *sr = NULL;
2950Sstevel@tonic-gate md_mnset_record *mnsr = NULL;
2960Sstevel@tonic-gate char *p;
2970Sstevel@tonic-gate size_t len;
2980Sstevel@tonic-gate
2990Sstevel@tonic-gate /* get set info from daemon */
3000Sstevel@tonic-gate if (clnt_getset(mynode(), setname, MD_SET_BAD, &sr, ep) == -1)
3010Sstevel@tonic-gate return (NULL);
3020Sstevel@tonic-gate if (sr != NULL) {
3030Sstevel@tonic-gate /*
3040Sstevel@tonic-gate * Returned record could be for a multi-node set or a
3050Sstevel@tonic-gate * non-multi-node set.
3060Sstevel@tonic-gate */
3070Sstevel@tonic-gate if (MD_MNSET_REC(sr)) {
3080Sstevel@tonic-gate /*
3090Sstevel@tonic-gate * Record is for a multi-node set. Reissue call
3100Sstevel@tonic-gate * to get mnset information. Need to free
3110Sstevel@tonic-gate * record as if a non-multi-node set record since
3120Sstevel@tonic-gate * that is what clnt_getset gave us. If in
3130Sstevel@tonic-gate * the daemon, don't free since this is a pointer
3140Sstevel@tonic-gate * into the setrecords array.
3150Sstevel@tonic-gate */
3160Sstevel@tonic-gate if (! md_in_daemon) {
3170Sstevel@tonic-gate sr->sr_flags &= ~MD_SR_MN;
3180Sstevel@tonic-gate free_sr(sr);
3190Sstevel@tonic-gate }
3200Sstevel@tonic-gate if (clnt_mngetset(mynode(), setname, MD_SET_BAD, &mnsr,
3210Sstevel@tonic-gate ep) == -1)
3220Sstevel@tonic-gate return (NULL);
3230Sstevel@tonic-gate if (mnsr != NULL)
3240Sstevel@tonic-gate return ((struct md_set_record *)mnsr);
3250Sstevel@tonic-gate } else {
3260Sstevel@tonic-gate return (sr);
3270Sstevel@tonic-gate }
3280Sstevel@tonic-gate }
3290Sstevel@tonic-gate
3300Sstevel@tonic-gate /* no such set */
3310Sstevel@tonic-gate len = strlen(setname) + 30;
3320Sstevel@tonic-gate p = Malloc(len);
3330Sstevel@tonic-gate (void) snprintf(p, len, "setname \"%s\"", setname);
3340Sstevel@tonic-gate (void) mderror(ep, MDE_NO_SET, p);
3350Sstevel@tonic-gate Free(p);
3360Sstevel@tonic-gate return (NULL);
3370Sstevel@tonic-gate }
3380Sstevel@tonic-gate
3390Sstevel@tonic-gate /*
3400Sstevel@tonic-gate * get set info from number
3410Sstevel@tonic-gate */
3420Sstevel@tonic-gate md_set_record *
getsetbynum(set_t setno,md_error_t * ep)3430Sstevel@tonic-gate getsetbynum(set_t setno, md_error_t *ep)
3440Sstevel@tonic-gate {
3450Sstevel@tonic-gate md_set_record *sr;
3460Sstevel@tonic-gate md_mnset_record *mnsr = NULL;
3470Sstevel@tonic-gate char buf[100];
3480Sstevel@tonic-gate
3490Sstevel@tonic-gate if (clnt_getset(mynode(), NULL, setno, &sr, ep) == -1)
3500Sstevel@tonic-gate return (NULL);
3510Sstevel@tonic-gate
3520Sstevel@tonic-gate if (sr != NULL) {
3530Sstevel@tonic-gate /*
3540Sstevel@tonic-gate * Record is for a multi-node set. Reissue call
3550Sstevel@tonic-gate * to get mnset information. Need to free
3560Sstevel@tonic-gate * record as if a non-multi-node set record since
3570Sstevel@tonic-gate * that is what clnt_getset gave us. If in
3580Sstevel@tonic-gate * the daemon, don't free since this is a pointer
3590Sstevel@tonic-gate * into the setrecords array.
3600Sstevel@tonic-gate */
3610Sstevel@tonic-gate if (MD_MNSET_REC(sr)) {
3620Sstevel@tonic-gate /*
3630Sstevel@tonic-gate * Record is for a multi-node set. Reissue call
3640Sstevel@tonic-gate * to get mnset information.
3650Sstevel@tonic-gate */
3660Sstevel@tonic-gate if (! md_in_daemon) {
3670Sstevel@tonic-gate sr->sr_flags &= ~MD_SR_MN;
3680Sstevel@tonic-gate free_sr(sr);
3690Sstevel@tonic-gate }
3700Sstevel@tonic-gate if (clnt_mngetset(mynode(), NULL, setno, &mnsr,
3710Sstevel@tonic-gate ep) == -1)
3720Sstevel@tonic-gate return (NULL);
3730Sstevel@tonic-gate if (mnsr != NULL)
3740Sstevel@tonic-gate return ((struct md_set_record *)mnsr);
3750Sstevel@tonic-gate } else {
3760Sstevel@tonic-gate return (sr);
3770Sstevel@tonic-gate }
3780Sstevel@tonic-gate }
3790Sstevel@tonic-gate
3800Sstevel@tonic-gate (void) sprintf(buf, "setno %u", setno);
3810Sstevel@tonic-gate (void) mderror(ep, MDE_NO_SET, buf);
3820Sstevel@tonic-gate return (NULL);
3830Sstevel@tonic-gate }
3840Sstevel@tonic-gate
3850Sstevel@tonic-gate int
meta_check_drive_inuse(mdsetname_t * sp,mddrivename_t * dnp,int check_db,md_error_t * ep)3860Sstevel@tonic-gate meta_check_drive_inuse(
3870Sstevel@tonic-gate mdsetname_t *sp,
3880Sstevel@tonic-gate mddrivename_t *dnp,
3890Sstevel@tonic-gate int check_db,
3900Sstevel@tonic-gate md_error_t *ep
3910Sstevel@tonic-gate )
3920Sstevel@tonic-gate {
3930Sstevel@tonic-gate mdnamelist_t *nlp = NULL;
3940Sstevel@tonic-gate mdnamelist_t *p;
3950Sstevel@tonic-gate int rval = 0;
3960Sstevel@tonic-gate
3970Sstevel@tonic-gate /* get all underlying partitions */
3980Sstevel@tonic-gate if (meta_getalldevs(sp, &nlp, check_db, ep) != 0)
3990Sstevel@tonic-gate return (-1);
4000Sstevel@tonic-gate
4010Sstevel@tonic-gate /* search for drive */
4020Sstevel@tonic-gate for (p = nlp; (p != NULL); p = p->next) {
4030Sstevel@tonic-gate mdname_t *np = p->namep;
4040Sstevel@tonic-gate
4050Sstevel@tonic-gate if (strcmp(dnp->cname, np->drivenamep->cname) == 0) {
4060Sstevel@tonic-gate rval = (mddserror(ep, MDE_DS_DRIVEINUSE, sp->setno,
4070Sstevel@tonic-gate NULL, dnp->cname, sp->setname));
4080Sstevel@tonic-gate break;
4090Sstevel@tonic-gate }
4100Sstevel@tonic-gate }
4110Sstevel@tonic-gate
4120Sstevel@tonic-gate /* cleanup, return success */
4130Sstevel@tonic-gate metafreenamelist(nlp);
4140Sstevel@tonic-gate return (rval);
4150Sstevel@tonic-gate }
4160Sstevel@tonic-gate
4170Sstevel@tonic-gate /*
4180Sstevel@tonic-gate * simple check for ownership
4190Sstevel@tonic-gate */
4200Sstevel@tonic-gate int
meta_check_ownership(mdsetname_t * sp,md_error_t * ep)4210Sstevel@tonic-gate meta_check_ownership(mdsetname_t *sp, md_error_t *ep)
4220Sstevel@tonic-gate {
4230Sstevel@tonic-gate int ownset;
4240Sstevel@tonic-gate md_set_desc *sd;
4250Sstevel@tonic-gate md_drive_desc *dd;
4260Sstevel@tonic-gate md_replicalist_t *rlp = NULL;
4270Sstevel@tonic-gate md_error_t xep = mdnullerror;
4280Sstevel@tonic-gate
4290Sstevel@tonic-gate if (metaislocalset(sp))
4300Sstevel@tonic-gate return (0);
4310Sstevel@tonic-gate
4320Sstevel@tonic-gate ownset = own_set(sp, NULL, TRUE, ep);
4330Sstevel@tonic-gate if (! mdisok(ep))
4340Sstevel@tonic-gate return (-1);
4350Sstevel@tonic-gate
4360Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL)
4370Sstevel@tonic-gate return (-1);
4380Sstevel@tonic-gate
4390Sstevel@tonic-gate dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
4400Sstevel@tonic-gate if (! mdisok(ep))
4410Sstevel@tonic-gate return (-1);
4420Sstevel@tonic-gate
4430Sstevel@tonic-gate /* If we have no drive descriptors, check for no ownership */
4440Sstevel@tonic-gate if (dd == NULL) {
4450Sstevel@tonic-gate if (ownset == MD_SETOWNER_NONE)
4460Sstevel@tonic-gate return (0);
4470Sstevel@tonic-gate
4480Sstevel@tonic-gate /* If ownership somehow has come to exist, we must clean up */
4490Sstevel@tonic-gate
4500Sstevel@tonic-gate if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp,
4510Sstevel@tonic-gate &xep) < 0)
4520Sstevel@tonic-gate mdclrerror(&xep);
4530Sstevel@tonic-gate
4540Sstevel@tonic-gate if ((dd = rl_to_dd(sp, rlp, &xep)) == NULL)
4550Sstevel@tonic-gate if (! mdisok(&xep))
4560Sstevel@tonic-gate mdclrerror(&xep);
4570Sstevel@tonic-gate
4580Sstevel@tonic-gate if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
4590Sstevel@tonic-gate if (rel_own_bydd(sp, dd, TRUE, &xep))
4600Sstevel@tonic-gate mdclrerror(&xep);
4610Sstevel@tonic-gate }
4620Sstevel@tonic-gate
4630Sstevel@tonic-gate if (halt_set(sp, &xep))
4640Sstevel@tonic-gate mdclrerror(&xep);
4650Sstevel@tonic-gate
4660Sstevel@tonic-gate metafreereplicalist(rlp);
4670Sstevel@tonic-gate
4680Sstevel@tonic-gate metafreedrivedesc(&dd);
4690Sstevel@tonic-gate
4700Sstevel@tonic-gate return (0);
4710Sstevel@tonic-gate }
4720Sstevel@tonic-gate
4730Sstevel@tonic-gate metafreedrivedesc(&sd->sd_drvs);
4740Sstevel@tonic-gate
4750Sstevel@tonic-gate if (ownset == MD_SETOWNER_YES)
4760Sstevel@tonic-gate return (0);
4770Sstevel@tonic-gate
4780Sstevel@tonic-gate return (mddserror(ep, MDE_DS_NOOWNER, sp->setno, NULL, NULL,
4790Sstevel@tonic-gate sp->setname));
4800Sstevel@tonic-gate }
4810Sstevel@tonic-gate
4820Sstevel@tonic-gate /*
4830Sstevel@tonic-gate * simple check for ownership
4840Sstevel@tonic-gate */
4850Sstevel@tonic-gate int
meta_check_ownership_on_host(mdsetname_t * sp,char * hostname,md_error_t * ep)4860Sstevel@tonic-gate meta_check_ownership_on_host(mdsetname_t *sp, char *hostname, md_error_t *ep)
4870Sstevel@tonic-gate {
4880Sstevel@tonic-gate md_set_desc *sd;
4890Sstevel@tonic-gate md_drive_desc *dd;
4900Sstevel@tonic-gate int bool;
4910Sstevel@tonic-gate
4920Sstevel@tonic-gate if (metaislocalset(sp))
4930Sstevel@tonic-gate return (0);
4940Sstevel@tonic-gate
4950Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL)
4960Sstevel@tonic-gate return (-1);
4970Sstevel@tonic-gate
4980Sstevel@tonic-gate if (getnodeside(hostname, sd) == MD_SIDEWILD)
4990Sstevel@tonic-gate return (mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
5000Sstevel@tonic-gate hostname, NULL, sp->setname));
5010Sstevel@tonic-gate
5020Sstevel@tonic-gate dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
5030Sstevel@tonic-gate if (! mdisok(ep))
5040Sstevel@tonic-gate return (-1);
5050Sstevel@tonic-gate
5060Sstevel@tonic-gate if (clnt_ownset(hostname, sp, &bool, ep) == -1)
5070Sstevel@tonic-gate return (-1);
5080Sstevel@tonic-gate
5090Sstevel@tonic-gate if (dd == NULL)
5100Sstevel@tonic-gate return (0);
5110Sstevel@tonic-gate
5120Sstevel@tonic-gate metafreedrivedesc(&sd->sd_drvs);
5130Sstevel@tonic-gate
5140Sstevel@tonic-gate if (bool == TRUE)
5150Sstevel@tonic-gate return (0);
5160Sstevel@tonic-gate
5170Sstevel@tonic-gate return (mddserror(ep, MDE_DS_NODEISNOTOWNER, sp->setno, hostname, NULL,
5180Sstevel@tonic-gate sp->setname));
5190Sstevel@tonic-gate }
5200Sstevel@tonic-gate
5210Sstevel@tonic-gate /*
5220Sstevel@tonic-gate * Function that determines if a node is in the multinode diskset
5230Sstevel@tonic-gate * membership list. Calling node passes in node to be checked and
5240Sstevel@tonic-gate * the nodelist as returned from meta_read_nodelist. This routine
5250Sstevel@tonic-gate * anticipates being called many times using the same diskset membership
5260Sstevel@tonic-gate * list which is why the alloc and free of the diskset membership list
5270Sstevel@tonic-gate * is left to the calling routine.
5280Sstevel@tonic-gate * Returns:
5290Sstevel@tonic-gate * 1 - if a member
5300Sstevel@tonic-gate * 0 - not a member
5310Sstevel@tonic-gate */
5320Sstevel@tonic-gate int
meta_is_member(char * node_name,md_mn_nodeid_t node_id,mndiskset_membershiplist_t * nl)5330Sstevel@tonic-gate meta_is_member(
5340Sstevel@tonic-gate char *node_name,
5350Sstevel@tonic-gate md_mn_nodeid_t node_id,
5360Sstevel@tonic-gate mndiskset_membershiplist_t *nl
5370Sstevel@tonic-gate )
5380Sstevel@tonic-gate {
5390Sstevel@tonic-gate mndiskset_membershiplist_t *nl2;
5400Sstevel@tonic-gate int flag_check_name;
5410Sstevel@tonic-gate
5420Sstevel@tonic-gate if (node_id != 0)
5430Sstevel@tonic-gate flag_check_name = 0;
5440Sstevel@tonic-gate else if (node_name != NULL)
5450Sstevel@tonic-gate flag_check_name = 1;
5460Sstevel@tonic-gate else
5470Sstevel@tonic-gate return (0);
5480Sstevel@tonic-gate
5490Sstevel@tonic-gate nl2 = nl;
5500Sstevel@tonic-gate while (nl2) {
5510Sstevel@tonic-gate if (flag_check_name) {
5520Sstevel@tonic-gate /* Compare given name against name in member list */
5530Sstevel@tonic-gate if (strcmp(nl2->msl_node_name, node_name) == 0)
5540Sstevel@tonic-gate break;
5550Sstevel@tonic-gate } else {
5560Sstevel@tonic-gate /* Compare given nodeid against nodeid in member list */
5570Sstevel@tonic-gate if (nl2->msl_node_id == node_id)
5580Sstevel@tonic-gate break;
5590Sstevel@tonic-gate }
5600Sstevel@tonic-gate nl2 = nl2->next;
5610Sstevel@tonic-gate }
5620Sstevel@tonic-gate /* No match found in member list */
5630Sstevel@tonic-gate if (nl2 == NULL) {
5640Sstevel@tonic-gate return (0);
5650Sstevel@tonic-gate }
5660Sstevel@tonic-gate /* Return 1 if node is in member list */
5670Sstevel@tonic-gate return (1);
5680Sstevel@tonic-gate }
5690Sstevel@tonic-gate
5700Sstevel@tonic-gate /*
5710Sstevel@tonic-gate * meta_getnext_devinfo should go to the host that
5720Sstevel@tonic-gate * has the device, to return the device name, driver name, minor num.
5730Sstevel@tonic-gate * We can take the big cheat for now, since it is a requirement
5740Sstevel@tonic-gate * that the device names and device numbers are the same, and
5750Sstevel@tonic-gate * just get the info locally.
5760Sstevel@tonic-gate *
5770Sstevel@tonic-gate * This routine is very similar to meta_getnextside_devinfo except
5780Sstevel@tonic-gate * that the specific side to be used is being passed in.
5790Sstevel@tonic-gate *
5800Sstevel@tonic-gate * Exit status:
5810Sstevel@tonic-gate * 0 - No more side info to return
5820Sstevel@tonic-gate * 1 - More side info's to return
5830Sstevel@tonic-gate * -1 - An error has been detected
5840Sstevel@tonic-gate */
5850Sstevel@tonic-gate /*ARGSUSED*/
5860Sstevel@tonic-gate int
meta_getside_devinfo(mdsetname_t * sp,char * bname,side_t sideno,char ** ret_bname,char ** ret_dname,minor_t * ret_mnum,md_error_t * ep)5870Sstevel@tonic-gate meta_getside_devinfo(
5880Sstevel@tonic-gate mdsetname_t *sp, /* for this set */
5890Sstevel@tonic-gate char *bname, /* local block name (myside) */
5900Sstevel@tonic-gate side_t sideno, /* sideno */
5910Sstevel@tonic-gate char **ret_bname, /* block device name of returned side */
5920Sstevel@tonic-gate char **ret_dname, /* driver name of returned side */
5930Sstevel@tonic-gate minor_t *ret_mnum, /* minor number of returned side */
5940Sstevel@tonic-gate md_error_t *ep
5950Sstevel@tonic-gate )
5960Sstevel@tonic-gate {
5970Sstevel@tonic-gate mdname_t *np;
5980Sstevel@tonic-gate
5990Sstevel@tonic-gate if (ret_bname != NULL)
6000Sstevel@tonic-gate *ret_bname = NULL;
6010Sstevel@tonic-gate if (ret_dname != NULL)
6020Sstevel@tonic-gate *ret_dname = NULL;
6030Sstevel@tonic-gate if (ret_mnum != NULL)
6040Sstevel@tonic-gate *ret_mnum = NODEV32;
6050Sstevel@tonic-gate
6060Sstevel@tonic-gate
6071623Stw21770 if ((np = metaname(&sp, bname, LOGICAL_DEVICE, ep)) == NULL)
6080Sstevel@tonic-gate return (-1);
6090Sstevel@tonic-gate
6100Sstevel@tonic-gate /*
6110Sstevel@tonic-gate * NOTE (future) - There will be more work here once devids are integrated
6120Sstevel@tonic-gate * into disksets. Then the side should be used to find the correct
6130Sstevel@tonic-gate * host and the b/d names should be gotten from that host.
6140Sstevel@tonic-gate */
6150Sstevel@tonic-gate
6160Sstevel@tonic-gate /*
6170Sstevel@tonic-gate * Return the side info.
6180Sstevel@tonic-gate */
6190Sstevel@tonic-gate if (ret_bname != NULL)
6200Sstevel@tonic-gate *ret_bname = Strdup(np->bname);
6210Sstevel@tonic-gate
6220Sstevel@tonic-gate if (ret_dname != NULL) {
6230Sstevel@tonic-gate mdcinfo_t *cinfo;
6240Sstevel@tonic-gate
6250Sstevel@tonic-gate if ((cinfo = metagetcinfo(np, ep)) == NULL)
6260Sstevel@tonic-gate return (-1);
6270Sstevel@tonic-gate
6280Sstevel@tonic-gate *ret_dname = Strdup(cinfo->dname);
6290Sstevel@tonic-gate }
6300Sstevel@tonic-gate
6310Sstevel@tonic-gate if (ret_mnum != NULL)
6320Sstevel@tonic-gate *ret_mnum = meta_getminor(np->dev);
6330Sstevel@tonic-gate
6340Sstevel@tonic-gate return (1);
6350Sstevel@tonic-gate }
6360Sstevel@tonic-gate
6370Sstevel@tonic-gate /*
6380Sstevel@tonic-gate * Get the information on the device from the remote node using the devid
6390Sstevel@tonic-gate * of the disk.
6400Sstevel@tonic-gate *
6410Sstevel@tonic-gate * Exit status:
6420Sstevel@tonic-gate * 0 - No more side info to return
6430Sstevel@tonic-gate * 1 - More side info's to return
6440Sstevel@tonic-gate * -1 - An error has been detected
6450Sstevel@tonic-gate */
6460Sstevel@tonic-gate int
meta_getnextside_devinfo(mdsetname_t * sp,char * bname,side_t * sideno,char ** ret_bname,char ** ret_dname,minor_t * ret_mnum,md_error_t * ep)6470Sstevel@tonic-gate meta_getnextside_devinfo(
6480Sstevel@tonic-gate mdsetname_t *sp, /* for this set */
6490Sstevel@tonic-gate char *bname, /* local block name (myside) */
6500Sstevel@tonic-gate side_t *sideno, /* previous sideno & returned sideno */
6510Sstevel@tonic-gate char **ret_bname, /* block device name of returned side */
6520Sstevel@tonic-gate char **ret_dname, /* driver name of returned side */
6530Sstevel@tonic-gate minor_t *ret_mnum, /* minor number of returned side */
6540Sstevel@tonic-gate md_error_t *ep
6550Sstevel@tonic-gate )
6560Sstevel@tonic-gate {
6570Sstevel@tonic-gate md_set_desc *sd;
6580Sstevel@tonic-gate int i;
6590Sstevel@tonic-gate mdname_t *np;
6600Sstevel@tonic-gate mddrivename_t *dnp;
6610Sstevel@tonic-gate char *devidstr = NULL;
6620Sstevel@tonic-gate int devidstrlen;
6630Sstevel@tonic-gate md_dev64_t retdev = NODEV64;
6640Sstevel@tonic-gate char *ret_devname = NULL;
6650Sstevel@tonic-gate char *ret_blkdevname = NULL;
6660Sstevel@tonic-gate char *ret_driver = NULL;
6670Sstevel@tonic-gate char *nodename;
6680Sstevel@tonic-gate int fd;
6690Sstevel@tonic-gate int ret = -1;
6700Sstevel@tonic-gate char *minor_name = NULL;
6710Sstevel@tonic-gate md_mnnode_desc *nd;
6720Sstevel@tonic-gate
6730Sstevel@tonic-gate
6740Sstevel@tonic-gate if (ret_bname != NULL)
6750Sstevel@tonic-gate *ret_bname = NULL;
6760Sstevel@tonic-gate if (ret_dname != NULL)
6770Sstevel@tonic-gate *ret_dname = NULL;
6780Sstevel@tonic-gate if (ret_mnum != NULL)
6790Sstevel@tonic-gate *ret_mnum = NODEV32;
6800Sstevel@tonic-gate
6810Sstevel@tonic-gate if (metaislocalset(sp)) {
6820Sstevel@tonic-gate /* no more sides - we are done */
6830Sstevel@tonic-gate if (*sideno != MD_SIDEWILD)
6840Sstevel@tonic-gate return (0);
6850Sstevel@tonic-gate
6860Sstevel@tonic-gate /* First time through - set up return sideno */
6870Sstevel@tonic-gate *sideno = 0;
6880Sstevel@tonic-gate } else {
6890Sstevel@tonic-gate
6900Sstevel@tonic-gate /*
6910Sstevel@tonic-gate * Find the next sideno, starting after the one given.
6920Sstevel@tonic-gate */
6930Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL)
6940Sstevel@tonic-gate return (-1);
6950Sstevel@tonic-gate
6960Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) {
6970Sstevel@tonic-gate nd = sd->sd_nodelist;
6980Sstevel@tonic-gate if ((*sideno == MD_SIDEWILD) &&
6990Sstevel@tonic-gate (nd != (struct md_mnnode_desc *)NULL)) {
7000Sstevel@tonic-gate *sideno = nd->nd_nodeid;
7010Sstevel@tonic-gate } else {
7020Sstevel@tonic-gate while (nd) {
7030Sstevel@tonic-gate /*
7040Sstevel@tonic-gate * Found given sideno, now find
7050Sstevel@tonic-gate * next sideno, if there is one.
7060Sstevel@tonic-gate */
7070Sstevel@tonic-gate if ((*sideno == nd->nd_nodeid) &&
7080Sstevel@tonic-gate (nd->nd_next !=
7090Sstevel@tonic-gate (struct md_mnnode_desc *)NULL)) {
7100Sstevel@tonic-gate *sideno =
7110Sstevel@tonic-gate nd->nd_next->nd_nodeid;
7120Sstevel@tonic-gate break;
7130Sstevel@tonic-gate }
7140Sstevel@tonic-gate nd = nd->nd_next;
7150Sstevel@tonic-gate }
7160Sstevel@tonic-gate if (nd == NULL) {
7170Sstevel@tonic-gate return (0);
7180Sstevel@tonic-gate }
7190Sstevel@tonic-gate }
7200Sstevel@tonic-gate if (*sideno == MD_SIDEWILD)
7210Sstevel@tonic-gate return (0);
7220Sstevel@tonic-gate } else {
7230Sstevel@tonic-gate for (i = (*sideno)+1; i < MD_MAXSIDES; i++)
7240Sstevel@tonic-gate /* Find next full slot */
7250Sstevel@tonic-gate if (sd->sd_nodes[i][0] != '\0')
7260Sstevel@tonic-gate break;
7270Sstevel@tonic-gate
7280Sstevel@tonic-gate /* No more sides - we are done */
7290Sstevel@tonic-gate if (i == MD_MAXSIDES)
7300Sstevel@tonic-gate return (0);
7310Sstevel@tonic-gate
7320Sstevel@tonic-gate /* Set up the return sideno */
7330Sstevel@tonic-gate *sideno = i;
7340Sstevel@tonic-gate nodename = (char *)sd->sd_nodes[i];
7350Sstevel@tonic-gate }
7360Sstevel@tonic-gate }
7370Sstevel@tonic-gate
7380Sstevel@tonic-gate /*
7390Sstevel@tonic-gate * Need to pass the node the devid of the disk and get it to
7400Sstevel@tonic-gate * send back the details of the disk from that side.
7410Sstevel@tonic-gate */
7421623Stw21770 if ((np = metaname(&sp, bname, UNKNOWN, ep)) == NULL)
7430Sstevel@tonic-gate return (-1);
7440Sstevel@tonic-gate
7450Sstevel@tonic-gate dnp = np->drivenamep;
7460Sstevel@tonic-gate
7470Sstevel@tonic-gate /*
7480Sstevel@tonic-gate * By default, set up the parameters so that they are copied out.
7490Sstevel@tonic-gate */
7500Sstevel@tonic-gate if (ret_bname != NULL)
7510Sstevel@tonic-gate *ret_bname = Strdup(np->bname);
7520Sstevel@tonic-gate
7530Sstevel@tonic-gate if (ret_dname != NULL) {
7540Sstevel@tonic-gate mdcinfo_t *cinfo;
7550Sstevel@tonic-gate
7560Sstevel@tonic-gate if ((cinfo = metagetcinfo(np, ep)) == NULL)
7570Sstevel@tonic-gate return (-1);
7580Sstevel@tonic-gate
7590Sstevel@tonic-gate *ret_dname = Strdup(cinfo->dname);
7600Sstevel@tonic-gate }
7610Sstevel@tonic-gate
7620Sstevel@tonic-gate if (ret_mnum != NULL)
7630Sstevel@tonic-gate *ret_mnum = meta_getminor(np->dev);
7640Sstevel@tonic-gate
7650Sstevel@tonic-gate /*
7660Sstevel@tonic-gate * Try some optimization. If this is the local set or the device
7670Sstevel@tonic-gate * is a metadevice then just copy the information. If the device
7680Sstevel@tonic-gate * does not have a devid (due to not having a minor name) then
7690Sstevel@tonic-gate * fall back to the pre-devid behaviour of copying the information
7700Sstevel@tonic-gate * on the device: this is okay because the sanity checks before this
7710Sstevel@tonic-gate * call would have found any issues with the device. If it's a
7720Sstevel@tonic-gate * multi-node diskset also just return ie. copy.
7730Sstevel@tonic-gate */
7740Sstevel@tonic-gate if (metaislocalset(sp) || metaismeta(np) || (dnp->devid == NULL) ||
7750Sstevel@tonic-gate (MD_MNSET_DESC(sd)))
7760Sstevel@tonic-gate return (1);
7770Sstevel@tonic-gate
7780Sstevel@tonic-gate if (np->minor_name == (char *)NULL) {
7790Sstevel@tonic-gate /*
7800Sstevel@tonic-gate * Have to get the minor name then. The slice should exist
7810Sstevel@tonic-gate * on the disk because it will have already been repartitioned
7820Sstevel@tonic-gate * up prior to getting to this point.
7830Sstevel@tonic-gate */
7840Sstevel@tonic-gate if ((fd = open(np->bname, (O_RDONLY|O_NDELAY), 0)) < 0) {
7850Sstevel@tonic-gate (void) mdsyserror(ep, errno, np->bname);
7860Sstevel@tonic-gate return (-1);
7870Sstevel@tonic-gate }
7880Sstevel@tonic-gate (void) devid_get_minor_name(fd, &minor_name);
7890Sstevel@tonic-gate np->minor_name = Strdup(minor_name);
7900Sstevel@tonic-gate devid_str_free(minor_name);
7910Sstevel@tonic-gate (void) close(fd);
7920Sstevel@tonic-gate }
7930Sstevel@tonic-gate
7940Sstevel@tonic-gate /* allocate extra space for "/" and NULL hence +2 */
7950Sstevel@tonic-gate devidstrlen = strlen(dnp->devid) + strlen(np->minor_name) + 2;
7960Sstevel@tonic-gate devidstr = (char *)Malloc(devidstrlen);
7970Sstevel@tonic-gate
7980Sstevel@tonic-gate /*
7990Sstevel@tonic-gate * As a minor name is supplied then the ret_devname will be
8000Sstevel@tonic-gate * appropriate to that minor_name and in this case it will be
8010Sstevel@tonic-gate * a block device ie /dev/dsk.
8020Sstevel@tonic-gate */
8030Sstevel@tonic-gate (void) snprintf(devidstr, devidstrlen,
8044932Spetede "%s/%s", dnp->devid, np->minor_name);
8050Sstevel@tonic-gate
8060Sstevel@tonic-gate ret = clnt_devinfo_by_devid(nodename, sp, devidstr, &retdev,
8070Sstevel@tonic-gate np->bname, &ret_devname, &ret_driver, ep);
8080Sstevel@tonic-gate
8090Sstevel@tonic-gate Free(devidstr);
8100Sstevel@tonic-gate
8110Sstevel@tonic-gate /*
8120Sstevel@tonic-gate * If the other side is not running device id in disksets,
8130Sstevel@tonic-gate * 'ret' is set to ENOTSUP in which case we fallback to
8140Sstevel@tonic-gate * the existing behaviour
8150Sstevel@tonic-gate */
8160Sstevel@tonic-gate if (ret == ENOTSUP)
8170Sstevel@tonic-gate return (1);
8180Sstevel@tonic-gate else if (ret == -1)
8190Sstevel@tonic-gate return (-1);
8200Sstevel@tonic-gate
8210Sstevel@tonic-gate /*
8220Sstevel@tonic-gate * ret_devname comes from the rpc call and is a
8230Sstevel@tonic-gate * raw device name. We need to make this into a
8240Sstevel@tonic-gate * block device via blkname for further processing.
8250Sstevel@tonic-gate * Unfortunately, when our device id isn't found in
8260Sstevel@tonic-gate * the system, the rpc call will return a " " in
8270Sstevel@tonic-gate * ret_devname in which case we need to fill that in
8280Sstevel@tonic-gate * as ret_blkname because blkname of " " returns NULL.
8290Sstevel@tonic-gate */
8300Sstevel@tonic-gate if (ret_bname != NULL && ret_devname != NULL) {
8310Sstevel@tonic-gate ret_blkdevname = blkname(ret_devname);
8320Sstevel@tonic-gate if (ret_blkdevname == NULL)
8330Sstevel@tonic-gate *ret_bname = Strdup(ret_devname);
8340Sstevel@tonic-gate else
8350Sstevel@tonic-gate *ret_bname = Strdup(ret_blkdevname);
8360Sstevel@tonic-gate }
8370Sstevel@tonic-gate
8380Sstevel@tonic-gate if (ret_dname != NULL && ret_driver != NULL)
8390Sstevel@tonic-gate *ret_dname = Strdup(ret_driver);
8400Sstevel@tonic-gate
8410Sstevel@tonic-gate if (ret_mnum != NULL)
8420Sstevel@tonic-gate *ret_mnum = meta_getminor(retdev);
8430Sstevel@tonic-gate
8440Sstevel@tonic-gate return (1);
8450Sstevel@tonic-gate }
8460Sstevel@tonic-gate
8470Sstevel@tonic-gate int
meta_is_drive_in_anyset(mddrivename_t * dnp,mdsetname_t ** spp,int bypass_daemon,md_error_t * ep)8480Sstevel@tonic-gate meta_is_drive_in_anyset(
8490Sstevel@tonic-gate mddrivename_t *dnp,
8500Sstevel@tonic-gate mdsetname_t **spp,
8510Sstevel@tonic-gate int bypass_daemon,
8520Sstevel@tonic-gate md_error_t *ep
8530Sstevel@tonic-gate )
8540Sstevel@tonic-gate {
8550Sstevel@tonic-gate set_t setno;
8560Sstevel@tonic-gate mdsetname_t *this_sp;
8570Sstevel@tonic-gate int is_it;
8580Sstevel@tonic-gate set_t max_sets;
8590Sstevel@tonic-gate
8600Sstevel@tonic-gate if ((max_sets = get_max_sets(ep)) == 0)
8610Sstevel@tonic-gate return (-1);
8620Sstevel@tonic-gate
8630Sstevel@tonic-gate assert(spp != NULL);
8640Sstevel@tonic-gate *spp = NULL;
8650Sstevel@tonic-gate
8660Sstevel@tonic-gate for (setno = 1; setno < max_sets; setno++) {
8670Sstevel@tonic-gate if (!bypass_daemon) {
8680Sstevel@tonic-gate if ((this_sp = metasetnosetname(setno, ep)) == NULL) {
8690Sstevel@tonic-gate if (mdismddberror(ep, MDE_DB_NODB)) {
8700Sstevel@tonic-gate mdclrerror(ep);
8710Sstevel@tonic-gate return (0);
8720Sstevel@tonic-gate }
8730Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) {
8740Sstevel@tonic-gate mdclrerror(ep);
8750Sstevel@tonic-gate continue;
8760Sstevel@tonic-gate }
8770Sstevel@tonic-gate return (-1);
8780Sstevel@tonic-gate }
8790Sstevel@tonic-gate } else
8800Sstevel@tonic-gate this_sp = metafakesetname(setno, NULL);
8810Sstevel@tonic-gate
8820Sstevel@tonic-gate if ((is_it = meta_is_drive_in_thisset(this_sp, dnp,
8830Sstevel@tonic-gate bypass_daemon, ep)) == -1) {
8840Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) {
8850Sstevel@tonic-gate mdclrerror(ep);
8860Sstevel@tonic-gate continue;
8870Sstevel@tonic-gate }
8880Sstevel@tonic-gate return (-1);
8890Sstevel@tonic-gate }
8900Sstevel@tonic-gate if (is_it) {
8910Sstevel@tonic-gate *spp = this_sp;
8920Sstevel@tonic-gate return (0);
8930Sstevel@tonic-gate }
8940Sstevel@tonic-gate }
8950Sstevel@tonic-gate return (0);
8960Sstevel@tonic-gate }
8970Sstevel@tonic-gate
8980Sstevel@tonic-gate int
meta_is_drive_in_thisset(mdsetname_t * sp,mddrivename_t * dnp,int bypass_daemon,md_error_t * ep)8990Sstevel@tonic-gate meta_is_drive_in_thisset(
9000Sstevel@tonic-gate mdsetname_t *sp,
9010Sstevel@tonic-gate mddrivename_t *dnp,
9020Sstevel@tonic-gate int bypass_daemon,
9030Sstevel@tonic-gate md_error_t *ep
9040Sstevel@tonic-gate )
9050Sstevel@tonic-gate {
9060Sstevel@tonic-gate md_drive_desc *dd, *p;
9070Sstevel@tonic-gate
9080Sstevel@tonic-gate if (bypass_daemon)
9090Sstevel@tonic-gate dd = dr2drivedesc(sp, MD_SIDEWILD,
9100Sstevel@tonic-gate (MD_BASICNAME_OK | MD_BYPASS_DAEMON), ep);
9110Sstevel@tonic-gate else
9120Sstevel@tonic-gate dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
9130Sstevel@tonic-gate
9140Sstevel@tonic-gate if (dd == NULL) {
9150Sstevel@tonic-gate if (! mdisok(ep))
9160Sstevel@tonic-gate return (-1);
9170Sstevel@tonic-gate return (0);
9180Sstevel@tonic-gate }
9190Sstevel@tonic-gate
9200Sstevel@tonic-gate
9210Sstevel@tonic-gate for (p = dd; p != NULL; p = p->dd_next)
9220Sstevel@tonic-gate if (strcmp(p->dd_dnp->cname, dnp->cname) == 0)
9230Sstevel@tonic-gate return (1);
9240Sstevel@tonic-gate return (0);
9250Sstevel@tonic-gate }
9260Sstevel@tonic-gate
9271945Sjeanm /*
9281945Sjeanm * Check to see if devid is in use in any diskset.
9291945Sjeanm * This is used in the case when a partial diskset is being imported
9301945Sjeanm * to make sure that the unvailable drive isn't already in use in an
9311945Sjeanm * already imported partial diskset. Can't check on the cname since the
9321945Sjeanm * unavailable disk's cname is from the previous system and may collide
9331945Sjeanm * with a cname on this system.
9341945Sjeanm * Return values:
9351945Sjeanm * 1: devid has been found in a diskset
9361945Sjeanm * 0: devid not found in any diskset
9371945Sjeanm */
9381945Sjeanm int
meta_is_devid_in_anyset(void * devid,mdsetname_t ** spp,md_error_t * ep)9391945Sjeanm meta_is_devid_in_anyset(
9401945Sjeanm void *devid,
9411945Sjeanm mdsetname_t **spp,
9421945Sjeanm md_error_t *ep
9431945Sjeanm )
9441945Sjeanm {
9451945Sjeanm set_t setno;
9461945Sjeanm mdsetname_t *this_sp;
9471945Sjeanm int is_it;
9481945Sjeanm set_t max_sets;
9491945Sjeanm
9501945Sjeanm if ((max_sets = get_max_sets(ep)) == 0)
9511945Sjeanm return (-1);
9521945Sjeanm
9531945Sjeanm assert(spp != NULL);
9541945Sjeanm *spp = NULL;
9551945Sjeanm
9561945Sjeanm for (setno = 1; setno < max_sets; setno++) {
9571945Sjeanm if ((this_sp = metasetnosetname(setno, ep)) == NULL) {
9581945Sjeanm if (mdismddberror(ep, MDE_DB_NODB)) {
9591945Sjeanm mdclrerror(ep);
9601945Sjeanm return (0);
9611945Sjeanm }
9621945Sjeanm if (mdiserror(ep, MDE_NO_SET)) {
9631945Sjeanm mdclrerror(ep);
9641945Sjeanm continue;
9651945Sjeanm }
9661945Sjeanm return (-1);
9671945Sjeanm }
9681945Sjeanm
9691945Sjeanm if ((is_it = meta_is_devid_in_thisset(this_sp,
9701945Sjeanm devid, ep)) == -1) {
9711945Sjeanm if (mdiserror(ep, MDE_NO_SET)) {
9721945Sjeanm mdclrerror(ep);
9731945Sjeanm continue;
9741945Sjeanm }
9751945Sjeanm return (-1);
9761945Sjeanm }
9771945Sjeanm if (is_it) {
9781945Sjeanm *spp = this_sp;
9791945Sjeanm return (0);
9801945Sjeanm }
9811945Sjeanm }
9821945Sjeanm return (0);
9831945Sjeanm }
9841945Sjeanm
9851945Sjeanm int
meta_is_devid_in_thisset(mdsetname_t * sp,void * devid,md_error_t * ep)9861945Sjeanm meta_is_devid_in_thisset(
9871945Sjeanm mdsetname_t *sp,
9881945Sjeanm void *devid,
9891945Sjeanm md_error_t *ep
9901945Sjeanm )
9911945Sjeanm {
9921945Sjeanm md_drive_desc *dd, *p;
9931945Sjeanm ddi_devid_t dd_devid;
9941945Sjeanm
9951945Sjeanm dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
9961945Sjeanm if (dd == NULL) {
9971945Sjeanm if (! mdisok(ep))
9981945Sjeanm return (-1);
9991945Sjeanm return (0);
10001945Sjeanm }
10011945Sjeanm
10021945Sjeanm for (p = dd; p != NULL; p = p->dd_next) {
10031945Sjeanm if (p->dd_dnp->devid == NULL)
10041945Sjeanm continue;
10051945Sjeanm (void) devid_str_decode(p->dd_dnp->devid,
10061945Sjeanm &dd_devid, NULL);
10071945Sjeanm if (dd_devid == NULL)
10081945Sjeanm continue;
10091945Sjeanm if (devid_compare(devid, dd_devid) == 0) {
10101945Sjeanm devid_free(dd_devid);
10111945Sjeanm return (1);
10121945Sjeanm }
10131945Sjeanm devid_free(dd_devid);
10141945Sjeanm }
10151945Sjeanm return (0);
10161945Sjeanm }
10171945Sjeanm
10180Sstevel@tonic-gate int
meta_set_balance(mdsetname_t * sp,md_error_t * ep)10190Sstevel@tonic-gate meta_set_balance(
10200Sstevel@tonic-gate mdsetname_t *sp,
10210Sstevel@tonic-gate md_error_t *ep
10220Sstevel@tonic-gate )
10230Sstevel@tonic-gate {
10240Sstevel@tonic-gate md_set_desc *sd;
10250Sstevel@tonic-gate md_drive_desc *dd, *curdd;
10260Sstevel@tonic-gate daddr_t dbsize;
10270Sstevel@tonic-gate daddr_t nblks;
10280Sstevel@tonic-gate int i;
10290Sstevel@tonic-gate int rval = 0;
10300Sstevel@tonic-gate sigset_t oldsigs;
10310Sstevel@tonic-gate md_setkey_t *cl_sk;
10320Sstevel@tonic-gate md_error_t xep = mdnullerror;
10330Sstevel@tonic-gate md_mnnode_desc *nd;
10340Sstevel@tonic-gate int suspend1_flag = 0;
10350Sstevel@tonic-gate
10360Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL)
10370Sstevel@tonic-gate return (-1);
10380Sstevel@tonic-gate
10390Sstevel@tonic-gate dbsize = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
10400Sstevel@tonic-gate
10410Sstevel@tonic-gate /* Make sure we own the set */
10420Sstevel@tonic-gate if (meta_check_ownership(sp, ep) != 0)
10430Sstevel@tonic-gate return (-1);
10440Sstevel@tonic-gate
10450Sstevel@tonic-gate /* END CHECK CODE */
10460Sstevel@tonic-gate
10470Sstevel@tonic-gate /*
10480Sstevel@tonic-gate * Get drive descriptors for the drives that are currently in the set.
10490Sstevel@tonic-gate */
10500Sstevel@tonic-gate curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep);
10510Sstevel@tonic-gate
10520Sstevel@tonic-gate if (! mdisok(ep))
10530Sstevel@tonic-gate return (-1);
10540Sstevel@tonic-gate
10550Sstevel@tonic-gate /* Find the minimum replica size in use is or use the default */
10560Sstevel@tonic-gate if ((nblks = meta_db_minreplica(sp, ep)) < 0)
10570Sstevel@tonic-gate mdclrerror(ep);
10580Sstevel@tonic-gate else
10590Sstevel@tonic-gate dbsize = nblks; /* adjust replica size */
10600Sstevel@tonic-gate
10610Sstevel@tonic-gate /* Make sure we are blocking all signals */
10620Sstevel@tonic-gate if (procsigs(TRUE, &oldsigs, &xep) < 0)
10630Sstevel@tonic-gate mdclrerror(&xep);
10640Sstevel@tonic-gate
10650Sstevel@tonic-gate /*
10660Sstevel@tonic-gate * Lock the set on current set members.
10670Sstevel@tonic-gate * For MN diskset lock_set and SUSPEND are used to protect against
10680Sstevel@tonic-gate * other meta* commands running on the other nodes.
10690Sstevel@tonic-gate */
10700Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) {
10710Sstevel@tonic-gate nd = sd->sd_nodelist;
10720Sstevel@tonic-gate while (nd) {
10730Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
10740Sstevel@tonic-gate nd = nd->nd_next;
10750Sstevel@tonic-gate continue;
10760Sstevel@tonic-gate }
10770Sstevel@tonic-gate if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
10780Sstevel@tonic-gate rval = -1;
10790Sstevel@tonic-gate goto out;
10800Sstevel@tonic-gate }
10810Sstevel@tonic-gate nd = nd->nd_next;
10820Sstevel@tonic-gate }
10830Sstevel@tonic-gate /*
10840Sstevel@tonic-gate * Lock out other meta* commands by suspending
10850Sstevel@tonic-gate * class 1 messages across the diskset.
10860Sstevel@tonic-gate */
10870Sstevel@tonic-gate nd = sd->sd_nodelist;
10880Sstevel@tonic-gate while (nd) {
10890Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
10900Sstevel@tonic-gate nd = nd->nd_next;
10910Sstevel@tonic-gate continue;
10920Sstevel@tonic-gate }
10930Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename,
10940Sstevel@tonic-gate COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
10950Sstevel@tonic-gate MD_MSCF_NO_FLAGS, ep)) {
10960Sstevel@tonic-gate rval = -1;
10970Sstevel@tonic-gate goto out;
10980Sstevel@tonic-gate }
10990Sstevel@tonic-gate suspend1_flag = 1;
11000Sstevel@tonic-gate nd = nd->nd_next;
11010Sstevel@tonic-gate }
11020Sstevel@tonic-gate } else {
11030Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) {
11040Sstevel@tonic-gate /* Skip empty slots */
11050Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0') continue;
11060Sstevel@tonic-gate
11070Sstevel@tonic-gate if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
11080Sstevel@tonic-gate rval = -1;
11090Sstevel@tonic-gate goto out;
11100Sstevel@tonic-gate }
11110Sstevel@tonic-gate }
11120Sstevel@tonic-gate }
11130Sstevel@tonic-gate
11140Sstevel@tonic-gate /* We are not adding or deleting any drives, just balancing */
11150Sstevel@tonic-gate dd = NULL;
11160Sstevel@tonic-gate
11170Sstevel@tonic-gate /*
11180Sstevel@tonic-gate * Balance the DB's according to the list of existing drives and the
11190Sstevel@tonic-gate * list of added drives.
11200Sstevel@tonic-gate */
11210Sstevel@tonic-gate if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1)
11220Sstevel@tonic-gate goto out;
11230Sstevel@tonic-gate
11240Sstevel@tonic-gate out:
11250Sstevel@tonic-gate /*
11260Sstevel@tonic-gate * Unlock diskset by resuming class 1 messages across the diskset.
11270Sstevel@tonic-gate * Just resume all classes so that resume is the same whether
11280Sstevel@tonic-gate * just one class was locked or all classes were locked.
11290Sstevel@tonic-gate */
11300Sstevel@tonic-gate if (suspend1_flag) {
11310Sstevel@tonic-gate nd = sd->sd_nodelist;
11320Sstevel@tonic-gate while (nd) {
11330Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
11340Sstevel@tonic-gate nd = nd->nd_next;
11350Sstevel@tonic-gate continue;
11360Sstevel@tonic-gate }
11370Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
11384932Spetede sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
11390Sstevel@tonic-gate /*
11400Sstevel@tonic-gate * We are here because we failed to resume
11410Sstevel@tonic-gate * rpc.mdcommd. However we potentially have
11420Sstevel@tonic-gate * an error from the previous call
11430Sstevel@tonic-gate * (meta_db_balance). If the previous call
11440Sstevel@tonic-gate * did fail, we capture that error and
11450Sstevel@tonic-gate * generate a perror withthe string,
11460Sstevel@tonic-gate * "Unable to resume...".
11470Sstevel@tonic-gate * Setting rval to -1 ensures that in the
11480Sstevel@tonic-gate * next iteration of the loop, ep is not
11490Sstevel@tonic-gate * clobbered.
11500Sstevel@tonic-gate */
11510Sstevel@tonic-gate if (rval == 0)
11520Sstevel@tonic-gate (void) mdstealerror(ep, &xep);
11530Sstevel@tonic-gate else
11540Sstevel@tonic-gate mdclrerror(&xep);
11550Sstevel@tonic-gate rval = -1;
11560Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN,
11570Sstevel@tonic-gate "Unable to resume rpc.mdcommd."));
11580Sstevel@tonic-gate }
11590Sstevel@tonic-gate nd = nd->nd_next;
11600Sstevel@tonic-gate }
11610Sstevel@tonic-gate }
11620Sstevel@tonic-gate
11630Sstevel@tonic-gate /* Unlock the set */
11640Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname);
11650Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) {
11660Sstevel@tonic-gate nd = sd->sd_nodelist;
11670Sstevel@tonic-gate while (nd) {
11680Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
11690Sstevel@tonic-gate nd = nd->nd_next;
11700Sstevel@tonic-gate continue;
11710Sstevel@tonic-gate }
11720Sstevel@tonic-gate if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
11730Sstevel@tonic-gate if (rval == 0)
11740Sstevel@tonic-gate (void) mdstealerror(ep, &xep);
11750Sstevel@tonic-gate else
11760Sstevel@tonic-gate mdclrerror(&xep);
11770Sstevel@tonic-gate rval = -1;
11780Sstevel@tonic-gate }
11790Sstevel@tonic-gate nd = nd->nd_next;
11800Sstevel@tonic-gate }
11810Sstevel@tonic-gate } else {
11820Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) {
11830Sstevel@tonic-gate /* Skip empty slots */
11840Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0')
11850Sstevel@tonic-gate continue;
11860Sstevel@tonic-gate
11870Sstevel@tonic-gate if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
11880Sstevel@tonic-gate if (rval == 0)
11890Sstevel@tonic-gate (void) mdstealerror(ep, &xep);
11900Sstevel@tonic-gate rval = -1;
11910Sstevel@tonic-gate }
11920Sstevel@tonic-gate }
11930Sstevel@tonic-gate }
11940Sstevel@tonic-gate
11950Sstevel@tonic-gate /* release signals back to what they were on entry */
11960Sstevel@tonic-gate if (procsigs(FALSE, &oldsigs, &xep) < 0)
11970Sstevel@tonic-gate mdclrerror(&xep);
11980Sstevel@tonic-gate
11990Sstevel@tonic-gate cl_set_setkey(NULL);
12000Sstevel@tonic-gate
12010Sstevel@tonic-gate metaflushsetname(sp);
12020Sstevel@tonic-gate
12030Sstevel@tonic-gate return (rval);
12040Sstevel@tonic-gate }
12050Sstevel@tonic-gate
12060Sstevel@tonic-gate int
meta_set_destroy(mdsetname_t * sp,int lock_set,md_error_t * ep)12070Sstevel@tonic-gate meta_set_destroy(
12080Sstevel@tonic-gate mdsetname_t *sp,
12090Sstevel@tonic-gate int lock_set,
12100Sstevel@tonic-gate md_error_t *ep
12110Sstevel@tonic-gate )
12120Sstevel@tonic-gate {
12130Sstevel@tonic-gate int i;
12140Sstevel@tonic-gate med_rec_t medr;
12150Sstevel@tonic-gate md_set_desc *sd;
12160Sstevel@tonic-gate md_drive_desc *dd, *p, *p1;
12170Sstevel@tonic-gate mddrivename_t *dnp;
12180Sstevel@tonic-gate mdname_t *np;
12190Sstevel@tonic-gate mdnamelist_t *nlp = NULL;
12200Sstevel@tonic-gate int num_users = 0;
12210Sstevel@tonic-gate int has_set;
12220Sstevel@tonic-gate side_t mysideno;
12230Sstevel@tonic-gate sigset_t oldsigs;
12240Sstevel@tonic-gate md_error_t xep = mdnullerror;
12250Sstevel@tonic-gate md_setkey_t *cl_sk;
12260Sstevel@tonic-gate int rval = 0;
12270Sstevel@tonic-gate int delete_end = 1;
12280Sstevel@tonic-gate
12290Sstevel@tonic-gate /* Make sure we are blocking all signals */
12300Sstevel@tonic-gate if (procsigs(TRUE, &oldsigs, ep) < 0)
12310Sstevel@tonic-gate return (-1);
12320Sstevel@tonic-gate
12330Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) {
12340Sstevel@tonic-gate if (! mdisok(ep))
12350Sstevel@tonic-gate rval = -1;
12360Sstevel@tonic-gate goto out;
12370Sstevel@tonic-gate }
12380Sstevel@tonic-gate
12390Sstevel@tonic-gate /*
12400Sstevel@tonic-gate * meta_set_destroy should not be called for a MN diskset.
12410Sstevel@tonic-gate * This routine destroys a set without communicating this information
12420Sstevel@tonic-gate * to the other nodes which would lead to an inconsistency in
12430Sstevel@tonic-gate * the MN diskset.
12440Sstevel@tonic-gate */
12450Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) {
12460Sstevel@tonic-gate rval = -1;
12470Sstevel@tonic-gate goto out;
12480Sstevel@tonic-gate }
12490Sstevel@tonic-gate
12500Sstevel@tonic-gate /* Continue if a traditional diskset */
12510Sstevel@tonic-gate
12520Sstevel@tonic-gate /*
12530Sstevel@tonic-gate * Check to see who has the set. If we are not the last user of the
12540Sstevel@tonic-gate * set, we will not touch the replicas.
12550Sstevel@tonic-gate */
12560Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) {
12570Sstevel@tonic-gate /* Skip empty slots */
12580Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0')
12590Sstevel@tonic-gate continue;
12600Sstevel@tonic-gate
12610Sstevel@tonic-gate has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ,
12620Sstevel@tonic-gate ep);
12630Sstevel@tonic-gate
12640Sstevel@tonic-gate if (has_set < 0) {
12650Sstevel@tonic-gate mdclrerror(ep);
12660Sstevel@tonic-gate } else
12670Sstevel@tonic-gate num_users++;
12680Sstevel@tonic-gate }
12690Sstevel@tonic-gate
12700Sstevel@tonic-gate if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) == NULL) {
12710Sstevel@tonic-gate if (! mdisok(ep)) {
12720Sstevel@tonic-gate rval = -1;
12730Sstevel@tonic-gate goto out;
12740Sstevel@tonic-gate }
12750Sstevel@tonic-gate }
12760Sstevel@tonic-gate
12770Sstevel@tonic-gate if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
12780Sstevel@tonic-gate rval = -1;
12790Sstevel@tonic-gate goto out;
12800Sstevel@tonic-gate }
12810Sstevel@tonic-gate
12820Sstevel@tonic-gate if (lock_set == TRUE) {
12830Sstevel@tonic-gate /* Lock the set on our side */
12840Sstevel@tonic-gate if (clnt_lock_set(mynode(), sp, ep)) {
12850Sstevel@tonic-gate rval = -1;
12860Sstevel@tonic-gate goto out;
12870Sstevel@tonic-gate }
12880Sstevel@tonic-gate }
12890Sstevel@tonic-gate
12900Sstevel@tonic-gate /*
12910Sstevel@tonic-gate * A traditional diskset has no diskset stale information to send
12920Sstevel@tonic-gate * since there can only be one owner node at a time.
12930Sstevel@tonic-gate */
12940Sstevel@tonic-gate if (snarf_set(sp, FALSE, ep))
12950Sstevel@tonic-gate mdclrerror(ep);
12960Sstevel@tonic-gate
12970Sstevel@tonic-gate if (dd != NULL) {
12980Sstevel@tonic-gate /*
12990Sstevel@tonic-gate * Make sure that no drives are in use as parts of metadrives
13000Sstevel@tonic-gate * or hot spare pools, this is one of the few error conditions
13010Sstevel@tonic-gate * that will stop this routine, unless the environment has
13020Sstevel@tonic-gate * META_DESTROY_SET_OK set, in which case, the operation will
13030Sstevel@tonic-gate * proceed.
13040Sstevel@tonic-gate */
13050Sstevel@tonic-gate if (getenv("META_DESTROY_SET_OK") == NULL) {
13060Sstevel@tonic-gate for (p = dd; p != NULL; p = p->dd_next) {
13070Sstevel@tonic-gate dnp = p->dd_dnp;
13080Sstevel@tonic-gate
13090Sstevel@tonic-gate i = meta_check_drive_inuse(sp, dnp, FALSE, ep);
13100Sstevel@tonic-gate if (i == -1) {
13110Sstevel@tonic-gate /* need xep - wire calls clear error */
13120Sstevel@tonic-gate i = metaget_setownership(sp, &xep);
13130Sstevel@tonic-gate if (i == -1) {
13140Sstevel@tonic-gate rval = -1;
13150Sstevel@tonic-gate goto out;
13160Sstevel@tonic-gate }
13170Sstevel@tonic-gate
13180Sstevel@tonic-gate mysideno = getmyside(sp, &xep);
13190Sstevel@tonic-gate
13200Sstevel@tonic-gate if (mysideno == MD_SIDEWILD) {
13210Sstevel@tonic-gate rval = -1;
13220Sstevel@tonic-gate goto out;
13230Sstevel@tonic-gate }
13240Sstevel@tonic-gate
13250Sstevel@tonic-gate if (sd->sd_isown[mysideno] == FALSE)
13260Sstevel@tonic-gate if (halt_set(sp, &xep)) {
13270Sstevel@tonic-gate rval = -1;
13280Sstevel@tonic-gate goto out;
13290Sstevel@tonic-gate }
13300Sstevel@tonic-gate
13310Sstevel@tonic-gate rval = -1;
13320Sstevel@tonic-gate goto out;
13330Sstevel@tonic-gate }
13340Sstevel@tonic-gate }
13350Sstevel@tonic-gate }
13360Sstevel@tonic-gate
13370Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) {
13380Sstevel@tonic-gate /* Skip empty slots */
13390Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0')
13400Sstevel@tonic-gate continue;
13410Sstevel@tonic-gate
13420Sstevel@tonic-gate /* Skip non local nodes */
13430Sstevel@tonic-gate if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
13440Sstevel@tonic-gate continue;
13450Sstevel@tonic-gate
13460Sstevel@tonic-gate if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep))
13470Sstevel@tonic-gate mdclrerror(ep);
13480Sstevel@tonic-gate }
13490Sstevel@tonic-gate
13500Sstevel@tonic-gate /*
13510Sstevel@tonic-gate * Go thru each drive and individually delete the replicas.
13520Sstevel@tonic-gate * This way we can ignore individual errors.
13530Sstevel@tonic-gate */
13540Sstevel@tonic-gate for (p = dd; p != NULL; p = p->dd_next) {
13550Sstevel@tonic-gate uint_t rep_slice;
13560Sstevel@tonic-gate
13570Sstevel@tonic-gate dnp = p->dd_dnp;
13580Sstevel@tonic-gate if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) ||
13590Sstevel@tonic-gate (((np = metaslicename(dnp, rep_slice, ep))
13604932Spetede == NULL) &&
13614932Spetede ((np = metaslicename(dnp, MD_SLICE0, ep))
13624932Spetede == NULL))) {
13630Sstevel@tonic-gate rval = -1;
13640Sstevel@tonic-gate goto out;
13650Sstevel@tonic-gate }
13660Sstevel@tonic-gate
13670Sstevel@tonic-gate if ((np = metaslicename(dnp,
13680Sstevel@tonic-gate rep_slice, ep)) == NULL) {
13690Sstevel@tonic-gate if ((np = metaslicename(dnp,
13700Sstevel@tonic-gate MD_SLICE0, ep)) == NULL) {
13710Sstevel@tonic-gate rval = -1;
13720Sstevel@tonic-gate goto out;
13730Sstevel@tonic-gate }
13740Sstevel@tonic-gate mdclrerror(ep);
13750Sstevel@tonic-gate }
13760Sstevel@tonic-gate
13770Sstevel@tonic-gate /* Yes this is UGLY!!! */
13780Sstevel@tonic-gate p1 = p->dd_next;
13790Sstevel@tonic-gate p->dd_next = NULL;
13800Sstevel@tonic-gate if (rel_own_bydd(sp, p, FALSE, ep))
13810Sstevel@tonic-gate mdclrerror(ep);
13820Sstevel@tonic-gate p->dd_next = p1;
13830Sstevel@tonic-gate
13840Sstevel@tonic-gate if (p->dd_dbcnt == 0)
13850Sstevel@tonic-gate continue;
13860Sstevel@tonic-gate
13870Sstevel@tonic-gate /*
13880Sstevel@tonic-gate * Skip the replica removal if we are not the last user
13890Sstevel@tonic-gate */
13900Sstevel@tonic-gate if (num_users != 1)
13910Sstevel@tonic-gate continue;
13920Sstevel@tonic-gate
13930Sstevel@tonic-gate nlp = NULL;
13940Sstevel@tonic-gate (void) metanamelist_append(&nlp, np);
13950Sstevel@tonic-gate if (meta_db_detach(sp, nlp,
13960Sstevel@tonic-gate (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, ep))
13970Sstevel@tonic-gate mdclrerror(ep);
13980Sstevel@tonic-gate metafreenamelist(nlp);
13990Sstevel@tonic-gate }
14000Sstevel@tonic-gate }
14010Sstevel@tonic-gate
14020Sstevel@tonic-gate if (halt_set(sp, ep)) {
14030Sstevel@tonic-gate rval = -1;
14040Sstevel@tonic-gate goto out;
14050Sstevel@tonic-gate }
14060Sstevel@tonic-gate
14070Sstevel@tonic-gate /* Setup the mediator record */
14080Sstevel@tonic-gate (void) memset(&medr, '\0', sizeof (med_rec_t));
14090Sstevel@tonic-gate medr.med_rec_mag = MED_REC_MAGIC;
14100Sstevel@tonic-gate medr.med_rec_rev = MED_REC_REV;
14110Sstevel@tonic-gate medr.med_rec_fl = 0;
14120Sstevel@tonic-gate medr.med_rec_sn = sp->setno;
14130Sstevel@tonic-gate (void) strcpy(medr.med_rec_snm, sp->setname);
14140Sstevel@tonic-gate medr.med_rec_meds = sd->sd_med; /* structure assigment */
14150Sstevel@tonic-gate (void) memset(&medr.med_rec_data, '\0', sizeof (med_data_t));
14160Sstevel@tonic-gate medr.med_rec_foff = 0;
14170Sstevel@tonic-gate
14180Sstevel@tonic-gate /*
14190Sstevel@tonic-gate * If we are the last remaining user, then remove the mediator hosts
14200Sstevel@tonic-gate */
14210Sstevel@tonic-gate if (num_users == 1) {
14220Sstevel@tonic-gate for (i = 0; i < MED_MAX_HOSTS; i++) {
14230Sstevel@tonic-gate if (medr.med_rec_meds.n_lst[i].a_cnt != 0)
14240Sstevel@tonic-gate SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE,
14250Sstevel@tonic-gate SVM_TAG_MEDIATOR, sp->setno, i);
14260Sstevel@tonic-gate (void) memset(&medr.med_rec_meds.n_lst[i], '\0',
14270Sstevel@tonic-gate sizeof (md_h_t));
14280Sstevel@tonic-gate }
14290Sstevel@tonic-gate medr.med_rec_meds.n_cnt = 0;
14300Sstevel@tonic-gate } else { /* Remove this host from the mediator node list. */
14310Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) {
14320Sstevel@tonic-gate /* Skip empty slots */
14330Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0')
14340Sstevel@tonic-gate continue;
14350Sstevel@tonic-gate
14360Sstevel@tonic-gate /* Copy non local node */
14370Sstevel@tonic-gate if (strcmp(mynode(), sd->sd_nodes[i]) != 0) {
14380Sstevel@tonic-gate (void) strcpy(medr.med_rec_nodes[i],
14390Sstevel@tonic-gate sd->sd_nodes[i]);
14400Sstevel@tonic-gate continue;
14410Sstevel@tonic-gate }
14420Sstevel@tonic-gate
14430Sstevel@tonic-gate /* Clear local node */
14440Sstevel@tonic-gate (void) memset(&medr.med_rec_nodes[i], '\0',
14450Sstevel@tonic-gate sizeof (md_node_nm_t));
14460Sstevel@tonic-gate }
14470Sstevel@tonic-gate }
14480Sstevel@tonic-gate
14490Sstevel@tonic-gate crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
14500Sstevel@tonic-gate
14510Sstevel@tonic-gate /*
14520Sstevel@tonic-gate * If the client is part of a cluster put the DCS service
14530Sstevel@tonic-gate * into a deleteing state.
14540Sstevel@tonic-gate */
14550Sstevel@tonic-gate if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
14560Sstevel@tonic-gate if (metad_isautotakebyname(sp->setname)) {
14570Sstevel@tonic-gate delete_end = 0;
14580Sstevel@tonic-gate } else {
14590Sstevel@tonic-gate mdclrerror(ep);
14600Sstevel@tonic-gate goto out;
14610Sstevel@tonic-gate }
14620Sstevel@tonic-gate }
14630Sstevel@tonic-gate
14640Sstevel@tonic-gate /* Inform the mediator hosts of the new information */
14650Sstevel@tonic-gate for (i = 0; i < MED_MAX_HOSTS; i++) {
14660Sstevel@tonic-gate if (sd->sd_med.n_lst[i].a_cnt == 0)
14670Sstevel@tonic-gate continue;
14680Sstevel@tonic-gate
14690Sstevel@tonic-gate if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep))
14700Sstevel@tonic-gate mdclrerror(ep);
14710Sstevel@tonic-gate }
14720Sstevel@tonic-gate
14730Sstevel@tonic-gate /* Delete the set locally */
14740Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) {
14750Sstevel@tonic-gate /* Skip empty slots */
14760Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0')
14770Sstevel@tonic-gate continue;
14780Sstevel@tonic-gate
14790Sstevel@tonic-gate /* Skip non local nodes */
14800Sstevel@tonic-gate if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
14810Sstevel@tonic-gate continue;
14820Sstevel@tonic-gate
14830Sstevel@tonic-gate if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1)
14840Sstevel@tonic-gate mdclrerror(ep);
14850Sstevel@tonic-gate }
14860Sstevel@tonic-gate if (delete_end &&
14870Sstevel@tonic-gate sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR)
14880Sstevel@tonic-gate rval = -1;
14890Sstevel@tonic-gate
14900Sstevel@tonic-gate out:
14910Sstevel@tonic-gate /* release signals back to what they were on entry */
14920Sstevel@tonic-gate if (procsigs(FALSE, &oldsigs, &xep) < 0) {
14930Sstevel@tonic-gate if (rval == 0)
14940Sstevel@tonic-gate (void) mdstealerror(ep, &xep);
14950Sstevel@tonic-gate rval = -1;
14960Sstevel@tonic-gate }
14970Sstevel@tonic-gate
14980Sstevel@tonic-gate if (lock_set == TRUE) {
14990Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname);
15000Sstevel@tonic-gate if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
15010Sstevel@tonic-gate if (rval == 0)
15020Sstevel@tonic-gate (void) mdstealerror(ep, &xep);
15030Sstevel@tonic-gate rval = -1;
15040Sstevel@tonic-gate }
15050Sstevel@tonic-gate cl_set_setkey(NULL);
15060Sstevel@tonic-gate }
15070Sstevel@tonic-gate
15080Sstevel@tonic-gate metaflushsetname(sp);
15090Sstevel@tonic-gate return (rval);
15100Sstevel@tonic-gate }
15110Sstevel@tonic-gate
15120Sstevel@tonic-gate int
meta_set_purge(mdsetname_t * sp,int bypass_cluster,int forceflg,md_error_t * ep)15130Sstevel@tonic-gate meta_set_purge(
15140Sstevel@tonic-gate mdsetname_t *sp,
15150Sstevel@tonic-gate int bypass_cluster,
15160Sstevel@tonic-gate int forceflg,
15170Sstevel@tonic-gate md_error_t *ep
15180Sstevel@tonic-gate )
15190Sstevel@tonic-gate {
15200Sstevel@tonic-gate char *thishost = mynode();
15210Sstevel@tonic-gate md_set_desc *sd;
15220Sstevel@tonic-gate md_setkey_t *cl_sk;
15230Sstevel@tonic-gate md_error_t xep = mdnullerror;
15240Sstevel@tonic-gate int rval = 0;
15250Sstevel@tonic-gate int i, num_hosts = 0;
15260Sstevel@tonic-gate int has_set = 0;
15270Sstevel@tonic-gate int max_node = 0;
15280Sstevel@tonic-gate int delete_end = 1;
15290Sstevel@tonic-gate md_mnnode_desc *nd;
15300Sstevel@tonic-gate
15310Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) {
15320Sstevel@tonic-gate /* unable to find set description */
15330Sstevel@tonic-gate rval = 1;
15340Sstevel@tonic-gate return (rval);
15350Sstevel@tonic-gate }
15360Sstevel@tonic-gate
15370Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) {
15380Sstevel@tonic-gate /*
15390Sstevel@tonic-gate * Get a count of the hosts in the set and also lock the set
15400Sstevel@tonic-gate * on those hosts that know about it.
15410Sstevel@tonic-gate */
15420Sstevel@tonic-gate nd = sd->sd_nodelist;
15430Sstevel@tonic-gate while (nd) {
15444932Spetede /*
15454932Spetede * Only deal with those nodes that are members of
15464932Spetede * the set (MD_MN_NODE_ALIVE) or the node on which
15474932Spetede * the purge is being run. We must lock the set
15484932Spetede * on the purging node because the delset call
15494932Spetede * requires the lock to be set.
15504932Spetede */
15514932Spetede if (!(nd->nd_flags & MD_MN_NODE_ALIVE) &&
15524932Spetede nd->nd_nodeid != sd->sd_mn_mynode->nd_nodeid) {
15530Sstevel@tonic-gate nd = nd->nd_next;
15540Sstevel@tonic-gate continue;
15550Sstevel@tonic-gate }
15560Sstevel@tonic-gate has_set = nodehasset(sp, nd->nd_nodename,
15574932Spetede NHS_NST_EQ, ep);
15580Sstevel@tonic-gate
15590Sstevel@tonic-gate /*
15600Sstevel@tonic-gate * The host is not aware of this set (has_set < 0) or
15610Sstevel@tonic-gate * the set does not match (has_set == 0). This check
15620Sstevel@tonic-gate * prevents the code getting confused by an apparent
15630Sstevel@tonic-gate * inconsistancy in the set's state, this is in the
15640Sstevel@tonic-gate * purge code so something is broken in any case and
15650Sstevel@tonic-gate * this is just trying to fix the brokeness.
15660Sstevel@tonic-gate */
15670Sstevel@tonic-gate if (has_set <= 0) {
15680Sstevel@tonic-gate mdclrerror(ep);
15690Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_NOSET;
15700Sstevel@tonic-gate } else {
15710Sstevel@tonic-gate num_hosts++;
15720Sstevel@tonic-gate if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
15730Sstevel@tonic-gate /*
15740Sstevel@tonic-gate * If the force flag is set then
15750Sstevel@tonic-gate * ignore any RPC failures because we
15760Sstevel@tonic-gate * are only really interested with
15770Sstevel@tonic-gate * the set on local node.
15780Sstevel@tonic-gate */
15790Sstevel@tonic-gate if (forceflg && mdanyrpcerror(ep)) {
15800Sstevel@tonic-gate mdclrerror(ep);
15810Sstevel@tonic-gate } else {
15820Sstevel@tonic-gate /*
15830Sstevel@tonic-gate * set max_node so that in the
15840Sstevel@tonic-gate * unlock code nodes in the
15850Sstevel@tonic-gate * set that have not been
15860Sstevel@tonic-gate * locked are not unlocked.
15870Sstevel@tonic-gate */
15880Sstevel@tonic-gate max_node = nd->nd_nodeid;
15890Sstevel@tonic-gate rval = 2;
15900Sstevel@tonic-gate goto out1;
15910Sstevel@tonic-gate }
15920Sstevel@tonic-gate }
15930Sstevel@tonic-gate
15940Sstevel@tonic-gate }
15950Sstevel@tonic-gate nd = nd->nd_next;
15960Sstevel@tonic-gate }
15970Sstevel@tonic-gate max_node = 0;
15980Sstevel@tonic-gate } else {
15990Sstevel@tonic-gate /*
16000Sstevel@tonic-gate * Get a count of the hosts in the set and also lock the set
16010Sstevel@tonic-gate * on those hosts that know about it.
16020Sstevel@tonic-gate */
16030Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) {
16040Sstevel@tonic-gate /* Skip empty slots */
16050Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0')
16060Sstevel@tonic-gate continue;
16070Sstevel@tonic-gate
16080Sstevel@tonic-gate has_set = nodehasset(sp, sd->sd_nodes[i],
16094932Spetede NHS_NST_EQ, ep);
16100Sstevel@tonic-gate
16110Sstevel@tonic-gate /*
16120Sstevel@tonic-gate * The host is not aware of this set (has_set < 0) or
16130Sstevel@tonic-gate * the set does not match (has_set == 0). This check
16140Sstevel@tonic-gate * prevents the code getting confused by an apparent
16150Sstevel@tonic-gate * inconsistancy in the set's state, this is in the
16160Sstevel@tonic-gate * purge code so something is broken in any case and
16170Sstevel@tonic-gate * this is just trying to fix the brokeness.
16180Sstevel@tonic-gate */
16190Sstevel@tonic-gate if (has_set <= 0) {
16200Sstevel@tonic-gate mdclrerror(ep);
16210Sstevel@tonic-gate /*
16220Sstevel@tonic-gate * set the node to NULL to prevent further
16230Sstevel@tonic-gate * requests to this unresponsive node.
16240Sstevel@tonic-gate */
16250Sstevel@tonic-gate sd->sd_nodes[i][0] = '\0';
16260Sstevel@tonic-gate } else {
16270Sstevel@tonic-gate num_hosts++;
16280Sstevel@tonic-gate if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
16290Sstevel@tonic-gate /*
16300Sstevel@tonic-gate * If the force flag is set then
16310Sstevel@tonic-gate * ignore any RPC failures because we
16320Sstevel@tonic-gate * are only really interested with
16330Sstevel@tonic-gate * the set on local node.
16340Sstevel@tonic-gate */
16350Sstevel@tonic-gate if (forceflg && mdanyrpcerror(ep)) {
16360Sstevel@tonic-gate mdclrerror(ep);
16370Sstevel@tonic-gate } else {
16380Sstevel@tonic-gate rval = 2;
16390Sstevel@tonic-gate /*
16400Sstevel@tonic-gate * set max_node so that in the
16410Sstevel@tonic-gate * unlock code nodes in the
16420Sstevel@tonic-gate * set that have not been
16430Sstevel@tonic-gate * locked are not unlocked.
16440Sstevel@tonic-gate */
16450Sstevel@tonic-gate max_node = i;
16460Sstevel@tonic-gate goto out1;
16470Sstevel@tonic-gate }
16480Sstevel@tonic-gate }
16490Sstevel@tonic-gate }
16500Sstevel@tonic-gate }
16510Sstevel@tonic-gate max_node = i; /* now MD_MAXSIDES */
16520Sstevel@tonic-gate }
16530Sstevel@tonic-gate if (!bypass_cluster) {
16540Sstevel@tonic-gate /*
16550Sstevel@tonic-gate * If there is only one host associated with the
16560Sstevel@tonic-gate * set then remove the set from the cluster.
16570Sstevel@tonic-gate */
16580Sstevel@tonic-gate if (num_hosts == 1) {
16590Sstevel@tonic-gate if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
16600Sstevel@tonic-gate if (metad_isautotakebyname(sp->setname)) {
16610Sstevel@tonic-gate delete_end = 0;
16620Sstevel@tonic-gate } else {
16630Sstevel@tonic-gate mdclrerror(ep);
16640Sstevel@tonic-gate rval = 3;
16650Sstevel@tonic-gate goto out1;
16660Sstevel@tonic-gate }
16670Sstevel@tonic-gate }
16680Sstevel@tonic-gate }
16690Sstevel@tonic-gate }
16700Sstevel@tonic-gate
16710Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) {
16720Sstevel@tonic-gate nd = sd->sd_nodelist;
16730Sstevel@tonic-gate while (nd) {
16744932Spetede if (nd->nd_nodeid == sd->sd_mn_mynode->nd_nodeid) {
16754932Spetede /*
16764932Spetede * This is the node on which the purge is
16774932Spetede * being run. We do not care if it is
16784932Spetede * alive or not, just want to get rid of
16794932Spetede * the set.
16804932Spetede */
16814932Spetede if (clnt_delset(nd->nd_nodename, sp,
16824932Spetede ep) == -1) {
16834932Spetede md_perror(dgettext(TEXT_DOMAIN,
16844932Spetede "delset"));
16854932Spetede if (!bypass_cluster && num_hosts == 1)
16864932Spetede (void) sdssc_delete_end(
16874932Spetede sp->setname, SDSSC_CLEANUP);
16884932Spetede mdclrerror(ep);
16894932Spetede goto out1;
16904932Spetede }
16914932Spetede nd = nd->nd_next;
16924932Spetede continue;
16934932Spetede }
16944932Spetede
16954932Spetede /*
16964932Spetede * Only contact those nodes that are members of
16974932Spetede * the set.
16984932Spetede */
16990Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
17000Sstevel@tonic-gate nd = nd->nd_next;
17010Sstevel@tonic-gate continue;
17020Sstevel@tonic-gate }
17034932Spetede
17044932Spetede /*
17054932Spetede * Tell the remote node to remove this node
17064932Spetede */
17074932Spetede if (clnt_delhosts(nd->nd_nodename, sp, 1, &thishost,
17084932Spetede ep) == -1) {
17090Sstevel@tonic-gate /*
17104932Spetede * If we fail to delete ourselves
17114932Spetede * from the remote host it does not
17124932Spetede * really matter because the set is
17134932Spetede * being "purged" from this node. The
17144932Spetede * set can be purged from the other
17154932Spetede * node at a later time.
17160Sstevel@tonic-gate */
17170Sstevel@tonic-gate mdclrerror(ep);
17180Sstevel@tonic-gate }
17190Sstevel@tonic-gate nd = nd->nd_next;
17200Sstevel@tonic-gate }
17210Sstevel@tonic-gate } else {
17220Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++) {
17230Sstevel@tonic-gate /* Skip empty slots */
17240Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0')
17250Sstevel@tonic-gate continue;
17260Sstevel@tonic-gate if (strcmp(thishost, sd->sd_nodes[i]) != 0) {
17270Sstevel@tonic-gate /*
17280Sstevel@tonic-gate * Tell the remote node to remove this node
17290Sstevel@tonic-gate */
17300Sstevel@tonic-gate if (clnt_delhosts(sd->sd_nodes[i], sp, 1,
17310Sstevel@tonic-gate &thishost, ep) == -1) {
17320Sstevel@tonic-gate /*
17330Sstevel@tonic-gate * If we fail to delete ourselves
17340Sstevel@tonic-gate * from the remote host it does not
17350Sstevel@tonic-gate * really matter because the set is
17360Sstevel@tonic-gate * being "purged" from this node. The
17370Sstevel@tonic-gate * set can be purged from the other
17380Sstevel@tonic-gate * node at a later time.
17390Sstevel@tonic-gate */
17400Sstevel@tonic-gate mdclrerror(ep);
17410Sstevel@tonic-gate }
17420Sstevel@tonic-gate continue;
17430Sstevel@tonic-gate }
17440Sstevel@tonic-gate
17450Sstevel@tonic-gate /* remove the set from this host */
17460Sstevel@tonic-gate if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) {
17470Sstevel@tonic-gate md_perror(dgettext(TEXT_DOMAIN, "delset"));
17480Sstevel@tonic-gate if (!bypass_cluster && num_hosts == 1)
17490Sstevel@tonic-gate (void) sdssc_delete_end(sp->setname,
17500Sstevel@tonic-gate SDSSC_CLEANUP);
17510Sstevel@tonic-gate mdclrerror(ep);
17520Sstevel@tonic-gate goto out1;
17530Sstevel@tonic-gate }
17540Sstevel@tonic-gate }
17550Sstevel@tonic-gate }
17560Sstevel@tonic-gate
17570Sstevel@tonic-gate if (!bypass_cluster && num_hosts == 1) {
17580Sstevel@tonic-gate if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) ==
17590Sstevel@tonic-gate SDSSC_ERROR) {
17600Sstevel@tonic-gate rval = 4;
17610Sstevel@tonic-gate }
17620Sstevel@tonic-gate }
17630Sstevel@tonic-gate
17640Sstevel@tonic-gate out1:
17650Sstevel@tonic-gate
17660Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname);
17670Sstevel@tonic-gate
17680Sstevel@tonic-gate /*
17690Sstevel@tonic-gate * Remove the set lock on those nodes that had the set locked
17700Sstevel@tonic-gate * max_node will either be MD_MAXSIDES or array index of the last
17710Sstevel@tonic-gate * node contacted (or rather failed to contact) for traditional
17720Sstevel@tonic-gate * diskset. For a MN diskset, max_node is the node_id of the node
17730Sstevel@tonic-gate * that failed the lock.
17740Sstevel@tonic-gate */
17750Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) {
17760Sstevel@tonic-gate nd = sd->sd_nodelist;
17770Sstevel@tonic-gate while (nd) {
17780Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
17790Sstevel@tonic-gate nd = nd->nd_next;
17800Sstevel@tonic-gate continue;
17810Sstevel@tonic-gate }
17820Sstevel@tonic-gate if (nd->nd_nodeid == max_node)
17830Sstevel@tonic-gate break;
17840Sstevel@tonic-gate if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
17850Sstevel@tonic-gate if (forceflg && mdanyrpcerror(&xep)) {
17860Sstevel@tonic-gate mdclrerror(&xep);
17870Sstevel@tonic-gate nd = nd->nd_next;
17880Sstevel@tonic-gate continue;
17890Sstevel@tonic-gate }
17900Sstevel@tonic-gate if (rval == 0)
17910Sstevel@tonic-gate (void) mdstealerror(ep, &xep);
17920Sstevel@tonic-gate rval = 5;
17930Sstevel@tonic-gate }
17940Sstevel@tonic-gate nd = nd->nd_next;
17950Sstevel@tonic-gate }
17960Sstevel@tonic-gate } else {
17970Sstevel@tonic-gate for (i = 0; i < max_node; i++) {
17980Sstevel@tonic-gate /* Skip empty slots */
17990Sstevel@tonic-gate if (sd->sd_nodes[i][0] == '\0')
18000Sstevel@tonic-gate continue;
18010Sstevel@tonic-gate
18020Sstevel@tonic-gate if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
18030Sstevel@tonic-gate if (forceflg && mdanyrpcerror(&xep)) {
18040Sstevel@tonic-gate mdclrerror(&xep);
18050Sstevel@tonic-gate continue;
18060Sstevel@tonic-gate }
18070Sstevel@tonic-gate if (rval == 0)
18080Sstevel@tonic-gate (void) mdstealerror(ep, &xep);
18090Sstevel@tonic-gate rval = 5;
18100Sstevel@tonic-gate }
18110Sstevel@tonic-gate }
18120Sstevel@tonic-gate }
18130Sstevel@tonic-gate
18140Sstevel@tonic-gate cl_set_setkey(NULL);
18150Sstevel@tonic-gate
18160Sstevel@tonic-gate return (rval);
18170Sstevel@tonic-gate }
18180Sstevel@tonic-gate
18190Sstevel@tonic-gate int
meta_set_query(mdsetname_t * sp,mddb_dtag_lst_t ** dtlpp,md_error_t * ep)18200Sstevel@tonic-gate meta_set_query(
18210Sstevel@tonic-gate mdsetname_t *sp,
18220Sstevel@tonic-gate mddb_dtag_lst_t **dtlpp,
18230Sstevel@tonic-gate md_error_t *ep
18240Sstevel@tonic-gate )
18250Sstevel@tonic-gate {
18260Sstevel@tonic-gate mddb_dtag_get_parm_t dtgp;
18270Sstevel@tonic-gate
18280Sstevel@tonic-gate (void) memset(&dtgp, '\0', sizeof (mddb_dtag_get_parm_t));
18290Sstevel@tonic-gate dtgp.dtgp_setno = sp->setno;
18300Sstevel@tonic-gate
18310Sstevel@tonic-gate /*CONSTCOND*/
18320Sstevel@tonic-gate while (1) {
18330Sstevel@tonic-gate if (metaioctl(MD_MED_GET_TAG, &dtgp, &dtgp.dtgp_mde, NULL) != 0)
18340Sstevel@tonic-gate if (! mdismddberror(&dtgp.dtgp_mde, MDE_DB_NOTAG) ||
18350Sstevel@tonic-gate *dtlpp == NULL)
18360Sstevel@tonic-gate return (mdstealerror(ep, &dtgp.dtgp_mde));
18370Sstevel@tonic-gate else
18380Sstevel@tonic-gate break;
18390Sstevel@tonic-gate
18400Sstevel@tonic-gate /*
18410Sstevel@tonic-gate * Run to the end of the list
18420Sstevel@tonic-gate */
18430Sstevel@tonic-gate for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx)
18440Sstevel@tonic-gate /* void */;
18450Sstevel@tonic-gate
18460Sstevel@tonic-gate *dtlpp = Zalloc(sizeof (mddb_dtag_lst_t));
18470Sstevel@tonic-gate
18480Sstevel@tonic-gate (void) memmove(&(*dtlpp)->dtl_dt, &dtgp.dtgp_dt,
18490Sstevel@tonic-gate sizeof (mddb_dtag_t));
18500Sstevel@tonic-gate
18510Sstevel@tonic-gate dtgp.dtgp_dt.dt_id++;
18520Sstevel@tonic-gate }
18530Sstevel@tonic-gate return (0);
18540Sstevel@tonic-gate }
18550Sstevel@tonic-gate
18560Sstevel@tonic-gate /*
18570Sstevel@tonic-gate * return drivename get by key
18580Sstevel@tonic-gate */
18590Sstevel@tonic-gate mddrivename_t *
metadrivename_withdrkey(mdsetname_t * sp,side_t sideno,mdkey_t key,int flags,md_error_t * ep)18600Sstevel@tonic-gate metadrivename_withdrkey(
18610Sstevel@tonic-gate mdsetname_t *sp,
18620Sstevel@tonic-gate side_t sideno,
18630Sstevel@tonic-gate mdkey_t key,
18640Sstevel@tonic-gate int flags,
18650Sstevel@tonic-gate md_error_t *ep
18660Sstevel@tonic-gate )
18670Sstevel@tonic-gate {
18680Sstevel@tonic-gate char *nm;
18690Sstevel@tonic-gate mdname_t *np;
18700Sstevel@tonic-gate mddrivename_t *dnp;
18710Sstevel@tonic-gate ddi_devid_t devidp;
18720Sstevel@tonic-gate md_set_desc *sd;
18730Sstevel@tonic-gate
18740Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) {
18750Sstevel@tonic-gate return (NULL);
18760Sstevel@tonic-gate }
18770Sstevel@tonic-gate
18780Sstevel@tonic-gate /*
18790Sstevel@tonic-gate * Get the devid associated with the key.
18800Sstevel@tonic-gate *
18810Sstevel@tonic-gate * If a devid was returned, it MUST be valid even in
18820Sstevel@tonic-gate * the case where a device id has been "updated". The
18830Sstevel@tonic-gate * "update" of the device id may have occured due to
18840Sstevel@tonic-gate * a firmware upgrade.
18850Sstevel@tonic-gate */
18860Sstevel@tonic-gate if ((devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep))
18870Sstevel@tonic-gate != NULL) {
18881945Sjeanm /*
18891945Sjeanm * Look for the correct dnp using the devid for comparison.
18901945Sjeanm */
18911945Sjeanm dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep);
18920Sstevel@tonic-gate free(devidp);
1893*8452SJohn.Wren.Kennedy@Sun.COM
1894*8452SJohn.Wren.Kennedy@Sun.COM /* dnp could be NULL if the devid could not be decoded. */
1895*8452SJohn.Wren.Kennedy@Sun.COM if (dnp == NULL) {
1896*8452SJohn.Wren.Kennedy@Sun.COM return (NULL);
1897*8452SJohn.Wren.Kennedy@Sun.COM }
18981945Sjeanm dnp->side_names_key = key;
18990Sstevel@tonic-gate } else {
19000Sstevel@tonic-gate /*
19011945Sjeanm * We didn't get a devid. We'll try for a dnp using the
19021945Sjeanm * name. If we have a MN diskset or if the dnp is a did
19031945Sjeanm * device, we're done because then we don't have devids.
19041945Sjeanm * Otherwise we'll try to set the devid
19051945Sjeanm * and get the dnp via devid again.
19061945Sjeanm * We also need to clear the ep structure. When the
19071945Sjeanm * above call to meta_getdidbykey returned a null, it
19081945Sjeanm * also put an error code into ep. In this case, the null
19091945Sjeanm * return is actually OK and any errors can be ignored. The
19101945Sjeanm * reason it is OK is because this could be a MN set or
19111945Sjeanm * we could be running without devids (ex cluster).
19121945Sjeanm */
19131945Sjeanm mdclrerror(ep);
19141945Sjeanm
19151945Sjeanm if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno, key,
19161945Sjeanm ep)) == NULL)
19171945Sjeanm return (NULL);
19181945Sjeanm /* get device name */
19191945Sjeanm if (flags & PRINT_FAST) {
19201945Sjeanm if ((np = metaname_fast(&sp, nm,
19211945Sjeanm LOGICAL_DEVICE, ep)) == NULL) {
19221945Sjeanm Free(nm);
19231945Sjeanm return (NULL);
19241945Sjeanm }
19251945Sjeanm } else {
19261945Sjeanm if ((np = metaname(&sp, nm, LOGICAL_DEVICE,
19271945Sjeanm ep)) == NULL) {
19281945Sjeanm Free(nm);
19291945Sjeanm return (NULL);
19301945Sjeanm }
19311945Sjeanm }
19321945Sjeanm Free(nm);
19331945Sjeanm /* make sure it's OK */
19341945Sjeanm if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np,
19351945Sjeanm ep) != 0))
19361945Sjeanm return (NULL);
19371945Sjeanm
19381945Sjeanm /* get drivename */
19391945Sjeanm dnp = np->drivenamep;
19401945Sjeanm dnp->side_names_key = key;
19411945Sjeanm /*
19421945Sjeanm * Skip the devid set/check for the following cases:
19431945Sjeanm * 1) If MN diskset, there are no devid's
19441945Sjeanm * 2) if dnp is did device
19451945Sjeanm * The device id is disabled for did device due to the
19461945Sjeanm * lack of minor name support in the did driver. The following
19471945Sjeanm * devid code path can set and propagate the error and
19481945Sjeanm * eventually prevent did disks from being added to the
19491945Sjeanm * diskset under SunCluster systems
19503073Sjkennedy *
19513073Sjkennedy * Note that this code can be called through rpc.mdcommd.
19523073Sjkennedy * sdssc_version cannot be used because the library won't
19533073Sjkennedy * be bound.
19541945Sjeanm */
19551945Sjeanm if ((strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/"))
19561945Sjeanm == 0) || (MD_MNSET_DESC(sd)))
19571945Sjeanm goto out;
19581945Sjeanm
19591945Sjeanm /*
19600Sstevel@tonic-gate * It is okay if replica is not in devid mode
19610Sstevel@tonic-gate */
19620Sstevel@tonic-gate if (mdissyserror(ep, MDDB_F_NODEVID)) {
19630Sstevel@tonic-gate mdclrerror(ep);
19640Sstevel@tonic-gate goto out;
19650Sstevel@tonic-gate }
19660Sstevel@tonic-gate
19670Sstevel@tonic-gate /*
19681945Sjeanm * We're not MN or did devices but
19690Sstevel@tonic-gate * devid is missing so this means that we have
19700Sstevel@tonic-gate * just upgraded from a configuration where
19710Sstevel@tonic-gate * devid's were not used so try to add in
19721945Sjeanm * the devid and requery. If the devid still isn't there,
19731945Sjeanm * that's OK. dnp->devid will be null as it is in any
19741945Sjeanm * configuration with no devids.
19750Sstevel@tonic-gate */
19763073Sjkennedy if (meta_setdid(MD_LOCAL_SET, sideno + SKEW, key, ep) < 0)
19770Sstevel@tonic-gate return (NULL);
19780Sstevel@tonic-gate if ((devidp = (ddi_devid_t)meta_getdidbykey(MD_LOCAL_SET,
19791945Sjeanm sideno+SKEW, key, ep)) != NULL) {
19801945Sjeanm /*
19811945Sjeanm * Found a devid so look for the dnp using the
19821945Sjeanm * devid as the search mechanism.
19831945Sjeanm */
19841945Sjeanm dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep);
19851945Sjeanm free(devidp);
1986*8452SJohn.Wren.Kennedy@Sun.COM if (dnp == NULL) {
1987*8452SJohn.Wren.Kennedy@Sun.COM return (NULL);
1988*8452SJohn.Wren.Kennedy@Sun.COM }
19891945Sjeanm dnp->side_names_key = key;
19901945Sjeanm }
19910Sstevel@tonic-gate }
19920Sstevel@tonic-gate
19931945Sjeanm
19941945Sjeanm
19950Sstevel@tonic-gate out:
19960Sstevel@tonic-gate if (flags & MD_BYPASS_DAEMON)
19970Sstevel@tonic-gate return (dnp);
19980Sstevel@tonic-gate
19990Sstevel@tonic-gate if (get_sidenmlist(sp, dnp, ep))
20000Sstevel@tonic-gate return (NULL);
20010Sstevel@tonic-gate
20020Sstevel@tonic-gate /* return success */
20030Sstevel@tonic-gate return (dnp);
20040Sstevel@tonic-gate }
20050Sstevel@tonic-gate
20060Sstevel@tonic-gate void
metafreedrivedesc(md_drive_desc ** dd)20070Sstevel@tonic-gate metafreedrivedesc(md_drive_desc **dd)
20080Sstevel@tonic-gate {
20090Sstevel@tonic-gate md_drive_desc *p, *next = NULL;
20100Sstevel@tonic-gate
20110Sstevel@tonic-gate for (p = *dd; p != NULL; p = next) {
20120Sstevel@tonic-gate next = p->dd_next;
20130Sstevel@tonic-gate Free(p);
20140Sstevel@tonic-gate }
20150Sstevel@tonic-gate *dd = NULL;
20160Sstevel@tonic-gate }
20170Sstevel@tonic-gate
20180Sstevel@tonic-gate md_drive_desc *
metaget_drivedesc(mdsetname_t * sp,int flags,md_error_t * ep)20190Sstevel@tonic-gate metaget_drivedesc(
20200Sstevel@tonic-gate mdsetname_t *sp,
20210Sstevel@tonic-gate int flags,
20220Sstevel@tonic-gate md_error_t *ep
20230Sstevel@tonic-gate )
20240Sstevel@tonic-gate {
20250Sstevel@tonic-gate side_t sideno = MD_SIDEWILD;
20260Sstevel@tonic-gate
20270Sstevel@tonic-gate assert(! (flags & MD_BYPASS_DAEMON));
20280Sstevel@tonic-gate
20290Sstevel@tonic-gate if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD)
20300Sstevel@tonic-gate return (NULL);
20310Sstevel@tonic-gate
20320Sstevel@tonic-gate return (metaget_drivedesc_sideno(sp, sideno, flags, ep));
20330Sstevel@tonic-gate }
20340Sstevel@tonic-gate
20350Sstevel@tonic-gate md_drive_desc *
metaget_drivedesc_fromnamelist(mdsetname_t * sp,mdnamelist_t * nlp,md_error_t * ep)20360Sstevel@tonic-gate metaget_drivedesc_fromnamelist(
20370Sstevel@tonic-gate mdsetname_t *sp,
20380Sstevel@tonic-gate mdnamelist_t *nlp,
20390Sstevel@tonic-gate md_error_t *ep
20400Sstevel@tonic-gate )
20410Sstevel@tonic-gate {
20420Sstevel@tonic-gate md_set_desc *sd;
20430Sstevel@tonic-gate mdnamelist_t *p;
20440Sstevel@tonic-gate md_drive_desc *dd = NULL;
20450Sstevel@tonic-gate
20460Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL)
20470Sstevel@tonic-gate return (NULL);
20480Sstevel@tonic-gate
20490Sstevel@tonic-gate for (p = nlp; p != NULL; p = p->next)
20500Sstevel@tonic-gate (void) metadrivedesc_append(&dd, p->namep->drivenamep, 0, 0,
20510Sstevel@tonic-gate sd->sd_ctime, sd->sd_genid, MD_DR_ADD);
20520Sstevel@tonic-gate
20530Sstevel@tonic-gate return (dd);
20540Sstevel@tonic-gate }
20550Sstevel@tonic-gate
20560Sstevel@tonic-gate md_drive_desc *
metaget_drivedesc_sideno(mdsetname_t * sp,side_t sideno,int flags,md_error_t * ep)20570Sstevel@tonic-gate metaget_drivedesc_sideno(
20580Sstevel@tonic-gate mdsetname_t *sp,
20590Sstevel@tonic-gate side_t sideno,
20600Sstevel@tonic-gate int flags,
20610Sstevel@tonic-gate md_error_t *ep
20620Sstevel@tonic-gate )
20630Sstevel@tonic-gate {
20640Sstevel@tonic-gate md_set_desc *sd = NULL;
20650Sstevel@tonic-gate
20660Sstevel@tonic-gate assert(! (flags & MD_BYPASS_DAEMON));
20670Sstevel@tonic-gate
20680Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL)
20690Sstevel@tonic-gate return (NULL);
20700Sstevel@tonic-gate
20710Sstevel@tonic-gate if (sd->sd_drvs)
20720Sstevel@tonic-gate return (sd->sd_drvs);
20730Sstevel@tonic-gate
20740Sstevel@tonic-gate if ((sd->sd_drvs = dr2drivedesc(sp, sideno, flags, ep)) == NULL)
20750Sstevel@tonic-gate return (NULL);
20760Sstevel@tonic-gate
20770Sstevel@tonic-gate return (sd->sd_drvs);
20780Sstevel@tonic-gate }
20790Sstevel@tonic-gate
20800Sstevel@tonic-gate int
metaget_setownership(mdsetname_t * sp,md_error_t * ep)20810Sstevel@tonic-gate metaget_setownership(
20820Sstevel@tonic-gate mdsetname_t *sp,
20830Sstevel@tonic-gate md_error_t *ep
20840Sstevel@tonic-gate )
20850Sstevel@tonic-gate {
20860Sstevel@tonic-gate md_set_desc *sd;
20870Sstevel@tonic-gate int bool;
20880Sstevel@tonic-gate int i;
20890Sstevel@tonic-gate md_mnnode_desc *nd;
20900Sstevel@tonic-gate
20910Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL)
20920Sstevel@tonic-gate return (-1);
20930Sstevel@tonic-gate
20940Sstevel@tonic-gate if (MD_MNSET_DESC(sd)) {
20950Sstevel@tonic-gate nd = sd->sd_nodelist;
20960Sstevel@tonic-gate while (nd) {
20970Sstevel@tonic-gate /* If node isn't alive, can't own diskset */
20980Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
20990Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OWN;
21000Sstevel@tonic-gate nd = nd->nd_next;
21010Sstevel@tonic-gate continue;
21020Sstevel@tonic-gate }
21030Sstevel@tonic-gate /*
21040Sstevel@tonic-gate * If can't communicate with rpc.metad, then mark
21050Sstevel@tonic-gate * this node as not an owner. That node may
21060Sstevel@tonic-gate * in fact, be an owner, but without rpc.metad running
21070Sstevel@tonic-gate * that node can't do much.
21080Sstevel@tonic-gate */
21090Sstevel@tonic-gate if (clnt_ownset(nd->nd_nodename, sp, &bool, ep) == -1) {
21100Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OWN;
21110Sstevel@tonic-gate } else if (bool == TRUE) {
21120Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_OWN;
21130Sstevel@tonic-gate } else {
21140Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OWN;
21150Sstevel@tonic-gate }
21160Sstevel@tonic-gate nd = nd->nd_next;
21170Sstevel@tonic-gate }
21180Sstevel@tonic-gate return (0);
21190Sstevel@tonic-gate }
21200Sstevel@tonic-gate
21210Sstevel@tonic-gate /* Rest of code handles traditional disksets */
21220Sstevel@tonic-gate
21230Sstevel@tonic-gate for (i = 0; i < MD_MAXSIDES; i++)
21240Sstevel@tonic-gate sd->sd_isown[i] = 0;
21250Sstevel@tonic-gate
21260Sstevel@tonic-gate if (clnt_ownset(mynode(), sp, &bool, ep) == -1)
21270Sstevel@tonic-gate return (-1);
21280Sstevel@tonic-gate
21290Sstevel@tonic-gate if (bool == TRUE)
21300Sstevel@tonic-gate sd->sd_isown[getmyside(sp, ep)] = 1;
21310Sstevel@tonic-gate
21320Sstevel@tonic-gate return (0);
21330Sstevel@tonic-gate }
21340Sstevel@tonic-gate
21350Sstevel@tonic-gate char *
mynode(void)21360Sstevel@tonic-gate mynode(void)
21370Sstevel@tonic-gate {
21380Sstevel@tonic-gate static struct utsname myuname;
21390Sstevel@tonic-gate static int done = 0;
21400Sstevel@tonic-gate
21410Sstevel@tonic-gate if (! done) {
21420Sstevel@tonic-gate if (uname(&myuname) == -1) {
21430Sstevel@tonic-gate md_perror(dgettext(TEXT_DOMAIN, "uname"));
21440Sstevel@tonic-gate assert(0);
21450Sstevel@tonic-gate }
21460Sstevel@tonic-gate done = 1;
21470Sstevel@tonic-gate }
21480Sstevel@tonic-gate return (myuname.nodename);
21490Sstevel@tonic-gate }
21500Sstevel@tonic-gate
21510Sstevel@tonic-gate int
strinlst(char * str,int cnt,char ** lst)21520Sstevel@tonic-gate strinlst(char *str, int cnt, char **lst)
21530Sstevel@tonic-gate {
21540Sstevel@tonic-gate int i;
21550Sstevel@tonic-gate
21560Sstevel@tonic-gate for (i = 0; i < cnt; i++)
21570Sstevel@tonic-gate if (strcmp(lst[i], str) == 0)
21580Sstevel@tonic-gate return (TRUE);
21590Sstevel@tonic-gate
21600Sstevel@tonic-gate return (FALSE);
21610Sstevel@tonic-gate }
21620Sstevel@tonic-gate
21630Sstevel@tonic-gate /*
21640Sstevel@tonic-gate * meta_get_reserved_names
21650Sstevel@tonic-gate * returns an mdnamelist_t of reserved slices
21660Sstevel@tonic-gate * reserved slices are those that are used but don't necessarily
21670Sstevel@tonic-gate * show up as metadevices (ex. reserved slice for db in sets, logs)
21680Sstevel@tonic-gate */
21690Sstevel@tonic-gate
21700Sstevel@tonic-gate /*ARGSUSED*/
21710Sstevel@tonic-gate int
meta_get_reserved_names(mdsetname_t * sp,mdnamelist_t ** nlpp,int options,md_error_t * ep)21720Sstevel@tonic-gate meta_get_reserved_names(
21730Sstevel@tonic-gate mdsetname_t *sp,
21740Sstevel@tonic-gate mdnamelist_t **nlpp,
21750Sstevel@tonic-gate int options,
21760Sstevel@tonic-gate md_error_t *ep)
21770Sstevel@tonic-gate {
21780Sstevel@tonic-gate int count = 0;
21790Sstevel@tonic-gate mdname_t *np = NULL;
21800Sstevel@tonic-gate mdnamelist_t *transnlp = NULL;
21810Sstevel@tonic-gate mdnamelist_t **tailpp = nlpp;
21820Sstevel@tonic-gate mdnamelist_t *nlp;
21830Sstevel@tonic-gate md_drive_desc *dd, *di;
21840Sstevel@tonic-gate
21850Sstevel@tonic-gate if (metaislocalset(sp))
21860Sstevel@tonic-gate goto out;
21870Sstevel@tonic-gate
21880Sstevel@tonic-gate if (!(dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) && !mdisok(ep)) {
21890Sstevel@tonic-gate count = -1;
21900Sstevel@tonic-gate goto out;
21910Sstevel@tonic-gate }
21920Sstevel@tonic-gate
21930Sstevel@tonic-gate /* db in for sets on reserved slice */
21940Sstevel@tonic-gate for (di = dd; di && count >= 0; di = di->dd_next) {
21950Sstevel@tonic-gate uint_t rep_slice;
21960Sstevel@tonic-gate
21970Sstevel@tonic-gate /*
21980Sstevel@tonic-gate * Add the name struct to the end of the
21990Sstevel@tonic-gate * namelist but keep a pointer to the last
22000Sstevel@tonic-gate * element so that we don't incur the overhead
22010Sstevel@tonic-gate * of traversing the list each time
22020Sstevel@tonic-gate */
22030Sstevel@tonic-gate if (di->dd_dnp &&
22040Sstevel@tonic-gate (meta_replicaslice(di->dd_dnp, &rep_slice, ep) == 0) &&
22050Sstevel@tonic-gate (np = metaslicename(di->dd_dnp, rep_slice, ep)) &&
22060Sstevel@tonic-gate (tailpp = meta_namelist_append_wrapper(tailpp, np)))
22070Sstevel@tonic-gate count++;
22080Sstevel@tonic-gate else
22090Sstevel@tonic-gate count = -1;
22100Sstevel@tonic-gate }
22110Sstevel@tonic-gate
22120Sstevel@tonic-gate /* now find logs */
22130Sstevel@tonic-gate if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) {
22140Sstevel@tonic-gate count = -1;
22150Sstevel@tonic-gate goto out;
22160Sstevel@tonic-gate }
22170Sstevel@tonic-gate
22180Sstevel@tonic-gate for (nlp = transnlp; (nlp != NULL); nlp = nlp->next) {
22190Sstevel@tonic-gate mdname_t *transnp = nlp->namep;
22200Sstevel@tonic-gate md_trans_t *transp;
22210Sstevel@tonic-gate
22220Sstevel@tonic-gate if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) {
22230Sstevel@tonic-gate count = -1;
22240Sstevel@tonic-gate goto out;
22250Sstevel@tonic-gate }
22260Sstevel@tonic-gate if (transp->lognamep) {
22270Sstevel@tonic-gate /*
22280Sstevel@tonic-gate * Add the name struct to the end of the
22290Sstevel@tonic-gate * namelist but keep a pointer to the last
22300Sstevel@tonic-gate * element so that we don't incur the overhead
22310Sstevel@tonic-gate * of traversing the list each time
22320Sstevel@tonic-gate */
22330Sstevel@tonic-gate tailpp = meta_namelist_append_wrapper(
22340Sstevel@tonic-gate tailpp, transp->lognamep);
22350Sstevel@tonic-gate }
22360Sstevel@tonic-gate }
22370Sstevel@tonic-gate out:
22380Sstevel@tonic-gate metafreenamelist(transnlp);
22390Sstevel@tonic-gate return (count);
22400Sstevel@tonic-gate }
22410Sstevel@tonic-gate
22420Sstevel@tonic-gate /*
22430Sstevel@tonic-gate * Entry point to join a node to MultiNode diskset.
22440Sstevel@tonic-gate *
22450Sstevel@tonic-gate * Validate host in diskset.
22460Sstevel@tonic-gate * - Should be in membership list from API
22470Sstevel@tonic-gate * - Should not already be joined into diskset.
22480Sstevel@tonic-gate * - Set must have drives
22490Sstevel@tonic-gate * Assume valid configuration is stored in the set/drive/node records
22500Sstevel@tonic-gate * in the local mddb since no node or drive can be added to the MNset
22510Sstevel@tonic-gate * unless all drives and nodes are available. Reconfig steps will
22520Sstevel@tonic-gate * resync all ALIVE nodes in case of panic in critical areas.
22530Sstevel@tonic-gate *
22540Sstevel@tonic-gate * Lock down the set.
22550Sstevel@tonic-gate * Verify host is a member of this diskset.
22560Sstevel@tonic-gate * If drives exist in the configuration, load the mddbs.
22570Sstevel@tonic-gate * Set this node to active by notifying master if one exists.
22580Sstevel@tonic-gate * If this is the first node active in the diskset, this node
22590Sstevel@tonic-gate * becomes the master.
22600Sstevel@tonic-gate * Unlock the set.
22610Sstevel@tonic-gate *
22620Sstevel@tonic-gate * Mirror Resync:
22630Sstevel@tonic-gate * If this node is the last node to join the set and clustering
22640Sstevel@tonic-gate * isn't running, then start the 'metasync -r' type resync
22650Sstevel@tonic-gate * on all mirrors in this diskset.
22660Sstevel@tonic-gate * If clustering is running, this resync operation will
22670Sstevel@tonic-gate * be handled by the reconfig steps and should NOT
22680Sstevel@tonic-gate * be handled during a join operation.
22690Sstevel@tonic-gate *
22700Sstevel@tonic-gate * There are multiple return values in order to assist
22710Sstevel@tonic-gate * the join operation of all sets in the metaset command.
22720Sstevel@tonic-gate *
22730Sstevel@tonic-gate * Return values:
22740Sstevel@tonic-gate * 0 - Node successfully joined to set.
22750Sstevel@tonic-gate * -1 - Join attempted but failed
22760Sstevel@tonic-gate * - any failure from libmeta calls
22770Sstevel@tonic-gate * - node not in the member list
22780Sstevel@tonic-gate * -2 - Join not attempted since
22790Sstevel@tonic-gate * - this set had no drives in set
22800Sstevel@tonic-gate * - this node already joined to set
22810Sstevel@tonic-gate * - set is not a multinode set
22820Sstevel@tonic-gate * -3 - Node joined to STALE set.
22830Sstevel@tonic-gate */
22840Sstevel@tonic-gate extern int
meta_set_join(mdsetname_t * sp,md_error_t * ep)22850Sstevel@tonic-gate meta_set_join(
22860Sstevel@tonic-gate mdsetname_t *sp,
22870Sstevel@tonic-gate md_error_t *ep
22880Sstevel@tonic-gate )
22890Sstevel@tonic-gate {
22900Sstevel@tonic-gate md_set_desc *sd;
22910Sstevel@tonic-gate md_drive_desc *dd;
22920Sstevel@tonic-gate md_mnnode_desc *nd, *nd2, my_nd;
22930Sstevel@tonic-gate int rval = 0;
22940Sstevel@tonic-gate md_setkey_t *cl_sk;
22950Sstevel@tonic-gate md_error_t xep = mdnullerror;
22960Sstevel@tonic-gate md_error_t ep_snarf = mdnullerror;
22970Sstevel@tonic-gate int master_flag = 0;
22980Sstevel@tonic-gate md_mnset_record *mas_mnsr = NULL;
22990Sstevel@tonic-gate int clear_nr_flags = 0;
23000Sstevel@tonic-gate md_mnnode_record *nr;
23010Sstevel@tonic-gate int stale_set = 0;
23020Sstevel@tonic-gate int rb_flags = 0;
23030Sstevel@tonic-gate int stale_bool = FALSE;
23040Sstevel@tonic-gate int suspendall_flag = 0;
23050Sstevel@tonic-gate int suspend1_flag = 0;
23060Sstevel@tonic-gate sigset_t oldsigs;
23070Sstevel@tonic-gate int send_reinit = 0;
23080Sstevel@tonic-gate
23090Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) {
23100Sstevel@tonic-gate return (-1);
23110Sstevel@tonic-gate }
23120Sstevel@tonic-gate
23130Sstevel@tonic-gate /* Must be a multinode diskset */
23140Sstevel@tonic-gate if (!MD_MNSET_DESC(sd)) {
23150Sstevel@tonic-gate (void) mderror(ep, MDE_NOT_MN, sp->setname);
23160Sstevel@tonic-gate return (-2);
23170Sstevel@tonic-gate }
23180Sstevel@tonic-gate
23190Sstevel@tonic-gate /* Verify that the node is ALIVE (i.e. is in the API membership list) */
23200Sstevel@tonic-gate if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_ALIVE)) {
23210Sstevel@tonic-gate (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno,
23224932Spetede sd->sd_mn_mynode->nd_nodename, NULL, sp->setname);
23230Sstevel@tonic-gate return (-1);
23240Sstevel@tonic-gate }
23250Sstevel@tonic-gate
23260Sstevel@tonic-gate /* Make sure we are blocking all signals */
23270Sstevel@tonic-gate if (procsigs(TRUE, &oldsigs, &xep) < 0)
23280Sstevel@tonic-gate mdclrerror(&xep);
23290Sstevel@tonic-gate
23300Sstevel@tonic-gate /*
23310Sstevel@tonic-gate * Lock the set on current set members.
23320Sstevel@tonic-gate * For MN diskset lock_set and SUSPEND are used to protect against
23330Sstevel@tonic-gate * other meta* commands running on the other nodes.
23340Sstevel@tonic-gate */
23350Sstevel@tonic-gate nd = sd->sd_nodelist;
23360Sstevel@tonic-gate while (nd) {
23370Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
23380Sstevel@tonic-gate nd = nd->nd_next;
23390Sstevel@tonic-gate continue;
23400Sstevel@tonic-gate }
23410Sstevel@tonic-gate if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
23420Sstevel@tonic-gate rval = -1;
23430Sstevel@tonic-gate goto out;
23440Sstevel@tonic-gate }
23450Sstevel@tonic-gate nd = nd->nd_next;
23460Sstevel@tonic-gate }
23470Sstevel@tonic-gate
23480Sstevel@tonic-gate /*
23490Sstevel@tonic-gate * Lock out other meta* commands by suspending
23500Sstevel@tonic-gate * class 1 messages across the diskset.
23510Sstevel@tonic-gate */
23520Sstevel@tonic-gate nd = sd->sd_nodelist;
23530Sstevel@tonic-gate while (nd) {
23540Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
23550Sstevel@tonic-gate nd = nd->nd_next;
23560Sstevel@tonic-gate continue;
23570Sstevel@tonic-gate }
23580Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
23594932Spetede sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
23600Sstevel@tonic-gate rval = -1;
23610Sstevel@tonic-gate goto out;
23620Sstevel@tonic-gate }
23630Sstevel@tonic-gate suspend1_flag = 1;
23640Sstevel@tonic-gate nd = nd->nd_next;
23650Sstevel@tonic-gate }
23660Sstevel@tonic-gate
23670Sstevel@tonic-gate /*
23680Sstevel@tonic-gate * Verify that this host is a member (in the host list) of the set.
23690Sstevel@tonic-gate */
23700Sstevel@tonic-gate nd = sd->sd_nodelist;
23710Sstevel@tonic-gate while (nd) {
23720Sstevel@tonic-gate if (strcmp(mynode(), nd->nd_nodename) == 0) {
23730Sstevel@tonic-gate break;
23740Sstevel@tonic-gate }
23750Sstevel@tonic-gate nd = nd->nd_next;
23760Sstevel@tonic-gate }
23770Sstevel@tonic-gate if (!nd) {
23780Sstevel@tonic-gate (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
23794932Spetede sd->sd_mn_mynode->nd_nodename, NULL,
23804932Spetede sp->setname);
23810Sstevel@tonic-gate rval = -1;
23820Sstevel@tonic-gate goto out;
23830Sstevel@tonic-gate }
23840Sstevel@tonic-gate
23850Sstevel@tonic-gate /*
23860Sstevel@tonic-gate * Need to return failure if host is already 'joined'
23870Sstevel@tonic-gate * into the set. This is done so that if later the user
23880Sstevel@tonic-gate * issues a command to join all sets and a failure is
23890Sstevel@tonic-gate * encountered - that the resulting cleanup effort
23900Sstevel@tonic-gate * (withdrawing from all sets that were joined
23910Sstevel@tonic-gate * during that command) won't withdraw from this set.
23920Sstevel@tonic-gate */
23930Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OWN) {
23940Sstevel@tonic-gate rval = -2;
23950Sstevel@tonic-gate goto out2;
23960Sstevel@tonic-gate }
23970Sstevel@tonic-gate
23980Sstevel@tonic-gate /*
23990Sstevel@tonic-gate * Call metaget_setownership that calls each node in diskset and
24000Sstevel@tonic-gate * marks in set descriptor if node is an owner of the set or not.
24010Sstevel@tonic-gate * metaget_setownership checks to see if a node is an owner by
24020Sstevel@tonic-gate * checking to see if that node's kernel has the mddb loaded.
24030Sstevel@tonic-gate * If a node had panic'd during a reconfig or an
24040Sstevel@tonic-gate * add/delete/join/withdraw operation, the other nodes' node
24050Sstevel@tonic-gate * records may not reflect the current state of the diskset,
24060Sstevel@tonic-gate * so calling metaget_setownership is the safest thing to do.
24070Sstevel@tonic-gate */
24080Sstevel@tonic-gate if (metaget_setownership(sp, ep) == -1) {
24090Sstevel@tonic-gate rval = -1;
24100Sstevel@tonic-gate goto out;
24110Sstevel@tonic-gate }
24120Sstevel@tonic-gate
24130Sstevel@tonic-gate /* If first active member of diskset, become the master. */
24140Sstevel@tonic-gate nd = sd->sd_nodelist;
24150Sstevel@tonic-gate while (nd) {
24160Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OWN)
24170Sstevel@tonic-gate break;
24180Sstevel@tonic-gate nd = nd->nd_next;
24190Sstevel@tonic-gate }
24200Sstevel@tonic-gate if (nd == NULL)
24210Sstevel@tonic-gate master_flag = 1;
24220Sstevel@tonic-gate
24230Sstevel@tonic-gate /*
24240Sstevel@tonic-gate * If not first active member of diskset, then get the
24250Sstevel@tonic-gate * master information from a node that is already joined
24260Sstevel@tonic-gate * and set the master information for this node. Be sure
24270Sstevel@tonic-gate * that this node (the already joined node) has its own
24280Sstevel@tonic-gate * join flag set. If not, then this diskset isn't currently
24290Sstevel@tonic-gate * consistent and shouldn't allow a node to join. This diskset
24300Sstevel@tonic-gate * inconsistency should only occur when a node has panic'd in
24310Sstevel@tonic-gate * the set while doing a metaset operation and the sysadmin is
24320Sstevel@tonic-gate * attempting to join a node into the set. This inconsistency
24330Sstevel@tonic-gate * will be fixed during a reconfig cycle which should be occurring
24340Sstevel@tonic-gate * soon since a node panic'd.
24350Sstevel@tonic-gate *
24360Sstevel@tonic-gate * If unable to get this information from an owning node, then
24370Sstevel@tonic-gate * this diskset isn't currently consistent and shouldn't
24380Sstevel@tonic-gate * allow a node to join.
24390Sstevel@tonic-gate */
24400Sstevel@tonic-gate if (!master_flag) {
24410Sstevel@tonic-gate /* get master information from an owner (joined) node */
24420Sstevel@tonic-gate if (clnt_mngetset(nd->nd_nodename, sp->setname,
24430Sstevel@tonic-gate sp->setno, &mas_mnsr, ep) == -1) {
24440Sstevel@tonic-gate rval = -1;
24450Sstevel@tonic-gate goto out;
24460Sstevel@tonic-gate }
24470Sstevel@tonic-gate
24480Sstevel@tonic-gate /* Verify that owner (joined) node has its own JOIN flag set */
24490Sstevel@tonic-gate nr = mas_mnsr->sr_nodechain;
24500Sstevel@tonic-gate while (nr) {
24510Sstevel@tonic-gate if ((nd->nd_nodeid == nr->nr_nodeid) &&
24520Sstevel@tonic-gate ((nr->nr_flags & MD_MN_NODE_OWN) == NULL)) {
24530Sstevel@tonic-gate (void) mddserror(ep, MDE_DS_NODENOSET,
24540Sstevel@tonic-gate sp->setno, nd->nd_nodename, NULL,
24550Sstevel@tonic-gate nd->nd_nodename);
24560Sstevel@tonic-gate free_sr((md_set_record *)mas_mnsr);
24570Sstevel@tonic-gate rval = -1;
24580Sstevel@tonic-gate goto out;
24590Sstevel@tonic-gate }
24600Sstevel@tonic-gate nr = nr->nr_next;
24610Sstevel@tonic-gate }
24620Sstevel@tonic-gate
24630Sstevel@tonic-gate /*
24640Sstevel@tonic-gate * Does master have set marked as STALE?
24650Sstevel@tonic-gate * If so, need to pass this down to kernel when
24660Sstevel@tonic-gate * this node snarfs the set.
24670Sstevel@tonic-gate */
24680Sstevel@tonic-gate if (clnt_mn_is_stale(nd->nd_nodename, sp,
24690Sstevel@tonic-gate &stale_bool, ep) == -1) {
24700Sstevel@tonic-gate rval = -1;
24710Sstevel@tonic-gate goto out;
24720Sstevel@tonic-gate }
24730Sstevel@tonic-gate
24740Sstevel@tonic-gate /* set master information in my rpc.metad's set record */
24750Sstevel@tonic-gate if (clnt_mnsetmaster(mynode(), sp, mas_mnsr->sr_master_nodenm,
24760Sstevel@tonic-gate mas_mnsr->sr_master_nodeid, ep)) {
24770Sstevel@tonic-gate free_sr((md_set_record *)mas_mnsr);
24780Sstevel@tonic-gate rval = -1;
24790Sstevel@tonic-gate goto out;
24800Sstevel@tonic-gate }
24810Sstevel@tonic-gate
24820Sstevel@tonic-gate /* set master information in my cached set desc */
24830Sstevel@tonic-gate (void) strcpy(sd->sd_mn_master_nodenm,
24840Sstevel@tonic-gate mas_mnsr->sr_master_nodenm);
24850Sstevel@tonic-gate sd->sd_mn_master_nodeid = mas_mnsr->sr_master_nodeid;
24860Sstevel@tonic-gate nd2 = sd->sd_nodelist;
24870Sstevel@tonic-gate while (nd2) {
24884932Spetede if (nd2->nd_nodeid == mas_mnsr->sr_master_nodeid) {
24894932Spetede sd->sd_mn_masternode = nd2;
24904932Spetede break;
24914932Spetede }
24924932Spetede nd2 = nd2->nd_next;
24930Sstevel@tonic-gate }
24940Sstevel@tonic-gate free_sr((md_set_record *)mas_mnsr);
24950Sstevel@tonic-gate
24960Sstevel@tonic-gate /*
24970Sstevel@tonic-gate * Set the node flags in mynode's rpc.metad node records for
24980Sstevel@tonic-gate * the nodes that are in the diskset. Can use my sd
24990Sstevel@tonic-gate * since earlier call to metaget_setownership set the
25000Sstevel@tonic-gate * owner flags based on whether that node had snarfed
25010Sstevel@tonic-gate * the MN diskset mddb. Reconfig steps guarantee that
25020Sstevel@tonic-gate * return of metaget_setownership will match the owning
25030Sstevel@tonic-gate * node's owner list except in the case where a node
25040Sstevel@tonic-gate * has just panic'd and in this case, a reconfig will
25050Sstevel@tonic-gate * be starting immediately and the owner lists will
25060Sstevel@tonic-gate * be sync'd up by the reconfig.
25070Sstevel@tonic-gate *
25080Sstevel@tonic-gate * Flag of SET means to take no action except to
25090Sstevel@tonic-gate * set the node flags as given in the nodelist linked list.
25100Sstevel@tonic-gate */
25110Sstevel@tonic-gate if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
25120Sstevel@tonic-gate MD_NR_SET, NULL, ep)) {
25130Sstevel@tonic-gate rval = -1;
25140Sstevel@tonic-gate goto out;
25150Sstevel@tonic-gate }
25160Sstevel@tonic-gate }
25170Sstevel@tonic-gate
25180Sstevel@tonic-gate /*
25190Sstevel@tonic-gate * Read in the mddb if there are drives in the set.
25200Sstevel@tonic-gate */
25210Sstevel@tonic-gate if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
25220Sstevel@tonic-gate ep)) == NULL) {
25230Sstevel@tonic-gate /* No drives in list */
25240Sstevel@tonic-gate if (! mdisok(ep)) {
25250Sstevel@tonic-gate rval = -1;
25260Sstevel@tonic-gate goto out;
25270Sstevel@tonic-gate }
25280Sstevel@tonic-gate rval = -2;
25290Sstevel@tonic-gate goto out;
25300Sstevel@tonic-gate }
25310Sstevel@tonic-gate
25320Sstevel@tonic-gate /*
25330Sstevel@tonic-gate * Notify rpc.mdcommd on all nodes of a nodelist change.
25340Sstevel@tonic-gate * Start by suspending rpc.mdcommd (which drains it of all messages),
25350Sstevel@tonic-gate * then change the nodelist followed by a reinit and resume.
25360Sstevel@tonic-gate */
25370Sstevel@tonic-gate nd = sd->sd_nodelist;
25380Sstevel@tonic-gate while (nd) {
25390Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
25400Sstevel@tonic-gate nd = nd->nd_next;
25410Sstevel@tonic-gate continue;
25420Sstevel@tonic-gate }
25430Sstevel@tonic-gate
25440Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp,
25450Sstevel@tonic-gate MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
25460Sstevel@tonic-gate rval = -1;
25470Sstevel@tonic-gate goto out;
25480Sstevel@tonic-gate }
25490Sstevel@tonic-gate suspendall_flag = 1;
25500Sstevel@tonic-gate nd = nd->nd_next;
25510Sstevel@tonic-gate }
25520Sstevel@tonic-gate
25530Sstevel@tonic-gate /* Set master in my set record in rpc.metad */
25540Sstevel@tonic-gate if (master_flag) {
25550Sstevel@tonic-gate if (clnt_mnsetmaster(mynode(), sp,
25560Sstevel@tonic-gate sd->sd_mn_mynode->nd_nodename,
25570Sstevel@tonic-gate sd->sd_mn_mynode->nd_nodeid, ep)) {
25580Sstevel@tonic-gate rval = -1;
25590Sstevel@tonic-gate goto out;
25600Sstevel@tonic-gate }
25610Sstevel@tonic-gate }
2562650Sskamm /*
2563650Sskamm * Causes mddbs to be loaded into the kernel.
2564650Sskamm * Set the force flag so that replica locations can be
2565650Sskamm * loaded into the kernel even if a mediator node was
2566650Sskamm * unavailable. This allows a node to join an MO
2567650Sskamm * diskset when there are sufficient replicas available,
2568650Sskamm * but a mediator node in unavailable.
2569650Sskamm */
2570650Sskamm if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
25710Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN,
25720Sstevel@tonic-gate "Host not able to start diskset."));
25730Sstevel@tonic-gate rval = -1;
25740Sstevel@tonic-gate goto out;
25750Sstevel@tonic-gate }
25760Sstevel@tonic-gate
25770Sstevel@tonic-gate if (! mdisok(ep)) {
25780Sstevel@tonic-gate rval = -1;
25790Sstevel@tonic-gate goto out;
25800Sstevel@tonic-gate }
25810Sstevel@tonic-gate
25820Sstevel@tonic-gate /*
25830Sstevel@tonic-gate * Set rollback flags to 1 so that halt_set is called if a failure
25840Sstevel@tonic-gate * is seen after this point. If snarf_set fails, still need to
25850Sstevel@tonic-gate * call halt_set to cleanup the diskset.
25860Sstevel@tonic-gate */
25870Sstevel@tonic-gate rb_flags = 1;
25880Sstevel@tonic-gate
25890Sstevel@tonic-gate /* Starts the set */
25900Sstevel@tonic-gate if (snarf_set(sp, stale_bool, ep) != 0) {
25910Sstevel@tonic-gate if (mdismddberror(ep, MDE_DB_STALE)) {
25920Sstevel@tonic-gate /*
25930Sstevel@tonic-gate * Don't fail join, STALE means that set has
25940Sstevel@tonic-gate * < 50% mddbs.
25950Sstevel@tonic-gate */
25960Sstevel@tonic-gate (void) mdstealerror(&ep_snarf, ep);
25970Sstevel@tonic-gate stale_set = 1;
25980Sstevel@tonic-gate } else if (mdisok(ep)) {
25990Sstevel@tonic-gate /* If snarf failed, but no error was set - set it */
260062Sjeanm (void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
26010Sstevel@tonic-gate sp->setno, 0, NULL);
26020Sstevel@tonic-gate rval = -1;
26030Sstevel@tonic-gate goto out;
26040Sstevel@tonic-gate } else if (!(mdismddberror(ep, MDE_DB_ACCOK))) {
26050Sstevel@tonic-gate /*
26060Sstevel@tonic-gate * Don't fail join if ACCOK; ACCOK means that mediator
26070Sstevel@tonic-gate * provided extra vote.
26080Sstevel@tonic-gate */
26090Sstevel@tonic-gate rval = -1;
26100Sstevel@tonic-gate goto out;
26110Sstevel@tonic-gate }
26120Sstevel@tonic-gate }
26130Sstevel@tonic-gate
26140Sstevel@tonic-gate /* Did set really get snarfed? */
26150Sstevel@tonic-gate if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_NO) {
26160Sstevel@tonic-gate if (mdisok(ep)) {
26170Sstevel@tonic-gate /* If snarf failed, but no error was set - set it */
261862Sjeanm (void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
26194932Spetede sp->setno, 0, NULL);
26200Sstevel@tonic-gate }
26210Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN,
26220Sstevel@tonic-gate "Host not able to start diskset."));
26230Sstevel@tonic-gate rval = -1;
26240Sstevel@tonic-gate goto out;
26250Sstevel@tonic-gate }
26260Sstevel@tonic-gate
26270Sstevel@tonic-gate /* Change to nodelist so need to send reinit to rpc.mdcommd */
26280Sstevel@tonic-gate send_reinit = 1;
26290Sstevel@tonic-gate
26300Sstevel@tonic-gate /* If first node to enter set, setup master and clear change log */
26310Sstevel@tonic-gate if (master_flag) {
26320Sstevel@tonic-gate /* Set master in my locally cached set descriptor */
26330Sstevel@tonic-gate (void) strcpy(sd->sd_mn_master_nodenm,
26340Sstevel@tonic-gate sd->sd_mn_mynode->nd_nodename);
26350Sstevel@tonic-gate sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid;
26360Sstevel@tonic-gate sd->sd_mn_am_i_master = 1;
26370Sstevel@tonic-gate
26380Sstevel@tonic-gate /*
26390Sstevel@tonic-gate * If first node to join set, then clear out change log
26400Sstevel@tonic-gate * entries. Change log entries are only needed when a
26410Sstevel@tonic-gate * change of master is occurring in a diskset that has
26420Sstevel@tonic-gate * multiple owners. Since this node is the first owner
26430Sstevel@tonic-gate * of the diskset, clear the entries.
26440Sstevel@tonic-gate *
26450Sstevel@tonic-gate * Only do this if we are in a single node non-SC3.x
26460Sstevel@tonic-gate * situation.
26470Sstevel@tonic-gate */
26480Sstevel@tonic-gate if (meta_mn_singlenode() &&
26494932Spetede mdmn_reset_changelog(sp, ep, MDMN_CLF_RESETLOG) != 0) {
26500Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN,
26510Sstevel@tonic-gate "Unable to reset changelog."));
26520Sstevel@tonic-gate rval = -1;
26530Sstevel@tonic-gate goto out;
26540Sstevel@tonic-gate }
26550Sstevel@tonic-gate }
26560Sstevel@tonic-gate
26570Sstevel@tonic-gate /* Set my locally cached flag */
26580Sstevel@tonic-gate sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
26590Sstevel@tonic-gate
26600Sstevel@tonic-gate /*
26610Sstevel@tonic-gate * Set this node's own flag on all joined nodes in the set
26620Sstevel@tonic-gate * (including my node).
26630Sstevel@tonic-gate */
26640Sstevel@tonic-gate clear_nr_flags = 1;
26650Sstevel@tonic-gate
26660Sstevel@tonic-gate my_nd = *(sd->sd_mn_mynode);
26670Sstevel@tonic-gate my_nd.nd_next = NULL;
26680Sstevel@tonic-gate nd = sd->sd_nodelist;
26690Sstevel@tonic-gate while (nd) {
26700Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
26710Sstevel@tonic-gate nd = nd->nd_next;
26720Sstevel@tonic-gate continue;
26730Sstevel@tonic-gate }
26740Sstevel@tonic-gate if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
26750Sstevel@tonic-gate MD_NR_JOIN, NULL, ep)) {
26760Sstevel@tonic-gate rval = -1;
26770Sstevel@tonic-gate goto out;
26780Sstevel@tonic-gate }
26790Sstevel@tonic-gate nd = nd->nd_next;
26800Sstevel@tonic-gate }
26810Sstevel@tonic-gate
26820Sstevel@tonic-gate out:
26830Sstevel@tonic-gate if (rval != NULL) {
26840Sstevel@tonic-gate /*
26850Sstevel@tonic-gate * If rollback flag is 1, then node was joined to set.
26860Sstevel@tonic-gate * Since an error occurred, withdraw node from set in
26870Sstevel@tonic-gate * order to rollback to before command was run.
26880Sstevel@tonic-gate * Need to preserve ep so that calling function can
26890Sstevel@tonic-gate * get error information.
26900Sstevel@tonic-gate */
26910Sstevel@tonic-gate if (rb_flags == 1) {
26920Sstevel@tonic-gate if (halt_set(sp, &xep)) {
26930Sstevel@tonic-gate mdclrerror(&xep);
26940Sstevel@tonic-gate }
26950Sstevel@tonic-gate }
26960Sstevel@tonic-gate
26970Sstevel@tonic-gate /*
26980Sstevel@tonic-gate * If error, reset master to INVALID.
26990Sstevel@tonic-gate * Ignore error since (next) first node to successfully join
27000Sstevel@tonic-gate * will set master on all nodes.
27010Sstevel@tonic-gate */
27020Sstevel@tonic-gate (void) clnt_mnsetmaster(mynode(), sp, "",
27034932Spetede MD_MN_INVALID_NID, &xep);
27040Sstevel@tonic-gate mdclrerror(&xep);
27050Sstevel@tonic-gate /* Reset master in my locally cached set descriptor */
27060Sstevel@tonic-gate sd->sd_mn_master_nodeid = MD_MN_INVALID_NID;
27070Sstevel@tonic-gate sd->sd_mn_am_i_master = 0;
27080Sstevel@tonic-gate
27090Sstevel@tonic-gate /*
27100Sstevel@tonic-gate * If nr flags set on other nodes, reset them.
27110Sstevel@tonic-gate */
27120Sstevel@tonic-gate if (clear_nr_flags) {
27130Sstevel@tonic-gate nd = sd->sd_nodelist;
27140Sstevel@tonic-gate while (nd) {
27150Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
27160Sstevel@tonic-gate nd = nd->nd_next;
27170Sstevel@tonic-gate continue;
27180Sstevel@tonic-gate }
27190Sstevel@tonic-gate (void) clnt_upd_nr_flags(nd->nd_nodename, sp,
27204932Spetede &my_nd, MD_NR_WITHDRAW, NULL, &xep);
27210Sstevel@tonic-gate mdclrerror(&xep);
27220Sstevel@tonic-gate nd = nd->nd_next;
27230Sstevel@tonic-gate }
27240Sstevel@tonic-gate /* Reset my locally cached flag */
27250Sstevel@tonic-gate sd->sd_mn_mynode->nd_flags &= ~MD_MN_NODE_OWN;
27260Sstevel@tonic-gate }
27270Sstevel@tonic-gate }
27280Sstevel@tonic-gate
27290Sstevel@tonic-gate /*
27300Sstevel@tonic-gate * Notify rpc.mdcommd on all nodes of a nodelist change.
27310Sstevel@tonic-gate * Send reinit command to mdcommd which forces it to get
27320Sstevel@tonic-gate * fresh set description.
27330Sstevel@tonic-gate */
27340Sstevel@tonic-gate if (send_reinit) {
27350Sstevel@tonic-gate /* Send reinit */
27360Sstevel@tonic-gate nd = sd->sd_nodelist;
27370Sstevel@tonic-gate while (nd) {
27380Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
27390Sstevel@tonic-gate nd = nd->nd_next;
27400Sstevel@tonic-gate continue;
27410Sstevel@tonic-gate }
27420Sstevel@tonic-gate
27430Sstevel@tonic-gate /* Class is ignored for REINIT */
27440Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
27454932Spetede sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
27460Sstevel@tonic-gate /*
27470Sstevel@tonic-gate * We are here because we failed to resume
27480Sstevel@tonic-gate * rpc.mdcommd. However we potentially have
27490Sstevel@tonic-gate * an error from the previous call
27500Sstevel@tonic-gate * If the previous call did fail, we capture
27510Sstevel@tonic-gate * that error and generate a perror with
27520Sstevel@tonic-gate * the string, "Unable to resume...".
27530Sstevel@tonic-gate * Setting rval to -1 ensures that in the
27540Sstevel@tonic-gate * next iteration of the loop, ep is not
27550Sstevel@tonic-gate * clobbered.
27560Sstevel@tonic-gate */
27570Sstevel@tonic-gate if (rval == 0)
27580Sstevel@tonic-gate (void) mdstealerror(ep, &xep);
27590Sstevel@tonic-gate else
27600Sstevel@tonic-gate mdclrerror(&xep);
27610Sstevel@tonic-gate rval = -1;
27620Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN,
27630Sstevel@tonic-gate "Unable to reinit rpc.mdcommd."));
27640Sstevel@tonic-gate }
27650Sstevel@tonic-gate nd = nd->nd_next;
27660Sstevel@tonic-gate }
27670Sstevel@tonic-gate
27680Sstevel@tonic-gate }
27690Sstevel@tonic-gate
27700Sstevel@tonic-gate out2:
27710Sstevel@tonic-gate /*
27720Sstevel@tonic-gate * Unlock diskset by resuming messages across the diskset.
27730Sstevel@tonic-gate * Just resume all classes so that resume is the same whether
27740Sstevel@tonic-gate * just one class was locked or all classes were locked.
27750Sstevel@tonic-gate */
27760Sstevel@tonic-gate if ((suspend1_flag) || (suspendall_flag)) {
27770Sstevel@tonic-gate nd = sd->sd_nodelist;
27780Sstevel@tonic-gate while (nd) {
27790Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
27800Sstevel@tonic-gate nd = nd->nd_next;
27810Sstevel@tonic-gate continue;
27820Sstevel@tonic-gate }
27830Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
27844932Spetede sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
27850Sstevel@tonic-gate /*
27860Sstevel@tonic-gate * We are here because we failed to resume
27870Sstevel@tonic-gate * rpc.mdcommd. However we potentially have
27880Sstevel@tonic-gate * an error from the previous call
27890Sstevel@tonic-gate * If the previous call did fail, we capture
27900Sstevel@tonic-gate * that error and generate a perror with
27910Sstevel@tonic-gate * the string, "Unable to resume...".
27920Sstevel@tonic-gate * Setting rval to -1 ensures that in the
27930Sstevel@tonic-gate * next iteration of the loop, ep is not
27940Sstevel@tonic-gate * clobbered.
27950Sstevel@tonic-gate */
27960Sstevel@tonic-gate if (rval == 0)
27970Sstevel@tonic-gate (void) mdstealerror(ep, &xep);
27980Sstevel@tonic-gate else
27990Sstevel@tonic-gate mdclrerror(&xep);
28000Sstevel@tonic-gate rval = -1;
28010Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN,
28020Sstevel@tonic-gate "Unable to resume rpc.mdcommd."));
28030Sstevel@tonic-gate }
28040Sstevel@tonic-gate nd = nd->nd_next;
28050Sstevel@tonic-gate }
28060Sstevel@tonic-gate meta_ping_mnset(sp->setno);
28070Sstevel@tonic-gate }
28080Sstevel@tonic-gate
28090Sstevel@tonic-gate /*
28100Sstevel@tonic-gate * Unlock set. This flushes the caches on the servers.
28110Sstevel@tonic-gate */
28120Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname);
28130Sstevel@tonic-gate nd = sd->sd_nodelist;
28140Sstevel@tonic-gate while (nd) {
28150Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
28160Sstevel@tonic-gate nd = nd->nd_next;
28170Sstevel@tonic-gate continue;
28180Sstevel@tonic-gate }
28190Sstevel@tonic-gate if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
28200Sstevel@tonic-gate if (rval == 0)
28210Sstevel@tonic-gate (void) mdstealerror(ep, &xep);
28220Sstevel@tonic-gate else
28230Sstevel@tonic-gate mdclrerror(&xep);
28240Sstevel@tonic-gate rval = -1;
28250Sstevel@tonic-gate }
28260Sstevel@tonic-gate nd = nd->nd_next;
28270Sstevel@tonic-gate }
28280Sstevel@tonic-gate
28290Sstevel@tonic-gate /*
28300Sstevel@tonic-gate * If this node is the last to join the diskset and clustering isn't
28310Sstevel@tonic-gate * running, then resync the mirrors in the diskset. We have to wait
28320Sstevel@tonic-gate * until all nodes are joined so that the status gets propagated to
28330Sstevel@tonic-gate * all of the members of the set.
28340Sstevel@tonic-gate * Ignore any error from the resync as the join function shouldn't fail
28350Sstevel@tonic-gate * because the mirror resync had a problem.
28360Sstevel@tonic-gate *
28370Sstevel@tonic-gate * Don't start resync if set is stale.
28380Sstevel@tonic-gate */
28390Sstevel@tonic-gate if ((rval == 0) && (sdssc_bind_library() != SDSSC_OKAY) &&
28400Sstevel@tonic-gate (stale_set != 1)) {
28410Sstevel@tonic-gate nd = sd->sd_nodelist;
28420Sstevel@tonic-gate while (nd) {
28430Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN))
28440Sstevel@tonic-gate break;
28450Sstevel@tonic-gate nd = nd->nd_next;
28460Sstevel@tonic-gate }
28470Sstevel@tonic-gate /*
28480Sstevel@tonic-gate * nd set to NULL means that we have no nodes in the set that
28490Sstevel@tonic-gate * haven't joined. In this case we start the resync.
28500Sstevel@tonic-gate */
28510Sstevel@tonic-gate if (nd == NULL) {
28520Sstevel@tonic-gate (void) meta_mirror_resync_all(sp, 0, &xep);
28530Sstevel@tonic-gate mdclrerror(&xep);
28540Sstevel@tonic-gate }
28550Sstevel@tonic-gate }
28560Sstevel@tonic-gate
28570Sstevel@tonic-gate /* Update ABR state for all soft partitions */
28580Sstevel@tonic-gate (void) meta_sp_update_abr(sp, &xep);
28590Sstevel@tonic-gate mdclrerror(&xep);
28600Sstevel@tonic-gate
28610Sstevel@tonic-gate /*
28620Sstevel@tonic-gate * call metaflushsetnames to reset local cache for master and
28630Sstevel@tonic-gate * node information.
28640Sstevel@tonic-gate */
28650Sstevel@tonic-gate metaflushsetname(sp);
28660Sstevel@tonic-gate
28670Sstevel@tonic-gate /* release signals back to what they were on entry */
28680Sstevel@tonic-gate if (procsigs(FALSE, &oldsigs, &xep) < 0)
28690Sstevel@tonic-gate mdclrerror(&xep);
28700Sstevel@tonic-gate
28710Sstevel@tonic-gate /*
28720Sstevel@tonic-gate * If no error and stale_set is set, then set ep back
28730Sstevel@tonic-gate * to ep from snarf_set call and return -3. If another error
28740Sstevel@tonic-gate * occurred and rval is not 0, then that error would have
28750Sstevel@tonic-gate * caused the node to be withdrawn from the set and would
28760Sstevel@tonic-gate * have set ep to that error information.
28770Sstevel@tonic-gate */
28780Sstevel@tonic-gate if ((rval == 0) && (stale_set)) {
28790Sstevel@tonic-gate (void) mdstealerror(ep, &ep_snarf);
28800Sstevel@tonic-gate return (-3);
28810Sstevel@tonic-gate }
28820Sstevel@tonic-gate
28830Sstevel@tonic-gate return (rval);
28840Sstevel@tonic-gate }
28850Sstevel@tonic-gate
28860Sstevel@tonic-gate /*
28870Sstevel@tonic-gate * Entry point to withdraw a node from MultiNode diskset.
28880Sstevel@tonic-gate *
28890Sstevel@tonic-gate * Validate host in diskset.
28900Sstevel@tonic-gate * - Should be joined into diskset.
28910Sstevel@tonic-gate * Assume valid configuration is stored in the set/drive/node records
28920Sstevel@tonic-gate * in the local mddb since no node or drive can be added to the MNset
28930Sstevel@tonic-gate * unless all drives and nodes are available. Reconfig steps will
28940Sstevel@tonic-gate * resync all ALIVE nodes in case of panic in critical areas.
28950Sstevel@tonic-gate *
28960Sstevel@tonic-gate * Lock down the set.
28970Sstevel@tonic-gate * Verify that drives exist in configuration.
28980Sstevel@tonic-gate * Verify host is a member of this diskset.
28990Sstevel@tonic-gate * Verify host is an owner of the diskset (host is joined to diskset).
29000Sstevel@tonic-gate * Only allow withdrawal of master node if master node is the only joined
29010Sstevel@tonic-gate * in the diskset.
29020Sstevel@tonic-gate * Halt the diskset on this node.
29030Sstevel@tonic-gate * Reset Master on this node.
29040Sstevel@tonic-gate * Updated node flags that this node with withdrawn.
29050Sstevel@tonic-gate * Unlock the set.
29060Sstevel@tonic-gate *
29070Sstevel@tonic-gate * Return values:
29080Sstevel@tonic-gate * 0 - Node successfully withdrew from set.
29090Sstevel@tonic-gate * -1 - Withdrawal attempted but failed
29100Sstevel@tonic-gate * - any failure from libmeta calls
29110Sstevel@tonic-gate * - node not in the member list
29120Sstevel@tonic-gate * -2 - Withdrawal not attempted since
29130Sstevel@tonic-gate * - this set had no drives in set
29140Sstevel@tonic-gate * - this node not joined to set
29150Sstevel@tonic-gate * - set is not a multinode set
29160Sstevel@tonic-gate */
29170Sstevel@tonic-gate extern int
meta_set_withdraw(mdsetname_t * sp,md_error_t * ep)29180Sstevel@tonic-gate meta_set_withdraw(
29190Sstevel@tonic-gate mdsetname_t *sp,
29200Sstevel@tonic-gate md_error_t *ep
29210Sstevel@tonic-gate )
29220Sstevel@tonic-gate {
29230Sstevel@tonic-gate md_set_desc *sd;
29240Sstevel@tonic-gate md_drive_desc *dd = 0;
29250Sstevel@tonic-gate md_mnnode_desc *nd, my_nd;
29260Sstevel@tonic-gate int rval = 0;
29270Sstevel@tonic-gate md_setkey_t *cl_sk;
29280Sstevel@tonic-gate md_error_t xep = mdnullerror;
29290Sstevel@tonic-gate int set_halted = 0;
29300Sstevel@tonic-gate int suspendall_flag = 0;
29310Sstevel@tonic-gate int suspend1_flag = 0;
29320Sstevel@tonic-gate bool_t stale_bool = FALSE;
29330Sstevel@tonic-gate mddb_config_t c;
29340Sstevel@tonic-gate int node_id_list[1];
29350Sstevel@tonic-gate sigset_t oldsigs;
29360Sstevel@tonic-gate int send_reinit = 0;
29370Sstevel@tonic-gate
29380Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) {
29390Sstevel@tonic-gate return (-1);
29400Sstevel@tonic-gate }
29410Sstevel@tonic-gate
29420Sstevel@tonic-gate /* Must be a multinode diskset */
29430Sstevel@tonic-gate if (!MD_MNSET_DESC(sd)) {
29440Sstevel@tonic-gate (void) mderror(ep, MDE_NOT_MN, sp->setname);
29450Sstevel@tonic-gate return (-1);
29460Sstevel@tonic-gate }
29470Sstevel@tonic-gate
29480Sstevel@tonic-gate /* Make sure we are blocking all signals */
29490Sstevel@tonic-gate if (procsigs(TRUE, &oldsigs, &xep) < 0)
29500Sstevel@tonic-gate mdclrerror(&xep);
29510Sstevel@tonic-gate
29520Sstevel@tonic-gate /*
29530Sstevel@tonic-gate * Lock the set on current set members.
29540Sstevel@tonic-gate * For MN diskset lock_set and SUSPEND are used to protect against
29550Sstevel@tonic-gate * other meta* commands running on the other nodes.
29560Sstevel@tonic-gate */
29570Sstevel@tonic-gate nd = sd->sd_nodelist;
29580Sstevel@tonic-gate while (nd) {
29590Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
29600Sstevel@tonic-gate nd = nd->nd_next;
29610Sstevel@tonic-gate continue;
29620Sstevel@tonic-gate }
29630Sstevel@tonic-gate if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
29640Sstevel@tonic-gate rval = -1;
29650Sstevel@tonic-gate goto out;
29660Sstevel@tonic-gate }
29670Sstevel@tonic-gate nd = nd->nd_next;
29680Sstevel@tonic-gate }
29690Sstevel@tonic-gate /*
29700Sstevel@tonic-gate * Lock out other meta* commands by suspending
29710Sstevel@tonic-gate * class 1 messages across the diskset.
29720Sstevel@tonic-gate */
29730Sstevel@tonic-gate nd = sd->sd_nodelist;
29740Sstevel@tonic-gate while (nd) {
29750Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
29760Sstevel@tonic-gate nd = nd->nd_next;
29770Sstevel@tonic-gate continue;
29780Sstevel@tonic-gate }
29790Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
29804932Spetede sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
29810Sstevel@tonic-gate rval = -1;
29820Sstevel@tonic-gate goto out;
29830Sstevel@tonic-gate }
29840Sstevel@tonic-gate suspend1_flag = 1;
29850Sstevel@tonic-gate nd = nd->nd_next;
29860Sstevel@tonic-gate }
29870Sstevel@tonic-gate
29880Sstevel@tonic-gate /* Get list of drives - needed in case of failure */
29890Sstevel@tonic-gate if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
29900Sstevel@tonic-gate ep)) == NULL) {
29910Sstevel@tonic-gate /* Error getting drives in list */
29920Sstevel@tonic-gate if (! mdisok(ep)) {
29930Sstevel@tonic-gate rval = -1;
29940Sstevel@tonic-gate goto out2;
29950Sstevel@tonic-gate }
29960Sstevel@tonic-gate /* no drives in list */
29970Sstevel@tonic-gate rval = -2;
29980Sstevel@tonic-gate goto out2;
29990Sstevel@tonic-gate }
30000Sstevel@tonic-gate
30010Sstevel@tonic-gate /*
30020Sstevel@tonic-gate * Verify that this host is a member (in the host list) of the set.
30030Sstevel@tonic-gate */
30040Sstevel@tonic-gate nd = sd->sd_nodelist;
30050Sstevel@tonic-gate while (nd) {
30060Sstevel@tonic-gate if (strcmp(mynode(), nd->nd_nodename) == 0) {
30070Sstevel@tonic-gate break;
30080Sstevel@tonic-gate }
30090Sstevel@tonic-gate nd = nd->nd_next;
30100Sstevel@tonic-gate }
30110Sstevel@tonic-gate if (!nd) {
30120Sstevel@tonic-gate (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
30134932Spetede sd->sd_mn_mynode->nd_nodename, NULL,
30144932Spetede sp->setname);
30150Sstevel@tonic-gate rval = -1;
30160Sstevel@tonic-gate goto out2;
30170Sstevel@tonic-gate }
30180Sstevel@tonic-gate
30190Sstevel@tonic-gate /*
30200Sstevel@tonic-gate * Call metaget_setownership that calls each node in diskset and
30210Sstevel@tonic-gate * marks in set descriptor if node is an owner of the set or not.
30220Sstevel@tonic-gate * metaget_setownership checks to see if a node is an owner by
30230Sstevel@tonic-gate * checking to see if that node's kernel has the mddb loaded.
30240Sstevel@tonic-gate * If a node had panic'd during a reconfig or an
30250Sstevel@tonic-gate * add/delete/join/withdraw operation, the other nodes' node
30260Sstevel@tonic-gate * records may not reflect the current state of the diskset,
30270Sstevel@tonic-gate * so calling metaget_setownership is the safest thing to do.
30280Sstevel@tonic-gate */
30290Sstevel@tonic-gate if (metaget_setownership(sp, ep) == -1) {
30300Sstevel@tonic-gate rval = -1;
30310Sstevel@tonic-gate goto out2;
30320Sstevel@tonic-gate }
30330Sstevel@tonic-gate
30340Sstevel@tonic-gate /*
30350Sstevel@tonic-gate * Verify that this node is joined
30360Sstevel@tonic-gate * to diskset (i.e. is an owner of the diskset).
30370Sstevel@tonic-gate */
30380Sstevel@tonic-gate if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
30390Sstevel@tonic-gate rval = -2;
30400Sstevel@tonic-gate goto out2;
30410Sstevel@tonic-gate }
30420Sstevel@tonic-gate
30430Sstevel@tonic-gate /*
30440Sstevel@tonic-gate * For a MN diskset, only withdraw master if it is
30450Sstevel@tonic-gate * the only joined node.
30460Sstevel@tonic-gate */
30470Sstevel@tonic-gate if (sd->sd_mn_master_nodeid == sd->sd_mn_mynode->nd_nodeid) {
30480Sstevel@tonic-gate nd = sd->sd_nodelist;
30490Sstevel@tonic-gate while (nd) {
30500Sstevel@tonic-gate /* Skip my node since checking for other owners */
30510Sstevel@tonic-gate if (nd->nd_nodeid == sd->sd_mn_master_nodeid) {
30520Sstevel@tonic-gate nd = nd->nd_next;
30530Sstevel@tonic-gate continue;
30540Sstevel@tonic-gate }
30550Sstevel@tonic-gate /* If another owner node if found, error */
30560Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OWN) {
30570Sstevel@tonic-gate (void) mddserror(ep, MDE_DS_WITHDRAWMASTER,
30584932Spetede sp->setno,
30594932Spetede sd->sd_mn_mynode->nd_nodename, NULL,
30604932Spetede sp->setname);
30610Sstevel@tonic-gate rval = -1;
30620Sstevel@tonic-gate goto out2;
30630Sstevel@tonic-gate }
30640Sstevel@tonic-gate nd = nd->nd_next;
30650Sstevel@tonic-gate }
30660Sstevel@tonic-gate }
30670Sstevel@tonic-gate
30680Sstevel@tonic-gate /*
30690Sstevel@tonic-gate * Is current set STALE?
30700Sstevel@tonic-gate */
30710Sstevel@tonic-gate (void) memset(&c, 0, sizeof (c));
30720Sstevel@tonic-gate c.c_id = 0;
30730Sstevel@tonic-gate c.c_setno = sp->setno;
30740Sstevel@tonic-gate if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
30750Sstevel@tonic-gate (void) mdstealerror(ep, &c.c_mde);
30760Sstevel@tonic-gate rval = -1;
30770Sstevel@tonic-gate goto out;
30780Sstevel@tonic-gate }
30790Sstevel@tonic-gate if (c.c_flags & MDDB_C_STALE) {
30800Sstevel@tonic-gate stale_bool = TRUE;
30810Sstevel@tonic-gate }
30820Sstevel@tonic-gate
30830Sstevel@tonic-gate /*
30840Sstevel@tonic-gate * Notify rpc.mdcommd on all nodes of a nodelist change.
30850Sstevel@tonic-gate * Start by suspending rpc.mdcommd (which drains it of all messages),
30860Sstevel@tonic-gate * then change the nodelist followed by a reinit and resume.
30870Sstevel@tonic-gate */
30880Sstevel@tonic-gate nd = sd->sd_nodelist;
30890Sstevel@tonic-gate while (nd) {
30900Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
30910Sstevel@tonic-gate nd = nd->nd_next;
30920Sstevel@tonic-gate continue;
30930Sstevel@tonic-gate }
30940Sstevel@tonic-gate
30950Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
30960Sstevel@tonic-gate sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
30970Sstevel@tonic-gate rval = -1;
30980Sstevel@tonic-gate goto out;
30990Sstevel@tonic-gate }
31000Sstevel@tonic-gate suspendall_flag = 1;
31010Sstevel@tonic-gate nd = nd->nd_next;
31020Sstevel@tonic-gate }
31030Sstevel@tonic-gate
31040Sstevel@tonic-gate /*
31050Sstevel@tonic-gate * Withdraw the set - halt set.
31060Sstevel@tonic-gate * This will fail if any I/O is occuring to any metadevice which
31070Sstevel@tonic-gate * includes a resync to a mirror metadevice.
31080Sstevel@tonic-gate */
31090Sstevel@tonic-gate set_halted = 1;
31100Sstevel@tonic-gate if (halt_set(sp, ep)) {
31110Sstevel@tonic-gate /* Was set actually halted? */
31120Sstevel@tonic-gate if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_YES) {
31130Sstevel@tonic-gate set_halted = 0;
31140Sstevel@tonic-gate }
31150Sstevel@tonic-gate rval = -1;
31160Sstevel@tonic-gate goto out;
31170Sstevel@tonic-gate }
31180Sstevel@tonic-gate
31190Sstevel@tonic-gate /* Change to nodelist so need to send reinit to rpc.mdcommd */
31200Sstevel@tonic-gate send_reinit = 1;
31210Sstevel@tonic-gate
31220Sstevel@tonic-gate /* Reset master on withdrawn node */
31230Sstevel@tonic-gate if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, "",
31240Sstevel@tonic-gate MD_MN_INVALID_NID, ep)) {
31250Sstevel@tonic-gate rval = -1;
31260Sstevel@tonic-gate goto out;
31270Sstevel@tonic-gate }
31280Sstevel@tonic-gate
31290Sstevel@tonic-gate /* Mark my node as withdrawn and send to other nodes */
31300Sstevel@tonic-gate nd = sd->sd_nodelist;
31310Sstevel@tonic-gate my_nd = *(sd->sd_mn_mynode); /* structure copy */
31320Sstevel@tonic-gate my_nd.nd_next = NULL;
31330Sstevel@tonic-gate while (nd) {
31340Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
31350Sstevel@tonic-gate nd = nd->nd_next;
31360Sstevel@tonic-gate continue;
31370Sstevel@tonic-gate }
31380Sstevel@tonic-gate if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
31390Sstevel@tonic-gate MD_NR_WITHDRAW, NULL, ep)) {
31400Sstevel@tonic-gate rval = -1;
31410Sstevel@tonic-gate goto out;
31420Sstevel@tonic-gate }
31430Sstevel@tonic-gate nd = nd->nd_next;
31440Sstevel@tonic-gate }
31450Sstevel@tonic-gate
31460Sstevel@tonic-gate /*
31470Sstevel@tonic-gate * If withdrawn node is a mirror owner, reset mirror owner
31480Sstevel@tonic-gate * to NULL. If an error occurs, print a warning and continue.
31490Sstevel@tonic-gate * Don't fail metaset because of mirror owner reset problem since
31500Sstevel@tonic-gate * next node to grab mirror will resolve this issue.
31510Sstevel@tonic-gate * Before next node grabs mirrors, metaset will show the withdrawn
31520Sstevel@tonic-gate * node as owner which is why an attempt to reset the mirror owner
31530Sstevel@tonic-gate * is made.
31540Sstevel@tonic-gate */
31550Sstevel@tonic-gate node_id_list[0] = sd->sd_mn_mynode->nd_nodeid; /* Setup my nodeid */
31560Sstevel@tonic-gate nd = sd->sd_nodelist;
31570Sstevel@tonic-gate while (nd) {
31580Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
31590Sstevel@tonic-gate nd = nd->nd_next;
31600Sstevel@tonic-gate continue;
31610Sstevel@tonic-gate }
31620Sstevel@tonic-gate if (clnt_reset_mirror_owner(nd->nd_nodename, sp,
31630Sstevel@tonic-gate 1, &node_id_list[0], &xep) == 01) {
31640Sstevel@tonic-gate mde_perror(&xep, dgettext(TEXT_DOMAIN,
31650Sstevel@tonic-gate "Unable to reset mirror owner on node %s"),
31660Sstevel@tonic-gate nd->nd_nodename);
31670Sstevel@tonic-gate mdclrerror(&xep);
31680Sstevel@tonic-gate }
31690Sstevel@tonic-gate nd = nd->nd_next;
31700Sstevel@tonic-gate }
31710Sstevel@tonic-gate
31720Sstevel@tonic-gate out:
31730Sstevel@tonic-gate if (rval == -1) {
31740Sstevel@tonic-gate /* Rejoin node - Mark node as joined and send to other nodes */
31750Sstevel@tonic-gate nd = sd->sd_nodelist;
31760Sstevel@tonic-gate my_nd = *(sd->sd_mn_mynode); /* structure copy */
31770Sstevel@tonic-gate my_nd.nd_next = NULL;
31780Sstevel@tonic-gate while (nd) {
31790Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
31800Sstevel@tonic-gate nd = nd->nd_next;
31810Sstevel@tonic-gate continue;
31820Sstevel@tonic-gate }
31830Sstevel@tonic-gate if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
31840Sstevel@tonic-gate MD_NR_JOIN, NULL, &xep)) {
31850Sstevel@tonic-gate mdclrerror(&xep);
31860Sstevel@tonic-gate }
31870Sstevel@tonic-gate nd = nd->nd_next;
31880Sstevel@tonic-gate }
31890Sstevel@tonic-gate
31900Sstevel@tonic-gate /* Set master on withdrawn node */
31910Sstevel@tonic-gate if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp,
31920Sstevel@tonic-gate sd->sd_mn_master_nodenm,
31930Sstevel@tonic-gate sd->sd_mn_master_nodeid, &xep)) {
31940Sstevel@tonic-gate mdclrerror(&xep);
31950Sstevel@tonic-gate }
31960Sstevel@tonic-gate
31970Sstevel@tonic-gate /* Join set if halt_set had succeeded */
31980Sstevel@tonic-gate if (set_halted) {
3199650Sskamm /*
3200650Sskamm * Causes mddbs to be loaded into the kernel.
3201650Sskamm * Set the force flag so that replica locations can be
3202650Sskamm * loaded into the kernel even if a mediator node was
3203650Sskamm * unavailable. This allows a node to join an MO
3204650Sskamm * diskset when there are sufficient replicas available,
3205650Sskamm * but a mediator node in unavailable.
3206650Sskamm */
3207650Sskamm if (setup_db_bydd(sp, dd, TRUE, &xep) == -1) {
32080Sstevel@tonic-gate mdclrerror(&xep);
32090Sstevel@tonic-gate }
32100Sstevel@tonic-gate /* If set previously stale - make it so at re-join */
32110Sstevel@tonic-gate if (snarf_set(sp, stale_bool, &xep) != 0) {
32120Sstevel@tonic-gate mdclrerror(&xep);
32130Sstevel@tonic-gate (void) halt_set(sp, &xep);
32140Sstevel@tonic-gate mdclrerror(&xep);
32150Sstevel@tonic-gate }
32160Sstevel@tonic-gate }
32170Sstevel@tonic-gate }
32180Sstevel@tonic-gate
32190Sstevel@tonic-gate /*
32200Sstevel@tonic-gate * Notify rpc.mdcommd on all nodes of a nodelist change.
32210Sstevel@tonic-gate * Send reinit command to mdcommd which forces it to get
32220Sstevel@tonic-gate * fresh set description.
32230Sstevel@tonic-gate */
32240Sstevel@tonic-gate if (send_reinit) {
32250Sstevel@tonic-gate /* Send reinit */
32260Sstevel@tonic-gate nd = sd->sd_nodelist;
32270Sstevel@tonic-gate while (nd) {
32280Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
32290Sstevel@tonic-gate nd = nd->nd_next;
32300Sstevel@tonic-gate continue;
32310Sstevel@tonic-gate }
32320Sstevel@tonic-gate
32330Sstevel@tonic-gate /* Class is ignored for REINIT */
32340Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
32354932Spetede sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
32360Sstevel@tonic-gate /*
32370Sstevel@tonic-gate * We are here because we failed to resume
32380Sstevel@tonic-gate * rpc.mdcommd. However we potentially have
32390Sstevel@tonic-gate * an error from the previous call.
32400Sstevel@tonic-gate * If the previous call did fail, we
32410Sstevel@tonic-gate * capture that error and generate a perror
32420Sstevel@tonic-gate * withthe string, "Unable to resume...".
32430Sstevel@tonic-gate * Setting rval to -1 ensures that in the
32440Sstevel@tonic-gate * next iteration of the loop, ep is not
32450Sstevel@tonic-gate * clobbered.
32460Sstevel@tonic-gate */
32470Sstevel@tonic-gate if (rval == 0)
32480Sstevel@tonic-gate (void) mdstealerror(ep, &xep);
32490Sstevel@tonic-gate else
32500Sstevel@tonic-gate mdclrerror(&xep);
32510Sstevel@tonic-gate rval = -1;
32520Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN,
32530Sstevel@tonic-gate "Unable to reinit rpc.mdcommd."));
32540Sstevel@tonic-gate }
32550Sstevel@tonic-gate nd = nd->nd_next;
32560Sstevel@tonic-gate }
32570Sstevel@tonic-gate }
32580Sstevel@tonic-gate
32590Sstevel@tonic-gate out2:
32600Sstevel@tonic-gate /*
32610Sstevel@tonic-gate * Unlock diskset by resuming messages across the diskset.
32620Sstevel@tonic-gate * Just resume all classes so that resume is the same whether
32630Sstevel@tonic-gate * just one class was locked or all classes were locked.
32640Sstevel@tonic-gate */
32650Sstevel@tonic-gate if ((suspend1_flag) || (suspendall_flag)) {
32660Sstevel@tonic-gate nd = sd->sd_nodelist;
32670Sstevel@tonic-gate while (nd) {
32680Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
32690Sstevel@tonic-gate nd = nd->nd_next;
32700Sstevel@tonic-gate continue;
32710Sstevel@tonic-gate }
32720Sstevel@tonic-gate if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
32734932Spetede sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
32740Sstevel@tonic-gate /*
32750Sstevel@tonic-gate * We are here because we failed to resume
32760Sstevel@tonic-gate * rpc.mdcommd. However we potentially have
32770Sstevel@tonic-gate * an error from the previous call
32780Sstevel@tonic-gate * If the previous call did fail, we capture
32790Sstevel@tonic-gate * that error and generate a perror with
32800Sstevel@tonic-gate * the string, "Unable to resume...".
32810Sstevel@tonic-gate * Setting rval to -1 ensures that in the
32820Sstevel@tonic-gate * next iteration of the loop, ep is not
32830Sstevel@tonic-gate * clobbered.
32840Sstevel@tonic-gate */
32850Sstevel@tonic-gate if (rval == 0)
32860Sstevel@tonic-gate (void) mdstealerror(ep, &xep);
32870Sstevel@tonic-gate else
32880Sstevel@tonic-gate mdclrerror(&xep);
32890Sstevel@tonic-gate rval = -1;
32900Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN,
32910Sstevel@tonic-gate "Unable to resume rpc.mdcommd."));
32920Sstevel@tonic-gate }
32930Sstevel@tonic-gate nd = nd->nd_next;
32940Sstevel@tonic-gate }
32950Sstevel@tonic-gate meta_ping_mnset(sp->setno);
32960Sstevel@tonic-gate }
32970Sstevel@tonic-gate
32980Sstevel@tonic-gate /*
32990Sstevel@tonic-gate * Unlock set. This flushes the caches on the servers.
33000Sstevel@tonic-gate */
33010Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname);
33020Sstevel@tonic-gate nd = sd->sd_nodelist;
33030Sstevel@tonic-gate while (nd) {
33040Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
33050Sstevel@tonic-gate nd = nd->nd_next;
33060Sstevel@tonic-gate continue;
33070Sstevel@tonic-gate }
33080Sstevel@tonic-gate if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
33090Sstevel@tonic-gate if (rval == 0)
33100Sstevel@tonic-gate (void) mdstealerror(ep, &xep);
33110Sstevel@tonic-gate else
33120Sstevel@tonic-gate mdclrerror(&xep);
33130Sstevel@tonic-gate rval = -1;
33140Sstevel@tonic-gate }
33150Sstevel@tonic-gate nd = nd->nd_next;
33160Sstevel@tonic-gate }
33170Sstevel@tonic-gate
33180Sstevel@tonic-gate /*
33190Sstevel@tonic-gate * call metaflushsetnames to reset local cache for master and
33200Sstevel@tonic-gate * node information.
33210Sstevel@tonic-gate */
33220Sstevel@tonic-gate metaflushsetname(sp);
33230Sstevel@tonic-gate
33240Sstevel@tonic-gate /* release signals back to what they were on entry */
33250Sstevel@tonic-gate if (procsigs(FALSE, &oldsigs, &xep) < 0)
33260Sstevel@tonic-gate mdclrerror(&xep);
33270Sstevel@tonic-gate
33280Sstevel@tonic-gate return (rval);
33290Sstevel@tonic-gate
33300Sstevel@tonic-gate }
33310Sstevel@tonic-gate
33320Sstevel@tonic-gate /*
33330Sstevel@tonic-gate * Update nodelist with cluster member information.
33340Sstevel@tonic-gate * A node not in the member list will be marked
33350Sstevel@tonic-gate * as not ALIVE and not OWN.
33360Sstevel@tonic-gate * A node in the member list will be marked ALIVE, but
33370Sstevel@tonic-gate * the OWN bit will not be changed.
33380Sstevel@tonic-gate *
33390Sstevel@tonic-gate * If mynode isn't in the membership list, fail causing
33400Sstevel@tonic-gate * another reconfig cycle to be started since a non-member
33410Sstevel@tonic-gate * node shouldn't be taking part in the reconfig cycle.
33420Sstevel@tonic-gate *
33430Sstevel@tonic-gate * Return values:
33440Sstevel@tonic-gate * 0 - No problem.
33450Sstevel@tonic-gate * 1 - Any failure including RPC failure to my node.
33460Sstevel@tonic-gate */
33470Sstevel@tonic-gate int
meta_reconfig_update_nodelist(mdsetname_t * sp,mndiskset_membershiplist_t * nl,md_set_desc * sd,md_error_t * ep)33480Sstevel@tonic-gate meta_reconfig_update_nodelist(
33490Sstevel@tonic-gate mdsetname_t *sp,
33500Sstevel@tonic-gate mndiskset_membershiplist_t *nl,
33510Sstevel@tonic-gate md_set_desc *sd,
33520Sstevel@tonic-gate md_error_t *ep
33530Sstevel@tonic-gate )
33540Sstevel@tonic-gate {
33550Sstevel@tonic-gate mndiskset_membershiplist_t *nl2;
33560Sstevel@tonic-gate md_mnnode_desc *nd;
33570Sstevel@tonic-gate md_error_t xep = mdnullerror;
33580Sstevel@tonic-gate int rval = 0;
33590Sstevel@tonic-gate
33600Sstevel@tonic-gate /*
33610Sstevel@tonic-gate * Walk through nodelist, checking to see if each
33620Sstevel@tonic-gate * node is in the member list.
33630Sstevel@tonic-gate * If node is not a member, reset ALIVE and OWN node flag.
33640Sstevel@tonic-gate * If node is a member, set ALIVE.
33650Sstevel@tonic-gate * If mynode's OWN flag gets reset, then halt the diskset on this node.
33660Sstevel@tonic-gate */
33670Sstevel@tonic-gate nd = sd->sd_nodelist;
33680Sstevel@tonic-gate while (nd) {
33690Sstevel@tonic-gate nl2 = nl;
33700Sstevel@tonic-gate while (nl2) {
33710Sstevel@tonic-gate /* If node is in member list, set ALIVE */
33720Sstevel@tonic-gate if (nl2->msl_node_id == nd->nd_nodeid) {
33730Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_ALIVE;
33740Sstevel@tonic-gate break;
33750Sstevel@tonic-gate } else {
33760Sstevel@tonic-gate nl2 = nl2->next;
33770Sstevel@tonic-gate }
33780Sstevel@tonic-gate /* node is not in member list, mark !ALIVE and !OWN */
33790Sstevel@tonic-gate if (nl2 == NULL) {
33800Sstevel@tonic-gate /* If node is mynode, then halt set if needed */
33810Sstevel@tonic-gate if (strcmp(mynode(), nd->nd_nodename) == 0) {
33820Sstevel@tonic-gate /*
33830Sstevel@tonic-gate * This shouldn't happen, but just
33840Sstevel@tonic-gate * in case... Any node not in the
33850Sstevel@tonic-gate * membership list should be dead and
33860Sstevel@tonic-gate * not running reconfig step1.
33870Sstevel@tonic-gate */
33880Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OWN) {
33890Sstevel@tonic-gate if (halt_set(sp, &xep)) {
33900Sstevel@tonic-gate mde_perror(&xep, "");
33910Sstevel@tonic-gate mdclrerror(&xep);
33920Sstevel@tonic-gate }
33930Sstevel@tonic-gate }
33940Sstevel@tonic-gate /*
33950Sstevel@tonic-gate * Return failure since this node
33960Sstevel@tonic-gate * (mynode) is not in the membership
33970Sstevel@tonic-gate * list, but process the rest of the
33980Sstevel@tonic-gate * nodelist first so that rpc.metad
33990Sstevel@tonic-gate * can be updated with the latest
34000Sstevel@tonic-gate * membership information.
34010Sstevel@tonic-gate */
34020Sstevel@tonic-gate (void) mddserror(ep,
34030Sstevel@tonic-gate MDE_DS_NOTINMEMBERLIST,
34040Sstevel@tonic-gate sp->setno, nd->nd_nodename, NULL,
34050Sstevel@tonic-gate sp->setname);
34060Sstevel@tonic-gate rval = 1;
34070Sstevel@tonic-gate }
34080Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_ALIVE;
34090Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OWN;
34100Sstevel@tonic-gate }
34110Sstevel@tonic-gate }
34120Sstevel@tonic-gate nd = nd->nd_next;
34130Sstevel@tonic-gate }
34140Sstevel@tonic-gate
34150Sstevel@tonic-gate /* Send this information to rpc.metad */
34160Sstevel@tonic-gate if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
34170Sstevel@tonic-gate MD_NR_SET, MNSET_IN_RECONFIG, &xep)) {
34180Sstevel@tonic-gate /* Return failure if can't send node flags to rpc.metad */
34190Sstevel@tonic-gate if (rval == 0) {
34200Sstevel@tonic-gate (void) mdstealerror(ep, &xep);
34210Sstevel@tonic-gate rval = 1;
34220Sstevel@tonic-gate }
34230Sstevel@tonic-gate }
34240Sstevel@tonic-gate return (rval);
34250Sstevel@tonic-gate }
34260Sstevel@tonic-gate
34270Sstevel@tonic-gate /*
34280Sstevel@tonic-gate * Choose master determines the master for a diskset.
34290Sstevel@tonic-gate * Each node determines the master on its own and
34300Sstevel@tonic-gate * adds this information to its local rpc.metad nodelist
34310Sstevel@tonic-gate * and also sends it to the kernel.
34320Sstevel@tonic-gate *
34330Sstevel@tonic-gate * Nodelist in set descriptor (sd) is sorted in
34340Sstevel@tonic-gate * monotonically increasing sequence of nodeid.
34350Sstevel@tonic-gate *
34360Sstevel@tonic-gate * Return values:
34370Sstevel@tonic-gate * 0 - No problem.
34380Sstevel@tonic-gate * 205 - There was an RPC problem to another node.
34390Sstevel@tonic-gate * -1 - There was an error. This could be an RPC error to my node.
34400Sstevel@tonic-gate * This is a catastrophic failure causing node to panic.
34410Sstevel@tonic-gate */
34420Sstevel@tonic-gate int
meta_reconfig_choose_master_for_set(mdsetname_t * sp,md_set_desc * sd,md_error_t * ep)34430Sstevel@tonic-gate meta_reconfig_choose_master_for_set(
34440Sstevel@tonic-gate mdsetname_t *sp,
34450Sstevel@tonic-gate md_set_desc *sd,
34460Sstevel@tonic-gate md_error_t *ep
34470Sstevel@tonic-gate )
34480Sstevel@tonic-gate {
34490Sstevel@tonic-gate int is_owner;
34500Sstevel@tonic-gate md_mnset_record *mnsr = NULL;
34510Sstevel@tonic-gate int lowest_alive_nodeid = 0;
34520Sstevel@tonic-gate uint_t master_nodeid;
34530Sstevel@tonic-gate md_mnnode_desc *nd, *nd2;
34540Sstevel@tonic-gate md_mnnode_record *nr;
34550Sstevel@tonic-gate md_drive_desc *dd;
34560Sstevel@tonic-gate md_setkey_t *cl_sk;
34570Sstevel@tonic-gate int rval = 0;
34580Sstevel@tonic-gate md_error_t xep = mdnullerror;
34590Sstevel@tonic-gate mddb_setflags_config_t sf;
34600Sstevel@tonic-gate
34610Sstevel@tonic-gate /*
34620Sstevel@tonic-gate * Is current node joined to diskset?
34630Sstevel@tonic-gate * Don't trust flags, really check to see if mddb is snarfed.
34640Sstevel@tonic-gate */
34650Sstevel@tonic-gate if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
34660Sstevel@tonic-gate /*
34670Sstevel@tonic-gate * If a node is joined to the diskset, this node checks
34680Sstevel@tonic-gate * to see if the current master of the diskset is valid and
34690Sstevel@tonic-gate * is still in the membership list (ALIVE) and is
34700Sstevel@tonic-gate * still joined (OWN). Need to verify if master is
34710Sstevel@tonic-gate * really joined - don't trust the flags. (Can trust
34720Sstevel@tonic-gate * ALIVE since set during earlier part of reconfig cycle.)
34730Sstevel@tonic-gate * If the current master is valid, still in the membership
34740Sstevel@tonic-gate * list and joined, then master is not changed on this node.
34750Sstevel@tonic-gate * Just return.
34760Sstevel@tonic-gate *
34770Sstevel@tonic-gate * Verify that nodeid is valid before accessing masternode.
34780Sstevel@tonic-gate */
34790Sstevel@tonic-gate if ((sd->sd_mn_master_nodeid != MD_MN_INVALID_NID) &&
34800Sstevel@tonic-gate (sd->sd_mn_masternode->nd_flags & MD_MN_NODE_ALIVE)) {
34810Sstevel@tonic-gate if (clnt_ownset(sd->sd_mn_master_nodenm, sp,
34820Sstevel@tonic-gate &is_owner, ep) == -1) {
34830Sstevel@tonic-gate /* If RPC failure to another node return 205 */
34840Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
34850Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid !=
34860Sstevel@tonic-gate sd->sd_mn_master_nodeid)) {
34870Sstevel@tonic-gate return (205);
34880Sstevel@tonic-gate } else {
34890Sstevel@tonic-gate /* Any other failure */
34900Sstevel@tonic-gate return (-1);
34910Sstevel@tonic-gate }
34920Sstevel@tonic-gate } else {
34930Sstevel@tonic-gate if (is_owner == TRUE) {
34940Sstevel@tonic-gate
34950Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(
34960Sstevel@tonic-gate TEXT_DOMAIN, "Set %s previous "
34970Sstevel@tonic-gate "master chosen %s (%d): %s"),
34980Sstevel@tonic-gate sp->setname,
34990Sstevel@tonic-gate sd->sd_mn_master_nodenm,
35000Sstevel@tonic-gate sd->sd_mn_master_nodeid,
35010Sstevel@tonic-gate meta_print_hrtime(gethrtime() -
35020Sstevel@tonic-gate start_time));
35030Sstevel@tonic-gate
35040Sstevel@tonic-gate /* Previous master is ok - done */
35050Sstevel@tonic-gate return (0);
35060Sstevel@tonic-gate }
35070Sstevel@tonic-gate }
35080Sstevel@tonic-gate }
35090Sstevel@tonic-gate
35100Sstevel@tonic-gate /*
35110Sstevel@tonic-gate * If current master is no longer in the membership list or
35120Sstevel@tonic-gate * is no longer joined, then this node uses the following
35130Sstevel@tonic-gate * algorithm:
35140Sstevel@tonic-gate * - node calls RPC routine clnt_ownset to get latest
35150Sstevel@tonic-gate * information on which nodes are owners of diskset.
35160Sstevel@tonic-gate * clnt_ownset checks on each node to see if its kernel
35170Sstevel@tonic-gate * has that diskset snarfed.
35180Sstevel@tonic-gate */
35190Sstevel@tonic-gate nd = sd->sd_nodelist;
35200Sstevel@tonic-gate while (nd) {
35210Sstevel@tonic-gate /* Don't consider node that isn't in member list */
35220Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
35230Sstevel@tonic-gate nd = nd->nd_next;
35240Sstevel@tonic-gate continue;
35250Sstevel@tonic-gate }
35260Sstevel@tonic-gate
35270Sstevel@tonic-gate if (clnt_ownset(nd->nd_nodename, sp,
35280Sstevel@tonic-gate &is_owner, ep) == -1) {
35290Sstevel@tonic-gate /* If RPC failure to another node return 205 */
35300Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
35310Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid !=
35320Sstevel@tonic-gate nd->nd_nodeid)) {
35330Sstevel@tonic-gate return (205);
35340Sstevel@tonic-gate } else {
35350Sstevel@tonic-gate /* Any other failure */
35360Sstevel@tonic-gate return (-1);
35370Sstevel@tonic-gate }
35380Sstevel@tonic-gate }
35390Sstevel@tonic-gate
35400Sstevel@tonic-gate /*
35410Sstevel@tonic-gate * Set owner flag for each node based on whether
35420Sstevel@tonic-gate * that node really has a diskset mddb snarfed in
35430Sstevel@tonic-gate * or not.
35440Sstevel@tonic-gate */
35450Sstevel@tonic-gate if (is_owner == TRUE)
35460Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_OWN;
35470Sstevel@tonic-gate else
35480Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OWN;
35490Sstevel@tonic-gate
35500Sstevel@tonic-gate nd = nd->nd_next;
35510Sstevel@tonic-gate }
35520Sstevel@tonic-gate
35530Sstevel@tonic-gate /*
35540Sstevel@tonic-gate * - node walks through nodelist looking for nodes that are
35550Sstevel@tonic-gate * owners of the diskset that are in the membership list.
35560Sstevel@tonic-gate * - for each owner, node calls RPC routine clnt_getset to
35570Sstevel@tonic-gate * see if that node has its node record set to OK.
35580Sstevel@tonic-gate * - If so, master is chosen to be this owner node.
35590Sstevel@tonic-gate */
35600Sstevel@tonic-gate nd = sd->sd_nodelist;
35610Sstevel@tonic-gate while (nd) {
35620Sstevel@tonic-gate /* Don't consider node that isn't in member list */
35630Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
35640Sstevel@tonic-gate nd = nd->nd_next;
35650Sstevel@tonic-gate continue;
35660Sstevel@tonic-gate }
35670Sstevel@tonic-gate
35680Sstevel@tonic-gate /* Don't consider a node that isn't an owner */
35690Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
35700Sstevel@tonic-gate nd = nd->nd_next;
35710Sstevel@tonic-gate continue;
35720Sstevel@tonic-gate }
35730Sstevel@tonic-gate
35740Sstevel@tonic-gate /* Does node has its own node record set to OK? */
35750Sstevel@tonic-gate if (clnt_mngetset(nd->nd_nodename, sp->setname,
35760Sstevel@tonic-gate MD_SET_BAD, &mnsr, ep) == -1) {
35770Sstevel@tonic-gate /* If RPC failure to another node return 205 */
35780Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
35790Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid !=
35800Sstevel@tonic-gate nd->nd_nodeid)) {
35810Sstevel@tonic-gate return (205);
35820Sstevel@tonic-gate } else {
35830Sstevel@tonic-gate /* Any other failure */
35840Sstevel@tonic-gate return (-1);
35850Sstevel@tonic-gate }
35860Sstevel@tonic-gate }
35870Sstevel@tonic-gate nr = mnsr->sr_nodechain;
35880Sstevel@tonic-gate while (nr) {
35890Sstevel@tonic-gate if (nd->nd_nodeid == nr->nr_nodeid) {
35900Sstevel@tonic-gate if (nr->nr_flags & MD_MN_NODE_OK) {
35910Sstevel@tonic-gate /* Found a master */
35920Sstevel@tonic-gate free_sr(
35930Sstevel@tonic-gate (md_set_record *)mnsr);
35940Sstevel@tonic-gate goto found_master;
35950Sstevel@tonic-gate }
35960Sstevel@tonic-gate }
35970Sstevel@tonic-gate nr = nr->nr_next;
35980Sstevel@tonic-gate }
35990Sstevel@tonic-gate free_sr((md_set_record *)mnsr);
36000Sstevel@tonic-gate nd = nd->nd_next;
36010Sstevel@tonic-gate }
36020Sstevel@tonic-gate
36030Sstevel@tonic-gate /*
36040Sstevel@tonic-gate * - If no owner node has its own node record on its own node
36050Sstevel@tonic-gate * set to OK, then this node checks all of the non-owner
36060Sstevel@tonic-gate * nodes that are in the membership list.
36070Sstevel@tonic-gate * - for each non-owner, node calls RPC routine clnt_getset to
36080Sstevel@tonic-gate * see if that node has its node record set to OK.
36090Sstevel@tonic-gate * - If set doesn't exist, don't choose node for master.
36100Sstevel@tonic-gate * - If so, master is chosen to be this non-owner node.
36110Sstevel@tonic-gate *
36120Sstevel@tonic-gate */
36130Sstevel@tonic-gate nd = sd->sd_nodelist;
36140Sstevel@tonic-gate while (nd) {
36150Sstevel@tonic-gate /* Don't consider node that isn't in member list */
36160Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
36170Sstevel@tonic-gate nd = nd->nd_next;
36180Sstevel@tonic-gate continue;
36190Sstevel@tonic-gate }
36200Sstevel@tonic-gate
36210Sstevel@tonic-gate /* Only checking non-owner nodes this time around */
36220Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OWN) {
36230Sstevel@tonic-gate nd = nd->nd_next;
36240Sstevel@tonic-gate continue;
36250Sstevel@tonic-gate }
36260Sstevel@tonic-gate
36270Sstevel@tonic-gate /* Does node has its own node record set to OK? */
36280Sstevel@tonic-gate if (clnt_mngetset(nd->nd_nodename, sp->setname,
36290Sstevel@tonic-gate MD_SET_BAD, &mnsr, ep) == -1) {
36300Sstevel@tonic-gate /*
36310Sstevel@tonic-gate * If set doesn't exist on non-owner node,
36320Sstevel@tonic-gate * don't consider this node for master.
36330Sstevel@tonic-gate */
36340Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) {
36350Sstevel@tonic-gate nd = nd->nd_next;
36360Sstevel@tonic-gate continue;
36370Sstevel@tonic-gate } else if ((mdanyrpcerror(ep)) &&
36380Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid !=
36390Sstevel@tonic-gate nd->nd_nodeid)) {
36400Sstevel@tonic-gate /* RPC failure to another node */
36410Sstevel@tonic-gate return (205);
36420Sstevel@tonic-gate } else {
36430Sstevel@tonic-gate /* Any other failure */
36440Sstevel@tonic-gate return (-1);
36450Sstevel@tonic-gate }
36460Sstevel@tonic-gate }
36470Sstevel@tonic-gate nr = mnsr->sr_nodechain;
36480Sstevel@tonic-gate while (nr) {
36490Sstevel@tonic-gate if (nd->nd_nodeid == nr->nr_nodeid) {
36500Sstevel@tonic-gate if (nr->nr_flags & MD_MN_NODE_OK) {
36510Sstevel@tonic-gate /* Found a master */
36520Sstevel@tonic-gate free_sr(
36530Sstevel@tonic-gate (md_set_record *)mnsr);
36540Sstevel@tonic-gate goto found_master;
36550Sstevel@tonic-gate }
36560Sstevel@tonic-gate }
36570Sstevel@tonic-gate nr = nr->nr_next;
36580Sstevel@tonic-gate }
36590Sstevel@tonic-gate free_sr((md_set_record *)mnsr);
36600Sstevel@tonic-gate nd = nd->nd_next;
36610Sstevel@tonic-gate }
36620Sstevel@tonic-gate
36630Sstevel@tonic-gate /*
36640Sstevel@tonic-gate * - If no node can be found that has its own node record on
36650Sstevel@tonic-gate * its node to be set to OK, then all alive nodes
36660Sstevel@tonic-gate * were in the process of being added to or deleted
36670Sstevel@tonic-gate * from set. Each alive node will remove all
36680Sstevel@tonic-gate * information pertaining to this set from its node.
36690Sstevel@tonic-gate *
36700Sstevel@tonic-gate * If all nodes in set are ALIVE, then call sdssc end routines
36710Sstevel@tonic-gate * since set was truly being initially created or destroyed.
36720Sstevel@tonic-gate */
36730Sstevel@tonic-gate goto delete_set;
36740Sstevel@tonic-gate } else {
36750Sstevel@tonic-gate
36760Sstevel@tonic-gate /*
36770Sstevel@tonic-gate * If node is not joined to diskset, then this
36780Sstevel@tonic-gate * node uses the following algorithm:
36790Sstevel@tonic-gate * - If unjoined node doesn't have a node record for itself,
36800Sstevel@tonic-gate * just delete the diskset since diskset was in the
36810Sstevel@tonic-gate * process of being created.
36820Sstevel@tonic-gate * - node needs to find master of diskset before
36830Sstevel@tonic-gate * reconfig cycle, if a master existed.
36840Sstevel@tonic-gate * - node calls RPC routine clnt_ownset to get latest
36850Sstevel@tonic-gate * information on which nodes are owners of diskset.
36860Sstevel@tonic-gate * clnt_ownset checks on each node to see if its
36870Sstevel@tonic-gate * kernel has that diskset snarfed.
36880Sstevel@tonic-gate */
36890Sstevel@tonic-gate
36900Sstevel@tonic-gate /*
36910Sstevel@tonic-gate * Is my node in the set description?
36920Sstevel@tonic-gate * If not, delete the set from this node.
36930Sstevel@tonic-gate * sr2setdesc sets sd_mn_mynode pointer to the node
36940Sstevel@tonic-gate * descriptor for this node if there was a node
36950Sstevel@tonic-gate * record for this node.
36960Sstevel@tonic-gate *
36970Sstevel@tonic-gate */
36980Sstevel@tonic-gate if (sd->sd_mn_mynode == NULL) {
36990Sstevel@tonic-gate goto delete_set;
37000Sstevel@tonic-gate }
37010Sstevel@tonic-gate
37020Sstevel@tonic-gate nd = sd->sd_nodelist;
37030Sstevel@tonic-gate while (nd) {
37040Sstevel@tonic-gate /* Don't consider node that isn't in member list */
37050Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
37060Sstevel@tonic-gate nd = nd->nd_next;
37070Sstevel@tonic-gate continue;
37080Sstevel@tonic-gate }
37090Sstevel@tonic-gate
37100Sstevel@tonic-gate if (clnt_ownset(nd->nd_nodename, sp,
37110Sstevel@tonic-gate &is_owner, ep) == -1) {
37120Sstevel@tonic-gate /* If RPC failure to another node return 205 */
37130Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
37140Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid !=
37150Sstevel@tonic-gate nd->nd_nodeid)) {
37160Sstevel@tonic-gate return (205);
37170Sstevel@tonic-gate } else {
37180Sstevel@tonic-gate /* Any other failure */
37190Sstevel@tonic-gate return (-1);
37200Sstevel@tonic-gate }
37210Sstevel@tonic-gate }
37220Sstevel@tonic-gate
37230Sstevel@tonic-gate /*
37240Sstevel@tonic-gate * Set owner flag for each node based on whether
37250Sstevel@tonic-gate * that node really has a diskset mddb snarfed in
37260Sstevel@tonic-gate * or not.
37270Sstevel@tonic-gate */
37280Sstevel@tonic-gate if (is_owner == TRUE)
37290Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_OWN;
37300Sstevel@tonic-gate else
37310Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OWN;
37320Sstevel@tonic-gate
37330Sstevel@tonic-gate nd = nd->nd_next;
37340Sstevel@tonic-gate }
37350Sstevel@tonic-gate
37360Sstevel@tonic-gate /*
37370Sstevel@tonic-gate * - node walks through nodelist looking for nodes that
37380Sstevel@tonic-gate * are owners of the diskset that are in
37390Sstevel@tonic-gate * the membership list.
37400Sstevel@tonic-gate * - for each owner, node calls RPC routine clnt_getset to
37410Sstevel@tonic-gate * see if that node has a master set and to get the
37420Sstevel@tonic-gate * diskset description.
37430Sstevel@tonic-gate * - If the owner node has a set description that doesn't
37440Sstevel@tonic-gate * include the non-joined node in the nodelist, this node
37450Sstevel@tonic-gate * removes its set description of that diskset
37460Sstevel@tonic-gate * (i.e. removes the set from its local mddbs). This is
37470Sstevel@tonic-gate * handling the case of when a node was removed from a
37480Sstevel@tonic-gate * diskset while it was not in the cluster membership
37490Sstevel@tonic-gate * list.
37500Sstevel@tonic-gate * - If that node has a master set and the master is in the
37510Sstevel@tonic-gate * membership list and is an owner, then either this was
37520Sstevel@tonic-gate * the master from before the reconfig cycle or this
37530Sstevel@tonic-gate * node has already chosen a new master - either way,
37540Sstevel@tonic-gate * the master value is valid as long as it is in the
37550Sstevel@tonic-gate * membership list and is an owner
37560Sstevel@tonic-gate * - master is chosen to be owner node's master
37570Sstevel@tonic-gate */
37580Sstevel@tonic-gate nd = sd->sd_nodelist;
37590Sstevel@tonic-gate while (nd) {
37600Sstevel@tonic-gate /* Don't consider node that isn't in member list */
37610Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
37620Sstevel@tonic-gate nd = nd->nd_next;
37630Sstevel@tonic-gate continue;
37640Sstevel@tonic-gate }
37650Sstevel@tonic-gate
37660Sstevel@tonic-gate /* Don't consider a node that isn't an owner */
37670Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
37680Sstevel@tonic-gate nd = nd->nd_next;
37690Sstevel@tonic-gate continue;
37700Sstevel@tonic-gate }
37710Sstevel@tonic-gate
37720Sstevel@tonic-gate /* Get owner node's set record */
37730Sstevel@tonic-gate if (clnt_mngetset(nd->nd_nodename, sp->setname,
37740Sstevel@tonic-gate MD_SET_BAD, &mnsr, ep) == -1) {
37750Sstevel@tonic-gate /* If RPC failure to another node return 205 */
37760Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
37770Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid !=
37780Sstevel@tonic-gate nd->nd_nodeid)) {
37790Sstevel@tonic-gate return (205);
37800Sstevel@tonic-gate } else {
37810Sstevel@tonic-gate /* Any other failure */
37820Sstevel@tonic-gate return (-1);
37830Sstevel@tonic-gate }
37840Sstevel@tonic-gate }
37850Sstevel@tonic-gate
37860Sstevel@tonic-gate /* Is this node in the owner node's set record */
37870Sstevel@tonic-gate nr = mnsr->sr_nodechain;
37880Sstevel@tonic-gate while (nr) {
37890Sstevel@tonic-gate if (sd->sd_mn_mynode->nd_nodeid ==
37900Sstevel@tonic-gate nr->nr_nodeid) {
37910Sstevel@tonic-gate break;
37920Sstevel@tonic-gate }
37930Sstevel@tonic-gate nr = nr->nr_next;
37940Sstevel@tonic-gate }
37950Sstevel@tonic-gate if (nr == NULL) {
37960Sstevel@tonic-gate /* my node not found - delete set */
37970Sstevel@tonic-gate free_sr((md_set_record *)mnsr);
37980Sstevel@tonic-gate goto delete_set;
37990Sstevel@tonic-gate }
38000Sstevel@tonic-gate
38010Sstevel@tonic-gate /* Is owner's node's master valid? */
38020Sstevel@tonic-gate master_nodeid = mnsr->sr_master_nodeid;
38030Sstevel@tonic-gate free_sr((md_set_record *)mnsr);
38040Sstevel@tonic-gate if (master_nodeid == MD_MN_INVALID_NID) {
38050Sstevel@tonic-gate nd = nd->nd_next;
38060Sstevel@tonic-gate continue;
38070Sstevel@tonic-gate }
38080Sstevel@tonic-gate
38090Sstevel@tonic-gate nd2 = sd->sd_nodelist;
38100Sstevel@tonic-gate while (nd2) {
38110Sstevel@tonic-gate if ((nd2->nd_nodeid == master_nodeid) &&
38120Sstevel@tonic-gate (nd2->nd_flags & MD_MN_NODE_ALIVE) &&
38130Sstevel@tonic-gate (nd2->nd_flags & MD_MN_NODE_OWN)) {
38140Sstevel@tonic-gate nd = nd2;
38150Sstevel@tonic-gate goto found_master;
38160Sstevel@tonic-gate }
38170Sstevel@tonic-gate nd2 = nd2->nd_next;
38180Sstevel@tonic-gate }
38190Sstevel@tonic-gate nd = nd->nd_next;
38200Sstevel@tonic-gate }
38210Sstevel@tonic-gate
38220Sstevel@tonic-gate /*
38230Sstevel@tonic-gate * - If no owner node has a valid master, then follow
38240Sstevel@tonic-gate * algorithm of when a node is joined to the diskset.
38250Sstevel@tonic-gate * - node walks through nodelist looking for nodes that are
38260Sstevel@tonic-gate * owners of the diskset that are in the membership list.
38270Sstevel@tonic-gate * - for each owner, node calls RPC routine clnt_getset to
38280Sstevel@tonic-gate * see if that node has its node record set to OK.
38290Sstevel@tonic-gate * - If so, master is chosen to be this owner node.
38300Sstevel@tonic-gate */
38310Sstevel@tonic-gate nd = sd->sd_nodelist;
38320Sstevel@tonic-gate while (nd) {
38330Sstevel@tonic-gate /* Don't consider node that isn't in member list */
38340Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
38350Sstevel@tonic-gate nd = nd->nd_next;
38360Sstevel@tonic-gate continue;
38370Sstevel@tonic-gate }
38380Sstevel@tonic-gate
38390Sstevel@tonic-gate /* Don't consider a node that isn't an owner */
38400Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
38410Sstevel@tonic-gate nd = nd->nd_next;
38420Sstevel@tonic-gate continue;
38430Sstevel@tonic-gate }
38440Sstevel@tonic-gate
38450Sstevel@tonic-gate /* Does node has its own node record set to OK? */
38460Sstevel@tonic-gate if (clnt_mngetset(nd->nd_nodename, sp->setname,
38470Sstevel@tonic-gate MD_SET_BAD, &mnsr, ep) == -1) {
38480Sstevel@tonic-gate /* If RPC failure to another node return 205 */
38490Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
38500Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid !=
38510Sstevel@tonic-gate nd->nd_nodeid)) {
38520Sstevel@tonic-gate return (205);
38530Sstevel@tonic-gate } else {
38540Sstevel@tonic-gate /* Any other failure */
38550Sstevel@tonic-gate return (-1);
38560Sstevel@tonic-gate }
38570Sstevel@tonic-gate }
38580Sstevel@tonic-gate nr = mnsr->sr_nodechain;
38590Sstevel@tonic-gate while (nr) {
38600Sstevel@tonic-gate if (nd->nd_nodeid == nr->nr_nodeid) {
38610Sstevel@tonic-gate if (nr->nr_flags & MD_MN_NODE_OK) {
38620Sstevel@tonic-gate /* Found a master */
38630Sstevel@tonic-gate free_sr(
38640Sstevel@tonic-gate (md_set_record *)mnsr);
38650Sstevel@tonic-gate goto found_master;
38660Sstevel@tonic-gate }
38670Sstevel@tonic-gate }
38680Sstevel@tonic-gate nr = nr->nr_next;
38690Sstevel@tonic-gate }
38700Sstevel@tonic-gate free_sr((md_set_record *)mnsr);
38710Sstevel@tonic-gate nd = nd->nd_next;
38720Sstevel@tonic-gate }
38730Sstevel@tonic-gate
38740Sstevel@tonic-gate /*
38750Sstevel@tonic-gate * - If no owner node has its own node record on its own node
38760Sstevel@tonic-gate * set to OK, then this node checks all of the non-owner
38770Sstevel@tonic-gate * nodes that are in the membership list.
38780Sstevel@tonic-gate * - for each non-owner, node calls RPC routine clnt_getset to
38790Sstevel@tonic-gate * see if that node has its node record set to OK.
38800Sstevel@tonic-gate * - If set doesn't exist, don't choose node for master.
38810Sstevel@tonic-gate * - If this node doesn't exist in the nodelist on any of the
38820Sstevel@tonic-gate * non-owner nodes, this node removes its set description
38830Sstevel@tonic-gate * of that diskset (i.e. removes the set from its local
38840Sstevel@tonic-gate * mddbs). This is handling the case of when a node was
38850Sstevel@tonic-gate * removed from a diskset while it was not in the
38860Sstevel@tonic-gate * cluster membership list.
38870Sstevel@tonic-gate * - If non-owner node has its node record set to OK and if
38880Sstevel@tonic-gate * this node hasn't removed this diskset (step directly
38890Sstevel@tonic-gate * before this one), then the master is chosen to be this
38900Sstevel@tonic-gate * non-owner node.
38910Sstevel@tonic-gate */
38920Sstevel@tonic-gate nd = sd->sd_nodelist;
38930Sstevel@tonic-gate while (nd) {
38940Sstevel@tonic-gate /* Don't consider node that isn't in member list */
38950Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
38960Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL;
38970Sstevel@tonic-gate nd = nd->nd_next;
38980Sstevel@tonic-gate continue;
38990Sstevel@tonic-gate }
39000Sstevel@tonic-gate
39010Sstevel@tonic-gate /* Don't consider owner nodes since none are OK */
39020Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OWN) {
39030Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL;
39040Sstevel@tonic-gate nd = nd->nd_next;
39050Sstevel@tonic-gate continue;
39060Sstevel@tonic-gate }
39070Sstevel@tonic-gate
39080Sstevel@tonic-gate /*
39090Sstevel@tonic-gate * Don't need to get nodelist from my node since
39100Sstevel@tonic-gate * this is where sd_nodelist was obtained.
39110Sstevel@tonic-gate */
39120Sstevel@tonic-gate if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
39130Sstevel@tonic-gate nd = nd->nd_next;
39140Sstevel@tonic-gate continue;
39150Sstevel@tonic-gate }
39160Sstevel@tonic-gate
39170Sstevel@tonic-gate /*
39180Sstevel@tonic-gate * If node has already been decided against for
39190Sstevel@tonic-gate * master, then skip it.
39200Sstevel@tonic-gate */
39210Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_DEL) {
39220Sstevel@tonic-gate nd = nd->nd_next;
39230Sstevel@tonic-gate continue;
39240Sstevel@tonic-gate }
39250Sstevel@tonic-gate
39260Sstevel@tonic-gate /*
39270Sstevel@tonic-gate * Does node in my nodelist have its own node
39280Sstevel@tonic-gate * record marked OK on its node? And does node
39290Sstevel@tonic-gate * in my nodelist exist on all other nodes?
39300Sstevel@tonic-gate * Don't want to choose a node for master unless
39310Sstevel@tonic-gate * that node is marked OK on its own node and that
39320Sstevel@tonic-gate * node exists on all other alive nodes.
39330Sstevel@tonic-gate *
39340Sstevel@tonic-gate * This is guarding against the case when several
39350Sstevel@tonic-gate * nodes are down and one of the downed nodes is
39360Sstevel@tonic-gate * deleted from the diskset. When the down nodes
39370Sstevel@tonic-gate * are rebooted into the cluster, you don't want
39380Sstevel@tonic-gate * any node to pick the deleted node as the master.
39390Sstevel@tonic-gate */
39400Sstevel@tonic-gate if (clnt_mngetset(nd->nd_nodename, sp->setname,
39410Sstevel@tonic-gate MD_SET_BAD, &mnsr, ep) == -1) {
39420Sstevel@tonic-gate /*
39430Sstevel@tonic-gate * If set doesn't exist on non-owner node,
39440Sstevel@tonic-gate * don't consider this node for master.
39450Sstevel@tonic-gate */
39460Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) {
39470Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL;
39480Sstevel@tonic-gate nd = nd->nd_next;
39490Sstevel@tonic-gate continue;
39500Sstevel@tonic-gate } else if (mdanyrpcerror(ep)) {
39510Sstevel@tonic-gate /* RPC failure to another node */
39520Sstevel@tonic-gate return (205);
39530Sstevel@tonic-gate } else {
39540Sstevel@tonic-gate /* Any other failure */
39550Sstevel@tonic-gate return (-1);
39560Sstevel@tonic-gate }
39570Sstevel@tonic-gate }
39580Sstevel@tonic-gate /*
39590Sstevel@tonic-gate * Is my node in the nodelist gotten from the other
39600Sstevel@tonic-gate * node? If not, then remove the set from my node
39610Sstevel@tonic-gate * since set was deleted from my node while my node
39620Sstevel@tonic-gate * was out of the cluster.
39630Sstevel@tonic-gate */
39640Sstevel@tonic-gate nr = mnsr->sr_nodechain;
39650Sstevel@tonic-gate while (nr) {
39660Sstevel@tonic-gate if (sd->sd_mn_mynode->nd_nodeid ==
39670Sstevel@tonic-gate nr->nr_nodeid) {
39680Sstevel@tonic-gate break;
39690Sstevel@tonic-gate }
39700Sstevel@tonic-gate nr = nr->nr_next;
39710Sstevel@tonic-gate }
39720Sstevel@tonic-gate if (nr == NULL) {
39730Sstevel@tonic-gate /* my node not found - delete set */
39740Sstevel@tonic-gate free_sr((md_set_record *)mnsr);
39750Sstevel@tonic-gate goto delete_set;
39760Sstevel@tonic-gate }
39770Sstevel@tonic-gate
39780Sstevel@tonic-gate /* Is node being checked marked OK on its own node? */
39790Sstevel@tonic-gate nr = mnsr->sr_nodechain;
39800Sstevel@tonic-gate while (nr) {
39810Sstevel@tonic-gate if (nd->nd_nodeid == nr->nr_nodeid) {
39820Sstevel@tonic-gate if (!(nr->nr_flags & MD_MN_NODE_OK)) {
39830Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL;
39840Sstevel@tonic-gate }
39850Sstevel@tonic-gate break;
39860Sstevel@tonic-gate }
39870Sstevel@tonic-gate nr = nr->nr_next;
39880Sstevel@tonic-gate }
39890Sstevel@tonic-gate /*
39900Sstevel@tonic-gate * If node being checked doesn't exist on its
39910Sstevel@tonic-gate * own node - don't choose it as master.
39920Sstevel@tonic-gate */
39930Sstevel@tonic-gate if (nr == NULL) {
39940Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL;
39950Sstevel@tonic-gate }
39960Sstevel@tonic-gate
39970Sstevel@tonic-gate /*
39980Sstevel@tonic-gate * Check every node in my node's nodelist against
39990Sstevel@tonic-gate * the nodelist gotten from the other node.
40000Sstevel@tonic-gate * If a node in my node's nodelist is not found in the
40010Sstevel@tonic-gate * other node's nodelist, then set the DEL flag.
40020Sstevel@tonic-gate */
40030Sstevel@tonic-gate nd2 = sd->sd_nodelist;
40040Sstevel@tonic-gate while (nd2) {
40050Sstevel@tonic-gate nr = mnsr->sr_nodechain;
40060Sstevel@tonic-gate while (nr) {
40070Sstevel@tonic-gate if (nd2->nd_nodeid == nr->nr_nodeid) {
40080Sstevel@tonic-gate break;
40090Sstevel@tonic-gate }
40100Sstevel@tonic-gate nr = nr->nr_next;
40110Sstevel@tonic-gate }
40120Sstevel@tonic-gate /* nd2 not found in other node's nodelist */
40130Sstevel@tonic-gate if (nr == NULL) {
40140Sstevel@tonic-gate nd2->nd_flags |= MD_MN_NODE_DEL;
40150Sstevel@tonic-gate }
40160Sstevel@tonic-gate nd2 = nd2->nd_next;
40170Sstevel@tonic-gate }
40180Sstevel@tonic-gate
40190Sstevel@tonic-gate free_sr((md_set_record *)mnsr);
40200Sstevel@tonic-gate nd = nd->nd_next;
40210Sstevel@tonic-gate }
40220Sstevel@tonic-gate
40230Sstevel@tonic-gate /*
40240Sstevel@tonic-gate * Rescan list look for node that has not been marked DEL.
40250Sstevel@tonic-gate * First node found is the master.
40260Sstevel@tonic-gate */
40270Sstevel@tonic-gate nd = sd->sd_nodelist;
40280Sstevel@tonic-gate while (nd) {
40290Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
40300Sstevel@tonic-gate break;
40310Sstevel@tonic-gate }
40320Sstevel@tonic-gate nd = nd->nd_next;
40330Sstevel@tonic-gate continue;
40340Sstevel@tonic-gate }
40350Sstevel@tonic-gate if (nd) {
40360Sstevel@tonic-gate /* Found a master */
40370Sstevel@tonic-gate goto found_master;
40380Sstevel@tonic-gate }
40390Sstevel@tonic-gate
40400Sstevel@tonic-gate /*
40410Sstevel@tonic-gate * - If no node can be found that has its own node record on
40420Sstevel@tonic-gate * its node to be set to OK, then all alive nodes
40430Sstevel@tonic-gate * were in the process of being added to or deleted
40440Sstevel@tonic-gate * from set. Each alive node will remove all
40450Sstevel@tonic-gate * information pertaining to this set from its node.
40460Sstevel@tonic-gate *
40470Sstevel@tonic-gate * If all nodes in set are ALIVE, then call sdssc end routines
40480Sstevel@tonic-gate * since set was truly being initially created or destroyed.
40490Sstevel@tonic-gate */
40500Sstevel@tonic-gate goto delete_set;
40510Sstevel@tonic-gate }
40520Sstevel@tonic-gate
40530Sstevel@tonic-gate found_master:
40540Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
40550Sstevel@tonic-gate "Set %s master chosen %s (%d): %s"),
40560Sstevel@tonic-gate sp->setname, nd->nd_nodename, nd->nd_nodeid,
40570Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time));
40580Sstevel@tonic-gate
40590Sstevel@tonic-gate if (clnt_lock_set(mynode(), sp, ep) == -1) {
40600Sstevel@tonic-gate return (-1);
40610Sstevel@tonic-gate }
40620Sstevel@tonic-gate
40630Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname);
40640Sstevel@tonic-gate
40650Sstevel@tonic-gate if (clnt_mnsetmaster(mynode(), sp,
40660Sstevel@tonic-gate nd->nd_nodename, nd->nd_nodeid, ep)) {
40670Sstevel@tonic-gate rval = -1;
40680Sstevel@tonic-gate } else if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
40690Sstevel@tonic-gate /* If this node is new master, set flag in this node's kernel */
40700Sstevel@tonic-gate (void) memset(&sf, 0, sizeof (sf));
40710Sstevel@tonic-gate sf.sf_setno = sp->setno;
40720Sstevel@tonic-gate sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
40730Sstevel@tonic-gate /* Use magic to help protect ioctl against attack. */
40740Sstevel@tonic-gate sf.sf_magic = MDDB_SETFLAGS_MAGIC;
40750Sstevel@tonic-gate sf.sf_flags = MDDB_NM_SET;
40760Sstevel@tonic-gate
40770Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
40780Sstevel@tonic-gate "Setting new master flag for set %s: %s"),
40790Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time));
40800Sstevel@tonic-gate
40810Sstevel@tonic-gate /*
40820Sstevel@tonic-gate * Fail reconfig cycle if ioctl fails since it is critical
40830Sstevel@tonic-gate * to set new master flag.
40840Sstevel@tonic-gate */
40850Sstevel@tonic-gate if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde,
40860Sstevel@tonic-gate NULL) != NULL) {
40870Sstevel@tonic-gate (void) mdstealerror(ep, &sf.sf_mde);
40880Sstevel@tonic-gate rval = -1;
40890Sstevel@tonic-gate }
40900Sstevel@tonic-gate }
40910Sstevel@tonic-gate
40920Sstevel@tonic-gate if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
40930Sstevel@tonic-gate if (rval == 0) {
40940Sstevel@tonic-gate (void) mdstealerror(ep, &xep);
40950Sstevel@tonic-gate rval = -1;
40960Sstevel@tonic-gate }
40970Sstevel@tonic-gate }
40980Sstevel@tonic-gate
40990Sstevel@tonic-gate cl_set_setkey(NULL);
41000Sstevel@tonic-gate
41010Sstevel@tonic-gate metaflushsetname(sp);
41020Sstevel@tonic-gate
41030Sstevel@tonic-gate return (rval);
41040Sstevel@tonic-gate
41050Sstevel@tonic-gate delete_set:
41060Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
41070Sstevel@tonic-gate "Master not chosen, deleting set %s: %s"),
41080Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time));
41090Sstevel@tonic-gate
41100Sstevel@tonic-gate /*
41110Sstevel@tonic-gate * Remove all set information from this node:
41120Sstevel@tonic-gate * - node records for this set
41130Sstevel@tonic-gate * - drive records for this set
41140Sstevel@tonic-gate * - set record for this set
41150Sstevel@tonic-gate * (Only do this on this node since each node
41160Sstevel@tonic-gate * will do it for its own local mddb.)
41170Sstevel@tonic-gate *
41180Sstevel@tonic-gate * If all nodes in set are ALIVE, then
41190Sstevel@tonic-gate * the lowest numbered ALIVE nodeid in set
41200Sstevel@tonic-gate * (irregardless of whether an owner node or not) will
41210Sstevel@tonic-gate * call the DCS service to cleanup for create/delete of set.
41220Sstevel@tonic-gate * sdssc_create_end(cleanup) if set was being created or
41230Sstevel@tonic-gate * sdssc_delete_end(cleanup) if set was being deleted.
41240Sstevel@tonic-gate * A node record with flag ADD denotes a set being
41250Sstevel@tonic-gate * created. A node record with flag DEL denotes a
41260Sstevel@tonic-gate * set being deleted.
41270Sstevel@tonic-gate */
41280Sstevel@tonic-gate nd = sd->sd_nodelist;
41290Sstevel@tonic-gate while (nd) {
41300Sstevel@tonic-gate /* Found a node that isn't alive */
41310Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
41320Sstevel@tonic-gate break;
41330Sstevel@tonic-gate
41340Sstevel@tonic-gate /* Is my node the lowest numbered ALIVE node? */
41350Sstevel@tonic-gate if (nd->nd_nodeid < sd->sd_mn_mynode->nd_nodeid) {
41360Sstevel@tonic-gate break;
41370Sstevel@tonic-gate }
41380Sstevel@tonic-gate nd = nd->nd_next;
41390Sstevel@tonic-gate }
41400Sstevel@tonic-gate if (nd == NULL) {
41410Sstevel@tonic-gate /* All nodes ALIVE and this is the lowest nodeid */
41420Sstevel@tonic-gate lowest_alive_nodeid = 1;
41430Sstevel@tonic-gate }
41440Sstevel@tonic-gate
41450Sstevel@tonic-gate if (clnt_lock_set(mynode(), sp, ep) == -1) {
41460Sstevel@tonic-gate return (-1);
41470Sstevel@tonic-gate }
41480Sstevel@tonic-gate
41490Sstevel@tonic-gate
41500Sstevel@tonic-gate /*
41510Sstevel@tonic-gate * If this node had been joined, withdraw and reset master.
41520Sstevel@tonic-gate *
41530Sstevel@tonic-gate * This could happen if a node was being added to or removed
41540Sstevel@tonic-gate * from a diskset and the node doing the add/delete operation and
41550Sstevel@tonic-gate * all other nodes in the diskset have left the cluster.
41560Sstevel@tonic-gate */
41570Sstevel@tonic-gate if (sd->sd_mn_mynode) {
41580Sstevel@tonic-gate nd = sd->sd_mn_mynode;
41590Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OWN) {
41600Sstevel@tonic-gate if (clnt_withdrawset(mynode(), sp, ep)) {
41610Sstevel@tonic-gate rval = -1;
41620Sstevel@tonic-gate goto out;
41630Sstevel@tonic-gate }
41640Sstevel@tonic-gate if (clnt_mnsetmaster(mynode(), sp, "",
41650Sstevel@tonic-gate MD_MN_INVALID_NID, ep)) {
41660Sstevel@tonic-gate rval = -1;
41670Sstevel@tonic-gate goto out;
41680Sstevel@tonic-gate }
41690Sstevel@tonic-gate }
41700Sstevel@tonic-gate }
41710Sstevel@tonic-gate
41720Sstevel@tonic-gate /*
41730Sstevel@tonic-gate * Remove side records for this node (side) from local mddb
41740Sstevel@tonic-gate * (clnt_deldrvs does this) if there are drives in the set.
41750Sstevel@tonic-gate *
41760Sstevel@tonic-gate * Don't need to mark this node as DEL since already marked as
41770Sstevel@tonic-gate * ADD or DEL (or this node would have been chosen as master).
41780Sstevel@tonic-gate * Don't need to mark other node records, drive records or
41790Sstevel@tonic-gate * set records as DEL. If a panic occurs during clnt_delset,
41800Sstevel@tonic-gate * these records will be deleted the next time this node
41810Sstevel@tonic-gate * becomes a member and goes through the reconfig cycle.
41820Sstevel@tonic-gate */
41830Sstevel@tonic-gate /* Get the drive descriptors for this set */
41840Sstevel@tonic-gate if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
41850Sstevel@tonic-gate ep)) == NULL) {
41860Sstevel@tonic-gate if (! mdisok(ep)) {
41870Sstevel@tonic-gate /*
41880Sstevel@tonic-gate * Ignore and clear out any failures from
41890Sstevel@tonic-gate * metaget_drivedesc since a panic could have
41900Sstevel@tonic-gate * occurred when a node was partially added to a set.
41910Sstevel@tonic-gate */
41920Sstevel@tonic-gate mdclrerror(ep);
41930Sstevel@tonic-gate }
41940Sstevel@tonic-gate } else {
41950Sstevel@tonic-gate if (clnt_deldrvs(mynode(), sp, dd, ep)) {
41960Sstevel@tonic-gate rval = -1;
41970Sstevel@tonic-gate goto out;
41980Sstevel@tonic-gate }
41990Sstevel@tonic-gate }
42000Sstevel@tonic-gate
42010Sstevel@tonic-gate /*
42020Sstevel@tonic-gate * Now, delete the set - this removes the node, drive
42030Sstevel@tonic-gate * and set records from the local mddb.
42040Sstevel@tonic-gate */
42050Sstevel@tonic-gate if (clnt_delset(mynode(), sp, ep)) {
42060Sstevel@tonic-gate rval = -1;
42070Sstevel@tonic-gate goto out;
42080Sstevel@tonic-gate }
42090Sstevel@tonic-gate
42100Sstevel@tonic-gate out:
42110Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname);
42120Sstevel@tonic-gate
42130Sstevel@tonic-gate /*
42140Sstevel@tonic-gate * Ignore errors from unlock of set since set is no longer
42150Sstevel@tonic-gate * known (if clnt_delset worked).
42160Sstevel@tonic-gate */
42170Sstevel@tonic-gate if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
42180Sstevel@tonic-gate mdclrerror(&xep);
42190Sstevel@tonic-gate }
42200Sstevel@tonic-gate
42210Sstevel@tonic-gate cl_set_setkey(NULL);
42220Sstevel@tonic-gate
42230Sstevel@tonic-gate metaflushsetname(sp);
42240Sstevel@tonic-gate
42250Sstevel@tonic-gate /*
42260Sstevel@tonic-gate * If this node is the lowest numbered nodeid then
42270Sstevel@tonic-gate * call sdssc_create/delete_end depending on whether
42280Sstevel@tonic-gate * this node is marked as ADD or DEL in the node record.
42290Sstevel@tonic-gate */
42300Sstevel@tonic-gate if (lowest_alive_nodeid) {
42310Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_ADD)
42320Sstevel@tonic-gate sdssc_create_end(sp->setname, SDSSC_CLEANUP);
42330Sstevel@tonic-gate else if (nd->nd_flags & MD_MN_NODE_DEL)
42340Sstevel@tonic-gate sdssc_delete_end(sp->setname, SDSSC_CLEANUP);
42350Sstevel@tonic-gate }
42360Sstevel@tonic-gate
42370Sstevel@tonic-gate /* Finished with this set -- return */
42380Sstevel@tonic-gate return (rval);
42390Sstevel@tonic-gate }
42400Sstevel@tonic-gate
42410Sstevel@tonic-gate /*
42420Sstevel@tonic-gate * Reconfig step to choose a new master for all MN disksets.
42430Sstevel@tonic-gate * Return values:
42440Sstevel@tonic-gate * 0 - Everything is great.
42450Sstevel@tonic-gate * 1 - This node failed to reconfig.
42460Sstevel@tonic-gate * 205 - Cause another reconfig due to a nodelist problem
42470Sstevel@tonic-gate * or RPC failure to another node
42480Sstevel@tonic-gate */
42490Sstevel@tonic-gate int
meta_reconfig_choose_master(long timeout,md_error_t * ep)42500Sstevel@tonic-gate meta_reconfig_choose_master(
42513073Sjkennedy long timeout,
42520Sstevel@tonic-gate md_error_t *ep
42530Sstevel@tonic-gate )
42540Sstevel@tonic-gate {
42550Sstevel@tonic-gate set_t max_sets, setno;
42560Sstevel@tonic-gate int nodecnt;
42570Sstevel@tonic-gate mndiskset_membershiplist_t *nl;
42580Sstevel@tonic-gate md_set_desc *sd;
42590Sstevel@tonic-gate mdsetname_t *sp;
42600Sstevel@tonic-gate int rval = 0;
42610Sstevel@tonic-gate mddb_setflags_config_t sf;
42620Sstevel@tonic-gate int start_node_delayed = 0;
42630Sstevel@tonic-gate
42640Sstevel@tonic-gate if ((max_sets = get_max_sets(ep)) == 0) {
42650Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN,
42660Sstevel@tonic-gate "Unable to get number of sets"));
42670Sstevel@tonic-gate return (1);
42680Sstevel@tonic-gate }
42690Sstevel@tonic-gate
42700Sstevel@tonic-gate /*
42710Sstevel@tonic-gate * Get membershiplist from API routine. If there's
42720Sstevel@tonic-gate * an error, return a 205 to cause another reconfig.
42730Sstevel@tonic-gate */
42740Sstevel@tonic-gate if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
42750Sstevel@tonic-gate mde_perror(ep, "");
42760Sstevel@tonic-gate return (205);
42770Sstevel@tonic-gate }
42780Sstevel@tonic-gate
42790Sstevel@tonic-gate for (setno = 1; setno < max_sets; setno++) {
42800Sstevel@tonic-gate if ((sp = metasetnosetname(setno, ep)) == NULL) {
42810Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) {
42820Sstevel@tonic-gate /* No set for this setno - continue */
42830Sstevel@tonic-gate mdclrerror(ep);
42840Sstevel@tonic-gate continue;
42850Sstevel@tonic-gate } else {
42860Sstevel@tonic-gate /*
42870Sstevel@tonic-gate * If encountered an RPC error from my node,
42880Sstevel@tonic-gate * then immediately fail.
42890Sstevel@tonic-gate */
42900Sstevel@tonic-gate if (mdanyrpcerror(ep)) {
42910Sstevel@tonic-gate mde_perror(ep, "");
42920Sstevel@tonic-gate return (1);
42930Sstevel@tonic-gate }
42940Sstevel@tonic-gate /* Can't get set information */
42950Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN,
42964932Spetede "Unable to get information for "
42974932Spetede "set number %d"), setno);
42980Sstevel@tonic-gate mdclrerror(ep);
42990Sstevel@tonic-gate continue;
43000Sstevel@tonic-gate }
43010Sstevel@tonic-gate }
43020Sstevel@tonic-gate
43030Sstevel@tonic-gate /* If setname is there, set desc should exist. */
43040Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) {
43050Sstevel@tonic-gate /*
43060Sstevel@tonic-gate * If encountered an RPC error from my node,
43070Sstevel@tonic-gate * then immediately fail.
43080Sstevel@tonic-gate */
43090Sstevel@tonic-gate if (mdanyrpcerror(ep)) {
43100Sstevel@tonic-gate mde_perror(ep, "");
43110Sstevel@tonic-gate return (1);
43120Sstevel@tonic-gate }
43130Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN,
43144932Spetede "Unable to get set %s desc information"),
43154932Spetede sp->setname);
43160Sstevel@tonic-gate mdclrerror(ep);
43170Sstevel@tonic-gate continue;
43180Sstevel@tonic-gate }
43190Sstevel@tonic-gate
43200Sstevel@tonic-gate /* Only reconfig MN disksets */
43210Sstevel@tonic-gate if (!MD_MNSET_DESC(sd)) {
43220Sstevel@tonic-gate continue;
43230Sstevel@tonic-gate }
43240Sstevel@tonic-gate
43250Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
43260Sstevel@tonic-gate "Begin choose master for set %s: %s"),
43270Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time));
43280Sstevel@tonic-gate
43290Sstevel@tonic-gate /* Update nodelist with member information. */
43300Sstevel@tonic-gate if (meta_reconfig_update_nodelist(sp, nl, sd, ep)) {
43310Sstevel@tonic-gate /*
43320Sstevel@tonic-gate * If encountered an RPC error from my node,
43330Sstevel@tonic-gate * then immediately fail.
43340Sstevel@tonic-gate */
43350Sstevel@tonic-gate if (mdanyrpcerror(ep)) {
43360Sstevel@tonic-gate mde_perror(ep, "");
43370Sstevel@tonic-gate return (1);
43380Sstevel@tonic-gate }
43390Sstevel@tonic-gate mde_perror(ep, "");
43400Sstevel@tonic-gate mdclrerror(ep);
43410Sstevel@tonic-gate continue;
43420Sstevel@tonic-gate }
43430Sstevel@tonic-gate
43440Sstevel@tonic-gate /*
43450Sstevel@tonic-gate * If all nodes in a cluster are starting, then
43460Sstevel@tonic-gate * all nodes will attempt to contact all other nodes
43470Sstevel@tonic-gate * to determine a master node. This can lead to a
43480Sstevel@tonic-gate * problem where node 1 is trying to contact the rpc.metad
43490Sstevel@tonic-gate * node 2 and node 2 is trying to contact the rpc.metad
43500Sstevel@tonic-gate * on node 1 -- and this causes the rpc call to fail
43510Sstevel@tonic-gate * on both nodes and causes a new reconfig cycle.
43520Sstevel@tonic-gate *
43530Sstevel@tonic-gate * In order to break this problem, a newly starting node
43540Sstevel@tonic-gate * will delay a small amount of time (nodeid mod 4 seconds)
43550Sstevel@tonic-gate * and will then run the code to choose a master for the
43560Sstevel@tonic-gate * first set. Delay will only be done once regardless of the
43570Sstevel@tonic-gate * number of sets.
43580Sstevel@tonic-gate */
43590Sstevel@tonic-gate if (start_node_delayed == 0) {
43600Sstevel@tonic-gate (void) memset(&sf, 0, sizeof (sf));
43610Sstevel@tonic-gate sf.sf_setno = sp->setno;
43620Sstevel@tonic-gate sf.sf_flags = MDDB_NM_GET;
43630Sstevel@tonic-gate /* Use magic to help protect ioctl against attack. */
43640Sstevel@tonic-gate sf.sf_magic = MDDB_SETFLAGS_MAGIC;
43650Sstevel@tonic-gate if ((metaioctl(MD_MN_GET_SETFLAGS, &sf,
43660Sstevel@tonic-gate &sf.sf_mde, NULL) == 0) &&
43670Sstevel@tonic-gate ((sf.sf_setflags & MD_SET_MN_START_RC) ==
43680Sstevel@tonic-gate MD_SET_MN_START_RC)) {
43690Sstevel@tonic-gate (void) sleep(sd->sd_mn_mynode->nd_nodeid % 4);
43700Sstevel@tonic-gate }
43710Sstevel@tonic-gate start_node_delayed = 1;
43720Sstevel@tonic-gate }
43730Sstevel@tonic-gate
43740Sstevel@tonic-gate /* Choose master for this set */
43750Sstevel@tonic-gate rval = meta_reconfig_choose_master_for_set(sp, sd, ep);
43760Sstevel@tonic-gate if (rval == -1) {
43770Sstevel@tonic-gate mde_perror(ep, "");
43780Sstevel@tonic-gate return (1);
43790Sstevel@tonic-gate } else if (rval == 205) {
43800Sstevel@tonic-gate mde_perror(ep, "");
43810Sstevel@tonic-gate return (205);
43820Sstevel@tonic-gate }
43830Sstevel@tonic-gate
43843073Sjkennedy /* reinit rpc.mdcommd with new nodelist */
43853073Sjkennedy if (mdmn_reinit_set(sp->setno, timeout)) {
43863073Sjkennedy md_eprintf(dgettext(TEXT_DOMAIN,
43873073Sjkennedy "Could not re-initialise rpc.mdcommd for "
43883073Sjkennedy "set %s\n"), sp->setname);
43893073Sjkennedy return (1);
43903073Sjkennedy }
43910Sstevel@tonic-gate
43920Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
43930Sstevel@tonic-gate "Choose master for set %s completed: %s"),
43940Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time));
43950Sstevel@tonic-gate }
43960Sstevel@tonic-gate
43970Sstevel@tonic-gate /*
43980Sstevel@tonic-gate * Each node turns on I/Os for all MN disksets.
43990Sstevel@tonic-gate * This is to recover from the situation where the master died
44000Sstevel@tonic-gate * during a previous reconfig cycle when I/Os were suspended
44010Sstevel@tonic-gate * for a MN diskset.
44020Sstevel@tonic-gate * If a failure occurs return a 1 which will force this node to
44030Sstevel@tonic-gate * panic. Cannot leave node in the situation where I/Os are
44040Sstevel@tonic-gate * not resumed.
44050Sstevel@tonic-gate */
44060Sstevel@tonic-gate setno = 0; /* 0 means all MN sets */
44070Sstevel@tonic-gate if (metaioctl(MD_MN_RESUME_SET, &setno, ep, NULL)) {
44080Sstevel@tonic-gate mde_perror(ep, "");
44090Sstevel@tonic-gate return (1);
44100Sstevel@tonic-gate }
44110Sstevel@tonic-gate
44120Sstevel@tonic-gate /* Free the nodelist */
44130Sstevel@tonic-gate if (nodecnt)
44140Sstevel@tonic-gate meta_free_nodelist(nl);
44150Sstevel@tonic-gate
44160Sstevel@tonic-gate return (0);
44170Sstevel@tonic-gate }
44180Sstevel@tonic-gate
44190Sstevel@tonic-gate /*
44200Sstevel@tonic-gate * meta_mnsync_user_records will synchronize the diskset user records across
44210Sstevel@tonic-gate * all nodes in the diskset. The diskset user records are stored in
44220Sstevel@tonic-gate * each node's local set mddb.
44230Sstevel@tonic-gate *
44240Sstevel@tonic-gate * This needs to be done even if there is no master change during the
44250Sstevel@tonic-gate * reconfig cycle since this routine should clean up any mess left by
44260Sstevel@tonic-gate * the untimely termination of a metaset or metadb command (due to a
44270Sstevel@tonic-gate * node panic or to user intervention).
44280Sstevel@tonic-gate *
44290Sstevel@tonic-gate * Caller is the Master node.
44300Sstevel@tonic-gate *
44310Sstevel@tonic-gate * Returns 0 - Success
44320Sstevel@tonic-gate * 205 - Failure during RPC to another node
44330Sstevel@tonic-gate * -1 - Any other failure and ep is filled in.
44340Sstevel@tonic-gate */
44350Sstevel@tonic-gate int
meta_mnsync_user_records(mdsetname_t * sp,md_error_t * ep)44360Sstevel@tonic-gate meta_mnsync_user_records(
44370Sstevel@tonic-gate mdsetname_t *sp,
44380Sstevel@tonic-gate md_error_t *ep
44390Sstevel@tonic-gate )
44400Sstevel@tonic-gate {
44410Sstevel@tonic-gate md_set_desc *sd;
44420Sstevel@tonic-gate md_mnnode_desc *master_nodelist, *nd, *nd2, *ndtail;
44430Sstevel@tonic-gate md_mnset_record *mnsr;
44440Sstevel@tonic-gate md_mnsr_node_t *master_mnsr_node = NULL, *mnsr_node = NULL;
44450Sstevel@tonic-gate md_mnnode_record *nr;
44460Sstevel@tonic-gate md_drive_record *dr;
44470Sstevel@tonic-gate int dr_cnt, dd_cnt;
44480Sstevel@tonic-gate int found_my_nr;
44490Sstevel@tonic-gate md_drive_desc *dd, *dd_prev, *master_dd, *other_dd;
44500Sstevel@tonic-gate int all_drives_ok;
44510Sstevel@tonic-gate int rval = 0;
44520Sstevel@tonic-gate int max_genid = 0;
44530Sstevel@tonic-gate int num_alive_nodes, num_alive_nodes_del = 0;
44540Sstevel@tonic-gate int set_locked = 0;
44550Sstevel@tonic-gate md_setkey_t *cl_sk;
44560Sstevel@tonic-gate md_error_t xep = mdnullerror;
44570Sstevel@tonic-gate char *anode[1];
44580Sstevel@tonic-gate mddb_setflags_config_t sf;
44590Sstevel@tonic-gate
44600Sstevel@tonic-gate /*
44610Sstevel@tonic-gate * Sync up node records first.
44620Sstevel@tonic-gate * Construct a master nodelist using the nodelist from this
44630Sstevel@tonic-gate * node's rpc.metad node records and then setting the state of each
44640Sstevel@tonic-gate * node following these rules:
44650Sstevel@tonic-gate * - If a node record is marked OK on its node, mark it OK
44660Sstevel@tonic-gate * in the master nodelist (and later OK on all nodes)
44670Sstevel@tonic-gate * If a node record is also marked OWN on its node,
44680Sstevel@tonic-gate * mark it OWN in the master nodelist.
44690Sstevel@tonic-gate * - If a node record is not marked OK on its node, then mark
44700Sstevel@tonic-gate * it as DEL in the master list (later deleting it)
44710Sstevel@tonic-gate * - If node record doesn't exist on that node, then mark it DEL
44720Sstevel@tonic-gate * (later deleting it)
44730Sstevel@tonic-gate * - If set record doesn't exist on that node, mark node as DEL
44740Sstevel@tonic-gate * - If a node record doesn't exist on all nodes, then mark it DEL
44750Sstevel@tonic-gate * - If a node is not ALIVE, then
44760Sstevel@tonic-gate * - If that node marked DEL on any node - mark it DEL
44770Sstevel@tonic-gate * in master list but leave in nodelist
44780Sstevel@tonic-gate * - If that node is marked as ADD on any node, mark it
44790Sstevel@tonic-gate * ADD in the master list but leave in nodelist
44800Sstevel@tonic-gate * - When that node returns to the living, the DEL
44810Sstevel@tonic-gate * node record will be removed and the ADD node
44820Sstevel@tonic-gate * record may be removed if marked ADD on that
44830Sstevel@tonic-gate * node.
44840Sstevel@tonic-gate * The key rule is to not remove a node from the nodelist until
44850Sstevel@tonic-gate * that node record is removed from its own node. Do not want to
44860Sstevel@tonic-gate * remove a node's record from all other nodes and then have
44870Sstevel@tonic-gate * that node have its own record marked OK so that a node will pick
44880Sstevel@tonic-gate * a different master than the other nodes.
44890Sstevel@tonic-gate *
44900Sstevel@tonic-gate * Next,
44910Sstevel@tonic-gate * If node is ALIVE and node record is marked DEL in master nodelist,
44920Sstevel@tonic-gate * remove node from set.
44930Sstevel@tonic-gate * If node is ALIVE and node record is marked OK in master nodelist,
44940Sstevel@tonic-gate * mark it OK on all other nodes.
44950Sstevel@tonic-gate * If node is not ALIVE and node record is marked DEL in master
44960Sstevel@tonic-gate * nodelist, mark it DEL on all other nodes.
44970Sstevel@tonic-gate * If node is not ALIVE and node record is marked ADD in master,
44980Sstevel@tonic-gate * nodelist, mark it ADD on all other nodes.
44990Sstevel@tonic-gate */
45000Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) {
45010Sstevel@tonic-gate return (-1);
45020Sstevel@tonic-gate }
45030Sstevel@tonic-gate master_nodelist = sd->sd_nodelist;
45040Sstevel@tonic-gate
45050Sstevel@tonic-gate /*
45060Sstevel@tonic-gate * Walk through nodelist creating a master nodelist.
45070Sstevel@tonic-gate */
45080Sstevel@tonic-gate num_alive_nodes = 0;
45090Sstevel@tonic-gate nd = master_nodelist;
45100Sstevel@tonic-gate while (nd) {
45110Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
45120Sstevel@tonic-gate nd = nd->nd_next;
45130Sstevel@tonic-gate continue;
45140Sstevel@tonic-gate }
45150Sstevel@tonic-gate num_alive_nodes++;
45160Sstevel@tonic-gate if (clnt_mngetset(nd->nd_nodename, sp->setname,
45170Sstevel@tonic-gate MD_SET_BAD, &mnsr, ep) == -1) {
45180Sstevel@tonic-gate if (mdiserror(ep, MDE_NO_SET)) {
45190Sstevel@tonic-gate /* set doesn't exist, mark node as DEL */
45200Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OK;
45210Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_ADD;
45220Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL;
45230Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_NOSET;
45240Sstevel@tonic-gate nd = nd->nd_next;
45250Sstevel@tonic-gate continue;
45260Sstevel@tonic-gate } else {
45270Sstevel@tonic-gate /* If RPC failure to another node return 205 */
45280Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
45290Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid !=
45300Sstevel@tonic-gate nd->nd_nodeid)) {
45310Sstevel@tonic-gate rval = 205;
45320Sstevel@tonic-gate } else {
45330Sstevel@tonic-gate /* Any other failure */
45340Sstevel@tonic-gate rval = -1;
45350Sstevel@tonic-gate }
45360Sstevel@tonic-gate goto out;
45370Sstevel@tonic-gate }
45380Sstevel@tonic-gate }
45390Sstevel@tonic-gate /* Find biggest genid in records for this diskset */
45400Sstevel@tonic-gate if (mnsr->sr_genid > max_genid)
45410Sstevel@tonic-gate max_genid = mnsr->sr_genid;
45420Sstevel@tonic-gate
45430Sstevel@tonic-gate dr = mnsr->sr_drivechain;
45440Sstevel@tonic-gate while (dr) {
45450Sstevel@tonic-gate /* Find biggest genid in records for this diskset */
45460Sstevel@tonic-gate if (dr->dr_genid > max_genid) {
45470Sstevel@tonic-gate max_genid = dr->dr_genid;
45480Sstevel@tonic-gate }
45490Sstevel@tonic-gate dr = dr->dr_next;
45500Sstevel@tonic-gate }
45510Sstevel@tonic-gate
45520Sstevel@tonic-gate found_my_nr = 0;
45530Sstevel@tonic-gate nr = mnsr->sr_nodechain;
45540Sstevel@tonic-gate /* nr is the list of node recs from nd_nodename node */
45550Sstevel@tonic-gate while (nr) {
45560Sstevel@tonic-gate /* Find biggest genid in records for this diskset */
45570Sstevel@tonic-gate if (nr->nr_genid > max_genid)
45580Sstevel@tonic-gate max_genid = nr->nr_genid;
45590Sstevel@tonic-gate nd2 = master_nodelist;
45600Sstevel@tonic-gate ndtail = NULL;
45610Sstevel@tonic-gate /* For each node record, is it in master list? */
45620Sstevel@tonic-gate while (nd2) {
45630Sstevel@tonic-gate if (nd2->nd_nodeid == nr->nr_nodeid)
45640Sstevel@tonic-gate break;
45650Sstevel@tonic-gate if (nd2->nd_next == NULL)
45660Sstevel@tonic-gate ndtail = nd2;
45670Sstevel@tonic-gate nd2 = nd2->nd_next;
45680Sstevel@tonic-gate }
45690Sstevel@tonic-gate /*
45700Sstevel@tonic-gate * Found node record not in master list -- add it
45710Sstevel@tonic-gate * to list marking it as DEL since node record
45720Sstevel@tonic-gate * should exist on all nodes unless a panic occurred
45730Sstevel@tonic-gate * during addition or deletion of host to diskset.
45740Sstevel@tonic-gate */
45750Sstevel@tonic-gate if (nd2 == NULL) {
45760Sstevel@tonic-gate nd2 = Zalloc(sizeof (*nd2));
45770Sstevel@tonic-gate (void) strcpy(nd2->nd_nodename,
45780Sstevel@tonic-gate nr->nr_nodename);
45790Sstevel@tonic-gate nd2->nd_flags = nr->nr_flags;
45800Sstevel@tonic-gate nd2->nd_flags |= MD_MN_NODE_DEL;
45810Sstevel@tonic-gate nd2->nd_nodeid = nr->nr_nodeid;
45820Sstevel@tonic-gate nd2->nd_next = NULL;
45830Sstevel@tonic-gate ndtail->nd_next = nd2;
45840Sstevel@tonic-gate nd2 = NULL;
45850Sstevel@tonic-gate nr = nr->nr_next;
45860Sstevel@tonic-gate continue;
45870Sstevel@tonic-gate }
45880Sstevel@tonic-gate /*
45890Sstevel@tonic-gate * Is this the node record for the node that
45900Sstevel@tonic-gate * we requested the set desc from?
45910Sstevel@tonic-gate * If so, check if node has its own node record
45920Sstevel@tonic-gate * marked OK. If marked OK, check for the OWN bit.
45930Sstevel@tonic-gate */
45940Sstevel@tonic-gate if (nr->nr_nodeid == nd->nd_nodeid) {
45950Sstevel@tonic-gate found_my_nr = 1;
45960Sstevel@tonic-gate if (nr->nr_flags & MD_MN_NODE_OK) {
45970Sstevel@tonic-gate /*
45980Sstevel@tonic-gate * If node record is marked OK
45990Sstevel@tonic-gate * on its own node, then mark it OK
46000Sstevel@tonic-gate * in the master list. Node record
46010Sstevel@tonic-gate * would have to exist on all nodes
46020Sstevel@tonic-gate * in the ADD state before it could
46030Sstevel@tonic-gate * be put into the OK state.
46040Sstevel@tonic-gate */
46050Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_OK;
46060Sstevel@tonic-gate nd->nd_flags &=
46070Sstevel@tonic-gate ~(MD_MN_NODE_ADD | MD_MN_NODE_DEL);
46080Sstevel@tonic-gate /*
46090Sstevel@tonic-gate * Mark own in master list as marked
46100Sstevel@tonic-gate * on own node.
46110Sstevel@tonic-gate */
46120Sstevel@tonic-gate if (nr->nr_flags & MD_MN_NODE_OWN)
46130Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_OWN;
46140Sstevel@tonic-gate else
46150Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OWN;
46160Sstevel@tonic-gate } else {
46170Sstevel@tonic-gate /* Otherwise, mark node as DEL */
46180Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OK;
46190Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_ADD;
46200Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL;
46210Sstevel@tonic-gate }
46220Sstevel@tonic-gate }
46230Sstevel@tonic-gate /*
46240Sstevel@tonic-gate * If node is not ALIVE and marked DEL
46250Sstevel@tonic-gate * on any node, make it DEL in master list.
46260Sstevel@tonic-gate * If node is not ALIVE and marked ADD
46270Sstevel@tonic-gate * on any node, make it ADD in master list
46280Sstevel@tonic-gate * unless node record has already been marked DEL.
46290Sstevel@tonic-gate */
46300Sstevel@tonic-gate if (!(nr->nr_flags & MD_MN_NODE_ALIVE)) {
46310Sstevel@tonic-gate if (nr->nr_flags & MD_MN_NODE_ADD) {
46320Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
46330Sstevel@tonic-gate /* If not DEL - mark it ADD */
46340Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_ADD;
46350Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OK;
46360Sstevel@tonic-gate }
46370Sstevel@tonic-gate }
46380Sstevel@tonic-gate if (nr->nr_flags & MD_MN_NODE_DEL) {
46390Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL;
46400Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OK;
46410Sstevel@tonic-gate /* Could already be ADD - make it DEL */
46420Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_ADD;
46430Sstevel@tonic-gate }
46440Sstevel@tonic-gate }
46450Sstevel@tonic-gate nr = nr->nr_next;
46460Sstevel@tonic-gate }
46470Sstevel@tonic-gate /*
46480Sstevel@tonic-gate * If a node record doesn't exist on its own node,
46490Sstevel@tonic-gate * then mark node as DEL.
46500Sstevel@tonic-gate */
46510Sstevel@tonic-gate if (found_my_nr == 0) {
46520Sstevel@tonic-gate nd->nd_flags &= ~MD_MN_NODE_OK;
46530Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_DEL;
46540Sstevel@tonic-gate }
46550Sstevel@tonic-gate
46560Sstevel@tonic-gate /*
46570Sstevel@tonic-gate * If node is OK - put mnsr onto master_mnsr_node list for
46580Sstevel@tonic-gate * later use when syncing up the drive records in the set.
46590Sstevel@tonic-gate */
46600Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_OK) {
46610Sstevel@tonic-gate mnsr_node = Zalloc(sizeof (*mnsr_node));
46620Sstevel@tonic-gate mnsr_node->mmn_mnsr = mnsr;
46630Sstevel@tonic-gate (void) strncpy(mnsr_node->mmn_nodename,
46644932Spetede nd->nd_nodename, MD_MAX_MNNODENAME_PLUS_1);
46650Sstevel@tonic-gate mnsr_node->mmn_next = master_mnsr_node;
46660Sstevel@tonic-gate master_mnsr_node = mnsr_node;
46670Sstevel@tonic-gate } else {
46680Sstevel@tonic-gate free_sr((struct md_set_record *)mnsr);
46690Sstevel@tonic-gate }
46700Sstevel@tonic-gate
46710Sstevel@tonic-gate nd = nd->nd_next;
46720Sstevel@tonic-gate }
46730Sstevel@tonic-gate
46740Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
46750Sstevel@tonic-gate "Master nodelist created for set %s: %s"),
46760Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time));
46770Sstevel@tonic-gate
46780Sstevel@tonic-gate /*
46790Sstevel@tonic-gate * Send master nodelist to the rpc.metad on all nodes (including
46800Sstevel@tonic-gate * myself) and each node will update itself. This will set the
46810Sstevel@tonic-gate * ADD and DEL flags on each node as setup in the master nodelist.
46820Sstevel@tonic-gate * Don't send nodelist to node where set doesn't exist.
46830Sstevel@tonic-gate */
46840Sstevel@tonic-gate nd = master_nodelist;
46850Sstevel@tonic-gate while (nd) {
46860Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
46870Sstevel@tonic-gate (nd->nd_flags & MD_MN_NODE_NOSET)) {
46880Sstevel@tonic-gate nd = nd->nd_next;
46890Sstevel@tonic-gate continue;
46900Sstevel@tonic-gate }
46910Sstevel@tonic-gate if (clnt_upd_nr_flags(nd->nd_nodename, sp,
46920Sstevel@tonic-gate master_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
46930Sstevel@tonic-gate /* If RPC failure to another node return 205 */
46940Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
46950Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid !=
46960Sstevel@tonic-gate nd->nd_nodeid)) {
46970Sstevel@tonic-gate rval = 205;
46980Sstevel@tonic-gate } else {
46990Sstevel@tonic-gate /* Any other failure */
47000Sstevel@tonic-gate rval = -1;
47010Sstevel@tonic-gate }
47020Sstevel@tonic-gate goto out;
47030Sstevel@tonic-gate }
47040Sstevel@tonic-gate nd = nd->nd_next;
47050Sstevel@tonic-gate }
47060Sstevel@tonic-gate
47070Sstevel@tonic-gate /*
47080Sstevel@tonic-gate * Now, delete nodes that need to be deleted.
47090Sstevel@tonic-gate */
47100Sstevel@tonic-gate if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
47110Sstevel@tonic-gate ep)) == NULL) {
47120Sstevel@tonic-gate if (! mdisok(ep)) {
47130Sstevel@tonic-gate rval = -1;
47140Sstevel@tonic-gate goto out;
47150Sstevel@tonic-gate }
47160Sstevel@tonic-gate }
47170Sstevel@tonic-gate
47180Sstevel@tonic-gate /*
47190Sstevel@tonic-gate * May be doing lots of RPC commands to the nodes, so lock the
47200Sstevel@tonic-gate * ALIVE members of the set since most of the rpc.metad routines
47210Sstevel@tonic-gate * require this for security reasons.
47220Sstevel@tonic-gate */
47230Sstevel@tonic-gate nd = master_nodelist;
47240Sstevel@tonic-gate while (nd) {
47250Sstevel@tonic-gate /* Skip non-alive nodes and node without set */
47260Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
47270Sstevel@tonic-gate (nd->nd_flags & MD_MN_NODE_NOSET)) {
47280Sstevel@tonic-gate nd = nd->nd_next;
47290Sstevel@tonic-gate continue;
47300Sstevel@tonic-gate }
47310Sstevel@tonic-gate if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
47320Sstevel@tonic-gate /* If RPC failure to another node return 205 */
47330Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
47340Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid !=
47350Sstevel@tonic-gate nd->nd_nodeid)) {
47360Sstevel@tonic-gate rval = 205;
47370Sstevel@tonic-gate } else {
47380Sstevel@tonic-gate /* Any other failure */
47390Sstevel@tonic-gate rval = -1;
47400Sstevel@tonic-gate }
47410Sstevel@tonic-gate goto out;
47420Sstevel@tonic-gate }
47430Sstevel@tonic-gate set_locked = 1;
47440Sstevel@tonic-gate nd = nd->nd_next;
47450Sstevel@tonic-gate }
47460Sstevel@tonic-gate
47470Sstevel@tonic-gate nd = master_nodelist;
47480Sstevel@tonic-gate while (nd) {
47490Sstevel@tonic-gate /* Skip non-alive nodes */
47500Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
47510Sstevel@tonic-gate nd = nd->nd_next;
47520Sstevel@tonic-gate continue;
47530Sstevel@tonic-gate }
47540Sstevel@tonic-gate if (nd->nd_flags & MD_MN_NODE_DEL) {
47550Sstevel@tonic-gate num_alive_nodes_del++;
47560Sstevel@tonic-gate /*
47570Sstevel@tonic-gate * Delete this node rec from all ALIVE nodes in diskset.
47580Sstevel@tonic-gate */
47590Sstevel@tonic-gate nd2 = master_nodelist;
47600Sstevel@tonic-gate while (nd2) {
47610Sstevel@tonic-gate /* Skip non-alive nodes and node without set */
47620Sstevel@tonic-gate if (!(nd2->nd_flags & MD_MN_NODE_ALIVE) ||
47630Sstevel@tonic-gate (nd2->nd_flags & MD_MN_NODE_NOSET)) {
47640Sstevel@tonic-gate nd2 = nd2->nd_next;
47650Sstevel@tonic-gate continue;
47660Sstevel@tonic-gate }
47670Sstevel@tonic-gate
47680Sstevel@tonic-gate /* This is a node being deleted from set */
47690Sstevel@tonic-gate if (nd2->nd_nodeid == nd->nd_nodeid) {
47700Sstevel@tonic-gate /* Mark set record as DEL */
47710Sstevel@tonic-gate if (clnt_upd_sr_flags(nd->nd_nodename,
47720Sstevel@tonic-gate sp, MD_SR_DEL, ep)) {
47730Sstevel@tonic-gate /* RPC failure to !my node */
47740Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
47750Sstevel@tonic-gate (sd->sd_mn_mynode->
47760Sstevel@tonic-gate nd_nodeid
47770Sstevel@tonic-gate != nd->nd_nodeid)) {
47780Sstevel@tonic-gate rval = 205;
47790Sstevel@tonic-gate } else {
47800Sstevel@tonic-gate /* Any other failure */
47810Sstevel@tonic-gate rval = -1;
47820Sstevel@tonic-gate }
47830Sstevel@tonic-gate goto out;
47840Sstevel@tonic-gate }
47850Sstevel@tonic-gate if (clnt_deldrvs(nd->nd_nodename, sp,
47860Sstevel@tonic-gate dd, ep)) {
47870Sstevel@tonic-gate /* RPC failure to !my node */
47880Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
47890Sstevel@tonic-gate (sd->sd_mn_mynode->
47900Sstevel@tonic-gate nd_nodeid
47910Sstevel@tonic-gate != nd->nd_nodeid)) {
47920Sstevel@tonic-gate rval = 205;
47930Sstevel@tonic-gate } else {
47940Sstevel@tonic-gate /* Any other failure */
47950Sstevel@tonic-gate rval = -1;
47960Sstevel@tonic-gate }
47970Sstevel@tonic-gate goto out;
47980Sstevel@tonic-gate }
47990Sstevel@tonic-gate if (clnt_delset(nd->nd_nodename, sp,
48000Sstevel@tonic-gate ep) == -1) {
48010Sstevel@tonic-gate /* RPC failure to !my node */
48020Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
48030Sstevel@tonic-gate (sd->sd_mn_mynode->
48040Sstevel@tonic-gate nd_nodeid
48050Sstevel@tonic-gate != nd->nd_nodeid)) {
48060Sstevel@tonic-gate rval = 205;
48070Sstevel@tonic-gate } else {
48080Sstevel@tonic-gate /* Any other failure */
48090Sstevel@tonic-gate rval = -1;
48100Sstevel@tonic-gate }
48110Sstevel@tonic-gate goto out;
48120Sstevel@tonic-gate }
48130Sstevel@tonic-gate } else {
48140Sstevel@tonic-gate /*
48150Sstevel@tonic-gate * Delete host from sets on hosts
48160Sstevel@tonic-gate * not being deleted.
48170Sstevel@tonic-gate */
48180Sstevel@tonic-gate anode[0] = Strdup(nd->nd_nodename);
48190Sstevel@tonic-gate if (clnt_delhosts(nd2->nd_nodename, sp,
48200Sstevel@tonic-gate 1, anode, ep) == -1) {
48210Sstevel@tonic-gate Free(anode[0]);
48220Sstevel@tonic-gate /* RPC failure to !my node */
48230Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
48240Sstevel@tonic-gate (sd->sd_mn_mynode->
48250Sstevel@tonic-gate nd_nodeid
48260Sstevel@tonic-gate != nd2->nd_nodeid)) {
48270Sstevel@tonic-gate rval = 205;
48280Sstevel@tonic-gate } else {
48290Sstevel@tonic-gate /* Any other failure */
48300Sstevel@tonic-gate rval = -1;
48310Sstevel@tonic-gate }
48320Sstevel@tonic-gate goto out;
48330Sstevel@tonic-gate }
48340Sstevel@tonic-gate
48350Sstevel@tonic-gate meta_mc_log(MC_LOG5,
48360Sstevel@tonic-gate dgettext(TEXT_DOMAIN,
48370Sstevel@tonic-gate "Deleted node %s (%d) on node %s "
48380Sstevel@tonic-gate "from set %s: %s"),
48390Sstevel@tonic-gate nd->nd_nodename, nd->nd_nodeid,
48400Sstevel@tonic-gate nd2->nd_nodename,
48410Sstevel@tonic-gate sp->setname,
48420Sstevel@tonic-gate meta_print_hrtime(
48430Sstevel@tonic-gate gethrtime() - start_time));
48440Sstevel@tonic-gate
48450Sstevel@tonic-gate Free(anode[0]);
48460Sstevel@tonic-gate }
48470Sstevel@tonic-gate nd2 = nd2->nd_next;
48480Sstevel@tonic-gate }
48490Sstevel@tonic-gate }
48500Sstevel@tonic-gate nd = nd->nd_next;
48510Sstevel@tonic-gate }
48520Sstevel@tonic-gate
48530Sstevel@tonic-gate nd = master_nodelist;
48540Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname);
48550Sstevel@tonic-gate while (nd) {
48560Sstevel@tonic-gate /* Skip non-alive nodes and node without set */
48570Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
48580Sstevel@tonic-gate (nd->nd_flags & MD_MN_NODE_NOSET)) {
48590Sstevel@tonic-gate nd = nd->nd_next;
48600Sstevel@tonic-gate continue;
48610Sstevel@tonic-gate }
48620Sstevel@tonic-gate if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
48630Sstevel@tonic-gate /* If RPC failure to another node return 205 */
48640Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
48650Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid !=
48660Sstevel@tonic-gate nd->nd_nodeid)) {
48670Sstevel@tonic-gate rval = 205;
48680Sstevel@tonic-gate } else {
48690Sstevel@tonic-gate /* Any other failure */
48700Sstevel@tonic-gate rval = -1;
48710Sstevel@tonic-gate }
48720Sstevel@tonic-gate goto out;
48730Sstevel@tonic-gate }
48740Sstevel@tonic-gate nd = nd->nd_next;
48750Sstevel@tonic-gate }
48760Sstevel@tonic-gate cl_set_setkey(NULL);
48770Sstevel@tonic-gate set_locked = 0;
48780Sstevel@tonic-gate
48790Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
48800Sstevel@tonic-gate "Nodelist syncronization complete for set %s: %s"),
48810Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time));
48820Sstevel@tonic-gate
48830Sstevel@tonic-gate metaflushsetname(sp);
48840Sstevel@tonic-gate
48850Sstevel@tonic-gate /*
48860Sstevel@tonic-gate * If all alive nodes have been deleted from set, just
48870Sstevel@tonic-gate * return since nothing else can be done until non-alive
48880Sstevel@tonic-gate * nodes (if there are any) rejoin the cluster.
48890Sstevel@tonic-gate */
48900Sstevel@tonic-gate if (num_alive_nodes == num_alive_nodes_del) {
48910Sstevel@tonic-gate rval = 0;
48920Sstevel@tonic-gate goto out;
48930Sstevel@tonic-gate }
48940Sstevel@tonic-gate
48950Sstevel@tonic-gate /*
48960Sstevel@tonic-gate * Sync up drive records.
48970Sstevel@tonic-gate *
48980Sstevel@tonic-gate * If a node panic'd (or metaset command was killed) during the
48990Sstevel@tonic-gate * addition or deletion of a drive to the diskset, the nodes
49000Sstevel@tonic-gate * may have a different view of the drive list. During cleanup
49010Sstevel@tonic-gate * of the drive list during reconfig, a drive will be deleted
49020Sstevel@tonic-gate * from the list if the master node sees that the drive has been
49030Sstevel@tonic-gate * marked in the ADD state on any node or is marked in the DEL state
49040Sstevel@tonic-gate * on all nodes.
49050Sstevel@tonic-gate * This cleanup must occur even if all nodes in the cluster are
49060Sstevel@tonic-gate * not part of the cluster so that all nodes have the same view
49070Sstevel@tonic-gate * of the drivelist.
49080Sstevel@tonic-gate * Then if the entire cluster goes down and comes back up, the
49090Sstevel@tonic-gate * new master node could be a node that wasn't in the cluster when
49100Sstevel@tonic-gate * the node was deleted. This could lead to a situation where the
49110Sstevel@tonic-gate * master node thinks that a drive is OK, but this drive isn't
49120Sstevel@tonic-gate * known to the other nodes.
49130Sstevel@tonic-gate * This situation can also occur during the addition of a drive
49140Sstevel@tonic-gate * where a node has the drive marked OK, but the node executing the
49150Sstevel@tonic-gate * metaset command enountered a failure before marking that drive OK
49160Sstevel@tonic-gate * on the rest of the nodes. If the node with the OK drive then
49170Sstevel@tonic-gate * panics, then rest of the nodes will remove that drive marked ADD
49180Sstevel@tonic-gate * and when the node with the OK drive rejoins the cluster, it will
49190Sstevel@tonic-gate * have a drive marked OK that is unknown by the other nodes.
49200Sstevel@tonic-gate *
49210Sstevel@tonic-gate * There are 2 situations to consider:
49220Sstevel@tonic-gate * A) Master knows about a drive that other nodes don't know about.
49230Sstevel@tonic-gate * B) At least one slave node knows about a drive that the master
49240Sstevel@tonic-gate * node doesn't know about.
49250Sstevel@tonic-gate *
49260Sstevel@tonic-gate * To handle these situations the following steps are followed:
49270Sstevel@tonic-gate * 1) Count number of drives known by this master node and the
49280Sstevel@tonic-gate * other slave nodes.
49290Sstevel@tonic-gate * If all nodes have the same number of drives and the master has
49300Sstevel@tonic-gate * all drives marked OK, then skip to step4.
49310Sstevel@tonic-gate *
49320Sstevel@tonic-gate * 2) If a node has less drives listed than the master, the master
49330Sstevel@tonic-gate * must get the drive descriptor list from that node so that
49340Sstevel@tonic-gate * master can determine which drive it needs to delete from that
49350Sstevel@tonic-gate * node. Master must get the drive descriptor list since the
49360Sstevel@tonic-gate * drive record list does not contain the name of the drive, but
49370Sstevel@tonic-gate * only a key and the key can only be interprested on that other
49380Sstevel@tonic-gate * node.
49390Sstevel@tonic-gate *
49400Sstevel@tonic-gate * 3) The master will then create the master drive list by doing:
49410Sstevel@tonic-gate * - Master starts with drive list known by master.
49420Sstevel@tonic-gate * - Any drive marked ADD will be removed from the list.
49430Sstevel@tonic-gate * - Any drive not known by another node (from step2) will be
49440Sstevel@tonic-gate * removed from the drive list.
49450Sstevel@tonic-gate * - If a drive is marked DEL on the master, the master must
49460Sstevel@tonic-gate * verify that the drive record is marked DEL on all nodes.
49470Sstevel@tonic-gate * If any node has the drive record marked OK, mark it OK
49480Sstevel@tonic-gate * on the master. (The reason why is described below).
49490Sstevel@tonic-gate *
49500Sstevel@tonic-gate * 4) The master sends out the master drive list and the slave
49510Sstevel@tonic-gate * nodes will force their drive lists to match the master
49520Sstevel@tonic-gate * drive list by deleting drives, if necessary and by changing
49530Sstevel@tonic-gate * the drive record states from ADD->OK if master has drive
49540Sstevel@tonic-gate * marked OK and slave has drive marked ADD.
49550Sstevel@tonic-gate *
49560Sstevel@tonic-gate * Interesting scenarios:
49570Sstevel@tonic-gate *
49580Sstevel@tonic-gate * 1) System has 4 nodes with node 1 as the master. Node 3 starts
49590Sstevel@tonic-gate * to delete a drive record (drive record on node 1 is marked DEL),
49600Sstevel@tonic-gate * but is stopped when node 3 panics. Node 1 also panics.
49610Sstevel@tonic-gate * During reconfig cycle, node 2 is picked as master and the drive
49620Sstevel@tonic-gate * record is left alone since all nodes in the cluster have it
49630Sstevel@tonic-gate * marked OK. User now sees drive as part of diskset.
49640Sstevel@tonic-gate * Now, entire cluster is rebooted and node 1 rejoins the cluster.
49650Sstevel@tonic-gate * Node 1 is picked as the master and node 1 has drive record
49660Sstevel@tonic-gate * marked DEL. Node 1 contacts all other nodes in the cluster
49670Sstevel@tonic-gate * and since at least one node has the drive record marked OK,
49680Sstevel@tonic-gate * the master marks the drive record OK.
49690Sstevel@tonic-gate * User continues to see the drive as part of the diskset.
49700Sstevel@tonic-gate */
49710Sstevel@tonic-gate
49720Sstevel@tonic-gate /* Reget set descriptor since flushed above */
49730Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) {
49740Sstevel@tonic-gate rval = -1;
49750Sstevel@tonic-gate goto out;
49760Sstevel@tonic-gate }
49770Sstevel@tonic-gate
49780Sstevel@tonic-gate /* Has side effect of setting sd->sd_drvs to same as master_dd */
49790Sstevel@tonic-gate if ((master_dd = metaget_drivedesc_sideno(sp,
49800Sstevel@tonic-gate sd->sd_mn_mynode->nd_nodeid,
49810Sstevel@tonic-gate (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) {
49820Sstevel@tonic-gate /* No drives in list */
49830Sstevel@tonic-gate if (!mdisok(ep)) {
49840Sstevel@tonic-gate /*
49850Sstevel@tonic-gate * Can't get drive list for this node, so
49860Sstevel@tonic-gate * return -1 causing this node to be removed
49870Sstevel@tonic-gate * cluster config and fixed.
49880Sstevel@tonic-gate */
49890Sstevel@tonic-gate rval = -1;
49900Sstevel@tonic-gate goto out;
49910Sstevel@tonic-gate }
49920Sstevel@tonic-gate }
49930Sstevel@tonic-gate
49940Sstevel@tonic-gate /* Count the number of drives for all nodes */
49950Sstevel@tonic-gate mnsr_node = master_mnsr_node;
49960Sstevel@tonic-gate while (mnsr_node) {
49970Sstevel@tonic-gate dr_cnt = 0;
49980Sstevel@tonic-gate dr = mnsr_node->mmn_mnsr->sr_drivechain;
49990Sstevel@tonic-gate while (dr) {
50000Sstevel@tonic-gate dr_cnt++;
50010Sstevel@tonic-gate dr = dr->dr_next;
50020Sstevel@tonic-gate }
50030Sstevel@tonic-gate mnsr_node->mmn_numdrives = dr_cnt;
50040Sstevel@tonic-gate mnsr_node = mnsr_node->mmn_next;
50050Sstevel@tonic-gate }
50060Sstevel@tonic-gate
50070Sstevel@tonic-gate /* Count the number of drives for the master; also check flags */
50080Sstevel@tonic-gate all_drives_ok = 1;
50090Sstevel@tonic-gate dd_cnt = 0;
50100Sstevel@tonic-gate dd = master_dd;
50110Sstevel@tonic-gate while (dd) {
50120Sstevel@tonic-gate dd_cnt++;
50130Sstevel@tonic-gate if (!(dd->dd_flags & MD_DR_OK))
50140Sstevel@tonic-gate all_drives_ok = 0;
50150Sstevel@tonic-gate dd = dd->dd_next;
50160Sstevel@tonic-gate }
50170Sstevel@tonic-gate
50180Sstevel@tonic-gate /* If all drives are ok, do quick check against number of drives */
50190Sstevel@tonic-gate if (all_drives_ok) {
50200Sstevel@tonic-gate /* If all nodes have same number of drives, almost done */
50210Sstevel@tonic-gate mnsr_node = master_mnsr_node;
50220Sstevel@tonic-gate while (mnsr_node) {
50230Sstevel@tonic-gate if (mnsr_node->mmn_numdrives != dd_cnt)
50240Sstevel@tonic-gate break;
50250Sstevel@tonic-gate mnsr_node = mnsr_node->mmn_next;
50260Sstevel@tonic-gate }
50270Sstevel@tonic-gate /* All nodes have same number of drives, just send flags */
50280Sstevel@tonic-gate if (mnsr_node == NULL) {
50290Sstevel@tonic-gate goto send_drive_list;
50300Sstevel@tonic-gate }
50310Sstevel@tonic-gate }
50320Sstevel@tonic-gate
50330Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
50340Sstevel@tonic-gate "Begin detailed drive synchronization for set %s: %s"),
50350Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time));
50360Sstevel@tonic-gate
50370Sstevel@tonic-gate /* Detailed check required */
50380Sstevel@tonic-gate mnsr_node = master_mnsr_node;
50390Sstevel@tonic-gate while (mnsr_node) {
50400Sstevel@tonic-gate /* Does slave node have less drives than master? */
50410Sstevel@tonic-gate if (mnsr_node->mmn_numdrives < dd_cnt) {
50420Sstevel@tonic-gate /* Yes - must determine which drive is missing */
50430Sstevel@tonic-gate if (clnt_getdrivedesc(mnsr_node->mmn_nodename, sp,
50440Sstevel@tonic-gate &other_dd, ep)) {
50450Sstevel@tonic-gate /* RPC failure to !my node */
50460Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
50470Sstevel@tonic-gate (strcmp(mynode(), mnsr_node->mmn_nodename)
50480Sstevel@tonic-gate != 0)) {
50490Sstevel@tonic-gate rval = 205;
50500Sstevel@tonic-gate } else {
50510Sstevel@tonic-gate /* Any other failure */
50520Sstevel@tonic-gate rval = -1;
50530Sstevel@tonic-gate }
50540Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN,
50550Sstevel@tonic-gate "Master node %s unable to "
50560Sstevel@tonic-gate "retrieve drive list from node %s"),
50570Sstevel@tonic-gate mynode(), mnsr_node->mmn_nodename);
50580Sstevel@tonic-gate goto out;
50590Sstevel@tonic-gate }
50600Sstevel@tonic-gate mnsr_node->mmn_dd = other_dd;
50610Sstevel@tonic-gate dd = master_dd;
50620Sstevel@tonic-gate while (dd) {
50630Sstevel@tonic-gate if (!(dd->dd_flags & MD_DR_OK)) {
50640Sstevel@tonic-gate dd = dd->dd_next;
50650Sstevel@tonic-gate continue;
50660Sstevel@tonic-gate }
50670Sstevel@tonic-gate other_dd = mnsr_node->mmn_dd;
50680Sstevel@tonic-gate while (other_dd) {
50690Sstevel@tonic-gate /* Convert to devids, when available */
50700Sstevel@tonic-gate if (strcmp(other_dd->dd_dnp->cname,
50710Sstevel@tonic-gate dd->dd_dnp->cname) == 0) {
50720Sstevel@tonic-gate break;
50730Sstevel@tonic-gate }
50740Sstevel@tonic-gate other_dd = other_dd->dd_next;
50750Sstevel@tonic-gate }
50760Sstevel@tonic-gate /*
50770Sstevel@tonic-gate * dd not found on slave so mark it
50780Sstevel@tonic-gate * ADD for later deletion (drives in ADD
50790Sstevel@tonic-gate * state are deleted later in this routine).
50800Sstevel@tonic-gate */
50810Sstevel@tonic-gate if (other_dd == NULL) {
50820Sstevel@tonic-gate dd->dd_flags = MD_DR_ADD;
50830Sstevel@tonic-gate }
50840Sstevel@tonic-gate dd = dd->dd_next;
50850Sstevel@tonic-gate }
50860Sstevel@tonic-gate
50870Sstevel@tonic-gate }
50880Sstevel@tonic-gate mnsr_node = mnsr_node->mmn_next;
50890Sstevel@tonic-gate }
50900Sstevel@tonic-gate
50910Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
50920Sstevel@tonic-gate "Drive check completed for set %s: %s"),
50930Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time));
50940Sstevel@tonic-gate
50950Sstevel@tonic-gate dd = master_dd;
50960Sstevel@tonic-gate dd_prev = 0;
50970Sstevel@tonic-gate while (dd) {
50980Sstevel@tonic-gate /* Remove any ADD drives from list */
50990Sstevel@tonic-gate if (dd->dd_flags & MD_DR_ADD) {
51000Sstevel@tonic-gate if (dd_prev) {
51010Sstevel@tonic-gate dd_prev->dd_next = dd->dd_next;
51020Sstevel@tonic-gate dd->dd_next = NULL;
51030Sstevel@tonic-gate metafreedrivedesc(&dd);
51040Sstevel@tonic-gate dd = dd_prev->dd_next;
51050Sstevel@tonic-gate } else {
51060Sstevel@tonic-gate /*
51070Sstevel@tonic-gate * If removing drive descriptor from head
51080Sstevel@tonic-gate * of linked list, also change sd->sd_drvs.
51090Sstevel@tonic-gate */
51100Sstevel@tonic-gate master_dd = sd->sd_drvs = dd->dd_next;
51110Sstevel@tonic-gate dd->dd_next = NULL;
51120Sstevel@tonic-gate metafreedrivedesc(&dd);
51130Sstevel@tonic-gate dd = master_dd;
51140Sstevel@tonic-gate }
51150Sstevel@tonic-gate /* dd setup in if/else above */
51160Sstevel@tonic-gate continue;
51170Sstevel@tonic-gate }
51180Sstevel@tonic-gate /*
51190Sstevel@tonic-gate * If drive is marked DEL, check all other nodes.
51200Sstevel@tonic-gate * If drive on another node is marked OK, mark drive OK
51210Sstevel@tonic-gate * in master list. If drive is marked DEL or doesn't exist
51220Sstevel@tonic-gate * on all nodes, remove drive from list.
51230Sstevel@tonic-gate */
51240Sstevel@tonic-gate if (dd->dd_flags & MD_DR_DEL) {
51250Sstevel@tonic-gate mnsr_node = master_mnsr_node;
51260Sstevel@tonic-gate while (mnsr_node) {
51270Sstevel@tonic-gate if (mnsr_node->mmn_dd == NULL) {
51284932Spetede if (clnt_getdrivedesc(
51294932Spetede mnsr_node->mmn_nodename, sp,
51304932Spetede &other_dd, ep)) {
51314932Spetede /* RPC failure to !my node */
51324932Spetede if ((mdanyrpcerror(ep)) &&
51334932Spetede (strcmp(mynode(),
51344932Spetede mnsr_node->mmn_nodename)
51354932Spetede != 0)) {
51364932Spetede rval = 205;
51374932Spetede } else {
51384932Spetede /* Any other failure */
51394932Spetede rval = -1;
51404932Spetede }
51414932Spetede mde_perror(ep,
51424932Spetede dgettext(TEXT_DOMAIN,
51434932Spetede "Master node %s unable "
51444932Spetede "to retrieve drive list "
51454932Spetede "from node %s"), mynode(),
51464932Spetede mnsr_node->mmn_nodename);
51474932Spetede goto out;
51484932Spetede }
51494932Spetede mnsr_node->mmn_dd = other_dd;
51500Sstevel@tonic-gate }
51510Sstevel@tonic-gate other_dd = mnsr_node->mmn_dd;
51520Sstevel@tonic-gate while (other_dd) {
51530Sstevel@tonic-gate /* Found drive (OK) from other node */
51540Sstevel@tonic-gate if (strcmp(dd->dd_dnp->cname,
51550Sstevel@tonic-gate other_dd->dd_dnp->cname)
51560Sstevel@tonic-gate == 0) {
51570Sstevel@tonic-gate /* Drive marked OK */
51580Sstevel@tonic-gate if (other_dd->dd_flags &
51590Sstevel@tonic-gate MD_DR_OK) {
51604932Spetede dd->dd_flags = MD_DR_OK;
51610Sstevel@tonic-gate }
51620Sstevel@tonic-gate break;
51630Sstevel@tonic-gate }
51640Sstevel@tonic-gate other_dd = other_dd->dd_next;
51650Sstevel@tonic-gate }
51660Sstevel@tonic-gate if (dd->dd_flags == MD_DR_OK)
51670Sstevel@tonic-gate break;
51680Sstevel@tonic-gate
51690Sstevel@tonic-gate mnsr_node = mnsr_node->mmn_next;
51700Sstevel@tonic-gate }
51710Sstevel@tonic-gate /*
51720Sstevel@tonic-gate * If no node had this drive marked OK, delete it.
51730Sstevel@tonic-gate */
51740Sstevel@tonic-gate if (dd->dd_flags & MD_DR_DEL) {
51750Sstevel@tonic-gate if (dd_prev) {
51760Sstevel@tonic-gate dd_prev->dd_next = dd->dd_next;
51770Sstevel@tonic-gate dd->dd_next = NULL;
51780Sstevel@tonic-gate metafreedrivedesc(&dd);
51790Sstevel@tonic-gate dd = dd_prev->dd_next;
51800Sstevel@tonic-gate } else {
51810Sstevel@tonic-gate /*
51820Sstevel@tonic-gate * If removing drive descriptor from
51830Sstevel@tonic-gate * head of linked list, also change
51840Sstevel@tonic-gate * sd->sd_drvs.
51850Sstevel@tonic-gate */
51860Sstevel@tonic-gate master_dd = sd->sd_drvs = dd->dd_next;
51870Sstevel@tonic-gate dd->dd_next = NULL;
51880Sstevel@tonic-gate metafreedrivedesc(&dd);
51890Sstevel@tonic-gate dd = master_dd;
51900Sstevel@tonic-gate }
51910Sstevel@tonic-gate /* dd setup in if/else above */
51920Sstevel@tonic-gate continue;
51930Sstevel@tonic-gate }
51940Sstevel@tonic-gate }
51950Sstevel@tonic-gate dd_prev = dd;
51960Sstevel@tonic-gate dd = dd->dd_next;
51970Sstevel@tonic-gate }
51980Sstevel@tonic-gate
51990Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
52000Sstevel@tonic-gate "Setting drive states completed for set %s: %s"),
52010Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time));
52020Sstevel@tonic-gate
52030Sstevel@tonic-gate send_drive_list:
52040Sstevel@tonic-gate /*
52050Sstevel@tonic-gate * Set genid on all drives to be the highest value seen.
52060Sstevel@tonic-gate */
52070Sstevel@tonic-gate dd = master_dd;
52080Sstevel@tonic-gate while (dd) {
52090Sstevel@tonic-gate dd->dd_genid = max_genid;
52100Sstevel@tonic-gate dd = dd->dd_next;
52110Sstevel@tonic-gate }
52120Sstevel@tonic-gate /*
52130Sstevel@tonic-gate * Send updated drive list to all alive nodes.
52140Sstevel@tonic-gate * Will also set genid on set and node records to have same
52150Sstevel@tonic-gate * as the drive records.
52160Sstevel@tonic-gate */
52170Sstevel@tonic-gate nd = sd->sd_nodelist;
52180Sstevel@tonic-gate while (nd) {
52190Sstevel@tonic-gate /* Skip non-alive nodes */
52200Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
52210Sstevel@tonic-gate nd = nd->nd_next;
52220Sstevel@tonic-gate continue;
52230Sstevel@tonic-gate }
52240Sstevel@tonic-gate if (clnt_upd_dr_reconfig(nd->nd_nodename, sp, master_dd, ep)) {
52250Sstevel@tonic-gate /* RPC failure to another node */
52260Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
52270Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid != nd->nd_nodeid)) {
52280Sstevel@tonic-gate rval = 205;
52290Sstevel@tonic-gate } else {
52300Sstevel@tonic-gate /* Any other failure */
52310Sstevel@tonic-gate rval = -1;
52320Sstevel@tonic-gate }
52330Sstevel@tonic-gate goto out;
52340Sstevel@tonic-gate }
52350Sstevel@tonic-gate nd = nd->nd_next;
52360Sstevel@tonic-gate }
52370Sstevel@tonic-gate
52380Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
52390Sstevel@tonic-gate "Sent drive list to all nodes for set %s: %s"),
52400Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time));
52410Sstevel@tonic-gate
52420Sstevel@tonic-gate /*
52430Sstevel@tonic-gate * If no drive records left in set and nodes had been joined,
52440Sstevel@tonic-gate * withdraw the nodes. Always reset the master and mark
52450Sstevel@tonic-gate * all nodes as withdrawn on all nodes.
52460Sstevel@tonic-gate */
52470Sstevel@tonic-gate if (master_dd == NULL) {
52480Sstevel@tonic-gate /* Reset new master flag since no longer master */
52490Sstevel@tonic-gate (void) memset(&sf, 0, sizeof (sf));
52500Sstevel@tonic-gate sf.sf_setno = sp->setno;
52510Sstevel@tonic-gate sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
52520Sstevel@tonic-gate sf.sf_flags = MDDB_NM_RESET;
52530Sstevel@tonic-gate /* Use magic to help protect ioctl against attack. */
52540Sstevel@tonic-gate sf.sf_magic = MDDB_SETFLAGS_MAGIC;
52550Sstevel@tonic-gate /* Ignore failure, failure to reset flag isn't catastrophic */
52560Sstevel@tonic-gate (void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
52570Sstevel@tonic-gate &sf.sf_mde, NULL);
52580Sstevel@tonic-gate
52590Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
52600Sstevel@tonic-gate "Reset new master flag for " "set %s: %s"),
52610Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time));
52620Sstevel@tonic-gate
52630Sstevel@tonic-gate nd = sd->sd_nodelist;
52640Sstevel@tonic-gate while (nd) {
52650Sstevel@tonic-gate /* Skip non-alive nodes */
52660Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
52670Sstevel@tonic-gate nd = nd->nd_next;
52680Sstevel@tonic-gate continue;
52690Sstevel@tonic-gate }
52700Sstevel@tonic-gate
52710Sstevel@tonic-gate if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
52720Sstevel@tonic-gate /* RPC failure to another node */
52730Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
52740Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid !=
52750Sstevel@tonic-gate nd->nd_nodeid)) {
52760Sstevel@tonic-gate rval = 205;
52770Sstevel@tonic-gate } else {
52780Sstevel@tonic-gate /* Any other failure */
52790Sstevel@tonic-gate rval = -1;
52800Sstevel@tonic-gate }
52810Sstevel@tonic-gate goto out;
52820Sstevel@tonic-gate }
52830Sstevel@tonic-gate set_locked = 1;
52840Sstevel@tonic-gate
52850Sstevel@tonic-gate /* Withdraw node from set if owner */
52860Sstevel@tonic-gate if ((nd->nd_flags & MD_MN_NODE_OWN) &&
52870Sstevel@tonic-gate (clnt_withdrawset(nd->nd_nodename, sp, ep))) {
52880Sstevel@tonic-gate /* RPC failure to another node */
52890Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
52900Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid !=
52910Sstevel@tonic-gate nd->nd_nodeid)) {
52920Sstevel@tonic-gate rval = 205;
52930Sstevel@tonic-gate } else {
52940Sstevel@tonic-gate /* Any other failure */
52950Sstevel@tonic-gate rval = -1;
52960Sstevel@tonic-gate }
52970Sstevel@tonic-gate goto out;
52980Sstevel@tonic-gate }
52990Sstevel@tonic-gate
53000Sstevel@tonic-gate /* Mark all nodes as withdrawn on this node */
53010Sstevel@tonic-gate if (clnt_upd_nr_flags(nd->nd_nodename, sp,
53020Sstevel@tonic-gate sd->sd_nodelist, MD_NR_WITHDRAW, NULL, ep)) {
53030Sstevel@tonic-gate /* RPC failure to another node */
53040Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
53050Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid !=
53060Sstevel@tonic-gate nd->nd_nodeid)) {
53070Sstevel@tonic-gate rval = 205;
53080Sstevel@tonic-gate } else {
53090Sstevel@tonic-gate /* Any other failure */
53100Sstevel@tonic-gate rval = -1;
53110Sstevel@tonic-gate }
53120Sstevel@tonic-gate goto out;
53130Sstevel@tonic-gate }
53140Sstevel@tonic-gate
53150Sstevel@tonic-gate /* Resets master to no-master on this node */
53160Sstevel@tonic-gate if (clnt_mnsetmaster(nd->nd_nodename, sp,
53170Sstevel@tonic-gate "", MD_MN_INVALID_NID, ep)) {
53180Sstevel@tonic-gate /* RPC failure to another node */
53190Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
53200Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid !=
53210Sstevel@tonic-gate nd->nd_nodeid)) {
53220Sstevel@tonic-gate rval = 205;
53230Sstevel@tonic-gate } else {
53240Sstevel@tonic-gate /* Any other failure */
53250Sstevel@tonic-gate rval = -1;
53260Sstevel@tonic-gate }
53270Sstevel@tonic-gate goto out;
53280Sstevel@tonic-gate }
53290Sstevel@tonic-gate
53300Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname);
53310Sstevel@tonic-gate if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
53320Sstevel@tonic-gate /* RPC failure to another node */
53330Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
53340Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid !=
53350Sstevel@tonic-gate nd->nd_nodeid)) {
53360Sstevel@tonic-gate rval = 205;
53370Sstevel@tonic-gate } else {
53380Sstevel@tonic-gate /* Any other failure */
53390Sstevel@tonic-gate rval = -1;
53400Sstevel@tonic-gate }
53410Sstevel@tonic-gate goto out;
53420Sstevel@tonic-gate }
53430Sstevel@tonic-gate set_locked = 0;
53440Sstevel@tonic-gate nd = nd->nd_next;
53450Sstevel@tonic-gate }
53460Sstevel@tonic-gate }
53470Sstevel@tonic-gate
53480Sstevel@tonic-gate out:
53490Sstevel@tonic-gate /*
53500Sstevel@tonic-gate * If got here and set is still locked, then an error has
53510Sstevel@tonic-gate * occurred and master_nodelist is still valid.
53520Sstevel@tonic-gate * If error is not an RPC error, then unlock.
53530Sstevel@tonic-gate * If error is an RPC error, skip unlocks since this could cause
53540Sstevel@tonic-gate * yet another RPC timeout if a node has failed.
53550Sstevel@tonic-gate * Ignore failures in unlock since unlock is just trying to
53560Sstevel@tonic-gate * clean things up.
53570Sstevel@tonic-gate */
53580Sstevel@tonic-gate if ((set_locked) && !(mdanyrpcerror(ep))) {
53590Sstevel@tonic-gate nd = master_nodelist;
53600Sstevel@tonic-gate cl_sk = cl_get_setkey(sp->setno, sp->setname);
53610Sstevel@tonic-gate while (nd) {
53620Sstevel@tonic-gate /* Skip non-alive nodes */
53630Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
53640Sstevel@tonic-gate nd = nd->nd_next;
53650Sstevel@tonic-gate continue;
53660Sstevel@tonic-gate }
53670Sstevel@tonic-gate /*
53680Sstevel@tonic-gate * If clnt_unlock fails, just break out since next
53690Sstevel@tonic-gate * reconfig cycle will reset the locks anyway.
53700Sstevel@tonic-gate */
53710Sstevel@tonic-gate if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
53720Sstevel@tonic-gate break;
53730Sstevel@tonic-gate }
53740Sstevel@tonic-gate nd = nd->nd_next;
53750Sstevel@tonic-gate }
53760Sstevel@tonic-gate cl_set_setkey(NULL);
53770Sstevel@tonic-gate }
53780Sstevel@tonic-gate /* Free master_mnsr and drive descs */
53790Sstevel@tonic-gate mnsr_node = master_mnsr_node;
53800Sstevel@tonic-gate while (mnsr_node) {
53810Sstevel@tonic-gate master_mnsr_node = mnsr_node->mmn_next;
53820Sstevel@tonic-gate free_sr((md_set_record *)mnsr_node->mmn_mnsr);
53830Sstevel@tonic-gate free_rem_dd(mnsr_node->mmn_dd);
53840Sstevel@tonic-gate Free(mnsr_node);
53850Sstevel@tonic-gate mnsr_node = master_mnsr_node;
53860Sstevel@tonic-gate }
53870Sstevel@tonic-gate
53880Sstevel@tonic-gate /* Frees sd->sd_drvs (which is also master_dd) */
53890Sstevel@tonic-gate metaflushsetname(sp);
53900Sstevel@tonic-gate return (rval);
53910Sstevel@tonic-gate }
53920Sstevel@tonic-gate
53930Sstevel@tonic-gate /*
53940Sstevel@tonic-gate * meta_mnsync_diskset_mddbs
53950Sstevel@tonic-gate * Calling node is guaranteed to be an owner node.
53960Sstevel@tonic-gate * Calling node is the master node.
53970Sstevel@tonic-gate *
53980Sstevel@tonic-gate * Master node verifies that ondisk mddb format matches its incore format.
53990Sstevel@tonic-gate * If no nodes are joined to set, remove the change log entries.
54000Sstevel@tonic-gate * If a node is joined to set, play the change log.
54010Sstevel@tonic-gate *
54020Sstevel@tonic-gate * Returns 0 - Success
54030Sstevel@tonic-gate * 1 - Master unable to join to set.
54040Sstevel@tonic-gate * 205 - Failure during RPC to another node
54050Sstevel@tonic-gate * -1 - Any other failure and ep is filled in.
54060Sstevel@tonic-gate * -1 return will eventually cause node to panic
54070Sstevel@tonic-gate * in a SunCluster environment.
54080Sstevel@tonic-gate */
54090Sstevel@tonic-gate int
meta_mnsync_diskset_mddbs(mdsetname_t * sp,md_error_t * ep)54100Sstevel@tonic-gate meta_mnsync_diskset_mddbs(
54110Sstevel@tonic-gate mdsetname_t *sp,
54120Sstevel@tonic-gate md_error_t *ep
54130Sstevel@tonic-gate )
54140Sstevel@tonic-gate {
54150Sstevel@tonic-gate md_set_desc *sd;
54160Sstevel@tonic-gate mddb_config_t c;
54170Sstevel@tonic-gate md_mn_msgclass_t class;
54180Sstevel@tonic-gate mddb_setflags_config_t sf;
54190Sstevel@tonic-gate md_mnnode_desc *nd, *nd2;
54200Sstevel@tonic-gate md_error_t xep = mdnullerror;
54210Sstevel@tonic-gate int stale_set = 0;
54220Sstevel@tonic-gate
54230Sstevel@tonic-gate /* If setname is there, set desc should exist. */
54240Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) {
54250Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN,
54260Sstevel@tonic-gate "Unable to get set %s desc information"), sp->setname);
54270Sstevel@tonic-gate return (-1);
54280Sstevel@tonic-gate }
54290Sstevel@tonic-gate
54300Sstevel@tonic-gate /* Are there drives in the set? */
54310Sstevel@tonic-gate if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
54320Sstevel@tonic-gate ep) == NULL) {
54330Sstevel@tonic-gate if (! mdisok(ep)) {
54340Sstevel@tonic-gate return (-1);
54350Sstevel@tonic-gate }
54360Sstevel@tonic-gate /* No drives in set -- nothing to sync up */
54370Sstevel@tonic-gate return (0);
54380Sstevel@tonic-gate }
54390Sstevel@tonic-gate
54400Sstevel@tonic-gate /*
54410Sstevel@tonic-gate * Is master node (which is this node) joined to set?
54420Sstevel@tonic-gate * If master node isn't joined (which means that no nodes
54430Sstevel@tonic-gate * are joined to diskset), remove the change log entries
54440Sstevel@tonic-gate * since no need to replay them - all nodes will have same
54450Sstevel@tonic-gate * view of mddbs since all nodes are reading in the mddbs
54460Sstevel@tonic-gate * from disk.
54470Sstevel@tonic-gate * There is also no need to sync up the master and ondisk mddbs
54480Sstevel@tonic-gate * since master has no incore knowledge.
54490Sstevel@tonic-gate * Need to join master to set in order to flush the change
54500Sstevel@tonic-gate * log entries. Don't need to block I/O during join of master
54510Sstevel@tonic-gate * to set since no other nodes are joined to set and so no I/O
54520Sstevel@tonic-gate * can be occurring.
54530Sstevel@tonic-gate */
54540Sstevel@tonic-gate if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
54550Sstevel@tonic-gate /* Join master to set */
54560Sstevel@tonic-gate if (clnt_joinset(mynode(), sp,
54570Sstevel@tonic-gate MNSET_IN_RECONFIG, ep)) {
54580Sstevel@tonic-gate if (mdismddberror(ep, MDE_DB_STALE)) {
54590Sstevel@tonic-gate /*
54600Sstevel@tonic-gate * If STALE, print message and continue on.
54610Sstevel@tonic-gate * Don't do any writes or reads to mddbs
54620Sstevel@tonic-gate * so don't clear change log.
54630Sstevel@tonic-gate */
54640Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN,
54650Sstevel@tonic-gate "Join of master node to STALE set %s"),
54660Sstevel@tonic-gate sp->setname);
54670Sstevel@tonic-gate stale_set = 1;
54680Sstevel@tonic-gate mdclrerror(ep);
54690Sstevel@tonic-gate } else if (mdismddberror(ep, MDE_DB_ACCOK)) {
54700Sstevel@tonic-gate /* ACCOK means mediator provided extra vote */
54710Sstevel@tonic-gate mdclrerror(ep);
54720Sstevel@tonic-gate } else {
54730Sstevel@tonic-gate /*
54740Sstevel@tonic-gate * If master is unable to join set, print an
54750Sstevel@tonic-gate * error message. Don't return failure or node
54760Sstevel@tonic-gate * will panic during cluster reconfig cycle.
54770Sstevel@tonic-gate * Also, withdraw node from set in order to
54780Sstevel@tonic-gate * cleanup from failed join attempt.
54790Sstevel@tonic-gate */
54800Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN,
54810Sstevel@tonic-gate "Join of master node in set %s failed"),
54820Sstevel@tonic-gate sp->setname);
54830Sstevel@tonic-gate if (clnt_withdrawset(mynode(), sp, &xep))
54840Sstevel@tonic-gate mdclrerror(&xep);
54850Sstevel@tonic-gate return (1);
54860Sstevel@tonic-gate }
54870Sstevel@tonic-gate }
54880Sstevel@tonic-gate /*
54890Sstevel@tonic-gate * Master node successfully joined.
54900Sstevel@tonic-gate * Set local copy of flags to OWN and
54910Sstevel@tonic-gate * send owner flag to rpc.metad. If not stale,
54920Sstevel@tonic-gate * flush the change log.
54930Sstevel@tonic-gate */
54940Sstevel@tonic-gate sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
54950Sstevel@tonic-gate if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, MD_NR_SET,
54960Sstevel@tonic-gate MNSET_IN_RECONFIG, ep)) {
54970Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN,
54980Sstevel@tonic-gate "Flag update of master node join in set %s failed"),
54990Sstevel@tonic-gate sp->setname);
55000Sstevel@tonic-gate return (-1);
55010Sstevel@tonic-gate }
55020Sstevel@tonic-gate
55030Sstevel@tonic-gate if (!stale_set) {
55040Sstevel@tonic-gate if (mdmn_reset_changelog(sp, ep,
55050Sstevel@tonic-gate MDMN_CLF_RESETLOG) != 0) {
55060Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN,
55070Sstevel@tonic-gate "Unable to reset changelog."));
55080Sstevel@tonic-gate return (-1);
55090Sstevel@tonic-gate }
55100Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
55110Sstevel@tonic-gate "Removed changelog entries for set %s: %s"),
55120Sstevel@tonic-gate sp->setname,
55130Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time));
55140Sstevel@tonic-gate }
55150Sstevel@tonic-gate /* Reset new master flag before return */
55160Sstevel@tonic-gate (void) memset(&sf, 0, sizeof (sf));
55170Sstevel@tonic-gate sf.sf_setno = sp->setno;
55180Sstevel@tonic-gate sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
55190Sstevel@tonic-gate sf.sf_flags = MDDB_NM_RESET;
55200Sstevel@tonic-gate /* Use magic to help protect ioctl against attack. */
55210Sstevel@tonic-gate sf.sf_magic = MDDB_SETFLAGS_MAGIC;
55220Sstevel@tonic-gate /* Ignore failure, failure to reset flag isn't catastrophic */
55230Sstevel@tonic-gate (void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
55240Sstevel@tonic-gate &sf.sf_mde, NULL);
55250Sstevel@tonic-gate
55260Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
55270Sstevel@tonic-gate "Reset new master flag for set %s: %s"),
55280Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time));
55290Sstevel@tonic-gate
55300Sstevel@tonic-gate return (0);
55310Sstevel@tonic-gate }
55320Sstevel@tonic-gate
55330Sstevel@tonic-gate /*
55340Sstevel@tonic-gate * Is master already joined to STALE set (< 50% mddbs avail)?
55350Sstevel@tonic-gate * If so, can make no config changes to mddbs so don't check or play
55360Sstevel@tonic-gate * changelog and don't sync master node to ondisk mddbs.
55370Sstevel@tonic-gate * To get out of the stale state all nodes must be withdrawn
55380Sstevel@tonic-gate * from set. Then as nodes are re-joined, all nodes will
55390Sstevel@tonic-gate * have same view of mddbs since all nodes are reading the
55400Sstevel@tonic-gate * mddbs from disk.
55410Sstevel@tonic-gate */
55420Sstevel@tonic-gate (void) memset(&c, 0, sizeof (c));
55430Sstevel@tonic-gate c.c_id = 0;
55440Sstevel@tonic-gate c.c_setno = sp->setno;
55450Sstevel@tonic-gate if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
55460Sstevel@tonic-gate (void) mdstealerror(ep, &c.c_mde);
55470Sstevel@tonic-gate return (-1);
55480Sstevel@tonic-gate }
55490Sstevel@tonic-gate if (c.c_flags & MDDB_C_STALE) {
55500Sstevel@tonic-gate return (0);
55510Sstevel@tonic-gate }
55520Sstevel@tonic-gate
55530Sstevel@tonic-gate /*
55540Sstevel@tonic-gate * If this node is NOT a newly chosen master, then there's
55550Sstevel@tonic-gate * nothing else to do since the change log should be empty and
55560Sstevel@tonic-gate * the ondisk and incore mddbs are already consistent.
55570Sstevel@tonic-gate *
55580Sstevel@tonic-gate * A newly chosen master is a node that was not the master
55590Sstevel@tonic-gate * at the beginning of the reconfig cycle. If a node is a new
55600Sstevel@tonic-gate * master, then the new master state is reset after the ondisk
55610Sstevel@tonic-gate * and incore mddbs are consistent and the change log has
55620Sstevel@tonic-gate * been replayed.
55630Sstevel@tonic-gate */
55640Sstevel@tonic-gate (void) memset(&sf, 0, sizeof (sf));
55650Sstevel@tonic-gate sf.sf_setno = sp->setno;
55660Sstevel@tonic-gate sf.sf_flags = MDDB_NM_GET;
55670Sstevel@tonic-gate /* Use magic to help protect ioctl against attack. */
55680Sstevel@tonic-gate sf.sf_magic = MDDB_SETFLAGS_MAGIC;
55690Sstevel@tonic-gate if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) == 0) &&
55700Sstevel@tonic-gate ((sf.sf_setflags & MD_SET_MN_NEWMAS_RC) == 0)) {
55710Sstevel@tonic-gate return (0);
55720Sstevel@tonic-gate }
55730Sstevel@tonic-gate
55740Sstevel@tonic-gate /*
55750Sstevel@tonic-gate * Now, sync up incore master view to ondisk mddbs.
55760Sstevel@tonic-gate * This is needed in the case where a master node
55770Sstevel@tonic-gate * had made a change to the mddb, but this change
55780Sstevel@tonic-gate * may not have been relayed to the slaves yet.
55790Sstevel@tonic-gate * So, the new master needs to verify that the ondisk
55800Sstevel@tonic-gate * mddbs match what the new master has incore -
55810Sstevel@tonic-gate * if different, new master rewrites all of the mddbs.
55820Sstevel@tonic-gate * Then the new master will replay the changelog and the
55830Sstevel@tonic-gate * new master will then execute what the old master had
55840Sstevel@tonic-gate * done.
55850Sstevel@tonic-gate *
55860Sstevel@tonic-gate * Block all I/Os to disks in this diskset on all nodes in
55870Sstevel@tonic-gate * the diskset. This will allow the rewriting of the mddbs
55880Sstevel@tonic-gate * (if needed), to proceed in a timely manner.
55890Sstevel@tonic-gate *
55900Sstevel@tonic-gate * If block of I/Os fail, return a -1.
55910Sstevel@tonic-gate */
55920Sstevel@tonic-gate
55930Sstevel@tonic-gate nd = sd->sd_nodelist;
55940Sstevel@tonic-gate while (nd) {
55950Sstevel@tonic-gate /* Skip non-alive and non-owner nodes */
55960Sstevel@tonic-gate if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
55970Sstevel@tonic-gate (!(nd->nd_flags & MD_MN_NODE_OWN))) {
55980Sstevel@tonic-gate nd = nd->nd_next;
55990Sstevel@tonic-gate continue;
56000Sstevel@tonic-gate }
56010Sstevel@tonic-gate if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
56020Sstevel@tonic-gate MN_SUSP_IO, ep)) {
56030Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN,
56040Sstevel@tonic-gate "Unable to suspend I/O on node %s in set %s"),
56050Sstevel@tonic-gate nd->nd_nodename, sp->setname);
56060Sstevel@tonic-gate
56070Sstevel@tonic-gate /*
56080Sstevel@tonic-gate * Resume all other nodes that had been suspended.
56090Sstevel@tonic-gate * (Reconfig return step also resumes I/Os
56100Sstevel@tonic-gate * for all sets.)
56110Sstevel@tonic-gate */
56120Sstevel@tonic-gate nd2 = sd->sd_nodelist;
56130Sstevel@tonic-gate while (nd2) {
56140Sstevel@tonic-gate /* Stop when reaching failed node */
56150Sstevel@tonic-gate if (nd2->nd_nodeid == nd->nd_nodeid)
56160Sstevel@tonic-gate break;
56170Sstevel@tonic-gate /* Skip non-alive and non-owner nodes */
56180Sstevel@tonic-gate if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
56190Sstevel@tonic-gate (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
56200Sstevel@tonic-gate nd2 = nd2->nd_next;
56210Sstevel@tonic-gate continue;
56220Sstevel@tonic-gate }
56230Sstevel@tonic-gate (void) (clnt_mn_susp_res_io(nd2->nd_nodename,
56244932Spetede sp->setno, MN_RES_IO, &xep));
56250Sstevel@tonic-gate nd2 = nd2->nd_next;
56260Sstevel@tonic-gate }
56270Sstevel@tonic-gate
56280Sstevel@tonic-gate /*
56290Sstevel@tonic-gate * If an RPC failure on another node, return a 205.
56300Sstevel@tonic-gate * Otherwise, exit with failure.
56310Sstevel@tonic-gate */
56320Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
56330Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid !=
56340Sstevel@tonic-gate nd->nd_nodeid)) {
56350Sstevel@tonic-gate return (205);
56360Sstevel@tonic-gate } else {
56370Sstevel@tonic-gate return (-1);
56380Sstevel@tonic-gate }
56390Sstevel@tonic-gate
56400Sstevel@tonic-gate }
56410Sstevel@tonic-gate nd = nd->nd_next;
56420Sstevel@tonic-gate }
56430Sstevel@tonic-gate
56440Sstevel@tonic-gate (void) memset(&c, 0, sizeof (c));
56450Sstevel@tonic-gate c.c_id = 0;
56460Sstevel@tonic-gate c.c_setno = sp->setno;
56470Sstevel@tonic-gate /* Master can't sync up to ondisk mddbs? Kick it out of cluster */
56480Sstevel@tonic-gate if (metaioctl(MD_MN_CHK_WRT_MDDB, &c, &c.c_mde, NULL) != 0)
56490Sstevel@tonic-gate return (-1);
56500Sstevel@tonic-gate
56510Sstevel@tonic-gate /*
56520Sstevel@tonic-gate * Resume I/Os that were suspended above.
56530Sstevel@tonic-gate */
56540Sstevel@tonic-gate nd = sd->sd_nodelist;
56550Sstevel@tonic-gate while (nd) {
56560Sstevel@tonic-gate /* Skip non-alive and non-owner nodes */
56570Sstevel@tonic-gate if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
56580Sstevel@tonic-gate (!(nd->nd_flags & MD_MN_NODE_OWN))) {
56590Sstevel@tonic-gate nd = nd->nd_next;
56600Sstevel@tonic-gate continue;
56610Sstevel@tonic-gate }
56620Sstevel@tonic-gate if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
56630Sstevel@tonic-gate MN_RES_IO, ep)) {
56640Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN,
56650Sstevel@tonic-gate "Unable to resume I/O on node %s in set %s"),
56660Sstevel@tonic-gate nd->nd_nodename, sp->setname);
56670Sstevel@tonic-gate
56680Sstevel@tonic-gate /*
56690Sstevel@tonic-gate * If an RPC failure then don't do any
56700Sstevel@tonic-gate * more RPC calls, since one timeout is enough
56710Sstevel@tonic-gate * to endure. If RPC failure to another node, return
56720Sstevel@tonic-gate * 205. If RPC failure to my node, return -1.
56730Sstevel@tonic-gate * If not an RPC failure, continue resuming the
56740Sstevel@tonic-gate * rest of the nodes and then return -1.
56750Sstevel@tonic-gate */
56760Sstevel@tonic-gate if (mdanyrpcerror(ep)) {
56770Sstevel@tonic-gate if (sd->sd_mn_mynode->nd_nodeid ==
56780Sstevel@tonic-gate nd->nd_nodeid) {
56790Sstevel@tonic-gate return (-1);
56800Sstevel@tonic-gate } else {
56810Sstevel@tonic-gate return (205);
56820Sstevel@tonic-gate }
56830Sstevel@tonic-gate }
56840Sstevel@tonic-gate
56850Sstevel@tonic-gate /*
56860Sstevel@tonic-gate * If not an RPC error, continue resuming rest of
56870Sstevel@tonic-gate * nodes, ignoring any failures except for an
56880Sstevel@tonic-gate * RPC failure which constitutes an immediate exit.
56890Sstevel@tonic-gate * Start in middle of list with failing node.
56900Sstevel@tonic-gate */
56910Sstevel@tonic-gate nd2 = nd->nd_next;
56920Sstevel@tonic-gate while (nd2) {
56930Sstevel@tonic-gate /* Skip non-alive and non-owner nodes */
56940Sstevel@tonic-gate if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
56950Sstevel@tonic-gate (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
56960Sstevel@tonic-gate nd2 = nd2->nd_next;
56970Sstevel@tonic-gate continue;
56980Sstevel@tonic-gate }
56990Sstevel@tonic-gate (void) (clnt_mn_susp_res_io(nd2->nd_nodename,
57004932Spetede sp->setno, MN_RES_IO, &xep));
57010Sstevel@tonic-gate if (mdanyrpcerror(&xep)) {
57020Sstevel@tonic-gate return (-1);
57030Sstevel@tonic-gate }
57040Sstevel@tonic-gate nd2 = nd2->nd_next;
57050Sstevel@tonic-gate }
57060Sstevel@tonic-gate }
57070Sstevel@tonic-gate nd = nd->nd_next;
57080Sstevel@tonic-gate }
57090Sstevel@tonic-gate
57100Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, "Master node has completed "
57110Sstevel@tonic-gate "checking/writing the mddb for set %s: %s"), sp->setname,
57120Sstevel@tonic-gate meta_print_hrtime(gethrtime() - start_time));
57130Sstevel@tonic-gate
57140Sstevel@tonic-gate /*
57150Sstevel@tonic-gate * Send (aka replay) all messages we find in the changelog.
57160Sstevel@tonic-gate * Flag the messages with
57170Sstevel@tonic-gate * MD_MSGF_REPLAY_MSG, so no new message ID is generated for them
57180Sstevel@tonic-gate * MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd.
57190Sstevel@tonic-gate */
57200Sstevel@tonic-gate for (class = MD_MN_NCLASSES - 1; class > 0; class--) {
57210Sstevel@tonic-gate mdmn_changelog_record_t *lr;
57220Sstevel@tonic-gate md_error_t xep = mdnullerror;
57230Sstevel@tonic-gate md_mn_result_t *resultp = NULL;
57240Sstevel@tonic-gate int ret;
57250Sstevel@tonic-gate
57260Sstevel@tonic-gate lr = mdmn_get_changelogrec(sp->setno, class);
57270Sstevel@tonic-gate if ((lr->lr_flags & MD_MN_LR_INUSE) == 0) {
57280Sstevel@tonic-gate /* no entry for this class */
57290Sstevel@tonic-gate continue;
57300Sstevel@tonic-gate }
57310Sstevel@tonic-gate
57320Sstevel@tonic-gate meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
57330Sstevel@tonic-gate "replaying message ID=(%d, 0x%llx-%d)\n"),
57340Sstevel@tonic-gate MSGID_ELEMS(lr->lr_msg.msg_msgid));
57350Sstevel@tonic-gate
57360Sstevel@tonic-gate ret = mdmn_send_message_with_msgid(
57374932Spetede lr->lr_msg.msg_setno,
57384932Spetede lr->lr_msg.msg_type,
57394932Spetede lr->lr_msg.msg_flags | MD_MSGF_REPLAY_MSG |
57404932Spetede MD_MSGF_OVERRIDE_SUSPEND,
5741*8452SJohn.Wren.Kennedy@Sun.COM lr->lr_msg.msg_recipient,
57424932Spetede lr->lr_msg.msg_event_data,
57434932Spetede lr->lr_msg.msg_event_size,
57444932Spetede &resultp,
57454932Spetede &lr->lr_msg.msg_msgid,
57464932Spetede &xep);
57470Sstevel@tonic-gate
57480Sstevel@tonic-gate meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
57490Sstevel@tonic-gate "mdmn_send_message returned %d\n"), ret);
57500Sstevel@tonic-gate
57510Sstevel@tonic-gate if (resultp)
57520Sstevel@tonic-gate free_result(resultp);
57530Sstevel@tonic-gate }
57540Sstevel@tonic-gate
57550Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
57560Sstevel@tonic-gate "Playing changelog completed for set %s: %s"),
57570Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time));
57580Sstevel@tonic-gate
57590Sstevel@tonic-gate /*
57600Sstevel@tonic-gate * Now that new master has ondisk and incore mddbs in sync, reset
57610Sstevel@tonic-gate * this node's new master kernel flag (for this set). If this node
57620Sstevel@tonic-gate * re-enters another reconfig cycle before the completion of this
57630Sstevel@tonic-gate * reconfig cycle, this master node won't need to check if the ondisk
57640Sstevel@tonic-gate * and incore mddbs are in sync since this node won't be considered
57650Sstevel@tonic-gate * a new master (since this flag is being reset here in the middle of
57660Sstevel@tonic-gate * step2). This will save time during any subsequent reconfig
57670Sstevel@tonic-gate * cycles as long as this node continues to be master.
57680Sstevel@tonic-gate */
57690Sstevel@tonic-gate (void) memset(&sf, 0, sizeof (sf));
57700Sstevel@tonic-gate sf.sf_setno = sp->setno;
57710Sstevel@tonic-gate sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
57720Sstevel@tonic-gate sf.sf_flags = MDDB_NM_RESET;
57730Sstevel@tonic-gate /* Use magic to help protect ioctl against attack. */
57740Sstevel@tonic-gate sf.sf_magic = MDDB_SETFLAGS_MAGIC;
57750Sstevel@tonic-gate /* Ignore failure, since failure to reset flag isn't catastrophic */
57760Sstevel@tonic-gate (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL);
57770Sstevel@tonic-gate
57780Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
57790Sstevel@tonic-gate "Reset new master flag for set %s: %s"),
57800Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time));
57810Sstevel@tonic-gate
57820Sstevel@tonic-gate return (0);
57830Sstevel@tonic-gate }
57840Sstevel@tonic-gate
57850Sstevel@tonic-gate /*
57860Sstevel@tonic-gate * meta_mnjoin_all will join all starting nodes in the diskset.
57870Sstevel@tonic-gate * A starting node is considered to be any node that is not
57880Sstevel@tonic-gate * an owner of the set but is a member of the cluster.
57890Sstevel@tonic-gate * Master node is already joined to set (done in meta_mnsync_diskset_mddbs).
57900Sstevel@tonic-gate *
57910Sstevel@tonic-gate * Caller is the Master node.
57920Sstevel@tonic-gate *
57930Sstevel@tonic-gate * Returns 0 - Success
57940Sstevel@tonic-gate * 205 - Failure during RPC to another node
57950Sstevel@tonic-gate * -1 - Any other failure and ep is filled in.
57960Sstevel@tonic-gate */
57970Sstevel@tonic-gate int
meta_mnjoin_all(mdsetname_t * sp,md_error_t * ep)57980Sstevel@tonic-gate meta_mnjoin_all(
57990Sstevel@tonic-gate mdsetname_t *sp,
58000Sstevel@tonic-gate md_error_t *ep
58010Sstevel@tonic-gate )
58020Sstevel@tonic-gate {
58030Sstevel@tonic-gate md_set_desc *sd;
58040Sstevel@tonic-gate md_mnnode_desc *nd, *nd2;
58050Sstevel@tonic-gate int rval = 0;
58060Sstevel@tonic-gate int stale_flag = 0;
58070Sstevel@tonic-gate mddb_config_t c;
58080Sstevel@tonic-gate int susp_res_flag = 0;
58090Sstevel@tonic-gate md_error_t xep = mdnullerror;
58100Sstevel@tonic-gate
58110Sstevel@tonic-gate /* If setname is there, set desc should exist. */
58120Sstevel@tonic-gate if ((sd = metaget_setdesc(sp, ep)) == NULL) {
58130Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN,
58140Sstevel@tonic-gate "Unable to get set %s desc information"), sp->setname);
58150Sstevel@tonic-gate return (-1);
58160Sstevel@tonic-gate }
58170Sstevel@tonic-gate
58180Sstevel@tonic-gate /* Are there drives in the set? */
58190Sstevel@tonic-gate if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
58200Sstevel@tonic-gate ep) == NULL) {
58210Sstevel@tonic-gate if (! mdisok(ep)) {
58220Sstevel@tonic-gate return (-1);
58230Sstevel@tonic-gate }
58240Sstevel@tonic-gate /* No drives in set -- nothing to join */
58250Sstevel@tonic-gate return (0);
58260Sstevel@tonic-gate }
58270Sstevel@tonic-gate
58280Sstevel@tonic-gate /*
58290Sstevel@tonic-gate * Is set currently stale?
58300Sstevel@tonic-gate */
58310Sstevel@tonic-gate (void) memset(&c, 0, sizeof (c));
58320Sstevel@tonic-gate c.c_id = 0;
58330Sstevel@tonic-gate c.c_setno = sp->setno;
58340Sstevel@tonic-gate /* Ignore failure since master node may not be joined yet */
58350Sstevel@tonic-gate (void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL);
58360Sstevel@tonic-gate if (c.c_flags & MDDB_C_STALE) {
58370Sstevel@tonic-gate stale_flag = MNSET_IS_STALE;
58380Sstevel@tonic-gate }
58390Sstevel@tonic-gate
58400Sstevel@tonic-gate /*
58410Sstevel@tonic-gate * If any nodes are going to be joined to diskset, then
58420Sstevel@tonic-gate * suspend I/O to all disks in diskset so that nodes can join
58430Sstevel@tonic-gate * (read in mddbs) in a reasonable amount of time even under
58440Sstevel@tonic-gate * high I/O load. Don't need to do this if set is STALE since
58450Sstevel@tonic-gate * no I/O can be occurring to a STALE set.
58460Sstevel@tonic-gate */
58470Sstevel@tonic-gate if (stale_flag != MNSET_IS_STALE) {
58480Sstevel@tonic-gate nd = sd->sd_nodelist;
58490Sstevel@tonic-gate while (nd) {
58500Sstevel@tonic-gate /* Found a node that will be joined to diskset */
58510Sstevel@tonic-gate if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
58520Sstevel@tonic-gate (!(nd->nd_flags & MD_MN_NODE_OWN))) {
58530Sstevel@tonic-gate /* Set flag that diskset should be suspended */
58540Sstevel@tonic-gate susp_res_flag = 1;
58550Sstevel@tonic-gate break;
58560Sstevel@tonic-gate }
58570Sstevel@tonic-gate nd = nd->nd_next;
58580Sstevel@tonic-gate }
58590Sstevel@tonic-gate }
58600Sstevel@tonic-gate
58610Sstevel@tonic-gate if (susp_res_flag) {
58620Sstevel@tonic-gate /*
58630Sstevel@tonic-gate * Block all I/Os to disks in this diskset on all joined
58640Sstevel@tonic-gate * nodes in the diskset.
58650Sstevel@tonic-gate * If block of I/Os fails due to an RPC failure on another
58660Sstevel@tonic-gate * node, return 205; otherwise, return -1.
58670Sstevel@tonic-gate */
58680Sstevel@tonic-gate nd = sd->sd_nodelist;
58690Sstevel@tonic-gate while (nd) {
58700Sstevel@tonic-gate /* Skip non-alive and non-owner nodes */
58710Sstevel@tonic-gate if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
58720Sstevel@tonic-gate (!(nd->nd_flags & MD_MN_NODE_OWN))) {
58730Sstevel@tonic-gate nd = nd->nd_next;
58740Sstevel@tonic-gate continue;
58750Sstevel@tonic-gate }
58760Sstevel@tonic-gate if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
58770Sstevel@tonic-gate MN_SUSP_IO, ep)) {
58780Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN,
58790Sstevel@tonic-gate "Unable to suspend I/O on node %s"
58800Sstevel@tonic-gate " in set %s"), nd->nd_nodename,
58810Sstevel@tonic-gate sp->setname);
58820Sstevel@tonic-gate /*
58830Sstevel@tonic-gate * Resume other nodes that had been suspended.
58840Sstevel@tonic-gate * (Reconfig return step also resumes I/Os
58850Sstevel@tonic-gate * for all sets.)
58860Sstevel@tonic-gate */
58870Sstevel@tonic-gate nd2 = sd->sd_nodelist;
58880Sstevel@tonic-gate while (nd2) {
58890Sstevel@tonic-gate /* Stop when reaching failed node */
58900Sstevel@tonic-gate if (nd2->nd_nodeid == nd->nd_nodeid)
58910Sstevel@tonic-gate break;
58920Sstevel@tonic-gate /* Skip non-alive/non-owner nodes */
58930Sstevel@tonic-gate if ((!(nd2->nd_flags &
58940Sstevel@tonic-gate MD_MN_NODE_ALIVE)) ||
58950Sstevel@tonic-gate (!(nd2->nd_flags &
58960Sstevel@tonic-gate MD_MN_NODE_OWN))) {
58970Sstevel@tonic-gate nd2 = nd2->nd_next;
58980Sstevel@tonic-gate continue;
58990Sstevel@tonic-gate }
59000Sstevel@tonic-gate (void) (clnt_mn_susp_res_io(
59010Sstevel@tonic-gate nd2->nd_nodename, sp->setno,
59020Sstevel@tonic-gate MN_RES_IO, &xep));
59030Sstevel@tonic-gate nd2 = nd2->nd_next;
59040Sstevel@tonic-gate }
59050Sstevel@tonic-gate
59060Sstevel@tonic-gate /*
59070Sstevel@tonic-gate * If the suspend failed due to an
59080Sstevel@tonic-gate * RPC failure on another node, return
59090Sstevel@tonic-gate * a 205.
59100Sstevel@tonic-gate * Otherwise, exit with failure.
59110Sstevel@tonic-gate * The return reconfig step will resume
59120Sstevel@tonic-gate * I/Os for all disksets.
59130Sstevel@tonic-gate */
59140Sstevel@tonic-gate if ((mdanyrpcerror(ep)) &&
59150Sstevel@tonic-gate (sd->sd_mn_mynode->nd_nodeid !=
59160Sstevel@tonic-gate nd->nd_nodeid)) {
59170Sstevel@tonic-gate return (205);
59180Sstevel@tonic-gate } else {
59190Sstevel@tonic-gate return (-1);
59200Sstevel@tonic-gate }
59210Sstevel@tonic-gate }
59220Sstevel@tonic-gate nd = nd->nd_next;
59230Sstevel@tonic-gate }
59240Sstevel@tonic-gate }
59250Sstevel@tonic-gate
59260Sstevel@tonic-gate nd = sd->sd_nodelist;
59270Sstevel@tonic-gate while (nd) {
59280Sstevel@tonic-gate /*
59290Sstevel@tonic-gate * If a node is in the membership list but isn't joined
59300Sstevel@tonic-gate * to the set, try to join the node.
59310Sstevel@tonic-gate */
59320Sstevel@tonic-gate if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
59330Sstevel@tonic-gate (!(nd->nd_flags & MD_MN_NODE_OWN))) {
59340Sstevel@tonic-gate if (clnt_joinset(nd->nd_nodename, sp,
59350Sstevel@tonic-gate (MNSET_IN_RECONFIG | stale_flag), ep)) {
59360Sstevel@tonic-gate /*
59370Sstevel@tonic-gate * If RPC failure to another node
59380Sstevel@tonic-gate * then exit without attempting anything else.
59390Sstevel@tonic-gate * (Reconfig return step will resume I/Os
59400Sstevel@tonic-gate * for all sets.)
59410Sstevel@tonic-gate */
59420Sstevel@tonic-gate if (mdanyrpcerror(ep)) {
59430Sstevel@tonic-gate mde_perror(ep, "");
59440Sstevel@tonic-gate return (205);
59450Sstevel@tonic-gate }
59460Sstevel@tonic-gate /*
59470Sstevel@tonic-gate * STALE and ACCOK failures aren't true
59480Sstevel@tonic-gate * failures. STALE means that <50% mddbs
59490Sstevel@tonic-gate * are available. ACCOK means that the
59500Sstevel@tonic-gate * mediator provided the extra vote.
59510Sstevel@tonic-gate * If a true failure, then print messasge
59520Sstevel@tonic-gate * and withdraw node from set in order to
59530Sstevel@tonic-gate * cleanup from failed join attempt.
59540Sstevel@tonic-gate */
59550Sstevel@tonic-gate if ((!mdismddberror(ep, MDE_DB_STALE)) &&
59560Sstevel@tonic-gate (!mdismddberror(ep, MDE_DB_ACCOK))) {
59570Sstevel@tonic-gate mde_perror(ep,
59580Sstevel@tonic-gate "WARNING: Unable to join node %s "
59590Sstevel@tonic-gate "to set %s", nd->nd_nodename,
59600Sstevel@tonic-gate sp->setname);
59610Sstevel@tonic-gate mdclrerror(ep);
59620Sstevel@tonic-gate if (clnt_withdrawset(nd->nd_nodename,
59630Sstevel@tonic-gate sp, &xep))
59640Sstevel@tonic-gate mdclrerror(&xep);
59650Sstevel@tonic-gate nd = nd->nd_next;
59660Sstevel@tonic-gate continue;
59670Sstevel@tonic-gate }
59680Sstevel@tonic-gate }
59690Sstevel@tonic-gate /* Set owner flag even if STALE or ACCOK */
59700Sstevel@tonic-gate nd->nd_flags |= MD_MN_NODE_OWN;
59710Sstevel@tonic-gate }
59720Sstevel@tonic-gate nd = nd->nd_next;
59730Sstevel@tonic-gate }
59740Sstevel@tonic-gate /*
59750Sstevel@tonic-gate * Resume I/Os if suspended above.
59760Sstevel@tonic-gate */
59770Sstevel@tonic-gate if (susp_res_flag) {
59780Sstevel@tonic-gate nd = sd->sd_nodelist;
59790Sstevel@tonic-gate while (nd) {
59800Sstevel@tonic-gate /*
59810Sstevel@tonic-gate * Skip non-alive and non-owner nodes
59820Sstevel@tonic-gate * (this list doesn't include any of
59830Sstevel@tonic-gate * the nodes that were joined).
59840Sstevel@tonic-gate */
59850Sstevel@tonic-gate if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
59860Sstevel@tonic-gate (!(nd->nd_flags & MD_MN_NODE_OWN))) {
59870Sstevel@tonic-gate nd = nd->nd_next;
59880Sstevel@tonic-gate continue;
59890Sstevel@tonic-gate }
59900Sstevel@tonic-gate if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
59910Sstevel@tonic-gate MN_RES_IO, ep)) {
59920Sstevel@tonic-gate mde_perror(ep, dgettext(TEXT_DOMAIN,
59930Sstevel@tonic-gate "Unable to resume I/O on node %s"
59940Sstevel@tonic-gate " in set %s"), nd->nd_nodename,
59950Sstevel@tonic-gate sp->setname);
59960Sstevel@tonic-gate
59970Sstevel@tonic-gate /*
59980Sstevel@tonic-gate * If an RPC failure then don't do any
59990Sstevel@tonic-gate * more RPC calls, since one timeout is enough
60000Sstevel@tonic-gate * to endure. If RPC failure to another node,
60010Sstevel@tonic-gate * return 205. If RPC failure to my node,
60020Sstevel@tonic-gate * return -1.
60030Sstevel@tonic-gate * (Reconfig return step will resume I/Os
60040Sstevel@tonic-gate * for all sets.)
60050Sstevel@tonic-gate * If not an RPC failure, continue resuming the
60060Sstevel@tonic-gate * rest of the nodes and then return -1.
60070Sstevel@tonic-gate */
60080Sstevel@tonic-gate if (mdanyrpcerror(ep)) {
60090Sstevel@tonic-gate if (sd->sd_mn_mynode->nd_nodeid ==
60100Sstevel@tonic-gate nd->nd_nodeid) {
60110Sstevel@tonic-gate return (-1);
60120Sstevel@tonic-gate } else {
60130Sstevel@tonic-gate return (205);
60140Sstevel@tonic-gate }
60150Sstevel@tonic-gate }
60160Sstevel@tonic-gate
60170Sstevel@tonic-gate /*
60180Sstevel@tonic-gate * If not an RPC error, continue resuming rest
60190Sstevel@tonic-gate * of nodes, ignoring any failures except for
60200Sstevel@tonic-gate * an RPC failure which constitutes an
60210Sstevel@tonic-gate * immediate exit.
60220Sstevel@tonic-gate * Start in middle of list with failing node.
60230Sstevel@tonic-gate */
60240Sstevel@tonic-gate nd2 = nd->nd_next;
60250Sstevel@tonic-gate while (nd2) {
60260Sstevel@tonic-gate /* Skip non-owner nodes */
60270Sstevel@tonic-gate if ((!(nd2->nd_flags &
60280Sstevel@tonic-gate MD_MN_NODE_ALIVE)) ||
60290Sstevel@tonic-gate (!(nd2->nd_flags &
60300Sstevel@tonic-gate MD_MN_NODE_OWN))) {
60310Sstevel@tonic-gate nd2 = nd2->nd_next;
60320Sstevel@tonic-gate continue;
60330Sstevel@tonic-gate }
60340Sstevel@tonic-gate (void) (clnt_mn_susp_res_io(
60350Sstevel@tonic-gate nd2->nd_nodename, sp->setno,
60360Sstevel@tonic-gate MN_RES_IO, &xep));
60370Sstevel@tonic-gate if (mdanyrpcerror(&xep)) {
60380Sstevel@tonic-gate return (-1);
60390Sstevel@tonic-gate }
60400Sstevel@tonic-gate nd2 = nd2->nd_next;
60410Sstevel@tonic-gate }
60420Sstevel@tonic-gate }
60430Sstevel@tonic-gate nd = nd->nd_next;
60440Sstevel@tonic-gate }
60450Sstevel@tonic-gate }
60460Sstevel@tonic-gate
60470Sstevel@tonic-gate nd = sd->sd_nodelist;
60480Sstevel@tonic-gate while (nd) {
60490Sstevel@tonic-gate if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
60500Sstevel@tonic-gate nd = nd->nd_next;
60510Sstevel@tonic-gate continue;
60520Sstevel@tonic-gate }
60530Sstevel@tonic-gate /*
60540Sstevel@tonic-gate * If 1 node fails - go ahead and update the rest except
60550Sstevel@tonic-gate * in the case of an RPC failure, fail immediately.
60560Sstevel@tonic-gate */
60570Sstevel@tonic-gate if (clnt_upd_nr_flags(nd->nd_nodename, sp,
60580Sstevel@tonic-gate sd->sd_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
60590Sstevel@tonic-gate /* RPC failure to another node */
60600Sstevel@tonic-gate if (mdanyrpcerror(ep)) {
60610Sstevel@tonic-gate return (205);
60620Sstevel@tonic-gate }
60630Sstevel@tonic-gate nd = nd->nd_next;
60640Sstevel@tonic-gate rval = -1;
60650Sstevel@tonic-gate continue;
60660Sstevel@tonic-gate }
60670Sstevel@tonic-gate nd = nd->nd_next;
60680Sstevel@tonic-gate }
60690Sstevel@tonic-gate
60700Sstevel@tonic-gate meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
60710Sstevel@tonic-gate "Join of all nodes completed for set %s: %s"),
60720Sstevel@tonic-gate sp->setname, meta_print_hrtime(gethrtime() - start_time));
60730Sstevel@tonic-gate
60740Sstevel@tonic-gate return (rval);
60750Sstevel@tonic-gate }
6076