/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Just in case we're not in a build environment, make sure that * TEXT_DOMAIN gets set to something. */ #if !defined(TEXT_DOMAIN) #define TEXT_DOMAIN "SYS_TEST" #endif /* * Metadevice diskset interfaces */ #include "meta_set_prv.h" #include #include #include #include static int add_db_sidenms( mdsetname_t *sp, md_error_t *ep ) { md_replicalist_t *rlp = NULL; md_replicalist_t *rl; int rval = 0; if (metareplicalist(sp, MD_FULLNAME_ONLY, &rlp, ep) < 0) return (-1); for (rl = rlp; rl != NULL; rl = rl->rl_next) { md_replica_t *r = rl->rl_repp; /* * This is not the first replica being added to the * diskset so call with ADDSIDENMS_BCAST. If this * is a traditional diskset, the bcast flag is ignored * since traditional disksets don't use the rpc.mdcommd. */ if (meta_db_addsidenms(sp, r->r_namep, r->r_blkno, DB_ADDSIDENMS_BCAST, ep)) { rval = -1; goto out; } } out: metafreereplicalist(rlp); return (rval); } static int add_drvs_to_hosts( mdsetname_t *sp, int node_c, char **node_v, md_error_t *ep ) { int i; md_set_desc *sd; md_drive_desc *dd; md_timeval32_t now; ulong_t genid; if ((sd = metaget_setdesc(sp, ep)) == NULL) return (-1); if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL) { if (! mdisok(ep)) return (-1); return (0); } now = sd->sd_ctime; genid = sd->sd_genid - 1; for (i = 0; i < node_c; i++) { if (clnt_adddrvs(node_v[i], sp, dd, now, genid, ep) == -1) return (-1); } return (0); } static int add_md_sidenms(mdsetname_t *sp, side_t sideno, side_t otherside, md_error_t *ep) { mdnm_params_t nm; char *cname, *dname; side_t tmp_sideno; minor_t mnum; int done, i; int rval = 0; md_set_desc *sd; (void) memset(&nm, '\0', sizeof (nm)); nm.key = MD_KEYWILD; if (!metaislocalset(sp)) { if ((sd = metaget_setdesc(sp, ep)) == NULL) return (-1); } /* Use rpc.mdcommd to add md side info from all nodes */ if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { md_mn_result_t *resultp = NULL; md_mn_msg_meta_md_addside_t md_as; int send_rval; md_as.msg_sideno = sideno; md_as.msg_otherside = otherside; /* * If reconfig cycle has been started, this node is stuck in * in the return step until this command has completed. If * mdcommd is suspended, ask send_message to fail (instead of * retrying) so that metaset can finish allowing the * reconfig cycle to proceed. */ send_rval = mdmn_send_message(sp->setno, MD_MN_MSG_META_MD_ADDSIDE, MD_MSGF_FAIL_ON_SUSPEND | MD_MSGF_PANIC_WHEN_INCONSISTENT, 0, (char *)&md_as, sizeof (md_mn_msg_meta_md_addside_t), &resultp, ep); if (send_rval != 0) { (void) mdstealerror(ep, &(resultp->mmr_ep)); if (resultp) free_result(resultp); return (-1); } if (resultp) free_result(resultp); return (0); } else { /*CONSTCOND*/ while (1) { char *drvnm = NULL; nm.mde = mdnullerror; nm.setno = sp->setno; nm.side = otherside; if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0) return (mdstealerror(ep, &nm.mde)); if (nm.key == MD_KEYWILD) return (0); /* * Okay we have a valid key * Let's see if it is hsp or not */ nm.devname = (uintptr_t)meta_getnmentbykey(sp->setno, otherside, nm.key, &drvnm, NULL, NULL, ep); if (nm.devname == NULL || drvnm == NULL) { if (nm.devname) Free((void *)(uintptr_t)nm.devname); if (drvnm) Free((void *)(uintptr_t)drvnm); return (-1); } /* * If it is hsp add here */ if (strcmp(drvnm, MD_HOTSPARES) == 0) { if (add_name(sp, sideno, nm.key, MD_HOTSPARES, minor(NODEV), (char *)(uintptr_t)nm.devname, NULL, NULL, ep) == -1) { Free((void *)(uintptr_t)nm.devname); Free((void *)(uintptr_t)drvnm); return (-1); } else { Free((void *)(uintptr_t)nm.devname); Free((void *)(uintptr_t)drvnm); continue; } } nm.side = sideno; if (MD_MNSET_DESC(sd)) { tmp_sideno = sideno; } else { tmp_sideno = sideno - 1; } if ((done = meta_getnextside_devinfo(sp, (char *)(uintptr_t)nm.devname, &tmp_sideno, &cname, &dname, &mnum, ep)) == -1) { Free((void *)(uintptr_t)nm.devname); return (-1); } assert(done == 1); Free((void *)(uintptr_t)nm.devname); Free((void *)(uintptr_t)drvnm); /* * The device reference count can be greater than 1 if * more than one softpart is configured on top of the * same device. If this is the case then we want to * increment the count to sync up with the other sides. */ for (i = 0; i < nm.ref_count; i++) { if (add_name(sp, sideno, nm.key, dname, mnum, cname, NULL, NULL, ep) == -1) rval = -1; } Free(cname); Free(dname); if (rval != 0) return (rval); } } /*NOTREACHED*/ } static int check_setdrvs_againstnode(mdsetname_t *sp, char *node, md_error_t *ep) { mddrivename_t *dp; md_drive_desc *dd, *ddp; if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL) if (! mdisok(ep)) return (-1); for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) { dp = ddp->dd_dnp; if (checkdrive_onnode(sp, dp, node, ep)) return (-1); } return (0); } static int create_multinode_set_on_hosts( mdsetname_t *sp, int node_c, /* Number of new nodes */ char **node_v, /* Nodes which are being added */ int new_set, md_error_t *ep ) { int i; md_set_desc *sd; md_timeval32_t now; ulong_t genid; int rval = 0; md_mnnode_desc *nd, *ndm = NULL; md_mnnode_desc *nd_prev, *nd_curr; int nodecnt; mndiskset_membershiplist_t *nl, *nl2; if (!new_set) { if ((sd = metaget_setdesc(sp, ep)) == NULL) return (-1); now = sd->sd_ctime; genid = sd->sd_genid - 1; if (sd->sd_drvs) genid--; } else { sd = Zalloc(sizeof (*sd)); if (meta_gettimeofday(&now) == -1) { (void) mdsyserror(ep, errno, dgettext(TEXT_DOMAIN, "meta_gettimeofday()")); rval = -1; goto out; } /* Put the new entries into the set */ /* * Get membershiplist from API routine. If there's * an error, fail to create set and pass back error. */ if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) { rval = -1; goto out; } /* * meta_set_addhosts has already verified that * this node list is in the membership list * so set ALIVE flag. * Since this is a new set, all hosts being * added are new to the set, so also set ADD flag. */ for (i = 0; i < node_c; i++) { nd = Zalloc(sizeof (*nd)); (void) strcpy(nd->nd_nodename, node_v[i]); nd->nd_ctime = now; nd->nd_flags = (MD_MN_NODE_ALIVE | MD_MN_NODE_ADD); nl2 = nl; while (nl2) { if (strcmp(nl2->msl_node_name, node_v[i]) == 0) { nd->nd_nodeid = nl2->msl_node_id; (void) strcpy(nd->nd_priv_ic, nl2->msl_node_addr); break; } nl2 = nl2->next; } /* * Nodelist must be kept in ascending * nodeid order. */ if (sd->sd_nodelist == NULL) { /* Nothing in list, just add it */ sd->sd_nodelist = nd; } else if (nd->nd_nodeid < sd->sd_nodelist->nd_nodeid) { /* Add to head of list */ nd->nd_next = sd->sd_nodelist; sd->sd_nodelist = nd; } else { nd_curr = sd->sd_nodelist->nd_next; nd_prev = sd->sd_nodelist; /* Search for place ot add it */ while (nd_curr) { if (nd->nd_nodeid < nd_curr->nd_nodeid) { /* Add before nd_curr */ nd->nd_next = nd_curr; nd_prev->nd_next = nd; break; } nd_prev = nd_curr; nd_curr = nd_curr->nd_next; } /* Add to end of list */ if (nd_curr == NULL) { nd_prev->nd_next = nd; } } /* Set master to be first node added */ if (ndm == NULL) ndm = nd; } meta_free_nodelist(nl); /* * Creating mnset for first time. * Set master to be invalid until first drive is * in set. */ (void) strcpy(sd->sd_mn_master_nodenm, ""); sd->sd_mn_master_nodeid = MD_MN_INVALID_NID; sd->sd_mn_masternode = ndm; sd->sd_ctime = now; genid = sd->sd_genid = 0; } /* Create the set where needed */ for (i = 0; i < node_c; i++) { /* * Create the set on each new node. If the set already * exists, then the node list being created on each new node * is the current node list from before the new nodes * were added. If the set doesn't exist, then the node * list being created on each new node is the entire * new node list. */ if (clnt_mncreateset(node_v[i], sp, sd->sd_nodelist, now, genid, sd->sd_mn_master_nodenm, sd->sd_mn_master_nodeid, ep) == -1) { rval = -1; break; } } out: if (new_set) { nd = sd->sd_nodelist; while (nd) { sd->sd_nodelist = nd->nd_next; Free(nd); nd = sd->sd_nodelist; } Free(sd); } if (rval != 0 || new_set) return (rval); /* * Add the drive records to the new sets * and names for the new sides. */ return (add_drvs_to_hosts(sp, node_c, node_v, ep)); } static int create_traditional_set_on_hosts( mdsetname_t *sp, int node_c, /* Number of new nodes */ char **node_v, /* Nodes which are being added */ int new_set, md_error_t *ep ) { int i; md_set_desc *sd; md_timeval32_t now; ulong_t genid; int rval = 0; if (!new_set) { if ((sd = metaget_setdesc(sp, ep)) == NULL) return (-1); now = sd->sd_ctime; genid = sd->sd_genid; if (sd->sd_drvs) genid--; } else { if (node_c > MD_MAXSIDES) return (mddserror(ep, MDE_DS_SIDENUMNOTAVAIL, sp->setno, NULL, NULL, sp->setname)); sd = Zalloc(sizeof (*sd)); /* Put the new entries into the set */ for (i = 0; i < node_c; i++) { (void) strcpy(sd->sd_nodes[i], node_v[i]); } if (meta_gettimeofday(&now) == -1) { (void) mdsyserror(ep, errno, "meta_gettimeofday()"); rval = -1; goto out; } sd->sd_ctime = now; genid = sd->sd_genid = 0; } /* Create the set where needed */ for (i = 0; i < node_c; i++) { /* * Create the set on each new host */ if (clnt_createset(node_v[i], sp, sd->sd_nodes, now, genid, ep) == -1) { rval = -1; break; } } out: if (new_set) Free(sd); if (rval != 0 || new_set) return (rval); /* * Add the drive records to the new sets * and names for the new sides. */ return (add_drvs_to_hosts(sp, node_c, node_v, ep)); } static int create_set_on_hosts( mdsetname_t *sp, int multi_node, /* Multi_node diskset or not? */ int node_c, /* Number of new nodes */ char **node_v, /* Nodes which are being added */ int new_set, md_error_t *ep ) { if (multi_node) return (create_multinode_set_on_hosts(sp, node_c, node_v, new_set, ep)); else return (create_traditional_set_on_hosts(sp, node_c, node_v, new_set, ep)); } static int create_set( mdsetname_t *sp, int multi_node, /* Multi-node diskset or not? */ int node_c, char **node_v, int auto_take, md_error_t *ep ) { int i; int rval = 0; set_t max_sets; set_t setno; int bool; uint_t sr_flags; sigset_t oldsigs; md_setkey_t *cl_sk; int rb_level = 0; md_error_t xep = mdnullerror; rval_e sdssc_rval; int lock_flag = 0; int sig_flag = 0; if ((max_sets = get_max_sets(ep)) == 0) return (-1); /* We must be a member of the set we are creating */ if (! strinlst(mynode(), node_c, node_v)) return (mddserror(ep, MDE_DS_SELFNOTIN, sp->setno, mynode(), NULL, sp->setname)); /* * If auto_take then we must be the only member of the set * that we are creating. */ if (auto_take && node_c > 1) return (mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, NULL, sp->setname)); /* * If we're part of SC3.0 we'll already have allocated the * set number so we can skip the allocation algorithm used. * Set number is unique across traditional and MN disksets. */ if ((sdssc_rval = sdssc_get_index(sp->setname, &setno)) == SDSSC_NOT_BOUND) { for (i = 0; i < node_c; i++) { int has_set; /* Skip my node */ if (strcmp(mynode(), node_v[i]) == 0) continue; /* * Make sure this set name is not used on the * other hosts */ has_set = nodehasset(sp, node_v[i], NHS_N_EQ, ep); if (has_set < 0) { if (! mdiserror(ep, MDE_NO_SET)) { rval = -1; goto out; } mdclrerror(ep); continue; } if (has_set) { (void) mddserror(ep, MDE_DS_NODEHASSET, sp->setno, node_v[i], NULL, sp->setname); rval = -1; goto out; } } for (setno = 1; setno < max_sets; setno++) { for (i = 0; i < node_c; i++) { if (clnt_setnumbusy(node_v[i], setno, &bool, ep) == -1) { rval = -1; goto out; } if (bool == TRUE) break; } if (i == node_c) break; } } else if (sdssc_rval != SDSSC_OKAY) { (void) mddserror(ep, MDE_DS_SETNUMNOTAVAIL, MD_SET_BAD, NULL, NULL, sp->setname); rval = -1; goto out; } if (setno == max_sets) { (void) mddserror(ep, MDE_DS_SETNUMNOTAVAIL, MD_SET_BAD, NULL, NULL, sp->setname); rval = -1; goto out; } sp->setno = setno; /* * Lock the set on current set members. * Set locking done much earlier for MN diskset than for traditional * diskset since lock_set is used to protect against * other meta* commands running on the other nodes. * Don't issue mdcommd SUSPEND command since there is nothing * to suspend since there currently is no set. */ if (multi_node) { /* Make sure we are blocking all signals */ if (procsigs(TRUE, &oldsigs, &xep) < 0) mdclrerror(&xep); sig_flag = 1; /* Lock the set on new set members */ for (i = 0; i < node_c; i++) { if (clnt_lock_set(node_v[i], sp, ep)) { rval = -1; goto out; } lock_flag = 1; } /* Now have the diskset locked, verify set number is still ok */ for (i = 0; i < node_c; i++) { if (clnt_setnumbusy(node_v[i], setno, &bool, ep) == -1) { rval = -1; goto out; } } } if (meta_set_checkname(sp->setname, ep)) { rval = -1; goto out; } for (i = 0; i < node_c; i++) { if (clnt_setnameok(node_v[i], sp, &bool, ep) == -1) { rval = -1; goto out; } if (bool == FALSE) { (void) mddserror(ep, MDE_DS_SETNAMEBUSY, sp->setno, node_v[i], NULL, sp->setname); rval = -1; goto out; } } /* END CHECK CODE */ /* Lock the set on new set members */ if (!multi_node) { md_rb_sig_handling_on(); sig_flag = 1; for (i = 0; i < node_c; i++) { if (clnt_lock_set(node_v[i], sp, ep)) { rval = -1; goto out; } lock_flag = 1; } } RB_TEST(1, "create_set", ep) RB_PREEMPT; rb_level = 1; /* level 1 */ RB_TEST(2, "create_set", ep) if ((rval = create_set_on_hosts(sp, multi_node, node_c, node_v, 1, ep)) == -1) goto rollback; RB_TEST(3, "create_set", ep) if (auto_take) sr_flags = MD_SR_OK | MD_SR_AUTO_TAKE; else sr_flags = MD_SR_OK; /* * Mark the set record MD_SR_OK */ for (i = 0; i < node_c; i++) if (clnt_upd_sr_flags(node_v[i], sp, sr_flags, ep)) goto rollback; rb_level = 2; /* level 2 */ /* * For MN diskset: * On each added node, set the node record for that node * to OK. Then set all node records for the newly added * nodes on all nodes to ok. * * By setting a node's own node record to ok first, even if * the node adding the hosts panics, the rest of the nodes can * determine the same node list during the choosing of the master * during reconfig. So, only nodes considered for mastership * are nodes that have both MD_MN_NODE_OK and MD_SR_OK set * on that node's rpc.metad. If all nodes have MD_SR_OK set, * but no node has its own MD_MN_NODE_OK set, then the set will * be removed during reconfig since a panic occurred during the * creation of the initial diskset. */ if (multi_node) { md_mnnode_desc *nd, *saved_nd_next; md_set_desc *sd; if ((sd = metaget_setdesc(sp, ep)) == NULL) { goto rollback; } for (i = 0; i < node_c; i++) { nd = sd->sd_nodelist; /* All nodes are guaranteed to be ALIVE */ while (nd) { if (strcmp(nd->nd_nodename, node_v[i]) == 0) break; nd = nd->nd_next; } /* Something wrong, will pick this up in next loop */ if (nd == NULL) continue; /* Only changing my local cache of node list */ saved_nd_next = nd->nd_next; nd->nd_next = NULL; /* Set node record for added host to ok on that host */ if (clnt_upd_nr_flags(node_v[i], sp, nd, MD_NR_OK, NULL, ep)) { nd->nd_next = saved_nd_next; goto rollback; } nd->nd_next = saved_nd_next; } /* Now set all node records on all nodes to be ok */ nd = sd->sd_nodelist; /* All nodes are guaranteed to be ALIVE */ while (nd) { if (clnt_upd_nr_flags(nd->nd_nodename, sp, sd->sd_nodelist, MD_NR_OK, NULL, ep)) { goto rollback; } nd = nd->nd_next; } } RB_TEST(4, "create_set", ep) out: if ((rval == 0) && multi_node) { /* * Set successfully created. * Notify rpc.mdcommd on all nodes of a nodelist change. * Send reinit command to mdcommd which forces it to get * fresh set description. Then send resume. * Resume on class 0 will resume all classes. */ for (i = 0; i < node_c; i++) { /* Class is ignored for REINIT */ if (clnt_mdcommdctl(node_v[i], COMMDCTL_REINIT, sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { if (rval == 0) (void) mdstealerror(ep, &xep); rval = -1; mde_perror(ep, dgettext(TEXT_DOMAIN, "Unable to reinit rpc.mdcommd.\n")); } } for (i = 0; i < node_c; i++) { if (clnt_mdcommdctl(node_v[i], COMMDCTL_RESUME, sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { if (rval == 0) (void) mdstealerror(ep, &xep); rval = -1; mde_perror(ep, dgettext(TEXT_DOMAIN, "Unable to resume rpc.mdcommd.\n")); } } meta_ping_mnset(sp->setno); } if (lock_flag) { cl_sk = cl_get_setkey(sp->setno, sp->setname); for (i = 0; i < node_c; i++) { if (clnt_unlock_set(node_v[i], cl_sk, &xep)) { if (rval == 0) (void) mdstealerror(ep, &xep); rval = -1; } } cl_set_setkey(NULL); } if (sig_flag) { if (multi_node) { /* release signals back to what they were on entry */ if (procsigs(FALSE, &oldsigs, &xep) < 0) mdclrerror(&xep); } else { md_rb_sig_handling_off(md_got_sig(), md_which_sig()); } } return (rval); rollback: /* all signals already blocked for MN disket */ if (!multi_node) { /* Make sure we are blocking all signals */ if (procsigs(TRUE, &oldsigs, &xep) < 0) mdclrerror(&xep); } rval = -1; /* * For MN diskset: * On each added node (which is now each node to be deleted), * set the node record for that node to DEL. Then set all * node records for the newly added (soon to be deleted) nodes * on all nodes to ok. * * By setting a node's own node record to DEL first, even if * the node doing the rollback panics, the rest of the nodes can * determine the same node list during the choosing of the master * during reconfig. */ /* level 3 */ if ((rb_level > 1) && (multi_node)) { md_mnnode_desc *nd, *saved_nd_next; md_set_desc *sd; if ((sd = metaget_setdesc(sp, &xep)) == NULL) { mdclrerror(&xep); } for (i = 0; i < node_c; i++) { nd = sd->sd_nodelist; /* All nodes are guaranteed to be ALIVE */ while (nd) { if (strcmp(nd->nd_nodename, node_v[i]) == 0) break; nd = nd->nd_next; } /* Something wrong, will pick this up in next loop */ if (nd == NULL) continue; /* Only changing my local cache of node list */ saved_nd_next = nd->nd_next; nd->nd_next = NULL; /* Set node record for added host to DEL on that host */ if (clnt_upd_nr_flags(node_v[i], sp, nd, MD_NR_DEL, NULL, &xep)) { nd->nd_next = saved_nd_next; mdclrerror(&xep); } nd->nd_next = saved_nd_next; } /* Now set all node records on all nodes to be DEL */ nd = sd->sd_nodelist; /* All nodes are guaranteed to be ALIVE */ while (nd) { if (clnt_upd_nr_flags(nd->nd_nodename, sp, sd->sd_nodelist, MD_NR_DEL, NULL, &xep)) { mdclrerror(&xep); } nd = nd->nd_next; } /* Mark set record on all hosts to be DELETED */ for (i = 0; i < node_c; i++) { if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, &xep)) { mdclrerror(&xep); } } } /* level 1 */ if (rb_level > 0) { for (i = 0; i < node_c; i++) { if (clnt_delset(node_v[i], sp, &xep) == -1) mdclrerror(&xep); } } /* level 0 */ /* Don't test lock flag since guaranteed to be set if in rollback */ cl_sk = cl_get_setkey(sp->setno, sp->setname); for (i = 0; i < node_c; i++) { if (clnt_unlock_set(node_v[i], cl_sk, &xep)) mdclrerror(&xep); } cl_set_setkey(NULL); /* release signals back to what they were on entry */ if (procsigs(FALSE, &oldsigs, &xep) < 0) mdclrerror(&xep); if ((sig_flag) && (!multi_node)) md_rb_sig_handling_off(md_got_sig(), md_which_sig()); return (rval); } static int del_db_sidenms( mdsetname_t *sp, side_t sideno, md_error_t *ep ) { md_replicalist_t *rlp = NULL; md_replicalist_t *rl; int rval = 0; if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) return (-1); for (rl = rlp; rl != NULL; rl = rl->rl_next) { md_replica_t *r = rl->rl_repp; if (meta_db_delsidenm(sp, sideno, r->r_namep, r->r_blkno, ep)) { rval = -1; goto out; } } out: metafreereplicalist(rlp); return (rval); } static int del_drvs_from_hosts( mdsetname_t *sp, md_set_desc *sd, md_drive_desc *dd, int node_c, char **node_v, int oha, md_error_t *ep ) { int i; md_mnnode_desc *nd; for (i = 0; i < node_c; i++) { if (MD_MNSET_DESC(sd) && (oha == TRUE)) { /* * During OHA mode, don't issue RPCs to * non-alive nodes since there is no reason to * wait for RPC timeouts. */ nd = sd->sd_nodelist; while (nd) { if (strcmp(nd->nd_nodename, node_v[i]) == 0) break; nd = nd->nd_next; } if (nd == NULL) { return (mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno, nd->nd_nodename, NULL, sp->setname)); } if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { continue; } if (clnt_deldrvs(node_v[i], sp, dd, ep)) { return (-1); } } else if (MD_MNSET_DESC(sd) && (oha == FALSE)) { /* * All nodes should be alive in non-oha mode. */ if (clnt_deldrvs(node_v[i], sp, dd, ep)) { return (-1); } } else { /* * For traditional diskset, issue the RPC and * ignore RPC failure if in OHA mode. */ if (clnt_deldrvs(node_v[i], sp, dd, ep)) { if (oha == TRUE && mdanyrpcerror(ep)) { mdclrerror(ep); continue; } return (-1); } } } return (0); } static int del_host_noset( mdsetname_t *sp, char **anode, md_error_t *ep ) { int rval = 0; md_setkey_t *cl_sk; md_drive_desc *dd; md_error_t xep = mdnullerror; md_set_desc *sd; if ((sd = metaget_setdesc(sp, ep)) == NULL) return (-1); /* Make sure we own the set */ if (meta_check_ownership(sp, ep) != 0) return (-1); /* Lock the set on our side */ if (clnt_lock_set(mynode(), sp, ep)) { rval = -1; goto out; } if (clnt_delhosts(mynode(), sp, 1, anode, ep)) { rval = -1; goto out; } if (!MD_MNSET_DESC(sd)) { if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) { if (! mdisok(ep)) { rval = -1; goto out; } } /* If we have drives */ if (dd != NULL) { if (clnt_del_drv_sidenms(mynode(), sp, ep)) { rval = -1; goto out; } } } out: cl_sk = cl_get_setkey(sp->setno, sp->setname); if (clnt_unlock_set(mynode(), cl_sk, &xep)) { if (rval == 0) (void) mdstealerror(ep, &xep); rval = -1; } cl_set_setkey(NULL); metaflushsetname(sp); return (rval); } static int del_md_sidenms(mdsetname_t *sp, side_t sideno, md_error_t *ep) { mdnm_params_t nm; md_set_desc *sd; int i; if (!metaislocalset(sp)) { if ((sd = metaget_setdesc(sp, ep)) == NULL) return (-1); } /* Use rpc.mdcommd to add md side info from all nodes */ if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { md_mn_result_t *resultp = NULL; md_mn_msg_meta_md_delside_t md_ds; int send_rval; md_ds.msg_sideno = sideno; /* * If reconfig cycle has been started, this node is stuck in * in the return step until this command has completed. If * mdcommd is suspended, ask send_message to fail (instead of * retrying) so that metaset can finish allowing the * reconfig cycle to proceed. */ send_rval = mdmn_send_message(sp->setno, MD_MN_MSG_META_MD_DELSIDE, MD_MSGF_FAIL_ON_SUSPEND | MD_MSGF_PANIC_WHEN_INCONSISTENT, 0, (char *)&md_ds, sizeof (md_mn_msg_meta_md_delside_t), &resultp, ep); if (send_rval != 0) { (void) mdstealerror(ep, &(resultp->mmr_ep)); if (resultp) free_result(resultp); return (-1); } if (resultp) free_result(resultp); } else { (void) memset(&nm, '\0', sizeof (nm)); nm.key = MD_KEYWILD; /*CONSTCOND*/ while (1) { nm.mde = mdnullerror; nm.setno = sp->setno; nm.side = MD_SIDEWILD; if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0) return (mdstealerror(ep, &nm.mde)); if (nm.key == MD_KEYWILD) return (0); /* * The device reference count can be greater than 1 if * more than one softpart is configured on top of the * same device. If this is the case then we want to * decrement the count to zero so the entry can be * actually removed. */ for (i = 0; i < nm.ref_count; i++) { if (del_name(sp, sideno, nm.key, ep) == -1) return (-1); } } } return (0); } static void recreate_set( mdsetname_t *sp, md_set_desc *sd ) { int i; int has_set; md_error_t xep = mdnullerror; md_mnnode_desc *nd; if (MD_MNSET_DESC(sd)) { nd = sd->sd_nodelist; while (nd) { if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { nd = nd->nd_next; continue; } has_set = nodehasset(sp, nd->nd_nodename, NHS_NST_EQ, &xep); if (has_set >= 0) { nd = nd->nd_next; continue; } mdclrerror(&xep); if (clnt_mncreateset(nd->nd_nodename, sp, sd->sd_nodelist, sd->sd_ctime, sd->sd_genid, sd->sd_mn_master_nodenm, sd->sd_mn_master_nodeid, &xep) == -1) mdclrerror(&xep); nd = nd->nd_next; } } else { for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ, &xep); if (has_set >= 0) continue; mdclrerror(&xep); if (clnt_createset(sd->sd_nodes[i], sp, sd->sd_nodes, sd->sd_ctime, sd->sd_genid, &xep) == -1) mdclrerror(&xep); } } } /* * If a MN diskset, set is already locked on all nodes via clnt_lock_set. */ static int del_set_nodrives( mdsetname_t *sp, int node_c, char **node_v, int oha, md_error_t *ep ) { md_set_desc *sd; int i; sigset_t oldsigs; md_setkey_t *cl_sk; int rb_level = 0; ulong_t max_genid = 0; int rval = 0; md_error_t xep = mdnullerror; md_mnnode_desc *nd; int delete_end = 1; if ((sd = metaget_setdesc(sp, ep)) == NULL) return (-1); if (MD_MNSET_DESC(sd)) { /* Make sure we are blocking all signals */ if (procsigs(TRUE, &oldsigs, &xep) < 0) mdclrerror(&xep); } else { md_rb_sig_handling_on(); } /* * Lock the set on current set members for traditional disksets. */ if (!(MD_MNSET_DESC(sd))) { for (i = 0; i < node_c; i++) { /* * For traditional diskset, issue the RPC and * ignore RPC failure if in OHA mode. */ if (clnt_lock_set(node_v[i], sp, ep)) { if (oha == TRUE && mdanyrpcerror(ep)) { mdclrerror(ep); continue; } rval = -1; goto out; } } } RB_TEST(1, "deletehosts", ep) RB_PREEMPT; rb_level = 1; /* level 1 */ RB_TEST(2, "deletehosts", ep) /* * Mark the set record MD_SR_DEL */ for (i = 0; i < node_c; i++) { RB_TEST(3, "deletehosts", ep) if (MD_MNSET_DESC(sd) && (oha == TRUE)) { /* * During OHA mode, don't issue RPCs to * non-alive nodes since there is no reason to * wait for RPC timeouts. */ nd = sd->sd_nodelist; while (nd) { if (strcmp(nd->nd_nodename, node_v[i]) == 0) break; nd = nd->nd_next; } if (nd == NULL) { (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno, nd->nd_nodename, NULL, sp->setname); goto rollback; } if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { continue; } if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) { goto rollback; } } else if (MD_MNSET_DESC(sd) && (oha == FALSE)) { /* * All nodes should be alive in non-oha mode. */ if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) { goto rollback; } } else { /* * For traditional diskset, issue the RPC and * ignore RPC failure if in OHA mode. */ if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) { if (oha == TRUE && mdanyrpcerror(ep)) { mdclrerror(ep); continue; } goto rollback; } } RB_TEST(4, "deletehosts", ep) } RB_TEST(5, "deletehosts", ep) RB_PREEMPT; rb_level = 2; /* level 2 */ RB_TEST(6, "deletehosts", ep) if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) if (metad_isautotakebyname(sp->setname)) delete_end = 0; else goto rollback; /* The set is OK to delete, make it so. */ for (i = 0; i < node_c; i++) { RB_TEST(7, "deletehosts", ep) if (MD_MNSET_DESC(sd) && (oha == TRUE)) { /* * During OHA mode, don't issue RPCs to * non-alive nodes since there is no reason to * wait for RPC timeouts. */ nd = sd->sd_nodelist; while (nd) { if (strcmp(nd->nd_nodename, node_v[i]) == 0) break; nd = nd->nd_next; } if (nd == NULL) { (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno, nd->nd_nodename, NULL, sp->setname); goto rollback; } if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { continue; } if (clnt_delset(node_v[i], sp, ep) == -1) { goto rollback; } } else if (MD_MNSET_DESC(sd) && (oha == FALSE)) { /* * All nodes should be alive in non-oha mode. */ if (clnt_delset(node_v[i], sp, ep) == -1) { goto rollback; } } else { /* * For traditional diskset, issue the RPC and * ignore RPC failure if in OHA mode. */ if (clnt_delset(node_v[i], sp, ep) == -1) { if (oha == TRUE && mdanyrpcerror(ep)) { mdclrerror(ep); continue; } goto rollback; } } RB_TEST(8, "deletehosts", ep) } RB_TEST(9, "deletehosts", ep) out: /* * Unlock the set on current set members * for traditional disksets. */ if (!(MD_MNSET_DESC(sd))) { cl_sk = cl_get_setkey(sp->setno, sp->setname); for (i = 0; i < node_c; i++) { /* * For traditional diskset, issue the RPC and * ignore RPC failure if in OHA mode. */ if (clnt_unlock_set(node_v[i], cl_sk, &xep)) { if (oha == TRUE && mdanyrpcerror(&xep)) { mdclrerror(&xep); continue; } if (rval == 0) (void) mdstealerror(ep, &xep); rval = -1; } } cl_set_setkey(NULL); } /* * A MN diskset has the clnt_locks held by meta_set_deletehosts so * don't flush that data until meta_set_deletehosts has finished * with it. meta_set_deletehosts will handle the flush of the * setname. */ if (!(MD_MNSET_DESC(sd))) { metaflushsetname(sp); } if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR) rval = -1; if (MD_MNSET_DESC(sd)) { /* release signals back to what they were on entry */ if (procsigs(FALSE, &oldsigs, &xep) < 0) mdclrerror(&xep); } else { md_rb_sig_handling_off(md_got_sig(), md_which_sig()); } return (rval); rollback: /* all signals already blocked for MN disket */ if (!(MD_MNSET_DESC(sd))) { /* Make sure we are blocking all signals */ if (procsigs(TRUE, &oldsigs, &xep) < 0) mdclrerror(&xep); } rval = -1; max_genid = sd->sd_genid; /* level 2 */ if (rb_level > 1) { recreate_set(sp, sd); max_genid++; if (delete_end) (void) sdssc_delete_end(sp->setname, SDSSC_CLEANUP); } /* level 1 */ if (rb_level > 0) { max_genid++; resync_genid(sp, sd, max_genid, node_c, node_v); } /* level 0 */ /* * Unlock the set on current set members * for traditional disksets. */ if (!(MD_MNSET_DESC(sd))) { cl_sk = cl_get_setkey(sp->setno, sp->setname); for (i = 0; i < node_c; i++) { /* * For traditional diskset, issue the RPC and * ignore RPC failure if in OHA mode. */ if (clnt_unlock_set(node_v[i], cl_sk, &xep)) mdclrerror(&xep); } cl_set_setkey(NULL); } /* release signals back to what they were on entry */ if (procsigs(FALSE, &oldsigs, &xep) < 0) mdclrerror(&xep); /* * A MN diskset has the clnt_locks held by meta_set_deletehosts so * don't flush that data until meta_set_deletehosts has finished * with it. meta_set_deletehosts will handle the flush of the * setname. */ if (!(MD_MNSET_DESC(sd))) { metaflushsetname(sp); md_rb_sig_handling_off(md_got_sig(), md_which_sig()); } return (rval); } /* * On entry: * procsigs already called for MN diskset. * md_rb_sig_handling already called for traditional diskset. */ static int del_set_on_hosts( mdsetname_t *sp, md_set_desc *sd, md_drive_desc *dd, int node_c, /* Number of nodes */ char **node_v, /* Nodes being deleted */ int oha, md_error_t *ep ) { int i; int j; side_t sideno; md_replicalist_t *rlp = NULL; sigset_t oldsigs; md_setkey_t *cl_sk; ulong_t max_genid = 0; int rb_level = 1; /* This is a special case */ md_error_t xep = mdnullerror; md_mnnode_desc *nd; RB_PREEMPT; RB_TEST(7, "deletehosts", ep) if (dd != NULL) { /* * May need this to re-add sidenames on roll back. */ if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, ep) < 0) goto rollback; RB_TEST(8, "deletehosts", ep) RB_PREEMPT; rb_level = 2; /* level 2 */ RB_TEST(9, "deletehosts", ep) if (del_drvs_from_hosts(sp, sd, dd, node_c, node_v, oha, ep)) goto rollback; RB_TEST(10, "deletehosts", ep) RB_PREEMPT; rb_level = 3; /* level 3 */ RB_TEST(11, "deletehosts", ep) /* * Delete the db replica sides * This is done before the next loop, so that * the db does not get unloaded before we are finished * deleting the sides. */ if (MD_MNSET_DESC(sd)) { nd = sd->sd_nodelist; while (nd) { /* Skip hosts not being deleted */ if (! strinlst(nd->nd_nodename, node_c, node_v)) { nd = nd->nd_next; continue; } if (del_db_sidenms(sp, nd->nd_nodeid, ep)) goto rollback; RB_TEST(12, "deletehosts", ep) nd = nd->nd_next; } } else { for (sideno = 0; sideno < MD_MAXSIDES; sideno++) { /* Skip empty slots */ if (sd->sd_nodes[sideno][0] == '\0') continue; /* Skip hosts not being deleted */ if (! strinlst(sd->sd_nodes[sideno], node_c, node_v)) continue; if (del_db_sidenms(sp, sideno, ep)) goto rollback; RB_TEST(12, "deletehosts", ep) } } RB_TEST(13, "deletehosts", ep) RB_PREEMPT; rb_level = 4; /* level 4 */ RB_TEST(14, "deletehosts", ep) /* Delete the names from the namespace */ if (MD_MNSET_DESC(sd)) { nd = sd->sd_nodelist; while (nd) { /* Skip hosts not being deleted */ if (! strinlst(nd->nd_nodename, node_c, node_v)) { nd = nd->nd_next; continue; } if (del_md_sidenms(sp, nd->nd_nodeid, ep)) goto rollback; RB_TEST(15, "deletehosts", ep) nd = nd->nd_next; } } else { for (sideno = 0; sideno < MD_MAXSIDES; sideno++) { /* Skip empty slots */ if (sd->sd_nodes[sideno][0] == '\0') continue; /* Skip hosts not being deleted */ if (! strinlst(sd->sd_nodes[sideno], node_c, node_v)) continue; if (del_md_sidenms(sp, sideno, ep)) goto rollback; RB_TEST(15, "deletehosts", ep) } } } RB_TEST(16, "deletehosts", ep) RB_PREEMPT; rb_level = 5; /* level 6 */ RB_TEST(17, "deletehosts", ep) for (i = 0; i < node_c; i++) { if (MD_MNSET_DESC(sd) && (oha == TRUE)) { /* * During OHA mode, don't issue RPCs to * non-alive nodes since there is no reason to * wait for RPC timeouts. */ nd = sd->sd_nodelist; while (nd) { if (strcmp(nd->nd_nodename, node_v[i]) == 0) break; nd = nd->nd_next; } if (nd == NULL) { (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno, nd->nd_nodename, NULL, sp->setname); goto rollback; } if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { continue; } if (clnt_delset(node_v[i], sp, ep) == -1) { goto rollback; } } else if (MD_MNSET_DESC(sd) && (oha == FALSE)) { /* * All nodes should be alive in non-oha mode. */ if (clnt_delset(node_v[i], sp, ep) == -1) { goto rollback; } } else { /* * For traditional diskset, issue the RPC and * ignore RPC failure if in OHA mode. */ if (clnt_delset(node_v[i], sp, ep) == -1) { if (oha == TRUE && mdanyrpcerror(ep)) { mdclrerror(ep); continue; } goto rollback; } } RB_TEST(18, "deletehosts", ep) } metafreereplicalist(rlp); if (MD_MNSET_DESC(sd)) { /* release signals back to what they were on entry */ if (procsigs(FALSE, &oldsigs, &xep) < 0) mdclrerror(&xep); } else { md_rb_sig_handling_off(md_got_sig(), md_which_sig()); } return (0); rollback: /* all signals already blocked for MN disket */ if (!(MD_MNSET_DESC(sd))) { /* Make sure we are blocking all signals */ if (procsigs(TRUE, &oldsigs, &xep) < 0) mdclrerror(&xep); } max_genid = sd->sd_genid; /* level 5 */ if (rb_level > 4) { recreate_set(sp, sd); max_genid++; } /* level 2 */ if (rb_level > 1 && dd != NULL) { /* * See if we have to re-add the drives specified. */ for (i = 0; i < node_c; i++) { md_set_record *sr; if (MD_MNSET_DESC(sd) && (oha == TRUE)) { /* * During OHA mode, don't issue RPCs to * non-alive nodes since there is no reason to * wait for RPC timeouts. */ nd = sd->sd_nodelist; while (nd) { if (strcmp(nd->nd_nodename, node_v[i]) == 0) break; nd = nd->nd_next; } if (nd == NULL) continue; if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) continue; } /* Don't care if set record is MN or not */ if (clnt_getset(node_v[i], sp->setname, MD_SET_BAD, &sr, &xep) == -1) { mdclrerror(&xep); continue; } /* Drive already added, skip to next node */ if (sr->sr_drivechain != NULL) { /* * Set record structure was allocated from RPC * routine getset so this structure is only of * size md_set_record even if the MN flag is * set. So, clear the flag so that the free * code doesn't attempt to free a structure * the size of md_mnset_record. */ sr->sr_flags &= ~MD_SR_MN; free_sr(sr); continue; } if (clnt_adddrvs(node_v[i], sp, dd, sr->sr_ctime, sr->sr_genid, &xep) == -1) mdclrerror(&xep); if (clnt_upd_dr_flags(node_v[i], sp, dd, MD_DR_OK, &xep) == -1) mdclrerror(&xep); /* * Set record structure was allocated from RPC routine * getset so this structure is only of size * md_set_record even if the MN flag is set. So, * clear the flag so that the free code doesn't * attempt to free a structure the size of * md_mnset_record. */ sr->sr_flags &= ~MD_SR_MN; free_sr(sr); } max_genid += 3; } /* level 3 */ if (rb_level > 2 && dd != NULL) { md_replicalist_t *rl; for (rl = rlp; rl != NULL; rl = rl->rl_next) { md_replica_t *r = rl->rl_repp; /* * This is not the first replica being added to the * diskset so call with ADDSIDENMS_BCAST. If this * is a traditional diskset, the bcast flag is ignored * since traditional disksets don't use the rpc.mdcommd. */ if (meta_db_addsidenms(sp, r->r_namep, r->r_blkno, DB_ADDSIDENMS_BCAST, &xep)) mdclrerror(&xep); } } /* level 4 */ if (rb_level > 3 && dd != NULL) { int nodeid_addsides = 0; /* * Add the device names for the new sides into the namespace, * on all hosts not being deleted. */ if (MD_MNSET_DESC(sd)) { nd = sd->sd_nodelist; while (nd) { /* Find a node that is not being deleted */ if (! strinlst(nd->nd_nodename, node_c, node_v)) { nodeid_addsides = nd->nd_nodeid; break; } nd = nd->nd_next; } } else { for (j = 0; j < MD_MAXSIDES; j++) { /* Skip empty slots */ if (sd->sd_nodes[j][0] == '\0') continue; /* Find a node that is not being deleted */ if (! strinlst(sd->sd_nodes[j], node_c, node_v)) break; } nodeid_addsides = j; } if (MD_MNSET_DESC(sd)) { nd = sd->sd_nodelist; while (nd) { /* Skip nodes not being deleted */ if (!strinlst(nd->nd_nodename, node_c, node_v)) { nd = nd->nd_next; continue; } /* this side was just created, add the names */ if (add_md_sidenms(sp, nd->nd_nodeid, nodeid_addsides, &xep)) mdclrerror(&xep); nd = nd->nd_next; } } else { for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; /* Skip nodes not being deleted */ if (!strinlst(sd->sd_nodes[i], node_c, node_v)) continue; /* this side was just created, add the names */ if (add_md_sidenms(sp, i, nodeid_addsides, &xep)) mdclrerror(&xep); } } } /* level 1 */ if (rb_level > 0) { max_genid++; resync_genid(sp, sd, max_genid, node_c, node_v); } /* level 0 */ cl_sk = cl_get_setkey(sp->setno, sp->setname); if (MD_MNSET_DESC(sd)) { nd = sd->sd_nodelist; while (nd) { if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) continue; /* To balance lock/unlock; can send to dead node */ if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) mdclrerror(&xep); nd = nd->nd_next; } } else { for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) mdclrerror(&xep); } } cl_set_setkey(NULL); /* release signals back to what they were on entry */ if (procsigs(FALSE, &oldsigs, &xep) < 0) mdclrerror(&xep); metafreereplicalist(rlp); if (!(MD_MNSET_DESC(sd))) { md_rb_sig_handling_off(md_got_sig(), md_which_sig()); } return (-1); } static int make_sideno_sidenm( mdsetname_t *sp, mddrivename_t *dnp, side_t sideno, md_error_t *ep ) { mdsidenames_t *sn, **sn_next; md_set_desc *sd; mdname_t *np; uint_t rep_slice; int err = 0; assert(dnp->side_names_key != MD_KEYWILD); if ((sd = metaget_setdesc(sp, ep)) == NULL) return (-1); /* find the end of the link list */ for (sn = dnp->side_names; sn->next != NULL; sn = sn->next) ; sn_next = &sn->next; if (meta_replicaslice(dnp, &rep_slice, ep) != 0) return (-1); if ((np = metaslicename(dnp, rep_slice, ep)) == NULL) return (-1); sn = Zalloc(sizeof (*sn)); sn->sideno = sideno; if (MD_MNSET_DESC(sd)) { /* * For MO diskset the sideno is not an index into * the array of nodes. Hence getside_devinfo is * used instead of meta_getnextside_devinfo. */ if (meta_getside_devinfo(sp, np->bname, sideno, &sn->cname, &sn->dname, &sn->mnum, ep) == -1) err = -1; } else { /* decrement sideno, to look like the previous sideno */ sideno--; if (meta_getnextside_devinfo(sp, np->bname, &sideno, &sn->cname, &sn->dname, &sn->mnum, ep) == -1) err = -1; } if (err) { Free(sn); return (err); } assert(sn->sideno == sideno); /* Add to the end of the linked list */ *sn_next = sn; return (0); } static int validate_nodes( mdsetname_t *sp, int node_c, char **node_v, md_error_t *ep ) { char *hostname; int i; for (i = 0; i < node_c; i++) { if (strlen(node_v[i]) > (size_t)MD_MAX_NODENAME) return (mddserror(ep, MDE_DS_NODENAMETOOLONG, sp->setno, node_v[i], NULL, sp->setname)); if (clnt_hostname(node_v[i], &hostname, ep)) return (-1); if (strcmp(node_v[i], hostname) != 0) { Free(hostname); return (mddserror(ep, MDE_DS_NOTNODENAME, sp->setno, node_v[i], NULL, sp->setname)); } Free(hostname); } return (0); } /* * Exported Entry Points */ /* * Check the given disk set name for syntactic correctness. */ int meta_set_checkname(char *setname, md_error_t *ep) { char *cp; if (strlen(setname) > (size_t)MD_MAX_SETNAME) return (mddserror(ep, MDE_DS_SETNAMETOOLONG, MD_SET_BAD, NULL, NULL, setname)); for (cp = setname; *cp; cp++) if (!isprint(*cp) || strchr(INVALID_IN_NAMES, *cp) != NULL) return (mddserror(ep, MDE_DS_INVALIDSETNAME, MD_SET_BAD, NULL, NULL, setname)); return (0); } /* * Add host(s) to the multi-node diskset provided in sp. * - create set if non-existent. */ static int meta_multinode_set_addhosts( mdsetname_t *sp, int multi_node, int node_c, char **node_v, int auto_take, md_error_t *ep ) { md_set_desc *sd; md_drive_desc *dd, *p; int rval = 0; int bool; int nodeindex; int i; int has_set; sigset_t oldsigs; md_setkey_t *cl_sk; int rb_level = 0; md_error_t xep = mdnullerror; md_mnnode_desc *nd, *nd_curr, *nd_prev; md_timeval32_t now; int nodecnt; mndiskset_membershiplist_t *nl, *nl2; int suspendall_flag = 0; int suspend1_flag = 0; int lock_flag = 0; int stale_flag = 0; md_mnnode_desc *saved_nd_next; int remote_sets_created = 0; /* * Check membershiplist first. If there's * an error, fail to create set and pass back error. */ if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) { return (-1); } /* Verify that all nodes are in member list */ for (i = 0; i < node_c; i++) { /* * If node in list isn't a member of the membership, * just return error. */ if (meta_is_member(node_v[i], NULL, nl) == 0) { meta_free_nodelist(nl); return (mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno, node_v[i], NULL, sp->setname)); } } /* * Node list is needed later, but there is a lot of error * checking and possible failures between here and there, so * just re-get the list later if there are no errors. */ meta_free_nodelist(nl); nl = NULL; /* * Verify that list of nodes being added contains no * duplicates. */ if (nodesuniq(sp, node_c, node_v, ep)) return (-1); /* * Verify that each node being added thinks that its nodename * is the same as the nodename given. */ if (validate_nodes(sp, node_c, node_v, ep)) return (-1); if ((sd = metaget_setdesc(sp, ep)) == NULL) { if (! mdiserror(ep, MDE_NO_SET)) return (-1); mdclrerror(ep); return (create_set(sp, multi_node, node_c, node_v, auto_take, ep)); } else { /* * If this node and another node were both attempting to * create the same setname at the same time, and the other * node has just created the set on this node then sd would * be non-NULL, but sp->setno would be null (setno is filled * in by the create_set). If this is true, then fail since * the other node has already won this race. */ if (sp->setno == NULL) { return (mddserror(ep, MDE_DS_NODEINSET, NULL, mynode(), NULL, sp->setname)); } } /* The auto_take behavior is inconsistent with multiple hosts. */ if (auto_take || sd->sd_flags & MD_SR_AUTO_TAKE) { (void) mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, NULL, sp->setname); return (-1); } /* * We already have the set. */ /* Make sure we own the set */ if (meta_check_ownership(sp, ep) != 0) return (-1); /* * The drive and node records are stored in the local mddbs of each * node in the diskset. Each node's rpc.metad daemon reads in the set, * drive and node records from that node's local mddb and caches them * internally. Any process needing diskset information contacts its * local rpc.metad to get this information. Since each node in the * diskset is independently reading the set information from its local * mddb, the set, drive and node records in the local mddbs must stay * in-sync, so that all nodes have a consistent view of the diskset. * * For a multinode diskset, explicitly verify that all nodes in the * diskset are ALIVE (i.e. are in the API membership list). Otherwise, * fail this operation since all nodes must be ALIVE in order to add * the new node record to their local mddb. If a panic of this node * leaves the local mddbs set, node and drive records out-of-sync, the * reconfig cycle will fix the local mddbs and force them back into * synchronization. */ nd = sd->sd_nodelist; while (nd) { if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { return (mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno, nd->nd_nodename, NULL, sp->setname)); } nd = nd->nd_next; } /* * Check if node is already in set. */ for (i = 0; i < node_c; i++) { /* Is node already in set? */ nd = sd->sd_nodelist; while (nd) { if (strcmp(nd->nd_nodename, node_v[i]) == 0) break; nd = nd->nd_next; } if (nd) { return (mddserror(ep, MDE_DS_NODEINSET, sp->setno, node_v[i], NULL, sp->setname)); } } /* * Lock the set on current set members. * Set locking done much earlier for MN diskset than for traditional * diskset since lock_set and SUSPEND are used to protect against * other meta* commands running on the other nodes. */ /* Make sure we are blocking all signals */ if (procsigs(TRUE, &oldsigs, &xep) < 0) mdclrerror(&xep); nd = sd->sd_nodelist; /* All nodes are guaranteed to be ALIVE */ while (nd) { if (clnt_lock_set(nd->nd_nodename, sp, ep)) { rval = -1; goto out; } lock_flag = 1; nd = nd->nd_next; } /* * Lock out other meta* commands by suspending * class 1 messages across the diskset. */ nd = sd->sd_nodelist; /* Send suspend to nodes in nodelist before addhosts call */ /* All nodes are guaranteed to be ALIVE */ while (nd) { if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) { rval = -1; goto out; } suspend1_flag = 1; nd = nd->nd_next; } /* Lock the set on new set members */ for (i = 0; i < node_c; i++) { /* Already verified to be alive */ if (clnt_lock_set(node_v[i], sp, ep)) { rval = -1; goto out; } lock_flag = 1; } /* * Perform the required checks for new hosts */ for (i = 0; i < node_c; i++) { /* Make sure this set name is not used on the other hosts */ has_set = nodehasset(sp, node_v[i], NHS_N_EQ, ep); if (has_set < 0) { if (! mdiserror(ep, MDE_NO_SET)) { rval = -1; goto out; } /* Keep on truck'n */ mdclrerror(ep); } else if (has_set) { (void) mddserror(ep, MDE_DS_NODEHASSET, sp->setno, node_v[i], NULL, sp->setname); rval = -1; goto out; } if (clnt_setnumbusy(node_v[i], sp->setno, &bool, ep) == -1) { rval = -1; goto out; } if (bool == TRUE) { (void) mddserror(ep, MDE_DS_SETNUMBUSY, sp->setno, node_v[i], NULL, sp->setname); rval = -1; goto out; } if (clnt_setnameok(node_v[i], sp, &bool, ep) == -1) { rval = -1; goto out; } if (bool == FALSE) { (void) mddserror(ep, MDE_DS_SETNAMEBUSY, sp->setno, node_v[i], NULL, sp->setname); rval = -1; goto out; } if (check_setdrvs_againstnode(sp, node_v[i], ep)) { rval = -1; goto out; } } /* Get drive descriptors for the set */ if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL) { if (! mdisok(ep)) { rval = -1; goto out; } } /* END CHECK CODE */ RB_TEST(1, "addhosts", ep) RB_PREEMPT; rb_level = 1; /* level 1 */ RB_TEST(2, "addhosts", ep) /* * Create the set where needed */ if (create_set_on_hosts(sp, multi_node, node_c, node_v, 0, ep)) { goto rollback; } /* * Send suspend to rpc.mdcommd on nodes where a set has been * created since rpc.mdcommd must now be running on the remote nodes. */ remote_sets_created = 1; for (i = 0; i < node_c; i++) { /* * Lock out other meta* commands by suspending * class 1 messages across the diskset. */ if (clnt_mdcommdctl(node_v[i], COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) { rval = -1; goto rollback; } } /* * Merge the new entries into the set with the existing sides. * Get membershiplist from API routine. If there's * an error, fail to create set and pass back error. */ if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) { goto rollback; } if (meta_gettimeofday(&now) == -1) { meta_free_nodelist(nl); (void) mdsyserror(ep, errno, dgettext(TEXT_DOMAIN, "meta_gettimeofday()")); goto rollback; } for (nodeindex = 0; nodeindex < node_c; nodeindex++) { nd = Zalloc(sizeof (*nd)); (void) strcpy(nd->nd_nodename, node_v[nodeindex]); nd->nd_ctime = now; nl2 = nl; while (nl2) { if (strcmp(nl2->msl_node_name, node_v[nodeindex]) == 0) { nd->nd_nodeid = nl2->msl_node_id; (void) strcpy(nd->nd_priv_ic, nl2->msl_node_addr); break; } nl2 = nl2->next; } /* * Nodelist must be kept in ascending nodeid order. */ if (sd->sd_nodelist == NULL) { /* Nothing in list, just add it */ sd->sd_nodelist = nd; } else if (nd->nd_nodeid < sd->sd_nodelist->nd_nodeid) { /* Add to head of list */ nd->nd_next = sd->sd_nodelist; sd->sd_nodelist = nd; } else { nd_curr = sd->sd_nodelist->nd_next; nd_prev = sd->sd_nodelist; /* Search for place to add it */ while (nd_curr) { if (nd->nd_nodeid < nd_curr->nd_nodeid) { /* Add before nd_curr */ nd->nd_next = nd_curr; nd_prev->nd_next = nd; break; } nd_prev = nd_curr; nd_curr = nd_curr->nd_next; } /* Add to end of list */ if (nd_curr == NULL) { nd_prev->nd_next = nd; } } /* Node already verified to be in membership */ nd->nd_flags |= MD_MN_NODE_ALIVE; } meta_free_nodelist(nl); /* If we have drives */ if (dd != NULL) { /* * For all the hosts being added, create a sidename structure */ nd = sd->sd_nodelist; while (nd) { /* Skip nodes not being added */ if (!strinlst(nd->nd_nodename, node_c, node_v)) { nd = nd->nd_next; continue; } for (p = dd; p != NULL; p = p->dd_next) { if (make_sideno_sidenm(sp, p->dd_dnp, nd->nd_nodeid, ep) != 0) goto rollback; } nd = nd->nd_next; } RB_PREEMPT; rb_level = 2; /* level 2 */ RB_TEST(4, "addhosts", ep) /* * Add the new sidename for each drive to all the hosts * * If a multi-node diskset, each host only stores * the side information for itself. So, only send * side information to the new hosts where each host * will add the appropriate side information to its * local mddb. */ nd = sd->sd_nodelist; while (nd) { /* Skip nodes not being added */ if (!strinlst(nd->nd_nodename, node_c, node_v)) { nd = nd->nd_next; continue; } /* Add side info to new hosts */ if (clnt_add_drv_sidenms(nd->nd_nodename, mynode(), sp, sd, node_c, node_v, ep)) goto rollback; nd = nd->nd_next; } RB_TEST(5, "addhosts", ep) RB_PREEMPT; rb_level = 3; /* level 3 */ RB_TEST(6, "addhosts", ep) /* * Add the device names for the new sides into the namespace * for all hosts being added. This is adding the side * names to the diskset's mddb so add sidenames for all * of the new hosts. */ nd = sd->sd_nodelist; while (nd) { /* Skip nodes not being added */ if (!strinlst(nd->nd_nodename, node_c, node_v)) { nd = nd->nd_next; continue; } /* this side was just created, add the names */ if (add_md_sidenms(sp, nd->nd_nodeid, MD_SIDEWILD, ep)) goto rollback; nd = nd->nd_next; } RB_TEST(7, "addhosts", ep) RB_PREEMPT; rb_level = 4; /* level 4 */ RB_TEST(8, "addhosts", ep) if (add_db_sidenms(sp, ep)) goto rollback; } else { RB_PREEMPT; rb_level = 4; } RB_TEST(9, "addhosts", ep) RB_PREEMPT; rb_level = 5; /* level 5 */ RB_TEST(10, "addhosts", ep) if (dd != NULL) { /* * Notify rpc.mdcommd on all nodes of a nodelist change. * Start by suspending rpc.mdcommd (which drains it of all * messages), then change the nodelist followed by a reinit * and resume. */ nd = sd->sd_nodelist; /* Send suspend_all to nodes in nodelist (existing + new) */ /* All nodes are guaranteed to be ALIVE */ while (nd) { if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { rval = -1; goto rollback; } suspendall_flag = 1; nd = nd->nd_next; } } /* Add the node(s) to the each host that is currently in the set */ nd = sd->sd_nodelist; /* All nodes are guaranteed to be ALIVE */ while (nd) { if (clnt_addhosts(nd->nd_nodename, sp, node_c, node_v, ep)) { goto rollback; } nd = nd->nd_next; } RB_TEST(11, "addhosts", ep) if (dd != NULL) { /* * Mark the drives MD_DR_OK. */ nd = sd->sd_nodelist; /* All nodes are guaranteed to be ALIVE */ while (nd) { if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd, MD_DR_OK, ep) == -1) goto rollback; nd = nd->nd_next; } } RB_TEST(12, "addhosts", ep) RB_PREEMPT; rb_level = 6; /* level 6 */ RB_TEST(13, "addhosts", ep) /* Add the mediator information to all hosts in the set. */ nd = sd->sd_nodelist; /* All nodes are guaranteed to be ALIVE */ while (nd) { if (clnt_updmeds(nd->nd_nodename, sp, &sd->sd_med, ep)) goto rollback; nd = nd->nd_next; } RB_TEST(14, "addhosts", ep) /* * If a MN diskset and there are drives in the set, * set the master on the new nodes and * automatically join the new nodes into the set. */ if (dd != NULL) { mddb_config_t c; /* * Is current set STALE? */ (void) memset(&c, 0, sizeof (c)); c.c_id = 0; c.c_setno = sp->setno; if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { (void) mdstealerror(ep, &c.c_mde); rval = -1; goto out; } if (c.c_flags & MDDB_C_STALE) { stale_flag = MNSET_IS_STALE; } /* Set master on newly added nodes */ for (i = 0; i < node_c; i++) { if (clnt_mnsetmaster(node_v[i], sp, sd->sd_mn_master_nodenm, sd->sd_mn_master_nodeid, ep)) { goto rollback; } } /* Join newly added nodes to diskset and set OWN flag */ for (i = 0; i < node_c; i++) { if (clnt_joinset(node_v[i], sp, stale_flag, ep)) goto rollback; nd = sd->sd_nodelist; while (nd) { if (strcmp(nd->nd_nodename, node_v[i]) == 0) { nd->nd_flags |= MD_MN_NODE_OWN; /* * Also set ADD flag since this flag * is already set in rpc.metad - it's * just not in the local copy. * Could flush local cache and call * metaget_setdesc, but this just * adds time. Since this node knows * the state of the node flags in * rpc.metad, just set the ADD * flag and save time. */ nd->nd_flags |= MD_MN_NODE_ADD; break; } nd = nd->nd_next; } } /* Send new node flag list to all Owner nodes */ nd = sd->sd_nodelist; while (nd) { if (!(nd->nd_flags & MD_MN_NODE_OWN)) { nd = nd->nd_next; continue; } /* * Will effectively set OWN flag in records kept * cached in rpc.metad. The ADD flag would have * already been set by the call to clnt_addhosts. */ if (clnt_upd_nr_flags(nd->nd_nodename, sp, sd->sd_nodelist, MD_NR_SET, NULL, ep)) { goto rollback; } nd = nd->nd_next; } } /* * Mark the set record MD_SR_OK */ nd = sd->sd_nodelist; /* All nodes are guaranteed to be ALIVE */ while (nd) { if (clnt_upd_sr_flags(nd->nd_nodename, sp, MD_SR_OK, ep)) { goto rollback; } nd = nd->nd_next; } /* * For MN diskset: * On each newly added node, set the node record for that node * to OK. Then set all node records for the newly added * nodes on all nodes to ok. * * By setting a node's own node record to ok first, even if * the node adding the hosts panics, the rest of the nodes can * determine the same node list during the choosing of the master * during reconfig. So, only nodes considered for mastership * are nodes that have both MD_MN_NODE_OK and MD_SR_OK set * on that node's rpc.metad. If all nodes have MD_SR_OK set, * but no node has its own MD_MN_NODE_OK set, then the set will * be removed during reconfig since a panic occurred during the * creation of the initial diskset. */ for (i = 0; i < node_c; i++) { nd = sd->sd_nodelist; /* All nodes are guaranteed to be ALIVE */ while (nd) { if (strcmp(nd->nd_nodename, node_v[i]) == 0) break; nd = nd->nd_next; } /* Something wrong, will pick this up in next loop */ if (nd == NULL) continue; /* Only changing my local cache of node list */ saved_nd_next = nd->nd_next; nd->nd_next = NULL; /* Set node record for added host to ok on that host */ if (clnt_upd_nr_flags(node_v[i], sp, nd, MD_NR_OK, NULL, ep)) { nd->nd_next = saved_nd_next; goto rollback; } nd->nd_next = saved_nd_next; } /* Now set all node records on all nodes to be ok */ nd = sd->sd_nodelist; /* All nodes are guaranteed to be ALIVE */ while (nd) { if (clnt_upd_nr_flags(nd->nd_nodename, sp, sd->sd_nodelist, MD_NR_OK, NULL, ep)) { goto rollback; } nd = nd->nd_next; } RB_TEST(15, "addhosts", ep) out: /* * Notify rpc.mdcommd on all nodes of a nodelist change. * Send reinit command to mdcommd which forces it to get * fresh set description. Then send resume. * Resume on class 0 will resume all classes, so can skip * doing an explicit resume of class1 (ignore suspend1_flag). */ if (suspendall_flag) { /* * Don't know if nodelist contains the nodes being added * or not, so do reinit to nodes not being added (by skipping * any nodes in the nodelist being added) and then do * reinit to nodes being added if remote_sets_created is 1. */ nd = sd->sd_nodelist; /* All nodes are guaranteed to be ALIVE */ while (nd) { /* Skip nodes being added - handled later */ if (strinlst(nd->nd_nodename, node_c, node_v)) { nd = nd->nd_next; continue; } /* Class is ignored for REINIT */ if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { if (rval == 0) (void) mdstealerror(ep, &xep); rval = -1; mde_perror(ep, dgettext(TEXT_DOMAIN, "Unable to reinit rpc.mdcommd.\n")); } nd = nd->nd_next; } /* * Send reinit to added nodes that had a set created since * rpc.mdcommd is running on the nodes with a set. */ if (remote_sets_created == 1) { for (i = 0; i < node_c; i++) { if (clnt_mdcommdctl(node_v[i], COMMDCTL_REINIT, sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { if (rval == 0) (void) mdstealerror(ep, &xep); rval = -1; mde_perror(ep, dgettext(TEXT_DOMAIN, "Unable to reinit rpc.mdcommd.\n")); } } } } if ((suspend1_flag) || (suspendall_flag)) { /* * Unlock diskset by resuming messages across the diskset. * Just resume all classes so that resume is the same whether * just one class was locked or all classes were locked. * * Don't know if nodelist contains the nodes being added * or not, so do resume_all to nodes not being added (by * skipping any nodes in the nodelist being added) and then do * resume_all to nodes being added if remote_sets_created is 1. */ nd = sd->sd_nodelist; /* All nodes are guaranteed to be ALIVE */ while (nd) { /* Skip nodes being added - handled later */ if (strinlst(nd->nd_nodename, node_c, node_v)) { nd = nd->nd_next; continue; } if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { if (rval == 0) (void) mdstealerror(ep, &xep); rval = -1; mde_perror(ep, dgettext(TEXT_DOMAIN, "Unable to resume rpc.mdcommd.\n")); } nd = nd->nd_next; } /* * Send resume to added nodes that had a set created since * rpc.mdcommd is be running on the nodes with a set. */ if (remote_sets_created == 1) { for (i = 0; i < node_c; i++) { /* Already verified to be alive */ if (clnt_mdcommdctl(node_v[i], COMMDCTL_RESUME, sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { if (rval == 0) (void) mdstealerror(ep, &xep); rval = -1; mde_perror(ep, dgettext(TEXT_DOMAIN, "Unable to resume rpc.mdcommd.\n")); } } } meta_ping_mnset(sp->setno); /* * Start a resync thread on the newly added nodes * if set is not stale. Also start a thread to update the * abr state of all soft partitions */ if (stale_flag != MNSET_IS_STALE) { for (i = 0; i < node_c; i++) { if (clnt_mn_mirror_resync_all(node_v[i], sp->setno, &xep)) { if (rval == 0) (void) mdstealerror(ep, &xep); rval = -1; mde_perror(ep, dgettext(TEXT_DOMAIN, "Unable to start resync " "thread.\n")); } if (clnt_mn_sp_update_abr(node_v[i], sp->setno, &xep)) { if (rval == 0) (void) mdstealerror(ep, &xep); rval = -1; mde_perror(ep, dgettext(TEXT_DOMAIN, "Unable to start sp update " "thread.\n")); } } } } cl_sk = cl_get_setkey(sp->setno, sp->setname); /* * Don't know if nodelist contains the nodes being added * or not, so do clnt_unlock_set to nodes not being added (by * skipping any nodes in the nodelist being added) and then do * clnt_unlock_set to nodes being added. */ if (lock_flag) { nd = sd->sd_nodelist; /* All nodes are guaranteed to be ALIVE */ while (nd) { /* Skip hosts we get in the next loop */ if (strinlst(nd->nd_nodename, node_c, node_v)) { nd = nd->nd_next; continue; } if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { if (rval == 0) (void) mdstealerror(ep, &xep); rval = -1; } nd = nd->nd_next; } for (i = 0; i < node_c; i++) { /* Already verified to be alive */ if (clnt_unlock_set(node_v[i], cl_sk, &xep)) { if (rval == 0) (void) mdstealerror(ep, &xep); rval = -1; } } } cl_set_setkey(NULL); metaflushsetname(sp); /* release signals back to what they were on entry */ if (procsigs(FALSE, &oldsigs, &xep) < 0) mdclrerror(&xep); return (rval); rollback: rval = -1; /* level 6 */ if (rb_level > 5) { /* * For each node being deleted, set DEL flag and * reset OK flag on that node first. * Until a node has turned off its own * rpc.metad's NODE_OK flag, that node could be * considered for master during a reconfig. */ for (i = 0; i < node_c; i++) { nd = sd->sd_nodelist; /* All nodes are guaranteed to be ALIVE */ while (nd) { if (strcmp(nd->nd_nodename, node_v[i]) == 0) break; nd = nd->nd_next; } /* Something wrong, handle this in next loop */ if (nd == NULL) continue; /* Only changing my local cache of node list */ saved_nd_next = nd->nd_next; nd->nd_next = NULL; /* Set flags for del host to DEL on that host */ if (clnt_upd_nr_flags(node_v[i], sp, nd, MD_NR_DEL, NULL, &xep)) { mdclrerror(&xep); } nd->nd_next = saved_nd_next; } for (i = 0; i < node_c; i++) { if (dd != NULL) { /* Reset master on newly added node */ if (clnt_mnsetmaster(node_v[i], sp, "", MD_MN_INVALID_NID, &xep)) mdclrerror(&xep); /* Withdraw set on newly added node */ if (clnt_withdrawset(node_v[i], sp, &xep)) mdclrerror(&xep); } /* * Turn off owner flag in nodes to be deleted * if there are drives in the set. * Also, turn off NODE_OK and turn on NODE_DEL * for nodes to be deleted. * These flags are used to set the node * record flags in all nodes in the set. */ nd = sd->sd_nodelist; while (nd) { if (strcmp(nd->nd_nodename, node_v[i]) == 0) { if (dd != NULL) { nd->nd_flags &= ~MD_MN_NODE_OWN; } nd->nd_flags |= MD_MN_NODE_DEL; nd->nd_flags &= ~MD_MN_NODE_OK; break; } nd = nd->nd_next; } } /* * Now, reset owner and set delete flags for the deleted * nodes on all nodes. */ nd = sd->sd_nodelist; while (nd) { if (clnt_upd_nr_flags(nd->nd_nodename, sp, sd->sd_nodelist, MD_NR_SET, NULL, &xep)) { mdclrerror(&xep); } nd = nd->nd_next; } /* * On each node being deleted, set the set record * to be in DEL state. */ for (i = 0; i < node_c; i++) { if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, &xep)) { mdclrerror(&xep); } } } /* level 5 */ if (rb_level > 4) { nd = sd->sd_nodelist; /* All nodes are guaranteed to be ALIVE */ while (nd) { if (clnt_delhosts(nd->nd_nodename, sp, node_c, node_v, &xep) == -1) mdclrerror(&xep); nd = nd->nd_next; } } /* * Notify rpc.mdcommd on all nodes of a nodelist change. * Send reinit command to mdcommd which forces it to get * fresh set description. Then send resume. * Nodelist contains all nodes (existing + added). */ if (suspendall_flag) { /* Send reinit */ nd = sd->sd_nodelist; /* All nodes are guaranteed to be ALIVE */ /* Send reinit to nodes in nodelist before addhosts call */ while (nd) { /* * Skip nodes being added if remote sets were not * created since rpc.mdcommd may not be running * on the remote nodes. */ if ((remote_sets_created == 0) && (strinlst(nd->nd_nodename, node_c, node_v))) { nd = nd->nd_next; continue; } /* Class is ignored for REINIT */ if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { mde_perror(&xep, dgettext(TEXT_DOMAIN, "Unable to reinit rpc.mdcommd.\n")); mdclrerror(&xep); } nd = nd->nd_next; } /* Send resume */ nd = sd->sd_nodelist; /* All nodes are guaranteed to be ALIVE */ while (nd) { /* * Skip nodes being added if remote sets were not * created since rpc.mdcommd may not be running * on the remote nodes. */ if ((remote_sets_created == 0) && (strinlst(nd->nd_nodename, node_c, node_v))) { nd = nd->nd_next; continue; } /* * Resume all classes but class 1 so that lock is held * against meta* commands. * Send resume_all_but_1 to nodes in nodelist * before addhosts call. */ if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, sp, MD_MSG_CLASS0, MD_MSCF_DONT_RESUME_CLASS1, &xep)) { mde_perror(&xep, dgettext(TEXT_DOMAIN, "Unable to resume rpc.mdcommd.\n")); mdclrerror(&xep); } nd = nd->nd_next; } meta_ping_mnset(sp->setno); } /* level 4 */ /* Nodelist may or may not contain nodes being added. */ if (rb_level > 3 && dd != NULL) { nd = sd->sd_nodelist; while (nd) { /* Skip nodes not being added */ if (!strinlst(nd->nd_nodename, node_c, node_v)) { nd = nd->nd_next; continue; } if (del_db_sidenms(sp, nd->nd_nodeid, &xep)) mdclrerror(&xep); nd = nd->nd_next; } } /* level 3 */ /* Nodelist may or may not contain nodes being added. */ if (rb_level > 2 && dd != NULL) { nd = sd->sd_nodelist; while (nd) { /* Skip nodes not being added */ if (!strinlst(nd->nd_nodename, node_c, node_v)) { nd = nd->nd_next; continue; } if (del_md_sidenms(sp, nd->nd_nodeid, &xep)) mdclrerror(&xep); nd = nd->nd_next; } } /* level 1 */ if (rb_level > 0) { if (dd != NULL) { /* delete the drive records */ for (i = 0; i < node_c; i++) { if (clnt_deldrvs(node_v[i], sp, dd, &xep) == -1) mdclrerror(&xep); } } /* delete the set record */ for (i = 0; i < node_c; i++) { if (clnt_delset(node_v[i], sp, &xep) == -1) mdclrerror(&xep); } } /* level 0 */ cl_sk = cl_get_setkey(sp->setno, sp->setname); /* Don't test lock flag since guaranteed to be set if in rollback */ /* Nodelist may or may not contain nodes being added. */ /* * Unlock diskset by resuming messages across the diskset. * Just resume all classes so that resume is the same whether * just one class was locked or all classes were locked. */ if ((suspend1_flag) || (suspendall_flag)) { /* All nodes are guaranteed to be ALIVE */ nd = sd->sd_nodelist; while (nd) { /* * Skip nodes being added since remote sets * were either created and then deleted or * were never created. Either way - rpc.mdcommd * may not be running on the remote node. */ if (strinlst(nd->nd_nodename, node_c, node_v)) { nd = nd->nd_next; continue; } if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { mde_perror(&xep, dgettext(TEXT_DOMAIN, "Unable to resume rpc.mdcommd.\n")); mdclrerror(&xep); } nd = nd->nd_next; } meta_ping_mnset(sp->setno); } nd = sd->sd_nodelist; /* All nodes are guaranteed to be ALIVE */ while (nd) { /* Skip hosts we get in the next loop */ if (strinlst(nd->nd_nodename, node_c, node_v)) { nd = nd->nd_next; continue; } if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) mdclrerror(&xep); nd = nd->nd_next; } for (i = 0; i < node_c; i++) if (clnt_unlock_set(node_v[i], cl_sk, &xep)) mdclrerror(&xep); cl_set_setkey(NULL); /* release signals back to what they were on entry */ if (procsigs(FALSE, &oldsigs, &xep) < 0) mdclrerror(&xep); metaflushsetname(sp); return (rval); } /* * Add host(s) to the traditional diskset provided in sp. * - create set if non-existent. */ static int meta_traditional_set_addhosts( mdsetname_t *sp, int multi_node, int node_c, char **node_v, int auto_take, md_error_t *ep ) { md_set_desc *sd; md_drive_desc *dd, *p; med_rec_t medr; med_rec_t rb_medr; int rval = 0; int bool; int nodeindex; int i; int has_set; int numsides; sigset_t oldsigs; md_setkey_t *cl_sk; int rb_level = 0; md_error_t xep = mdnullerror; int max_meds; if (nodesuniq(sp, node_c, node_v, ep)) return (-1); if (validate_nodes(sp, node_c, node_v, ep)) return (-1); if ((sd = metaget_setdesc(sp, ep)) == NULL) { if (! mdiserror(ep, MDE_NO_SET)) return (-1); mdclrerror(ep); return (create_set(sp, multi_node, node_c, node_v, auto_take, ep)); } /* The auto_take behavior is inconsistent with multiple hosts. */ if (auto_take || sd->sd_flags & MD_SR_AUTO_TAKE) { (void) mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, NULL, sp->setname); return (-1); } /* * We already have the set. */ /* Make sure we own the set */ if (meta_check_ownership(sp, ep) != 0) return (-1); /* * Perform the required checks for new hosts */ for (i = 0; i < node_c; i++) { if (getnodeside(node_v[i], sd) != MD_SIDEWILD) return (mddserror(ep, MDE_DS_NODEINSET, sp->setno, node_v[i], NULL, sp->setname)); /* Make sure this set name is not used on the other hosts */ has_set = nodehasset(sp, node_v[i], NHS_N_EQ, ep); if (has_set < 0) { if (! mdiserror(ep, MDE_NO_SET)) return (-1); /* Keep on truck'n */ mdclrerror(ep); } else if (has_set) return (mddserror(ep, MDE_DS_NODEHASSET, sp->setno, node_v[i], NULL, sp->setname)); if (clnt_setnumbusy(node_v[i], sp->setno, &bool, ep) == -1) return (-1); if (bool == TRUE) return (mddserror(ep, MDE_DS_SETNUMBUSY, sp->setno, node_v[i], NULL, sp->setname)); if (clnt_setnameok(node_v[i], sp, &bool, ep) == -1) return (-1); if (bool == FALSE) return (mddserror(ep, MDE_DS_SETNAMEBUSY, sp->setno, node_v[i], NULL, sp->setname)); if (check_setdrvs_againstnode(sp, node_v[i], ep)) return (-1); } /* Count the number of occupied slots */ numsides = 0; for (i = 0; i < MD_MAXSIDES; i++) { /* Count occupied slots */ if (sd->sd_nodes[i][0] != '\0') numsides++; } /* Make sure the we have space to add the new sides */ if ((numsides + node_c) > MD_MAXSIDES) { (void) mddserror(ep, MDE_DS_SIDENUMNOTAVAIL, sp->setno, NULL, NULL, sp->setname); return (-1); } /* Get drive descriptors for the set */ if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL) if (! mdisok(ep)) return (-1); /* Setup the mediator record roll-back structure */ (void) memset(&rb_medr, '\0', sizeof (med_rec_t)); rb_medr.med_rec_mag = MED_REC_MAGIC; rb_medr.med_rec_rev = MED_REC_REV; rb_medr.med_rec_fl = 0; rb_medr.med_rec_sn = sp->setno; (void) strcpy(rb_medr.med_rec_snm, sp->setname); for (i = 0; i < MD_MAXSIDES; i++) (void) strcpy(rb_medr.med_rec_nodes[i], sd->sd_nodes[i]); rb_medr.med_rec_meds = sd->sd_med; /* structure assigment */ (void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t)); rb_medr.med_rec_foff = 0; crcgen(&rb_medr, &rb_medr.med_rec_cks, sizeof (med_rec_t), NULL); if ((max_meds = get_max_meds(ep)) == 0) return (-1); /* END CHECK CODE */ md_rb_sig_handling_on(); /* Lock the set on current set members */ for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { rval = -1; goto out; } } /* Lock the set on new set members */ for (i = 0; i < node_c; i++) { if (clnt_lock_set(node_v[i], sp, ep)) { rval = -1; goto out; } } RB_TEST(1, "addhosts", ep) RB_PREEMPT; rb_level = 1; /* level 1 */ RB_TEST(2, "addhosts", ep) /* * Add the new hosts to the existing set record on the existing hosts */ for (i = 0; i < MD_MAXSIDES; i++) { /* skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; if (clnt_addhosts(sd->sd_nodes[i], sp, node_c, node_v, ep)) goto rollback; } RB_PREEMPT; rb_level = 2; /* level 2 */ RB_TEST(3, "addhosts", ep); /* Merge the new entries into the set with the existing sides */ nodeindex = 0; for (i = 0; i < MD_MAXSIDES; i++) { /* Skip full slots */ if (sd->sd_nodes[i][0] != '\0') continue; (void) strcpy(sd->sd_nodes[i], node_v[nodeindex++]); if (nodeindex == node_c) break; } /* If we have drives */ if (dd != NULL) { /* * For all the hosts being added, create a sidename structure */ for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; /* Skip nodes not being added */ if (! strinlst(sd->sd_nodes[i], node_c, node_v)) continue; for (p = dd; p != NULL; p = p->dd_next) { if (make_sideno_sidenm(sp, p->dd_dnp, i, ep) != 0) goto rollback; } } /* * Add the new sidename for each drive to the existing hosts */ for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; /* Skip nodes being added */ if (strinlst(sd->sd_nodes[i], node_c, node_v)) continue; if (clnt_add_drv_sidenms(sd->sd_nodes[i], mynode(), sp, sd, node_c, node_v, ep)) { goto rollback; } } RB_TEST(4, "addhosts", ep) RB_PREEMPT; rb_level = 3; /* level 3 */ RB_TEST(5, "addhosts", ep) if (add_db_sidenms(sp, ep)) { goto rollback; } } else { RB_PREEMPT; rb_level = 3; } RB_TEST(6, "addhosts", ep) RB_PREEMPT; rb_level = 4; /* level 4 */ RB_TEST(7, "addhosts", ep) /* create the set on the new nodes, this adds the drives as well */ if (create_set_on_hosts(sp, multi_node, node_c, node_v, 0, ep)) { goto rollback; } RB_TEST(8, "addhosts", ep) RB_PREEMPT; rb_level = 5; /* level 5 */ RB_TEST(9, "addhosts", ep) if (dd != NULL) { /* * Add the device entries for the new sides into the namespace. */ for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; /* Skip nodes not being added */ if (! strinlst(sd->sd_nodes[i], node_c, node_v)) continue; if (add_md_sidenms(sp, i, MD_SIDEWILD, ep)) goto rollback; } } RB_TEST(10, "addhosts", ep) RB_PREEMPT; rb_level = 6; /* level 6 */ RB_TEST(11, "addhosts", ep); if (dd != NULL) { /* * Mark the drives MD_DR_OK. */ for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd, MD_DR_OK, ep) == -1) { goto rollback; } } } RB_TEST(12, "addhosts", ep) /* Bring the mediator record up to date with the set record */ medr = rb_medr; /* structure assignment */ for (i = 0; i < MD_MAXSIDES; i++) (void) strcpy(medr.med_rec_nodes[i], sd->sd_nodes[i]); crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL); /* Inform the mediator hosts of the new node list */ for (i = 0; i < max_meds; i++) { if (sd->sd_med.n_lst[i].a_cnt == 0) continue; if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep)) goto rollback; } /* Add the mediator information to all hosts in the set */ for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; if (clnt_updmeds(sd->sd_nodes[i], sp, &sd->sd_med, ep)) goto rollback; } RB_TEST(13, "addhosts", ep) /* * Mark the set record MD_SR_OK */ for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; if (clnt_upd_sr_flags(sd->sd_nodes[i], sp, MD_SR_OK, ep)) goto rollback; } RB_TEST(14, "addhosts", ep) out: cl_sk = cl_get_setkey(sp->setno, sp->setname); for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; /* Skip hosts we get in the next loop */ if (strinlst(sd->sd_nodes[i], node_c, node_v)) continue; if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { if (rval == 0) (void) mdstealerror(ep, &xep); rval = -1; } } if (rval == 0) { for (i = 0; i < node_c; i++) if (clnt_unlock_set(node_v[i], cl_sk, &xep)) { if (rval == 0) (void) mdstealerror(ep, &xep); rval = -1; } } cl_set_setkey(NULL); metaflushsetname(sp); md_rb_sig_handling_off(md_got_sig(), md_which_sig()); return (rval); rollback: /* Make sure we are blocking all signals */ if (procsigs(TRUE, &oldsigs, &xep) < 0) mdclrerror(&xep); rval = -1; /* level 6 */ if (rb_level > 5) { for (i = 0; i < max_meds; i++) { if (sd->sd_med.n_lst[i].a_cnt == 0) continue; if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &rb_medr, &xep)) mdclrerror(&xep); } if (dd != NULL) { for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; /* Skip nodes not being added */ if (! strinlst(sd->sd_nodes[i], node_c, node_v)) continue; if (del_md_sidenms(sp, i, &xep)) mdclrerror(&xep); } } } /* level 5 */ if (rb_level > 4) { if (dd != NULL) { /* delete the drive records */ for (i = 0; i < node_c; i++) { if (clnt_deldrvs(node_v[i], sp, dd, &xep) == -1) mdclrerror(&xep); } } /* delete the set record on the 'new' hosts */ for (i = 0; i < node_c; i++) { if (clnt_delset(node_v[i], sp, &xep) == -1) mdclrerror(&xep); } } /* level 4 */ if (rb_level > 3 && dd != NULL) { for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; /* Skip nodes not being added */ if (! strinlst(sd->sd_nodes[i], node_c, node_v)) continue; if (del_db_sidenms(sp, i, &xep)) mdclrerror(&xep); } } /* level 3 */ if (rb_level > 2 && dd != NULL) { for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; /* Skip nodes not being added */ if (! strinlst(sd->sd_nodes[i], node_c, node_v)) continue; if (clnt_del_drv_sidenms(sd->sd_nodes[i], sp, &xep) == -1) mdclrerror(&xep); } } /* level 2 */ if (rb_level > 1) { for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; if (clnt_delhosts(sd->sd_nodes[i], sp, node_c, node_v, &xep) == -1) mdclrerror(&xep); } } /* level 1 */ if (rb_level > 0) { cl_sk = cl_get_setkey(sp->setno, sp->setname); for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; /* Skip hosts we get in the next loop */ if (strinlst(sd->sd_nodes[i], node_c, node_v)) continue; if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) mdclrerror(&xep); } for (i = 0; i < node_c; i++) if (clnt_unlock_set(node_v[i], cl_sk, &xep)) mdclrerror(&xep); cl_set_setkey(NULL); } /* release signals back to what they were on entry */ if (procsigs(FALSE, &oldsigs, &xep) < 0) mdclrerror(&xep); metaflushsetname(sp); md_rb_sig_handling_off(md_got_sig(), md_which_sig()); return (rval); } /* * Add host(s) to the diskset provided in sp. * - create set if non-existent. */ int meta_set_addhosts( mdsetname_t *sp, int multi_node, int node_c, char **node_v, int auto_take, md_error_t *ep ) { if (multi_node) return (meta_multinode_set_addhosts(sp, multi_node, node_c, node_v, auto_take, ep)); else return (meta_traditional_set_addhosts(sp, multi_node, node_c, node_v, auto_take, ep)); } /* * Delete host(s) from the diskset provided in sp. * - destroy set if last host in set is removed. */ int meta_set_deletehosts( mdsetname_t *sp, int node_c, char **node_v, int forceflg, md_error_t *ep ) { md_set_desc *sd; md_drive_desc *dd; med_rec_t medr; med_rec_t rb_medr; int i, j; int has_set; int numsides = 0; int oha = FALSE; sigset_t oldsigs; mhd_mhiargs_t mhiargs; md_replicalist_t *rlp = NULL; md_setkey_t *cl_sk; ulong_t max_genid = 0; int rval = 0; int rb_level = 0; int max_meds = 0; md_error_t xep = mdnullerror; md_mnnode_desc *nd; md_mnnode_record *nr; int delete_master = 0; int suspendall_flag = 0, suspendall_flag_rb = 0; int suspend1_flag = 0; int lock_flag = 0; int stale_flag = 0; int *node_id_list = NULL; int remote_sets_deleted = 0; if ((sd = metaget_setdesc(sp, ep)) == NULL) return (-1); /* * Verify that list of nodes being deleted contains no * duplicates. */ if (nodesuniq(sp, node_c, node_v, ep)) return (-1); /* Make sure we own the set */ if (meta_check_ownership(sp, ep) != 0) return (-1); /* * The drive and node records are stored in the local mddbs of each * node in the diskset. Each node's rpc.metad daemon reads in the set, * drive and node records from that node's local mddb and caches them * internally. Any process needing diskset information contacts its * local rpc.metad to get this information. Since each node in the * diskset is independently reading the set information from its local * mddb, the set, drive and node records in the local mddbs must stay * in-sync, so that all nodes have a consistent view of the diskset. * * For a multinode diskset, explicitly verify that all nodes in the * diskset are ALIVE (i.e. are in the API membership list) if the * forceflag is FALSE. (The case of forceflag being TRUE is handled * in OHA check above.) * * If forceflag is FALSE and a node in the diskset is not in * the membership list, then fail this operation since all nodes must * be ALIVE in order to delete the node record from their local mddb. * If a panic of this node leaves the local mddbs set, node and drive * records out-of-sync, the reconfig cycle will fix the local mddbs * and force them back into synchronization. */ if ((forceflg == FALSE) && (MD_MNSET_DESC(sd))) { nd = sd->sd_nodelist; while (nd) { if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { return (mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno, nd->nd_nodename, NULL, sp->setname)); } nd = nd->nd_next; } } /* * Lock the set on current set members. * Set locking done much earlier for MN diskset than for traditional * diskset since lock_set and SUSPEND are used to protect against * other meta* commands running on the other nodes. */ if (MD_MNSET_DESC(sd)) { /* Make sure we are blocking all signals */ if (procsigs(TRUE, &oldsigs, &xep) < 0) mdclrerror(&xep); nd = sd->sd_nodelist; while (nd) { if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { nd = nd->nd_next; continue; } if (clnt_lock_set(nd->nd_nodename, sp, ep)) { rval = -1; goto out2; } lock_flag = 1; nd = nd->nd_next; } /* * Lock out other meta* commands by suspending * class 1 messages across the diskset. */ nd = sd->sd_nodelist; while (nd) { if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { nd = nd->nd_next; continue; } if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) { rval = -1; goto out2; } suspend1_flag = 1; nd = nd->nd_next; } } for (i = 0; i < node_c; i++) if (getnodeside(node_v[i], sd) == MD_SIDEWILD) { (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, node_v[i], NULL, sp->setname); rval = -1; goto out2; } /* * Count the number of nodes currently in the set. */ if (MD_MNSET_DESC(sd)) { nd = sd->sd_nodelist; while (nd) { numsides++; nd = nd->nd_next; } } else { for (i = 0; i < MD_MAXSIDES; i++) /* Count full slots */ if (sd->sd_nodes[i][0] != '\0') numsides++; } /* * OHA mode == -f -h * OHA is One Host Administration that occurs when the forceflag (-f) * is set and at least one host in the diskset isn't responding * to RPC requests. * * When in OHA mode, a node cannot delete itself from a diskset. * When in OHA mode, a node can delete a list of nodes from a diskset * even if some of the nodes in the diskset are unresponsive. * * For multinode diskset, only allow OHA mode when the nodes that * aren't responding in the diskset are not in the membership list * (i.e. nodes that aren't responding are not marked ALIVE). * Nodes that aren't in the membership list will be rejoining * the diskset through a reconfig cycle and the local mddb set * and node records can be reconciled during the reconfig cycle. * * If a node isn't responding, but is still in the membership list, * fail the request since the node may not be responding because * rpc.metad died and is restarting. In this case, no reconfig * cycle will be started, so there's no way to recover if * the host delete operation was allowed. * * NOTE: if nodes that weren't in the membership when the OHA host * delete occurred are now the only nodes in membership list, * those nodes will see the old view of the diskset. As soon as * a node re-enters the cluster that was present in the cluster * during the host deletion, the diskset will reflect the host * deletion on all nodes presently in the cluster. */ if (forceflg == TRUE) { if (MD_MNSET_DESC(sd)) { nd = sd->sd_nodelist; while (nd) { /* * If a node isn't ALIVE (in member list), * then allow a force-able delete in OHA mode. */ if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { oha = TRUE; break; } /* * Don't test for clnt_nullproc since already * tested the RPC connections by clnt_lock_set. */ nd = nd->nd_next; } } else { for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; if (clnt_nullproc(sd->sd_nodes[i], ep) == -1) { /* * If we timeout to at least one * client, then we can allow OHA mode, * otherwise, we are in normal mode. */ if (mdanyrpcerror(ep)) { mdclrerror(ep); if (strinlst(sd->sd_nodes[i], node_c, node_v)) { oha = TRUE; break; } } } } } } /* * Don't allow this for MN diskset since meta_set_destroy of 1 node * does NOT remove this node's node record from the other node's set * records in their local mddb. This leaves a MN diskset in a very * messed up state. */ if (!(MD_MNSET_DESC(sd))) { /* Destroy set */ if (forceflg == TRUE && node_c == 1 && strcmp(mynode(), node_v[0]) == 0) { /* Can return since !MN diskset so nothing to unlock */ return (meta_set_destroy(sp, TRUE, ep)); } } /* * In multinode diskset, can only delete self if this * is the last node in the set or if all nodes in * the set are being deleted. The traditional diskset code * allows a node to delete itself (when there are other nodes * in the diskset) when using the force flag, but that code * path doesn't have the node remove itself from * the set node list on the other nodes. Since this isn't * satisfactory for the multinode diskset, just don't * allow this operation. */ if (MD_MNSET_DESC(sd) && (numsides > 1) && (node_c != numsides) && strinlst(mynode(), node_c, node_v)) { (void) mddserror(ep, MDE_DS_MNCANTDELSELF, sp->setno, mynode(), NULL, sp->setname); rval = -1; goto out2; } /* * In multinode diskset, don't allow deletion of master node unless * this is the only node left or unless all nodes are being * deleted since there is no way to switch * master ownership (unless via a cluster reconfig cycle). */ delete_master = strinlst(sd->sd_mn_master_nodenm, node_c, node_v); if (MD_MNSET_DESC(sd) && (numsides > 1) && (node_c != numsides) && delete_master) { (void) mddserror(ep, MDE_DS_CANTDELMASTER, sp->setno, sd->sd_mn_master_nodenm, NULL, sp->setname); rval = -1; goto out2; } /* Deleting self w/o forceflg */ if (forceflg == FALSE && numsides > 1 && strinlst(mynode(), node_c, node_v)) { (void) mddserror(ep, MDE_DS_CANTDELSELF, sp->setno, mynode(), NULL, sp->setname); rval = -1; goto out2; } /* * Setup the mediator record roll-back structure for a trad diskset. * * For a MN diskset, the deletion of a host in the diskset * does not cause an update of the mediator record. If the * host deletion will cause the diskset to be removed (this is * the last host being removed or all hosts are being removed) * then the mediator record must have already been removed by the * user or this delete host operation will fail (a check for * this is done later in this routine). */ if (!(MD_MNSET_DESC(sd))) { (void) memset(&rb_medr, '\0', sizeof (med_rec_t)); rb_medr.med_rec_mag = MED_REC_MAGIC; rb_medr.med_rec_rev = MED_REC_REV; rb_medr.med_rec_fl = 0; rb_medr.med_rec_sn = sp->setno; (void) strcpy(rb_medr.med_rec_snm, sp->setname); for (i = 0; i < MD_MAXSIDES; i++) (void) strcpy(rb_medr.med_rec_nodes[i], sd->sd_nodes[i]); rb_medr.med_rec_meds = sd->sd_med; /* structure assigment */ (void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t)); rb_medr.med_rec_foff = 0; crcgen(&rb_medr, &rb_medr.med_rec_cks, sizeof (med_rec_t), NULL); /* Bring the mediator record up to date with the set record */ medr = rb_medr; /* structure assignment */ if ((max_meds = get_max_meds(ep)) == 0) { rval = -1; goto out2; } } /* * For traditional diskset: * Check to see if all the hosts we are trying to delete the set from * have a set "setname" that is the same as ours, i.e. - same name, * same time stamp, same genid. We only do this if forceflg is not * specified or we are in OHA mode. */ if (!(MD_MNSET_DESC(sd)) && (forceflg == FALSE || oha == TRUE)) { int fix_node_v = FALSE; int j; for (i = 0; i < node_c; i++) { /* We skip this side */ if (strcmp(mynode(), node_v[i]) == 0) continue; has_set = nodehasset(sp, node_v[i], NHS_NSTG_EQ, ep); if (has_set < 0) { char *anode[1]; /* * Can't talk to the host only allowed in OHA * mode. */ if (oha == TRUE && mdanyrpcerror(ep)) { mdclrerror(ep); continue; } /* * We got an error we do not, or are not, * prepared to handle. */ if (! mdiserror(ep, MDE_NO_SET) && ! mdismddberror(ep, MDE_DB_NODB)) { rval = -1; goto out2; } mdclrerror(ep); /* * If we got here: both hosts are up; a host in * our set record does not have the set. So we * delete the host from our set and invalidate * the node. */ anode[0] = Strdup(node_v[i]); rval = del_host_noset(sp, anode, ep); /* * If we delete a host, make sure the mediator * hosts are made aware of this. */ for (j = 0; j < MD_MAXSIDES; j++) { if (strcmp(medr.med_rec_nodes[j], node_v[i]) != 0) continue; (void) memset(&medr.med_rec_nodes[j], '\0', sizeof (md_node_nm_t)); } crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL); rb_medr = medr; /* struct assignment */ Free(anode[0]); if (rval == -1) goto out2; node_v[i][0] = '\0'; fix_node_v = TRUE; continue; } /* * If we can talk to the host, and they do not have the * exact set, then we disallow the operation. */ if (has_set == FALSE) { (void) mddserror(ep, MDE_DS_NODENOSET, sp->setno, node_v[i], NULL, sp->setname); rval = -1; goto out2; } } /* * Here we prune the node_v's that were invalidated above. */ if (fix_node_v == TRUE) { i = 0; while (i < node_c) { if (node_v[i][0] == '\0') { for (j = i; (j + 1) < node_c; j++) node_v[j] = node_v[j + 1]; node_c--; } i++; } /* * If we are left with no nodes, then we have * compeleted the operation. */ if (node_c == 0) { /* * Inform the mediator hosts of the new node * list */ for (i = 0; i < max_meds; i++) { if (sd->sd_med.n_lst[i].a_cnt == 0) continue; if (clnt_med_upd_rec( &sd->sd_med.n_lst[i], sp, &medr, ep)) mdclrerror(ep); } rval = 0; goto out2; } } } /* * For multinode diskset: * If forceflag is FALSE then check to see if all the hosts we * are trying to delete the set from have a set "setname" that * is the same as ours, i.e. - same name, same time stamp, same genid. * If forceflag is TRUE, then we don't care if the hosts being * deleted have the same set information or not since user is forcing * those hosts to be deleted. */ if ((MD_MNSET_DESC(sd)) && (forceflg == FALSE)) { for (i = 0; i < node_c; i++) { /* We skip this node since comparing against it */ if (strcmp(mynode(), node_v[i]) == 0) continue; has_set = nodehasset(sp, node_v[i], NHS_NSTG_EQ, ep); if (has_set < 0) { rval = -1; goto out2; } /* * If we can talk to the host, and they do not have the * exact set, then we disallow the operation. */ if (has_set == FALSE) { (void) mddserror(ep, MDE_DS_NODENOSET, sp->setno, node_v[i], NULL, sp->setname); rval = -1; goto out2; } } } /* * For traditional diskset: * Can't allow user to delete their node (without deleting all nodes) * out of a set in OHA mode, would leave a real mess. * This action was already failed above for a MN diskset. */ if (!(MD_MNSET_DESC(sd)) && (oha == TRUE) && strinlst(mynode(), node_c, node_v)) { /* Can directly return since !MN diskset; nothing to unlock */ return (mddserror(ep, MDE_DS_OHACANTDELSELF, sp->setno, mynode(), NULL, sp->setname)); } /* Get the drive descriptors for this set */ if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) { if (! mdisok(ep)) { rval = -1; goto out2; } } /* * We have been asked to delete all the hosts in the set, i.e. - delete * the whole set. */ if (node_c == numsides) { /* * This is only a valid operation if all drives have been * removed first. */ if (dd != NULL) { (void) mddserror(ep, MDE_DS_HASDRIVES, sp->setno, NULL, NULL, sp->setname); rval = -1; goto out2; } /* * If a mediator is currently associated with this set, * fail the deletion of the last host(s). */ if (sd->sd_med.n_cnt != 0) { (void) mddserror(ep, MDE_DS_HASMED, sp->setno, NULL, NULL, sp->setname); rval = -1; goto out2; } if (! mdisok(ep)) { rval = -1; goto out2; } rval = del_set_nodrives(sp, node_c, node_v, oha, ep); remote_sets_deleted = 1; goto out2; } /* * Get timeout values in case we need to roll back */ (void) memset(&mhiargs, '\0', sizeof (mhiargs)); if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) != 0) { rval = -1; goto out2; } if (dd != NULL) { /* * We need this around for re-adding DB side names later. */ if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) { rval = -1; goto out2; } /* * Alloc nodeid list if drives are present in diskset. * nodeid list is used to reset mirror owners if the * owner is a deleted node. */ if (MD_MNSET_DESC(sd)) { node_id_list = Zalloc(sizeof (int) * node_c); } } /* Lock the set on current set members */ if (!(MD_MNSET_DESC(sd))) { md_rb_sig_handling_on(); for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { if (oha == TRUE && mdanyrpcerror(ep)) { mdclrerror(ep); continue; } rval = -1; goto out2; } lock_flag = 1; } } RB_TEST(1, "deletehosts", ep) RB_PREEMPT; rb_level = 1; /* level 1 */ RB_TEST(2, "deletehosts", ep) if (MD_MNSET_DESC(sd)) { md_mnnode_desc *saved_nd_next; mddb_config_t c; if (dd != NULL) { /* * Notify rpc.mdcommd on all nodes of a nodelist change. * Start by suspending rpc.mdcommd (which drains it of * all messages), then change the nodelist followed * by a reinit and resume. */ nd = sd->sd_nodelist; while (nd) { if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { nd = nd->nd_next; continue; } if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { rval = -1; goto out2; } suspendall_flag = 1; nd = nd->nd_next; } /* * Is current set STALE? * Need to know this if delete host fails and node * is re-joined to diskset. */ (void) memset(&c, 0, sizeof (c)); c.c_id = 0; c.c_setno = sp->setno; if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { (void) mdstealerror(ep, &c.c_mde); rval = -1; goto out2; } if (c.c_flags & MDDB_C_STALE) { stale_flag = MNSET_IS_STALE; } } /* * For each node being deleted, set DEL flag and * reset OK flag on that node first. * Until a node has turned off its own * rpc.metad's NODE_OK flag, that node could be * considered for master during a reconfig. */ for (i = 0; i < node_c; i++) { /* * During OHA mode, don't issue RPCs to * non-alive nodes since there is no reason to * wait for RPC timeouts. */ nd = sd->sd_nodelist; while (nd) { if (strcmp(nd->nd_nodename, node_v[i]) == 0) break; nd = nd->nd_next; } /* Something wrong, handle this in next loop */ if (nd == NULL) continue; /* If node_id_list is alloc'd, fill in for later use */ if (node_id_list) node_id_list[i] = nd->nd_nodeid; /* All nodes are guaranteed to be ALIVE unless OHA */ if ((oha == TRUE) && (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { continue; } /* Only changing my local cache of node list */ saved_nd_next = nd->nd_next; nd->nd_next = NULL; /* Set flags for del host to DEL on that host */ if (clnt_upd_nr_flags(node_v[i], sp, nd, MD_NR_DEL, NULL, ep)) { nd->nd_next = saved_nd_next; goto rollback; } nd->nd_next = saved_nd_next; } for (i = 0; i < node_c; i++) { /* * Turn off owner flag in nodes to be deleted * if this node has been joined. * Also, turn off NODE_OK and turn on NODE_DEL * for nodes to be deleted. * These flags are used to set the node * record flags in all nodes in the set. * Only withdraw nodes that are joined. */ nd = sd->sd_nodelist; while (nd) { /* * Don't communicate with non-ALIVE node if * in OHA - but set flags in master list so * alive nodes are updated correctly. */ if (strcmp(nd->nd_nodename, node_v[i]) == 0) { if ((oha == TRUE) && (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { nd->nd_flags |= MD_MN_NODE_DEL; nd->nd_flags &= ~MD_MN_NODE_OK; nd = nd->nd_next; continue; } if (nd->nd_flags & MD_MN_NODE_OWN) { /* * Going to set locally cached * node flags to rollback join * so in case of error, the * rollback code knows which * nodes to re-join. rpc.metad * ignores the RB_JOIN flag. */ nd->nd_flags |= MD_MN_NODE_RB_JOIN; nd->nd_flags &= ~MD_MN_NODE_OWN; /* * Be careful in ordering of * following steps so that * recovery from a panic * between the steps is viable. * Only reset master info in * rpc.metad - don't reset * local cached info which will * be used to set master info * back if failure (rollback). */ if (clnt_withdrawset( nd->nd_nodename, sp, ep)) goto rollback; /* * Reset master on deleted node */ if (clnt_mnsetmaster(node_v[i], sp, "", MD_MN_INVALID_NID, ep)) goto rollback; } nd->nd_flags |= MD_MN_NODE_DEL; nd->nd_flags &= ~MD_MN_NODE_OK; } nd = nd->nd_next; } } /* * Now, reset owner and set delete flags for the * deleted nodes on all nodes. */ nd = sd->sd_nodelist; while (nd) { /* Skip non-ALIVE node if in OHA */ if ((oha == TRUE) && (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { nd = nd->nd_next; continue; } if (clnt_upd_nr_flags(nd->nd_nodename, sp, sd->sd_nodelist, MD_NR_SET, NULL, ep)) { goto rollback; } nd = nd->nd_next; } /* * Notify rpc.mdcommd on all nodes of a nodelist change. * Send reinit command to mdcommd which forces it to get * fresh set description. */ if (suspendall_flag) { /* Send reinit */ nd = sd->sd_nodelist; while (nd) { if ((oha == TRUE) && (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { nd = nd->nd_next; continue; } /* Class is ignored for REINIT */ if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, sp, NULL, MD_MSCF_NO_FLAGS, ep)) { mde_perror(ep, dgettext(TEXT_DOMAIN, "Unable to reinit rpc.mdcommd.\n")); goto rollback; } nd = nd->nd_next; } /* Send resume */ nd = sd->sd_nodelist; while (nd) { if ((oha == TRUE) && (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { nd = nd->nd_next; continue; } if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, sp, MD_MSG_CLASS0, MD_MSCF_DONT_RESUME_CLASS1, ep)) { mde_perror(ep, dgettext(TEXT_DOMAIN, "Unable to resume rpc.mdcommd.\n")); goto rollback; } nd = nd->nd_next; } meta_ping_mnset(sp->setno); } } /* * Mark the set record MD_SR_DEL on the hosts we are deleting * If a MN diskset and OHA mode, don't issue RPC to nodes that * are not ALIVE. * If a MN diskset and not in OHA mode, then all nodes must respond * to RPC (be alive) or this routine will return failure. * If a traditional diskset, all RPC failures if in OHA mode. */ for (i = 0; i < node_c; i++) { RB_TEST(3, "deletehosts", ep) if ((MD_MNSET_DESC(sd)) && (oha == TRUE)) { /* * During OHA mode, don't issue RPCs to * non-alive nodes since there is no reason to * wait for RPC timeouts. */ nd = sd->sd_nodelist; while (nd) { if (strcmp(nd->nd_nodename, node_v[i]) == 0) { break; } nd = nd->nd_next; } if (nd == NULL) { (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, node_v[i], NULL, sp->setname); goto rollback; } else if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { /* Skip non-ALIVE node if in OHA mode */ continue; } else { if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) { goto rollback; } } } else if ((MD_MNSET_DESC(sd)) && (oha == FALSE)) { /* * All nodes should be alive in non-oha mode. */ if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) { goto rollback; } } else { /* * For traditional diskset, issue the RPC and * ignore RPC failure if in OHA mode. */ if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) { if (oha == TRUE && mdanyrpcerror(ep)) { mdclrerror(ep); continue; } goto rollback; } } RB_TEST(4, "deletehosts", ep) } RB_TEST(5, "deletehosts", ep) RB_PREEMPT; rb_level = 2; /* level 2 */ RB_TEST(6, "deletehosts", ep) /* Delete the set on the hosts we are deleting */ if (del_set_on_hosts(sp, sd, dd, node_c, node_v, oha, ep)) { if (node_id_list) Free(node_id_list); /* * Failure during del_set_on_hosts would have recreated * the diskset on the remote hosts, but for multi-owner * disksets need to set node flags properly and REINIT and * RESUME rpc.mdcommd, so just let the rollback code * do this. */ if (MD_MNSET_DESC(sd)) goto rollback; return (-1); } remote_sets_deleted = 1; RB_TEST(19, "deletehosts", ep) RB_PREEMPT; rb_level = 3; /* level 3 */ RB_TEST(20, "deletehosts", ep) /* Delete the host from sets on hosts not being deleted */ if (MD_MNSET_DESC(sd)) { nd = sd->sd_nodelist; /* All nodes are guaranteed to be ALIVE unless in oha mode */ while (nd) { /* * During OHA mode, don't issue RPCs to * non-alive nodes since there is no reason to * wait for RPC timeouts. */ if ((oha == TRUE) && (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { nd = nd->nd_next; continue; } /* Skip nodes being deleted */ if (strinlst(nd->nd_nodename, node_c, node_v)) { nd = nd->nd_next; continue; } if (clnt_delhosts(nd->nd_nodename, sp, node_c, node_v, ep) == -1) { goto rollback; } RB_TEST(21, "deletehosts", ep) nd = nd->nd_next; } } else { for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; /* Skip nodes being deleted */ if (strinlst(sd->sd_nodes[i], node_c, node_v)) continue; if (clnt_delhosts(sd->sd_nodes[i], sp, node_c, node_v, ep) == -1) { if (oha == TRUE && mdanyrpcerror(ep)) { mdclrerror(ep); continue; } goto rollback; } RB_TEST(21, "deletehosts", ep) } } /* We have drives */ if (dd != NULL) { RB_TEST(22, "deletehosts", ep) RB_PREEMPT; rb_level = 4; /* level 4 */ RB_TEST(23, "deletehosts", ep) /* * Delete the old sidename for each drive on all the hosts. * If a multi-node diskset, each host only stores * the side information for itself. So, a multi-node * diskset doesn't delete the old sidename for * an old host. * * If a MN diskset, reset owners of mirrors that are * owned by the deleted nodes. */ if (!(MD_MNSET_DESC(sd))) { for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; /* Skip nodes being deleted */ if (strinlst(sd->sd_nodes[i], node_c, node_v)) continue; if (clnt_del_drv_sidenms(sd->sd_nodes[i], sp, ep)) { if (oha == TRUE && mdanyrpcerror(ep)) { mdclrerror(ep); continue; } metaflushsetname(sp); goto rollback; } RB_TEST(24, "deletehosts", ep) } } else { nd = sd->sd_nodelist; /* All nodes guaranteed ALIVE unless in oha mode */ while (nd) { /* * If mirror owner was set to a deleted node, * then each existing node resets mirror owner * to NULL. * * During OHA mode, don't issue RPCs to * non-alive nodes since there is no reason to * wait for RPC timeouts. */ if ((oha == TRUE) && (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { nd = nd->nd_next; continue; } /* Skip nodes being deleted */ if (strinlst(nd->nd_nodename, node_c, node_v)) { nd = nd->nd_next; continue; } /* * If mirror owner is a deleted node, reset * mirror owners to NULL. If an error occurs, * print a warning and continue. Don't fail * metaset because of mirror owner reset * problem since next node to grab mirror * will resolve this issue. Before next node * grabs mirrors, metaset will show the deleted * node as owner which is why an attempt to * reset the mirror owner is made. */ if (clnt_reset_mirror_owner(nd->nd_nodename, sp, node_c, &node_id_list[0], &xep) == -1) { mde_perror(&xep, dgettext(TEXT_DOMAIN, "Unable to reset mirror owner on" " node %s\n"), nd->nd_nodename); mdclrerror(&xep); } RB_TEST(21, "deletehosts", ep) nd = nd->nd_next; } } } RB_TEST(25, "deletehosts", ep) RB_PREEMPT; rb_level = 4; /* level 4 */ RB_TEST(26, "deletehosts", ep) /* * Bring the mediator record up to date with the set record for * traditional diskset. */ if (!(MD_MNSET_DESC(sd))) { medr = rb_medr; /* structure assignment */ for (i = 0; i < MD_MAXSIDES; i++) { if (strinlst(sd->sd_nodes[i], node_c, node_v)) (void) memset(&medr.med_rec_nodes[i], '\0', sizeof (md_node_nm_t)); else (void) strcpy(medr.med_rec_nodes[i], sd->sd_nodes[i]); } crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL); /* Inform the mediator hosts of the new node list */ for (i = 0; i < max_meds; i++) { if (sd->sd_med.n_lst[i].a_cnt == 0) continue; if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep)) { if (oha == TRUE && mdanyrpcerror(ep)) { mdclrerror(ep); continue; } goto rollback; } } } RB_TEST(27, "deletehosts", ep) /* * For traditional diskset: * We are deleting ourselves out of the set and we have drives to * consider; so we need to halt the set, release the drives and * reset the timeout. **** THIS IS A ONE WAY TICKET, NO ROLL BACK * IS POSSIBLE AS SOON AS THE HALT SET COMPLETES, SO THIS IS DONE * WITH ALL SIGNALS BLOCKED AND LAST **** * * This situation cannot occur in a MN diskset since a node can't * delete itself unless all nodes are being deleted and a diskset * cannot contain any drives if all nodes are being deleted. * So, don't even test for this if a MN diskset. */ if (!(MD_MNSET_DESC(sd)) && (dd != NULL) && strinlst(mynode(), node_c, node_v)) { /* Make sure we are blocking all signals */ if (procsigs(TRUE, &oldsigs, ep) < 0) { rval = -1; goto out1; } if (halt_set(sp, ep)) { rval = -1; goto out1; } if (rel_own_bydd(sp, dd, FALSE, ep)) rval = -1; out1: /* release signals back to what they were on entry */ if (procsigs(FALSE, &oldsigs, &xep) < 0) { if (rval == 0) (void) mdstealerror(ep, &xep); rval = -1; } } out2: /* * Unlock diskset by resuming messages across the diskset. * Just resume all classes so that resume is the same whether * just one class was locked or all classes were locked. */ if ((suspend1_flag) || (suspendall_flag)) { /* Send resume */ nd = sd->sd_nodelist; while (nd) { if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { nd = nd->nd_next; continue; } /* * Skip nodes being deleted if remote set * was deleted since rpc.mdcommd may no longer * be running on remote node. */ if ((remote_sets_deleted == 1) && (strinlst(nd->nd_nodename, node_c, node_v))) { nd = nd->nd_next; continue; } if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { if (rval == 0) (void) mdstealerror(ep, &xep); rval = -1; mde_perror(ep, dgettext(TEXT_DOMAIN, "Unable to resume rpc.mdcommd.\n")); } nd = nd->nd_next; } meta_ping_mnset(sp->setno); } cl_sk = cl_get_setkey(sp->setno, sp->setname); if (lock_flag) { if (MD_MNSET_DESC(sd)) { nd = sd->sd_nodelist; while (nd) { /* * During OHA mode, don't issue RPCs to * non-alive nodes since there is no reason to * wait for RPC timeouts. */ if ((oha == TRUE) && (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { nd = nd->nd_next; continue; } if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { if (rval == 0) (void) mdstealerror(ep, &xep); rval = -1; } nd = nd->nd_next; } } else { for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { if (oha == TRUE && mdanyrpcerror(&xep)) { mdclrerror(&xep); continue; } if (rval == 0) (void) mdstealerror(ep, &xep); rval = -1; } } } } cl_set_setkey(NULL); out3: metafreereplicalist(rlp); if (node_id_list) Free(node_id_list); metaflushsetname(sp); if (MD_MNSET_DESC(sd)) { /* release signals back to what they were on entry */ if (procsigs(FALSE, &oldsigs, &xep) < 0) mdclrerror(&xep); } else { md_rb_sig_handling_off(md_got_sig(), md_which_sig()); } return (rval); rollback: /* all signals already blocked for MN disket */ if (!(MD_MNSET_DESC(sd))) { if (procsigs(TRUE, &oldsigs, &xep) < 0) mdclrerror(&xep); } rval = -1; max_genid = sd->sd_genid; /* * Send reinit command to rpc.mdcommd which forces it to get * fresh set description and resume all classes but class 0. * Don't send any commands to rpc.mdcommd if set on that node * has been removed. */ if (suspendall_flag) { /* Send reinit */ nd = sd->sd_nodelist; while (nd) { if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { nd = nd->nd_next; continue; } /* * If the remote set was deleted, rpc.mdcommd * may no longer be running so send nothing to it. */ if ((remote_sets_deleted == 1) && (strinlst(nd->nd_nodename, node_c, node_v))) { nd = nd->nd_next; continue; } /* Class is ignored for REINIT */ if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { mde_perror(&xep, dgettext(TEXT_DOMAIN, "Unable to reinit rpc.mdcommd.\n")); mdclrerror(&xep); } nd = nd->nd_next; } /* Send resume */ nd = sd->sd_nodelist; while (nd) { if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { nd = nd->nd_next; continue; } /* * If the remote set was deleted, rpc.mdcommd * may no longer be running so send nothing to it. */ if ((remote_sets_deleted == 1) && (strinlst(nd->nd_nodename, node_c, node_v))) { nd = nd->nd_next; continue; } if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, sp, MD_MSG_CLASS0, MD_MSCF_DONT_RESUME_CLASS1, &xep)) { mde_perror(&xep, dgettext(TEXT_DOMAIN, "Unable to resume rpc.mdcommd.\n")); mdclrerror(&xep); } nd = nd->nd_next; } meta_ping_mnset(sp->setno); } /* level 2 */ if (rb_level > 1) { md_set_record *sr; md_replicalist_t *rl; recreate_set(sp, sd); /* * Lock out other meta* commands on nodes with the newly * re-created sets by suspending class 1 messages * across the diskset. */ nd = sd->sd_nodelist; while (nd) { /* Skip nodes not being deleted */ if (!(strinlst(nd->nd_nodename, node_c, node_v))) { nd = nd->nd_next; continue; } /* Suspend commd on nodes with re-created sets */ if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, &xep)) { mde_perror(&xep, dgettext(TEXT_DOMAIN, "Unable to suspend rpc.mdcommd.\n")); mdclrerror(&xep); } nd = nd->nd_next; } max_genid++; /* * See if we have to re-add the drives specified. */ for (i = 0; i < node_c; i++) { if (MD_MNSET_DESC(sd) && (oha == TRUE)) { /* * During OHA mode, don't issue RPCs to * non-alive nodes since there is no reason to * wait for RPC timeouts. */ nd = sd->sd_nodelist; while (nd) { if (strcmp(nd->nd_nodename, node_v[i]) == 0) { break; } nd = nd->nd_next; } if (nd == 0) continue; if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) continue; } /* Don't care if set record is MN or not */ if (clnt_getset(node_v[i], sp->setname, MD_SET_BAD, &sr, &xep) == -1) { mdclrerror(&xep); continue; } /* Drive already added, skip to next node */ if (sr->sr_drivechain != NULL) { /* * Set record structure was allocated from RPC * routine getset so this structure is only of * size md_set_record even if the MN flag is * set. So, clear the flag so that the free * code doesn't attempt to free a structure * the size of md_mnset_record. */ sr->sr_flags &= ~MD_SR_MN; free_sr(sr); continue; } if (clnt_adddrvs(node_v[i], sp, dd, sr->sr_ctime, sr->sr_genid, &xep) == -1) mdclrerror(&xep); if (clnt_upd_dr_flags(node_v[i], sp, dd, MD_DR_OK, &xep) == -1) mdclrerror(&xep); /* * Set record structure was allocated from RPC routine * getset so this structure is only of size * md_set_record even if the MN flag is set. So, * clear the flag so that the free code doesn't * attempt to free a structure the size of * md_mnset_record. */ sr->sr_flags &= ~MD_SR_MN; free_sr(sr); } max_genid += 3; for (rl = rlp; rl != NULL; rl = rl->rl_next) { md_replica_t *r = rl->rl_repp; /* * This is not the first replica being added to the * diskset so call with ADDSIDENMS_BCAST. If this * is a traditional diskset, the bcast flag is ignored * since traditional disksets don't use the rpc.mdcommd. */ if (meta_db_addsidenms(sp, r->r_namep, r->r_blkno, DB_ADDSIDENMS_BCAST, &xep)) mdclrerror(&xep); } /* * Add the device names for the new sides into the namespace, * on all hosts not being deleted. */ if (MD_MNSET_DESC(sd)) { nd = sd->sd_nodelist; while (nd) { /* Find a node that is not being deleted */ if (!strinlst(nd->nd_nodename, node_c, node_v)) { j = nd->nd_nodeid; break; } nd = nd->nd_next; } } else { for (j = 0; j < MD_MAXSIDES; j++) { /* Skip empty slots */ if (sd->sd_nodes[j][0] == '\0') continue; /* Find a node that is not being deleted */ if (!strinlst(sd->sd_nodes[j], node_c, node_v)) break; } } if (MD_MNSET_DESC(sd)) { nd = sd->sd_nodelist; while (nd) { /* Skip nodes not being deleted */ if (!strinlst(nd->nd_nodename, node_c, node_v)) { nd = nd->nd_next; continue; } /* this side was just created, add the names */ if (add_md_sidenms(sp, nd->nd_nodeid, j, &xep)) mdclrerror(&xep); nd = nd->nd_next; } } else { for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; /* Skip nodes not being deleted */ if (!strinlst(sd->sd_nodes[i], node_c, node_v)) continue; /* this side was just created, add the names */ if (add_md_sidenms(sp, i, j, &xep)) mdclrerror(&xep); } } } /* level 4 */ if (rb_level > 3 && dd != NULL) { /* * Add the new sidename for each drive to all the hosts * Multi-node disksets only store the sidename for * that host, so there is nothing to re-add. */ if (!(MD_MNSET_DESC(sd))) { for (j = 0; j < MD_MAXSIDES; j++) { /* Skip empty slots */ if (sd->sd_nodes[j][0] == '\0') continue; /* Skip nodes not being deleted */ if (!strinlst(sd->sd_nodes[j], node_c, node_v)) break; } for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; if (clnt_add_drv_sidenms(sd->sd_nodes[i], sd->sd_nodes[j], sp, sd, node_c, node_v, &xep)) mdclrerror(&xep); } } } /* level 5 */ if ((rb_level > 4) && (!(MD_MNSET_DESC(sd)))) { /* rollback the mediator record */ for (i = 0; i < max_meds; i++) { if (sd->sd_med.n_lst[i].a_cnt == 0) continue; if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &rb_medr, &xep)) mdclrerror(&xep); } } /* level 3 */ if (rb_level > 2) { md_set_record *sr; md_mnset_record *mnsr; if (MD_MNSET_DESC(sd)) { nd = sd->sd_nodelist; /* * During OHA mode, don't issue RPCs to * non-alive nodes since there is no reason to * wait for RPC timeouts. */ while (nd) { if ((oha == TRUE) && (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { nd = nd->nd_next; continue; } /* Record should be for a multi-node diskset */ if (clnt_mngetset(nd->nd_nodename, sp->setname, MD_SET_BAD, &mnsr, &xep) == -1) { mdclrerror(&xep); nd = nd->nd_next; continue; } has_set = 1; nr = mnsr->sr_nodechain; while (nr) { if (nd->nd_nodeid == nr->nr_nodeid) { break; } nr = nr->nr_next; } if (nr == NULL) has_set = 0; free_sr((struct md_set_record *)mnsr); if (has_set) { nd = nd->nd_next; continue; } if (clnt_addhosts(nd->nd_nodename, sp, node_c, node_v, &xep) == -1) mdclrerror(&xep); nd = nd->nd_next; } } else { for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; /* Record should be for a non-multi-node set */ if (clnt_getset(sd->sd_nodes[i], sp->setname, MD_SET_BAD, &sr, &xep) == -1) { mdclrerror(&xep); continue; } /* * Set record structure was allocated from RPC * routine getset so this structure is only of * size md_set_record even if the MN flag is * set. So, clear the flag so that the free * code doesn't attempt to free a structure * the size of md_mnset_record. */ if (MD_MNSET_REC(sr)) { sr->sr_flags &= ~MD_SR_MN; free_sr(sr); continue; } has_set = 1; for (j = 0; j < MD_MAXSIDES; j++) { /* Skip empty slots */ if (sd->sd_nodes[j][0] == '\0') continue; if (sr->sr_nodes[j][0] == '\0') { has_set = 0; break; } } free_sr(sr); if (has_set) continue; if (clnt_addhosts(sd->sd_nodes[i], sp, node_c, node_v, &xep) == -1) mdclrerror(&xep); } } max_genid++; } /* level 1 */ if (rb_level > 0) { max_genid++; /* Sets MD_SR_OK on given nodes. */ resync_genid(sp, sd, max_genid, node_c, node_v); /* * For MN diskset: * On each newly re-added node, set the node record for that * node to OK. Then set all node records for the newly added * nodes on all nodes to ok. * * By setting a node's own node record to ok first, even if * the node re-adding the hosts panics, the rest of the nodes * can determine the same node list during the choosing of the * master during reconfig. So, only nodes considered for * mastership are nodes that have both MD_MN_NODE_OK and * MD_SR_OK set on that node's rpc.metad. If all nodes have * MD_SR_OK set, but no node has its own MD_MN_NODE_OK set, * then the set will be removed during reconfig since a panic * occurred during the re-creation of the deletion of * the initial diskset. */ if (MD_MNSET_DESC(sd)) { md_mnnode_desc *saved_nd_next; if (dd != NULL) { /* * Notify rpc.mdcommd on all nodes of a * nodelist change. Start by suspending * rpc.mdcommd (which drains it of all * messages), then change the nodelist * followed by a reinit and resume. */ nd = sd->sd_nodelist; while (nd) { if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { nd = nd->nd_next; continue; } if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { mde_perror(&xep, dgettext(TEXT_DOMAIN, "Unable to suspend " "rpc.mdcommd.\n")); mdclrerror(&xep); } suspendall_flag_rb = 1; nd = nd->nd_next; } } for (i = 0; i < node_c; i++) { /* * During OHA mode, don't issue RPCs to * non-alive nodes since there is no reason to * wait for RPC timeouts. */ nd = sd->sd_nodelist; while (nd) { if (strcmp(nd->nd_nodename, node_v[i]) == 0) break; nd = nd->nd_next; } /* Something wrong, finish this in next loop */ if (nd == NULL) continue; if ((oha == TRUE) && (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { continue; } if (dd != NULL) { /* Set master on re-joining node. */ if (clnt_mnsetmaster(node_v[i], sp, sd->sd_mn_master_nodenm, sd->sd_mn_master_nodeid, &xep)) { mdclrerror(&xep); } /* * Re-join set to same state as * before - stale or non-stale. */ if (clnt_joinset(node_v[i], sp, stale_flag, &xep)) { mdclrerror(&xep); } } /* Only changing my local cache of node list */ saved_nd_next = nd->nd_next; nd->nd_next = NULL; /* Set record for host to ok on that host */ if (clnt_upd_nr_flags(node_v[i], sp, nd, MD_NR_OK, NULL, &xep)) { mdclrerror(&xep); } nd->nd_next = saved_nd_next; } /* Now set all node records on all nodes to be ok */ nd = sd->sd_nodelist; while (nd) { /* * During OHA mode, don't issue RPCs to * non-alive nodes since there is no reason to * wait for RPC timeouts. */ if ((oha == TRUE) && (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { nd = nd->nd_next; continue; } if (clnt_upd_nr_flags(nd->nd_nodename, sp, sd->sd_nodelist, MD_NR_OK, NULL, &xep)) { mdclrerror(&xep); } nd = nd->nd_next; } } } /* * Notify rpc.mdcommd on all nodes of a nodelist change. * Send reinit command to mdcommd which forces it to get * fresh set description. */ if (suspendall_flag_rb) { /* Send reinit */ nd = sd->sd_nodelist; while (nd) { if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { nd = nd->nd_next; continue; } /* Class is ignored for REINIT */ if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { mde_perror(&xep, dgettext(TEXT_DOMAIN, "Unable to reinit rpc.mdcommd.\n")); mdclrerror(&xep); } nd = nd->nd_next; } } /* * Unlock diskset by resuming messages across the diskset. * Just resume all classes so that resume is the same whether * just one class was locked or all classes were locked. */ if ((suspend1_flag) || (suspendall_flag) || (suspendall_flag_rb)) { /* Send resume */ nd = sd->sd_nodelist; while (nd) { if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { nd = nd->nd_next; continue; } if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { mde_perror(&xep, dgettext(TEXT_DOMAIN, "Unable to resume rpc.mdcommd.\n")); } nd = nd->nd_next; } meta_ping_mnset(sp->setno); } /* * Start a resync thread on the re-added nodes * if set is not stale. Also start a thread to update the * abr state of all soft partitions */ if (stale_flag != MNSET_IS_STALE) { for (i = 0; i < node_c; i++) { /* * During OHA mode, don't issue RPCs to * non-alive nodes since there is no reason to * wait for RPC timeouts. */ nd = sd->sd_nodelist; while (nd) { if (strcmp(nd->nd_nodename, node_v[i]) == 0) break; nd = nd->nd_next; } if (nd == NULL) continue; if ((oha == TRUE) && (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { continue; } if (dd != 0) { if (clnt_mn_mirror_resync_all(node_v[i], sp->setno, &xep)) { mde_perror(ep, dgettext(TEXT_DOMAIN, "Unable to start resync " "thread.\n")); } if (clnt_mn_sp_update_abr(node_v[i], sp->setno, &xep)) { mde_perror(ep, dgettext(TEXT_DOMAIN, "Unable to start sp update " "thread.\n")); } } } } /* level 0 */ cl_sk = cl_get_setkey(sp->setno, sp->setname); /* Don't test lock flag since guaranteed to be set if in rollback */ if (MD_MNSET_DESC(sd)) { nd = sd->sd_nodelist; while (nd) { /* * During OHA mode, don't issue RPCs to * non-alive nodes since there is no reason to * wait for RPC timeouts. */ if ((oha == TRUE) && (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { nd = nd->nd_next; continue; } if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) mdclrerror(&xep); nd = nd->nd_next; } } else { for (i = 0; i < MD_MAXSIDES; i++) { /* Skip empty slots */ if (sd->sd_nodes[i][0] == '\0') continue; if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) mdclrerror(&xep); } } cl_set_setkey(NULL); /* release signals back to what they were on entry */ if (procsigs(FALSE, &oldsigs, &xep) < 0) mdclrerror(&xep); metafreereplicalist(rlp); if (node_id_list) Free(node_id_list); metaflushsetname(sp); if (!(MD_MNSET_DESC(sd))) { md_rb_sig_handling_off(md_got_sig(), md_which_sig()); } return (rval); } int meta_set_auto_take( mdsetname_t *sp, int take_val, md_error_t *ep ) { int i; md_set_desc *sd; int rval = 0; md_setkey_t *cl_sk; md_error_t xep = mdnullerror; char *hostname; md_drive_desc *dd; if ((sd = metaget_setdesc(sp, ep)) == NULL) return (-1); /* Make sure we own the set */ if (meta_check_ownership(sp, ep) != 0) return (-1); hostname = mynode(); /* Lock the set on our side */ if (clnt_lock_set(hostname, sp, ep)) { rval = -1; goto out; } if (take_val) { /* enable auto_take but only if it is not already set */ if (! (sd->sd_flags & MD_SR_AUTO_TAKE)) { /* verify that we're the only host in the set */ for (i = 0; i < MD_MAXSIDES; i++) { if (sd->sd_nodes[i] == NULL || sd->sd_nodes[i][0] == '\0') continue; if (strcmp(sd->sd_nodes[i], hostname) != 0) { (void) mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, NULL, sp->setname); rval = -1; goto out; } } if (clnt_enable_sr_flags(hostname, sp, MD_SR_AUTO_TAKE, ep)) rval = -1; /* Disable SCSI reservations */ if (sd->sd_flags & MD_SR_MB_DEVID) dd = metaget_drivedesc(sp, MD_BASICNAME_OK | PRINT_FAST, &xep); else dd = metaget_drivedesc(sp, MD_BASICNAME_OK, &xep); if (! mdisok(&xep)) mdclrerror(&xep); if (dd != NULL) { if (rel_own_bydd(sp, dd, TRUE, &xep)) mdclrerror(&xep); } } } else { /* disable auto_take, if set, or error */ if (sd->sd_flags & MD_SR_AUTO_TAKE) { if (clnt_disable_sr_flags(hostname, sp, MD_SR_AUTO_TAKE, ep)) rval = -1; /* Enable SCSI reservations */ if (sd->sd_flags & MD_SR_MB_DEVID) dd = metaget_drivedesc(sp, MD_BASICNAME_OK | PRINT_FAST, &xep); else dd = metaget_drivedesc(sp, MD_BASICNAME_OK, &xep); if (! mdisok(&xep)) mdclrerror(&xep); if (dd != NULL) { mhd_mhiargs_t mhiargs = defmhiargs; if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep)) mdclrerror(&xep); } } else { (void) mddserror(ep, MDE_DS_AUTONOTSET, sp->setno, NULL, NULL, sp->setname); rval = -1; } } out: cl_sk = cl_get_setkey(sp->setno, sp->setname); if (clnt_unlock_set(hostname, cl_sk, &xep)) { if (rval == 0) (void) mdstealerror(ep, &xep); rval = -1; } cl_set_setkey(NULL); return (rval); }