1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Just in case we're not in a build environment, make sure that 30 * TEXT_DOMAIN gets set to something. 31 */ 32 #if !defined(TEXT_DOMAIN) 33 #define TEXT_DOMAIN "SYS_TEST" 34 #endif 35 36 #include <meta.h> 37 #include <sdssc.h> 38 #include <arpa/inet.h> 39 #include <sys/lvm/md_mddb.h> 40 41 #define MAX_LINE_SIZE 1024 42 43 /* 44 * Maximum amount of time to spend waiting for an ownership change to complete. 45 */ 46 static const int OWNER_TIMEOUT = 3; 47 48 /* 49 * FUNCTION: meta_is_mn_set() 50 * INPUT: sp - the set name 51 * OUTPUT: ep - return error pointer 52 * RETURNS: int - 1 if MultiNode set else 0 53 * PURPOSE: checks if the set is a MultiNode set 54 */ 55 int 56 meta_is_mn_set( 57 mdsetname_t *sp, 58 md_error_t *ep 59 ) 60 { 61 md_set_desc *sd; 62 63 /* Local set cannot be MultiNode */ 64 if ((sp == NULL) || (sp->setname == NULL) || 65 (strcmp(sp->setname, MD_LOCAL_NAME) == 0)) 66 return (0); 67 sd = metaget_setdesc(sp, ep); 68 ASSERT(sd != NULL); 69 if (sd->sd_flags & MD_SR_MN) 70 return (1); 71 return (0); 72 } 73 74 /* 75 * FUNCTION: meta_is_mn_name() 76 * INPUT: spp - ptr to the set name, if NULL the setname is derived 77 * from the metadevice name (eg set/d10 ) 78 * name - the metadevice/hsp name 79 * OUTPUT: ep - return error pointer 80 * RETURNS: int - 1 if MultiNode set else 0 81 * PURPOSE: checks if the metadevice is in a MultiNode set 82 */ 83 int 84 meta_is_mn_name( 85 mdsetname_t **spp, 86 char *name, 87 md_error_t *ep 88 ) 89 { 90 if (*spp == NULL) { 91 char *cname; 92 93 /* 94 * if the setname is specified in uname and *spp is 95 * not set, then it is setup using that set name value. 96 * If *spp is set and a setname specified in uname and 97 * the set names don't agree then cname will be 98 * returned as NULL 99 */ 100 cname = meta_canonicalize_check_set(spp, name, ep); 101 if (cname == NULL) { 102 mdclrerror(ep); 103 return (0); 104 } 105 106 Free(cname); 107 } 108 109 if ((strcmp((*spp)->setname, MD_LOCAL_NAME) != 0) && 110 (metaget_setdesc(*spp, ep) != NULL) && 111 ((*spp)->setdesc->sd_flags & MD_SR_MN)) { 112 return (1); 113 } 114 return (0); 115 } 116 117 /* 118 * meta_ping_mnset(set_t setno) 119 * Send a test message for this set in order to make commd do some init stuff 120 * Don't bother changelog. 121 * If set is suspended, fail immediately. 122 */ 123 void 124 meta_ping_mnset(set_t setno) 125 { 126 char *data = "test"; 127 md_error_t mde = mdnullerror; 128 md_mn_result_t *resp = NULL; 129 130 (void) mdmn_send_message(setno, MD_MN_MSG_TEST2, 131 MD_MSGF_NO_LOG | MD_MSGF_FAIL_ON_SUSPEND, data, 132 sizeof (data), &resp, &mde); 133 134 if (resp != (md_mn_result_t *)NULL) { 135 free_result(resp); 136 } 137 } 138 139 /* 140 * 141 * FUNCTION: print_stderr 142 * INPUT: errstr - the error message returned by the command 143 * context - the context string from metainit -a 144 * PURPOSE: called from meta_mn_send_command to print the error message 145 * to stderr. When context is NO_CONTEXT_STRING, the errstr string 146 * is output unchanged. When context is a string, it is the context 147 * string for the metainit -a command and in this case the errstr 148 * string has to be parsed to extract the command and node name 149 * and to send a message to stderr in the format 150 * command: node: context: error message 151 */ 152 static void 153 print_stderr( 154 char *errstr, 155 char *context 156 ) 157 { 158 char *command; 159 char *node; 160 char *message; 161 int length = strlen(errstr + 1); 162 163 if (context == NO_CONTEXT_STRING) { 164 (void) fprintf(stderr, "%s", errstr); 165 } else { 166 command = Malloc(length); 167 node = Malloc(length); 168 message = Malloc(length); 169 if (sscanf(errstr, "%[^:]: %[^:]: %[^\n]", command, node, 170 message) == 3) { 171 (void) fprintf(stderr, "%s: %s: %s: %s\n", command, 172 node, context, message); 173 } else { 174 (void) fprintf(stderr, dgettext(TEXT_DOMAIN, 175 "%s: Invalid format error message"), errstr); 176 } 177 Free(command); 178 Free(node); 179 Free(message); 180 } 181 } 182 183 /* 184 * FUNCTION: meta_mn_send_command() 185 * INPUT: sp - the set name 186 * argc - number of arguments 187 * argv - arg list 188 * flags - some controlling flags 189 * initall_context - context string for metainit -a 190 * OUTPUT: ep - return error pointer 191 * RETURNS: return exitval from mdmn_send_message 192 * PURPOSE: sends the command to the master node for execution 193 */ 194 int 195 meta_mn_send_command( 196 mdsetname_t *sp, 197 int argc, 198 char *argv[], 199 int flags, 200 char *initall_context, 201 md_error_t *ep 202 ) 203 { 204 int a; 205 int err; 206 int retval; 207 int send_message_flags = MD_MSGF_DEFAULT_FLAGS; 208 int send_message_type; 209 char *cmd; 210 md_mn_result_t *resp = NULL; 211 212 cmd = Malloc(1024); 213 (void) strlcpy(cmd, argv[0], 1024); 214 for (a = 1; a < argc; a++) { 215 /* don't copy empty arguments */ 216 if (*argv[a] == '\0') { 217 continue; 218 } 219 (void) strcat(cmd, " "); 220 (void) strcat(cmd, argv[a]); 221 } 222 /* 223 * in dryrun mode stop on the first error 224 * use the CMD_RETRY message type if RETRY_BUSY flag set 225 */ 226 if (flags & MD_DRYRUN) 227 send_message_flags |= MD_MSGF_STOP_ON_ERROR; 228 if (flags & MD_NOLOG) 229 send_message_flags |= MD_MSGF_NO_LOG; 230 if (flags & MD_PANIC_WHEN_INCONSISTENT) 231 send_message_flags |= MD_MSGF_PANIC_WHEN_INCONSISTENT; 232 if (flags & MD_RETRY_BUSY) { 233 send_message_type = MD_MN_MSG_BC_CMD_RETRY; 234 } else { 235 send_message_type = MD_MN_MSG_BC_CMD; 236 } 237 err = mdmn_send_message( 238 sp->setno, send_message_type, send_message_flags, 239 cmd, 1024, &resp, ep); 240 241 free(cmd); 242 243 if (err == 0) { 244 /* 245 * stderr may be turned off by IGNORE_STDERR 246 * In dryrun we only print stderr if the exit_val is non-zero 247 */ 248 if ((resp->mmr_err_size != 0) && 249 ((flags & MD_IGNORE_STDERR) == 0)) { 250 if (((flags & MD_DRYRUN) == 0) || 251 (resp->mmr_exitval != 0)) { 252 print_stderr(resp->mmr_err, initall_context); 253 } 254 } 255 256 /* 257 * If dryrun is set, we don't display stdout, 258 * because the real run has yet to follow. 259 */ 260 if (((flags & MD_DRYRUN) == 0) && (resp->mmr_out_size != 0)) { 261 (void) printf("%s", resp->mmr_out); 262 } 263 retval = resp->mmr_exitval; 264 free_result(resp); 265 return (retval); 266 } 267 if (resp != NULL) { 268 if (resp->mmr_comm_state == MDMNE_CLASS_BUSY) { 269 (void) fprintf(stderr, dgettext(TEXT_DOMAIN, 270 "rpc.mdcommd currently busy. " 271 "Retry operation later.\n")); 272 } else if (resp->mmr_comm_state == MDMNE_NOT_JOINED) { 273 (void) fprintf(stderr, dgettext(TEXT_DOMAIN, 274 "Node %s must join the %s multi-owner diskset to " 275 "issue commands.\n" 276 "To join, use: metaset -s %s -j\n"), 277 mynode(), sp->setname, sp->setname); 278 } else if (resp->mmr_comm_state == MDMNE_LOG_FAIL) { 279 mddb_config_t c; 280 281 (void) memset(&c, 0, sizeof (c)); 282 c.c_setno = sp->setno; 283 (void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL); 284 (void) fprintf(stderr, dgettext(TEXT_DOMAIN, 285 "Command not attempted: Unable to log message " 286 "in set %s\n"), sp->setname); 287 if (c.c_flags & MDDB_C_STALE) { 288 (void) mdmddberror(ep, MDE_DB_STALE, 289 (minor_t)NODEV64, sp->setno, 0, NULL); 290 mde_perror(ep, ""); 291 } 292 } else { 293 (void) fprintf(stderr, dgettext(TEXT_DOMAIN, 294 "Command failed: Commd State %d " 295 "encountered.\n"), resp->mmr_comm_state); 296 } 297 free_result(resp); 298 } else { 299 (void) fprintf(stderr, dgettext(TEXT_DOMAIN, 300 "Command failed: mdmn_send_message returned %d.\n"), 301 err); 302 } 303 304 305 return (1); 306 } 307 308 /* 309 * FUNCTION: meta_mn_send_suspend_writes() 310 * INPUT: mnum - minor num of mirror 311 * OUTPUT: ep - return error pointer 312 * RETURNS: return value from mdmn_send_message() 313 * PURPOSE: sends message to all nodes to suspend writes to the mirror. 314 */ 315 int 316 meta_mn_send_suspend_writes( 317 minor_t mnum, 318 md_error_t *ep 319 ) 320 { 321 int result; 322 md_mn_msg_suspwr_t suspwrmsg; 323 md_mn_result_t *resp = NULL; 324 325 suspwrmsg.msg_suspwr_mnum = mnum; 326 /* 327 * This message is never directly issued. 328 * So we launch it with a suspend override flag. 329 * If the commd is suspended, and this message comes 330 * along it must be sent due to replaying a command or similar. 331 * In that case we don't want this message to be blocked. 332 * If the commd is not suspended, the flag does no harm. 333 */ 334 result = mdmn_send_message(MD_MIN2SET(mnum), 335 MD_MN_MSG_SUSPEND_WRITES, 336 MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, 337 (char *)&suspwrmsg, sizeof (suspwrmsg), &resp, ep); 338 if (resp != NULL) { 339 free_result(resp); 340 } 341 return (result); 342 } 343 344 /* 345 * Parse the multi-node list file 346 * 347 * Return Values: Zero - Success 348 * Non Zero - Failure 349 * 350 * File content: The content of the nodelist file should consist of 351 * triplets of nodeid, nodename and private interconnect 352 * address seperated by one or more white space. 353 * e.g. 354 * 1 node_a 192.168.111.3 355 * 2 node_b 192.168.111.5 356 * 357 * Any missing fields will result in an error. 358 */ 359 int 360 meta_read_nodelist( 361 int *nodecnt, 362 mndiskset_membershiplist_t **nl, 363 md_error_t *ep 364 ) 365 { 366 FILE *fp = NULL; 367 char line[MAX_LINE_SIZE]; 368 char *buf; 369 uint_t i; 370 int sz; 371 mndiskset_membershiplist_t **tailp = nl; 372 373 /* open file */ 374 if ((fp = fopen(META_MNSET_NODELIST, "r")) == NULL) { 375 mndiskset_membershiplist_t *nlp; 376 struct hostent *hp; 377 378 /* return this node with id of 1 */ 379 nlp = *tailp = Zalloc(sizeof (*nlp)); 380 tailp = &nlp->next; 381 382 *nodecnt = 1; 383 nlp->msl_node_id = 1; 384 buf = mynode(); 385 sz = min(strlen(buf), sizeof (nlp->msl_node_name) - 1); 386 (void) strncpy(nlp->msl_node_name, buf, sz); 387 nlp->msl_node_name[sz] = '\0'; 388 389 /* retrieve info about our host */ 390 if ((hp = gethostbyname(buf)) == NULL) { 391 return (mdsyserror(ep, EADDRNOTAVAIL, buf)); 392 } 393 /* We only do IPv4 addresses, for now */ 394 if (hp->h_addrtype != AF_INET) { 395 return (mdsyserror(ep, EPFNOSUPPORT, buf)); 396 } 397 /* We take the first address only */ 398 if (*hp->h_addr_list) { 399 struct in_addr in; 400 401 (void) memcpy(&in.s_addr, *hp->h_addr_list, 402 sizeof (struct in_addr)); 403 (void) strncpy(nlp->msl_node_addr, inet_ntoa(in), 404 MD_MAX_NODENAME); 405 } else { 406 return (mdsyserror(ep, EADDRNOTAVAIL, buf)); 407 } 408 409 return (0); 410 } 411 412 *nl = NULL; 413 *nodecnt = 0; 414 415 while ((fp != NULL) && ((buf = fgets(line, sizeof (line) - 1, fp)) != 416 NULL)) { 417 mndiskset_membershiplist_t *nlp; 418 419 /* skip leading spaces */ 420 while ((*buf != '\0') && (i = strcspn(buf, " \t\n")) == 0) 421 buf++; 422 423 /* skip comments and blank lines */ 424 if (*buf == '\0' || *buf == '#') 425 continue; 426 427 /* allocate memory and set tail pointer */ 428 nlp = *tailp = Zalloc(sizeof (*nlp)); 429 tailp = &nlp->next; 430 431 /* parse node id */ 432 nlp->msl_node_id = strtoul(buf, NULL, 0); 433 buf += i; 434 435 /* skip leading spaces */ 436 while ((*buf != '\0') && (i = strcspn(buf, " \t\n")) == 0) 437 buf++; 438 439 /* fields missing, return error */ 440 if (*buf == '\0' || *buf == '#') { 441 meta_free_nodelist(*nl); 442 *nl = NULL; 443 *nodecnt = 0; 444 445 /* close file and return */ 446 if ((fp) && (fclose(fp) != 0)) 447 return (mdsyserror(ep, errno, 448 META_MNSET_NODELIST)); 449 450 return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST)); 451 } 452 453 /* parse node name */ 454 sz = min(i, sizeof (nlp->msl_node_name) - 1); 455 (void) strncpy(nlp->msl_node_name, buf, sz); 456 nlp->msl_node_name[sz] = '\0'; 457 buf += i; 458 459 /* skip leading spaces */ 460 while ((*buf != '\0') && (i = strcspn(buf, " \t\n")) == 0) 461 buf++; 462 463 /* fields missing, return error */ 464 if (*buf == '\0' || *buf == '#') { 465 meta_free_nodelist(*nl); 466 *nl = NULL; 467 *nodecnt = 0; 468 469 /* close file and return */ 470 if ((fp) && (fclose(fp) != 0)) 471 return (mdsyserror(ep, errno, 472 META_MNSET_NODELIST)); 473 474 return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST)); 475 } 476 477 /* parse node address */ 478 sz = min(i, sizeof (nlp->msl_node_addr) - 1); 479 (void) strncpy(nlp->msl_node_addr, buf, sz); 480 nlp->msl_node_addr[sz] = '\0'; 481 482 ++*nodecnt; 483 } 484 485 /* close file */ 486 if ((fp) && (fclose(fp) != 0)) 487 return (mdsyserror(ep, errno, META_MNSET_NODELIST)); 488 489 return (0); 490 } 491 492 /* 493 * Populate the multi-node list file from a given list of node id's 494 * The nids must have only one node id in each cell. Range of node 495 * id's in the form 1-n are not allowed. 496 * 497 * Return Values: Zero - Success 498 * Non Zero - Failure 499 */ 500 int 501 meta_write_nodelist( 502 int nodecnt, 503 char **nids, 504 md_error_t *ep 505 ) 506 { 507 FILE *fp = NULL; 508 char name[MAX_LINE_SIZE], addr[MAX_LINE_SIZE]; 509 uint_t i, nid; 510 struct in_addr ipaddr; 511 int err = 0; 512 513 /* check if we are running on clustering */ 514 if ((err = sdssc_bind_library()) != SDSSC_OKAY) { 515 return (mdsyserror(ep, err, META_MNSET_NODELIST)); 516 } 517 518 /* open file for writing */ 519 if ((fp = fopen(META_MNSET_NODELIST, "w")) == NULL) { 520 return (mdsyserror(ep, errno, META_MNSET_NODELIST)); 521 } 522 523 for (i = 0; i < nodecnt; i++) { 524 /* extract the node id */ 525 errno = 0; 526 nid = strtoul(nids[i], NULL, 0); 527 if (errno != 0) { 528 if ((fp) && (fclose(fp) != 0)) 529 return (mdsyserror(ep, errno, 530 META_MNSET_NODELIST)); 531 532 return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST)); 533 } 534 535 /* get node name */ 536 (void) snprintf(name, sizeof (name), "%d", nid); 537 sdssc_cm_nid2nm(name); 538 539 /* finally get the private ip address */ 540 (void) snprintf(addr, sizeof (addr), "%s", name); 541 if (sdssc_get_priv_ipaddr(addr, &ipaddr) != SDSSC_OKAY) { 542 if ((fp) && (fclose(fp) != 0)) 543 return (mdsyserror(ep, errno, 544 META_MNSET_NODELIST)); 545 546 return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST)); 547 } 548 549 (void) fprintf(fp, "%d\t%s\t%s\n", nid, name, 550 inet_ntoa(ipaddr)); 551 } 552 553 /* close file */ 554 if ((fp) && (fclose(fp) != 0)) 555 return (mdsyserror(ep, errno, META_MNSET_NODELIST)); 556 557 return (0); 558 } 559 560 /* 561 * Free node list 562 */ 563 void 564 meta_free_nodelist( 565 mndiskset_membershiplist_t *nl 566 ) 567 { 568 mndiskset_membershiplist_t *next = NULL; 569 570 for (/* void */; (nl != NULL); nl = next) { 571 next = nl->next; 572 Free(nl); 573 } 574 } 575 576 /* 577 * FUNCTION: meta_mn_send_setsync() 578 * INPUT: sp - setname 579 * mirnp - mirror name 580 * size - buffer size, 0 if none 581 * OUTPUT: ep - return error pointer 582 * RETURNS: return value from meta_mn_send_command() 583 * PURPOSE: Send a setsync command to all nodes to set resync status 584 */ 585 586 int 587 meta_mn_send_setsync( 588 mdsetname_t *sp, 589 mdname_t *mirnp, 590 daddr_t size, 591 md_error_t *ep 592 ) 593 { 594 md_mn_msg_setsync_t setsyncmsg; 595 int ret; 596 md_mn_result_t *resp = NULL; 597 598 setsyncmsg.setsync_mnum = meta_getminor(mirnp->dev); 599 setsyncmsg.setsync_copysize = size; 600 setsyncmsg.setsync_flags = 0; 601 602 /* 603 * We do not log the metasync command as it will have no effect on the 604 * underlying metadb state. If we have a master change the 605 * reconfiguration process will issue a new 'metasync' to all affected 606 * mirrors, so we would actually end up sending the message twice. 607 * Removing the logging of the message helps reduce the processing 608 * time required. 609 */ 610 ret = mdmn_send_message(sp->setno, MD_MN_MSG_SETSYNC, 611 MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, 612 (char *)&setsyncmsg, sizeof (setsyncmsg), &resp, ep); 613 if (resp != NULL) { 614 free_result(resp); 615 } 616 617 /* 618 * Unlike non-MN sets, the metasync command does not actually 619 * start a resync, it simply updates the state on all of the 620 * nodes. Therefore, to start a resync we send a resync starting 621 * message for the metadevice 622 */ 623 if (ret == 0) 624 ret = meta_mn_send_resync_starting(mirnp, ep); 625 return (ret); 626 } 627 628 /* 629 * FUNCTION: meta_mn_send_metaclear_command() 630 * INPUT: sp - setname 631 * name - metadevice name 632 * options - command options 633 * pflag - clear all soft partitions for a given device 634 * OUTPUT: ep - return error pointer 635 * RETURNS: return value from meta_mn_send_command() 636 * PURPOSE: Send a metaclear command to all nodes with force(-f) and 637 * recurse(-r) options set if required. For hotspare pool and 638 * metadevices, the metadevice name is of the form setname/dxx or 639 * setname/hspxxx so a '-s' argument isn't required. If pflag is set 640 * the name refers to a metadevice or component and in the is case 641 * a '-s' argument is required to define the set. 642 */ 643 644 int 645 meta_mn_send_metaclear_command( 646 mdsetname_t *sp, 647 char *name, 648 mdcmdopts_t options, 649 int pflag, 650 md_error_t *ep 651 ) 652 { 653 int newargc; 654 char **newargv; 655 int ret; 656 657 /* 658 * Allocate an array large enough to hold all of the possible 659 * metaclear arguments 660 */ 661 newargv = Calloc(7, sizeof (char *)); 662 newargv[0] = "metaclear"; 663 newargc = 1; 664 if (pflag) { 665 newargv[newargc] = "-s"; 666 newargc++; 667 newargv[newargc] = sp->setname; 668 newargc++; 669 } 670 if (options & MDCMD_FORCE) { 671 newargv[newargc] = "-f"; 672 newargc++; 673 } 674 if (options & MDCMD_RECURSE) { 675 newargv[newargc] = "-r"; 676 newargc++; 677 } 678 if (pflag) { 679 newargv[newargc] = "-p"; 680 newargc++; 681 } 682 newargv[newargc] = name; 683 newargc++; 684 685 ret = meta_mn_send_command(sp, newargc, newargv, 686 MD_DISP_STDERR, NO_CONTEXT_STRING, ep); 687 688 free(newargv); 689 return (ret); 690 } 691 692 /* 693 * FUNCTION: meta_mn_send_resync_starting() 694 * INPUT: sp - setname 695 * mirnp - mirror name 696 * OUTPUT: ep - return error pointer 697 * RETURNS: return value from mdmn_send_message() 698 * PURPOSE: Send a resync starting message to all nodes. 699 */ 700 701 int 702 meta_mn_send_resync_starting( 703 mdname_t *mirnp, 704 md_error_t *ep 705 ) 706 { 707 int result; 708 md_mn_msg_resync_t resyncmsg; 709 md_mn_result_t *resp = NULL; 710 minor_t mnum = meta_getminor(mirnp->dev); 711 712 /* 713 * This message is never directly issued. 714 * So we launch it with a suspend override flag. 715 * If the commd is suspended, and this message comes 716 * along it must be sent due to replaying a command or similar. 717 * In that case we don't want this message to be blocked. 718 * If the commd is not suspended, the flag does no harm. 719 */ 720 resyncmsg.msg_resync_mnum = mnum; 721 result = mdmn_send_message(MD_MIN2SET(mnum), 722 MD_MN_MSG_RESYNC_STARTING, 723 MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, 724 (char *)&resyncmsg, sizeof (resyncmsg), &resp, ep); 725 726 if (resp != NULL) { 727 free_result(resp); 728 } 729 return (result); 730 } 731 732 /* 733 * FUNCTION: meta_mn_change_owner() 734 * INPUT: opp - pointer to parameter block 735 * setno - set number of mirror metadevice 736 * mnum - minor number of mirror metadevice 737 * owner - node ID of mirror owner 738 * flags - flag field for ioctl 739 * OUTPUT: opp - parameter block used to send ioctl 740 * RETURNS: int - 0 success, -1 error 741 * PURPOSE: issue an ioctl to change the ownership of the specified mirror 742 * to our node ID. We need to be the owner before any watermarks 743 * are committed to the device otherwise we'll enter a deadly 744 * embrace when attempting to write the watermark. 745 * This function can also be used so set the owner on a node to 746 * NULL. In this case the change is only made on the local node. 747 * In addition by setting the MD_MN_MM_CHOOSE_OWNER flag, the 748 * function can also be used to choose a mirror resync owner. This 749 * function should only be called on the master and it will 750 * select the owner and request it to become the owner. 751 */ 752 int 753 meta_mn_change_owner( 754 md_set_mmown_params_t **opp, /* Returned parameter block */ 755 set_t setno, /* Mirror set number */ 756 uint_t mnum, /* Minor number */ 757 uint_t owner, /* Node ID of mirror owner */ 758 uint_t flags /* Flags */ 759 ) 760 { 761 md_set_mmown_params_t *ownpar = *opp; 762 md_mn_own_status_t *ownstat = NULL; 763 struct timeval tvs, tve; 764 int n = 0; 765 int rval; 766 767 if (ownpar != NULL) { 768 (void) memset(ownpar, 0, sizeof (*ownpar)); 769 } else { 770 ownpar = Zalloc(sizeof (*ownpar)); 771 } 772 ownstat = Zalloc(sizeof (*ownstat)); 773 774 ownpar->d.mnum = mnum; 775 ownpar->d.owner = owner; 776 ownpar->d.flags = flags; 777 MD_SETDRIVERNAME(ownpar, MD_MIRROR, setno); 778 MD_SETDRIVERNAME(ownstat, MD_MIRROR, setno); 779 780 /* 781 * Attempt to change the ownership to the specified node. We retry this 782 * up to 10 times if we receive EAGAIN from the metadevice. This only 783 * happens if the underlying metadevice is busy with outstanding i/o 784 * that requires ownership change. 785 */ 786 while ((rval = metaioctl(MD_MN_SET_MM_OWNER, ownpar, &ownpar->mde, 787 NULL)) != 0) { 788 md_sys_error_t *ip = 789 &ownpar->mde.info.md_error_info_t_u.sys_error; 790 if (ip->errnum != EAGAIN) 791 break; 792 if (n++ >= 10) 793 break; 794 (void) sleep(1); 795 } 796 797 /* 798 * There is no need to wait for the ioctl completion if we are setting 799 * the owner to NULL or requesting the master to choose the owner 800 */ 801 if ((owner == 0) || (flags & MD_MN_MM_CHOOSE_OWNER)) { 802 Free(ownstat); 803 *opp = ownpar; 804 return (0); 805 } 806 807 /* 808 * Wait for ioctl completion or a timeout to occur. If we 809 * timeout we fail the i/o request. 810 */ 811 ownstat->mnum = ownpar->d.mnum; 812 (void) gettimeofday(&tvs, NULL); 813 814 while ((rval == 0) && !(ownstat->flags & MD_MN_MM_RESULT)) { 815 while ((rval = metaioctl(MD_MN_MM_OWNER_STATUS, ownstat, 816 &ownstat->mde, NULL)) != 0) { 817 (void) gettimeofday(&tve, NULL); 818 if ((tve.tv_sec - tvs.tv_sec) > OWNER_TIMEOUT) { 819 rval = -1; 820 break; 821 } 822 (void) sleep(1); 823 } 824 } 825 826 /* we did not not timeout but ioctl failed set rval */ 827 828 if (rval == 0) { 829 rval = (ownstat->flags & MD_MN_MM_RES_FAIL) ? -1 : 0; 830 } 831 832 Free(ownstat); 833 *opp = ownpar; 834 return (rval); 835 } 836 /* 837 * special handling is required when running on a single node 838 * non-SC3.x environment. This function determines tests 839 * for that case. 840 * 841 * Return values: 842 * 0 - no nodes or joined or in a SC3.x env 843 * 1 - 1 node and not in SC3.x env 844 */ 845 846 int 847 meta_mn_singlenode() 848 { 849 md_error_t xep = mdnullerror; 850 int nodecnt; 851 int mnset_single_node = 0; 852 mndiskset_membershiplist_t *nl; 853 854 /* 855 * If running on SunCluster, then don't validate MN sets, 856 * this is done during a reconfig cycle since all nodes must 857 * take the same action. 858 * 859 * Only cleanup in case of a single node situation 860 * when not running on SunCluster. This single node 861 * situation occurs when the nodelist only contains 862 * this node and the MN setrecords only contain this 863 * node. 864 */ 865 if (meta_read_nodelist(&nodecnt, &nl, &xep) == -1) { 866 nodecnt = 0; /* no nodes are alive */ 867 nl = NULL; 868 mdclrerror(&xep); 869 } else { 870 /* 871 * If only 1 node in nodelist and not running 872 * on SunCluster, set single_node flag. 873 */ 874 if ((nodecnt == 1) && 875 (strcmp(nl->msl_node_name, mynode()) == 0) && 876 ((sdssc_bind_library()) != SDSSC_OKAY)) { 877 mnset_single_node = 1; 878 } 879 meta_free_nodelist(nl); 880 } 881 return (mnset_single_node); 882 } 883 884 /* 885 * FUNCTION: meta_mn_send_get_tstate() 886 * INPUT: dev - dev_t of device 887 * OUTPUT: tstatep - tstate value 888 * ep - return error pointer 889 * RETURNS: return value from mdmn_send_message() 890 * PURPOSE: Send a message to the master to get ui_tstate for a given device. 891 */ 892 893 int 894 meta_mn_send_get_tstate( 895 md_dev64_t dev, 896 uint_t *tstatep, 897 md_error_t *ep 898 ) 899 { 900 int result; 901 md_mn_msg_gettstate_t tstatemsg; 902 md_mn_result_t *resp = NULL; 903 minor_t mnum = meta_getminor(dev); 904 905 tstatemsg.gettstate_dev = dev; 906 result = mdmn_send_message(MD_MIN2SET(mnum), 907 MD_MN_MSG_GET_TSTATE, 908 MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 909 (char *)&tstatemsg, sizeof (tstatemsg), &resp, ep); 910 911 if (result == 0) 912 *tstatep = resp->mmr_exitval; 913 else 914 /* If some error occurred set tstate to 0 */ 915 *tstatep = 0; 916 917 if (resp != NULL) { 918 free_result(resp); 919 } 920 return (result); 921 } 922