10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 50Sstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only 60Sstevel@tonic-gate * (the "License"). You may not use this file except in compliance 70Sstevel@tonic-gate * with the License. 80Sstevel@tonic-gate * 90Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 100Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 110Sstevel@tonic-gate * See the License for the specific language governing permissions 120Sstevel@tonic-gate * and limitations under the License. 130Sstevel@tonic-gate * 140Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 150Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 160Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 170Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 180Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 190Sstevel@tonic-gate * 200Sstevel@tonic-gate * CDDL HEADER END 210Sstevel@tonic-gate */ 220Sstevel@tonic-gate /* 2362Sjeanm * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 240Sstevel@tonic-gate * Use is subject to license terms. 250Sstevel@tonic-gate */ 260Sstevel@tonic-gate 270Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 280Sstevel@tonic-gate 290Sstevel@tonic-gate #include <unistd.h> 300Sstevel@tonic-gate #include <sys/types.h> 310Sstevel@tonic-gate #include <sys/stat.h> 320Sstevel@tonic-gate #include <sys/statvfs.h> 330Sstevel@tonic-gate #include <sys/uadmin.h> 340Sstevel@tonic-gate #include <fcntl.h> 350Sstevel@tonic-gate #include <stdio.h> 360Sstevel@tonic-gate #include <thread.h> 370Sstevel@tonic-gate #include <meta.h> 380Sstevel@tonic-gate #include <sdssc.h> 390Sstevel@tonic-gate #include <mdmn_changelog.h> 400Sstevel@tonic-gate #include "mdmn_subr.h" 410Sstevel@tonic-gate 420Sstevel@tonic-gate /* 430Sstevel@tonic-gate * This is the communication daemon for SVM Multi Node Disksets. 440Sstevel@tonic-gate * It runs on every node and provides the following rpc services: 450Sstevel@tonic-gate * - mdmn_send_svc_1 460Sstevel@tonic-gate * - mdmn_work_svc_1 470Sstevel@tonic-gate * - mdmn_wakeup_initiator_svc_1 480Sstevel@tonic-gate * - mdmn_wakeup_master_svc_1 490Sstevel@tonic-gate * - mdmn_comm_lock_svc_1 500Sstevel@tonic-gate * - mdmn_comm_unlock_svc_1 510Sstevel@tonic-gate * - mdmn_comm_suspend_svc_1 520Sstevel@tonic-gate * - mdmn_comm_resume_svc_1 530Sstevel@tonic-gate * - mdmn_comm_reinit_set_svc_1 540Sstevel@tonic-gate * where send, lock, unlock and reinit are meant for external use, 550Sstevel@tonic-gate * work and the two wakeups are for internal use only. 560Sstevel@tonic-gate * 570Sstevel@tonic-gate * NOTE: 580Sstevel@tonic-gate * On every node only one of those xxx_1 functions can be active at the 590Sstevel@tonic-gate * same time because the daemon is single threaded. 600Sstevel@tonic-gate * 610Sstevel@tonic-gate * 620Sstevel@tonic-gate * In case an event occurs that has to be propagated to all the nodes... 630Sstevel@tonic-gate * 640Sstevel@tonic-gate * One node (the initiator) 650Sstevel@tonic-gate * calls the libmeta function mdmn_send_message() 660Sstevel@tonic-gate * This function calls the local daemon thru mdmn_send_svc_1. 670Sstevel@tonic-gate * 680Sstevel@tonic-gate * On the initiator: 690Sstevel@tonic-gate * mdmn_send_svc_1() 700Sstevel@tonic-gate * - starts a thread -> mdmn_send_to_work() and returns. 710Sstevel@tonic-gate * mdmn_send_to_work() 720Sstevel@tonic-gate * - sends this message over to the master of the diskset. 730Sstevel@tonic-gate * This is done by calling mdmn_work_svc_1 on the master. 740Sstevel@tonic-gate * - registers to the initiator_table 750Sstevel@tonic-gate * - exits without doing a svc_sendreply() for the call to 760Sstevel@tonic-gate * mdmn_send_svc_1. This means that call is blocked until somebody 770Sstevel@tonic-gate * (see end of this comment) does a svc_sendreply(). 780Sstevel@tonic-gate * This means mdmn_send_message() does not yet return. 790Sstevel@tonic-gate * - A timeout surveillance is started at this point. 800Sstevel@tonic-gate * This means in case the master doesn't reply at all in an 810Sstevel@tonic-gate * aproppriate time, an error condition is returned 820Sstevel@tonic-gate * to the caller. 830Sstevel@tonic-gate * 840Sstevel@tonic-gate * On the master: 850Sstevel@tonic-gate * mdmn_work_svc_1() 860Sstevel@tonic-gate * - starts a thread -> mdmn_master_process_msg() and returns 870Sstevel@tonic-gate * mdmn_master_process_msg() 880Sstevel@tonic-gate * - logs the message to the change log 890Sstevel@tonic-gate * - executes the message locally 900Sstevel@tonic-gate * - flags the message in the change log 910Sstevel@tonic-gate * - sends the message to mdmn_work_svc_1() on all the 920Sstevel@tonic-gate * other nodes (slaves) 930Sstevel@tonic-gate * after each call to mdmn_work_svc_1 the thread goes to sleep and 940Sstevel@tonic-gate * will be woken up by mdmn_wakeup_master_svc_1() as soon as the 950Sstevel@tonic-gate * slave node is done with this message. 960Sstevel@tonic-gate * - In case the slave doesn't respond in a apropriate time, an error 970Sstevel@tonic-gate * is assumed to ensure the master doesn't wait forever. 980Sstevel@tonic-gate * 990Sstevel@tonic-gate * On a slave: 1000Sstevel@tonic-gate * mdmn_work_svc_1() 1010Sstevel@tonic-gate * - starts a thread -> mdmn_slave_process_msg() and returns 1020Sstevel@tonic-gate * mdmn_slave_process_msg() 1030Sstevel@tonic-gate * - processes this message locally by calling the appropriate message 1040Sstevel@tonic-gate * handler, that creates some result. 1050Sstevel@tonic-gate * - sends that result thru a call to mdmn_wakeup_master_svc_1() to 1060Sstevel@tonic-gate * the master. 1070Sstevel@tonic-gate * 1080Sstevel@tonic-gate * Back on the master: 1090Sstevel@tonic-gate * mdmn_wakeup_master_svc_1() 1100Sstevel@tonic-gate * - stores the result into the master_table. 1110Sstevel@tonic-gate * - signals the mdmn_master_process_msg-thread. 1120Sstevel@tonic-gate * - returns 1130Sstevel@tonic-gate * mdmn_master_process_msg() 1140Sstevel@tonic-gate * - after getting the results from all nodes 1150Sstevel@tonic-gate * - sends them back to the initiating node thru a call to 1160Sstevel@tonic-gate * mdmn_wakeup_initiator_svc_1. 1170Sstevel@tonic-gate * 1180Sstevel@tonic-gate * Back on the initiator: 1190Sstevel@tonic-gate * mdmn_wakeup_initiator_svc_1() 1200Sstevel@tonic-gate * - calls svc_sendreply() which makes the call to mdmn_send_svc_1() 1210Sstevel@tonic-gate * return. 1220Sstevel@tonic-gate * which allows the initial mdmn_send_message() call to return. 1230Sstevel@tonic-gate */ 1240Sstevel@tonic-gate 1250Sstevel@tonic-gate FILE *commdout; /* debug output for the commd */ 1260Sstevel@tonic-gate char *commdoutfile; /* file name for the above output */ 1270Sstevel@tonic-gate /* want at least 10 MB free space when logging into a file */ 1280Sstevel@tonic-gate #define MIN_FS_SPACE (10LL * 1024 * 1024) 1290Sstevel@tonic-gate 1300Sstevel@tonic-gate /* 1310Sstevel@tonic-gate * Number of outstanding messages that were initiated by this node. 1320Sstevel@tonic-gate * If zero, check_timeouts goes to sleep 1330Sstevel@tonic-gate */ 1340Sstevel@tonic-gate uint_t messages_on_their_way; 1350Sstevel@tonic-gate mutex_t check_timeout_mutex; /* need mutex to protect above */ 1360Sstevel@tonic-gate cond_t check_timeout_cv; /* trigger for check_timeouts */ 1370Sstevel@tonic-gate 1380Sstevel@tonic-gate /* for printing out time stamps */ 1390Sstevel@tonic-gate hrtime_t __savetime; 1400Sstevel@tonic-gate 1410Sstevel@tonic-gate /* RPC clients for every set and every node and their protecting locks */ 1420Sstevel@tonic-gate CLIENT *client[MD_MAXSETS][NNODES]; 1430Sstevel@tonic-gate rwlock_t client_rwlock[MD_MAXSETS]; 1440Sstevel@tonic-gate 1450Sstevel@tonic-gate /* the descriptors of all possible sets and their protectors */ 1460Sstevel@tonic-gate struct md_set_desc *set_descriptor[MD_MAXSETS]; 1470Sstevel@tonic-gate rwlock_t set_desc_rwlock[MD_MAXSETS]; 1480Sstevel@tonic-gate 1490Sstevel@tonic-gate /* the daemon to daemon communication has to timeout quickly */ 1500Sstevel@tonic-gate static struct timeval FOUR_SECS = { 4, 0 }; 1510Sstevel@tonic-gate 1520Sstevel@tonic-gate /* These indicate if a set has already been setup */ 1530Sstevel@tonic-gate int md_mn_set_inited[MD_MAXSETS]; 1540Sstevel@tonic-gate 1550Sstevel@tonic-gate /* For every set we have a message completion table and protecting mutexes */ 1560Sstevel@tonic-gate md_mn_mct_t *mct[MD_MAXSETS]; 1570Sstevel@tonic-gate mutex_t mct_mutex[MD_MAXSETS][MD_MN_NCLASSES]; 1580Sstevel@tonic-gate 1590Sstevel@tonic-gate /* Stuff to describe the global status of the commd on one node */ 1600Sstevel@tonic-gate #define MD_CGS_INITED 0x0001 1610Sstevel@tonic-gate #define MD_CGS_ABORTED 0x0002 /* return everything with MDMNE_ABORT */ 1620Sstevel@tonic-gate uint_t md_commd_global_state = 0; /* No state when starting up */ 1630Sstevel@tonic-gate 1640Sstevel@tonic-gate /* 1650Sstevel@tonic-gate * Global verbosity level for the daemon 1660Sstevel@tonic-gate */ 1670Sstevel@tonic-gate uint_t md_commd_global_verb; 1680Sstevel@tonic-gate 1690Sstevel@tonic-gate /* 1700Sstevel@tonic-gate * libmeta doesn't like multiple threads in metaget_setdesc(). 1710Sstevel@tonic-gate * So we must protect access to it with a global lock 1720Sstevel@tonic-gate */ 1730Sstevel@tonic-gate mutex_t get_setdesc_mutex; 1740Sstevel@tonic-gate 1750Sstevel@tonic-gate /* 1760Sstevel@tonic-gate * Need a way to block single message types, 1770Sstevel@tonic-gate * hence an array with a status for every message type 1780Sstevel@tonic-gate */ 1790Sstevel@tonic-gate uint_t msgtype_lock_state[MD_MN_NMESSAGES]; 1800Sstevel@tonic-gate 1810Sstevel@tonic-gate /* for reading in the config file */ 1820Sstevel@tonic-gate #define MAX_LINE_SIZE 1024 1830Sstevel@tonic-gate 1840Sstevel@tonic-gate extern char *commd_get_outfile(void); 1850Sstevel@tonic-gate extern uint_t commd_get_verbosity(void); 1860Sstevel@tonic-gate 1870Sstevel@tonic-gate /* 1880Sstevel@tonic-gate * mdmn_clnt_create is a helper function for meta_client_create_retry. It 1890Sstevel@tonic-gate * merely needs to call clnt_create_timed, and meta_client_create_retry 1900Sstevel@tonic-gate * will take care of the rest. 1910Sstevel@tonic-gate */ 1920Sstevel@tonic-gate /* ARGSUSED */ 1930Sstevel@tonic-gate static CLIENT * 1940Sstevel@tonic-gate mdmn_clnt_create(char *ignore, void *data, struct timeval *time_out) 1950Sstevel@tonic-gate { 1960Sstevel@tonic-gate md_mnnode_desc *node = (md_mnnode_desc *)data; 1970Sstevel@tonic-gate 1980Sstevel@tonic-gate return (clnt_create_timed(node->nd_priv_ic, MDMN_COMMD, ONE, "tcp", 1990Sstevel@tonic-gate time_out)); 2000Sstevel@tonic-gate } 2010Sstevel@tonic-gate 2020Sstevel@tonic-gate #define FLUSH_DEBUGFILE() \ 2030Sstevel@tonic-gate if (commdout != (FILE *)NULL) { \ 2040Sstevel@tonic-gate fflush(commdout); \ 2050Sstevel@tonic-gate fsync(fileno(commdout)); \ 2060Sstevel@tonic-gate } 2070Sstevel@tonic-gate 2080Sstevel@tonic-gate static void 2090Sstevel@tonic-gate panic_system(int nid, md_mn_msgtype_t type, int master_err, int master_exitval, 2100Sstevel@tonic-gate md_mn_result_t *slave_result) 2110Sstevel@tonic-gate { 2120Sstevel@tonic-gate md_mn_commd_err_t commd_err; 2130Sstevel@tonic-gate md_error_t mne = mdnullerror; 2140Sstevel@tonic-gate char *msg_buf; 2150Sstevel@tonic-gate 2160Sstevel@tonic-gate msg_buf = (char *)calloc(MAXPATHLEN + 1, sizeof (char)); 2170Sstevel@tonic-gate 2180Sstevel@tonic-gate FLUSH_DEBUGFILE(); 2190Sstevel@tonic-gate 2200Sstevel@tonic-gate if (master_err != MDMNE_ACK) { 2210Sstevel@tonic-gate snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: RPC fail on master " 2220Sstevel@tonic-gate "when processing message type %d\n", type); 2230Sstevel@tonic-gate } else if (slave_result == NULL) { 2240Sstevel@tonic-gate snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: RPC fail on node " 2250Sstevel@tonic-gate "%d when processing message type %d\n", nid, type); 2260Sstevel@tonic-gate } else { 2270Sstevel@tonic-gate snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: Inconsistent " 2280Sstevel@tonic-gate "return value from node %d when processing message " 2290Sstevel@tonic-gate "type %d. Master exitval = %d, Slave exitval = %d\n", 2300Sstevel@tonic-gate nid, type, master_exitval, slave_result->mmr_exitval); 2310Sstevel@tonic-gate } 2320Sstevel@tonic-gate commd_err.size = strlen(msg_buf); 23362Sjeanm commd_err.md_message = (uint64_t)(uintptr_t)&msg_buf[0]; 2340Sstevel@tonic-gate 2350Sstevel@tonic-gate metaioctl(MD_MN_COMMD_ERR, &commd_err, &mne, "rpc.mdcommd"); 2360Sstevel@tonic-gate (void) uadmin(A_DUMP, AD_BOOT, NULL); 2370Sstevel@tonic-gate } 2380Sstevel@tonic-gate 2390Sstevel@tonic-gate static void 2400Sstevel@tonic-gate flush_fcout() 2410Sstevel@tonic-gate { 2420Sstevel@tonic-gate struct statvfs64 vfsbuf; 2430Sstevel@tonic-gate long long avail_bytes; 2440Sstevel@tonic-gate int warned = 0; 2450Sstevel@tonic-gate 2460Sstevel@tonic-gate for (; ; ) { 2470Sstevel@tonic-gate sleep(10); 2480Sstevel@tonic-gate /* No output file, nothing to do */ 2490Sstevel@tonic-gate if (commdout == (FILE *)NULL) 2500Sstevel@tonic-gate continue; 2510Sstevel@tonic-gate 2520Sstevel@tonic-gate /* 2530Sstevel@tonic-gate * stat the appropriate filesystem to check for available space. 2540Sstevel@tonic-gate */ 2550Sstevel@tonic-gate if (statvfs64(commdoutfile, &vfsbuf)) { 2560Sstevel@tonic-gate continue; 2570Sstevel@tonic-gate } 2580Sstevel@tonic-gate 2590Sstevel@tonic-gate avail_bytes = vfsbuf.f_frsize * vfsbuf.f_bavail; 2600Sstevel@tonic-gate /* 2610Sstevel@tonic-gate * If we don't have enough space, we print out a warning. 2620Sstevel@tonic-gate * And we drop the verbosity level to NULL 2630Sstevel@tonic-gate * In case the condtion doesn't go away, we don't repeat 2640Sstevel@tonic-gate * the warning. 2650Sstevel@tonic-gate */ 2660Sstevel@tonic-gate if (avail_bytes < MIN_FS_SPACE) { 2670Sstevel@tonic-gate if (warned) { 2680Sstevel@tonic-gate continue; 2690Sstevel@tonic-gate } 2700Sstevel@tonic-gate commd_debug(MD_MMV_SYSLOG, 2710Sstevel@tonic-gate "NOT enough space available for logging\n"); 2720Sstevel@tonic-gate commd_debug(MD_MMV_SYSLOG, 2730Sstevel@tonic-gate "Have %lld bytes, need %lld bytes\n", 2740Sstevel@tonic-gate avail_bytes, MIN_FS_SPACE); 2750Sstevel@tonic-gate warned = 1; 2760Sstevel@tonic-gate md_commd_global_verb = MD_MMV_NULL; 2770Sstevel@tonic-gate } else { 2780Sstevel@tonic-gate warned = 0; 2790Sstevel@tonic-gate } 2800Sstevel@tonic-gate 2810Sstevel@tonic-gate fflush(commdout); 2820Sstevel@tonic-gate } 2830Sstevel@tonic-gate } 2840Sstevel@tonic-gate 2850Sstevel@tonic-gate /* safer version of clnt_destroy. If clnt is NULL don't do anything */ 2860Sstevel@tonic-gate #define mdmn_clnt_destroy(clnt) { \ 2870Sstevel@tonic-gate if (clnt) \ 2880Sstevel@tonic-gate clnt_destroy(clnt); \ 2890Sstevel@tonic-gate } 2900Sstevel@tonic-gate 2910Sstevel@tonic-gate /* 2920Sstevel@tonic-gate * Own version of svc_sendreply that checks the integrity of the transport 2930Sstevel@tonic-gate * handle and so prevents us from core dumps in the real svc_sendreply() 2940Sstevel@tonic-gate */ 2950Sstevel@tonic-gate void 2960Sstevel@tonic-gate mdmn_svc_sendreply(SVCXPRT *transp, xdrproc_t xdr, caddr_t data) 2970Sstevel@tonic-gate { 2980Sstevel@tonic-gate if (SVC_STAT(transp) == XPRT_DIED) { 2990Sstevel@tonic-gate commd_debug(MD_MMV_MISC, 3000Sstevel@tonic-gate "mdmn_svc_sendreply: XPRT_DIED\n"); 3010Sstevel@tonic-gate return; 3020Sstevel@tonic-gate } 3030Sstevel@tonic-gate (void) svc_sendreply(transp, xdr, data); 3040Sstevel@tonic-gate } 3050Sstevel@tonic-gate 3060Sstevel@tonic-gate /* 3070Sstevel@tonic-gate * timeout_initiator(set, class) 3080Sstevel@tonic-gate * 3090Sstevel@tonic-gate * Alas, I sent a message and didn't get a response back in aproppriate time. 3100Sstevel@tonic-gate * 3110Sstevel@tonic-gate * timeout_initiator() takes care for doing the needed svc_sendreply() to the 3120Sstevel@tonic-gate * calling mdmn_send_message, so that guy doesn't wait forever 3130Sstevel@tonic-gate * What is done here is pretty much the same as what is done in 3140Sstevel@tonic-gate * wakeup initiator. The difference is that we cannot provide for any results, 3150Sstevel@tonic-gate * of course and we set the comm_state to MDMNE_TIMEOUT. 3160Sstevel@tonic-gate * 3170Sstevel@tonic-gate * By doing so, mdmn_send_message can decide if a retry would make sense or not. 3180Sstevel@tonic-gate * It's not our's to decide that here. 3190Sstevel@tonic-gate */ 3200Sstevel@tonic-gate void 3210Sstevel@tonic-gate timeout_initiator(set_t setno, md_mn_msgclass_t class) 3220Sstevel@tonic-gate { 3230Sstevel@tonic-gate SVCXPRT *transp; 3240Sstevel@tonic-gate md_mn_msgid_t mid; 3250Sstevel@tonic-gate md_mn_result_t *resultp; 3260Sstevel@tonic-gate 3270Sstevel@tonic-gate resultp = Zalloc(sizeof (md_mn_result_t)); 3280Sstevel@tonic-gate resultp->mmr_comm_state = MDMNE_TIMEOUT; 3290Sstevel@tonic-gate 3300Sstevel@tonic-gate commd_debug(MD_MMV_MISC, 3310Sstevel@tonic-gate "timeout_initiator set = %d, class = %d\n", setno, class); 3320Sstevel@tonic-gate 3330Sstevel@tonic-gate transp = mdmn_get_initiator_table_transp(setno, class); 3340Sstevel@tonic-gate mdmn_get_initiator_table_id(setno, class, &mid); 3350Sstevel@tonic-gate 3360Sstevel@tonic-gate commd_debug(MD_MMV_MISC, "timeout_ini: (%d, 0x%llx-%d)\n", 3370Sstevel@tonic-gate MSGID_ELEMS(mid)); 3380Sstevel@tonic-gate 3390Sstevel@tonic-gate /* return to mdmn_send_message() and let it deal with the situation */ 3400Sstevel@tonic-gate mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp); 3410Sstevel@tonic-gate 3420Sstevel@tonic-gate free(resultp); 3430Sstevel@tonic-gate commd_debug(MD_MMV_MISC, "timeout_ini: sendreplied\n"); 3440Sstevel@tonic-gate mdmn_unregister_initiator_table(setno, class); 3450Sstevel@tonic-gate } 3460Sstevel@tonic-gate 3470Sstevel@tonic-gate 3480Sstevel@tonic-gate /* 3490Sstevel@tonic-gate * check_timeouts - thread 3500Sstevel@tonic-gate * 3510Sstevel@tonic-gate * This implements a timeout surveillance for messages sent from the 3520Sstevel@tonic-gate * initiator to the master. 3530Sstevel@tonic-gate * 3540Sstevel@tonic-gate * If a message is started, this thread is triggered thru 3550Sstevel@tonic-gate * cond_signal(&check_timeout_cv) and we keep track of the numbers of 3560Sstevel@tonic-gate * messages that are outstanding (messages_on_their_way). 3570Sstevel@tonic-gate * 3580Sstevel@tonic-gate * As long as there are messages on their way, this thread never goes to sleep. 3590Sstevel@tonic-gate * It'll keep checking all class/set combinations for outstanding messages. 3600Sstevel@tonic-gate * If one is found, it's checked if this message is overdue. In that case, 3610Sstevel@tonic-gate * timeout_initiator() is called to wakeup the calling mdmn_send_message and 3620Sstevel@tonic-gate * to clean up the mess. 3630Sstevel@tonic-gate * 3640Sstevel@tonic-gate * If the result from the master arrives later, this message is considered 3650Sstevel@tonic-gate * to be unsolicited. And will be ignored. 3660Sstevel@tonic-gate */ 3670Sstevel@tonic-gate 3680Sstevel@tonic-gate void 3690Sstevel@tonic-gate check_timeouts() 3700Sstevel@tonic-gate { 3710Sstevel@tonic-gate set_t setno; 3720Sstevel@tonic-gate time_t now, then; 3730Sstevel@tonic-gate mutex_t *mx; 3740Sstevel@tonic-gate md_mn_msgclass_t class; 3750Sstevel@tonic-gate 3760Sstevel@tonic-gate for (; ; ) { 3770Sstevel@tonic-gate now = time((time_t *)NULL); 3780Sstevel@tonic-gate for (setno = 1; setno < MD_MAXSETS; setno++) { 3790Sstevel@tonic-gate if (md_mn_set_inited[setno] != MDMN_SET_READY) { 3800Sstevel@tonic-gate continue; 3810Sstevel@tonic-gate } 3820Sstevel@tonic-gate for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; 3830Sstevel@tonic-gate class++) { 3840Sstevel@tonic-gate mx = mdmn_get_initiator_table_mx(setno, class); 3850Sstevel@tonic-gate mutex_lock(mx); 3860Sstevel@tonic-gate 3870Sstevel@tonic-gate /* then is the registered time */ 3880Sstevel@tonic-gate then = 3890Sstevel@tonic-gate mdmn_get_initiator_table_time(setno, class); 3900Sstevel@tonic-gate if ((then != 0) && (now > then)) { 3910Sstevel@tonic-gate timeout_initiator(setno, class); 3920Sstevel@tonic-gate } 3930Sstevel@tonic-gate mutex_unlock(mx); 3940Sstevel@tonic-gate } 3950Sstevel@tonic-gate } 3960Sstevel@tonic-gate /* it's ok to check only once per second */ 3970Sstevel@tonic-gate sleep(1); 3980Sstevel@tonic-gate 3990Sstevel@tonic-gate /* is there work to do? */ 4000Sstevel@tonic-gate mutex_lock(&check_timeout_mutex); 4010Sstevel@tonic-gate if (messages_on_their_way == 0) { 4020Sstevel@tonic-gate cond_wait(&check_timeout_cv, &check_timeout_mutex); 4030Sstevel@tonic-gate } 4040Sstevel@tonic-gate mutex_unlock(&check_timeout_mutex); 4050Sstevel@tonic-gate } 4060Sstevel@tonic-gate } 4070Sstevel@tonic-gate 4080Sstevel@tonic-gate void 4090Sstevel@tonic-gate setup_debug(void) 4100Sstevel@tonic-gate { 4110Sstevel@tonic-gate char *tmp_dir; 4120Sstevel@tonic-gate 4130Sstevel@tonic-gate /* Read in the debug-controlling tokens from runtime.cf */ 4140Sstevel@tonic-gate md_commd_global_verb = commd_get_verbosity(); 4150Sstevel@tonic-gate /* 4160Sstevel@tonic-gate * If the user didn't specify a verbosity level in runtime.cf 4170Sstevel@tonic-gate * we can safely return here. As we don't intend to printout 4180Sstevel@tonic-gate * debug messages, we don't need to check for the output file. 4190Sstevel@tonic-gate */ 4200Sstevel@tonic-gate if (md_commd_global_verb == 0) { 4210Sstevel@tonic-gate return; 4220Sstevel@tonic-gate } 4230Sstevel@tonic-gate 4240Sstevel@tonic-gate /* if commdout is non-NULL it is an open FILE, we'd better close it */ 4250Sstevel@tonic-gate if (commdout != (FILE *)NULL) { 4260Sstevel@tonic-gate fclose(commdout); 4270Sstevel@tonic-gate } 4280Sstevel@tonic-gate 4290Sstevel@tonic-gate commdoutfile = commd_get_outfile(); 4300Sstevel@tonic-gate 4310Sstevel@tonic-gate /* setup the debug output */ 4320Sstevel@tonic-gate if (commdoutfile == (char *)NULL) { 4330Sstevel@tonic-gate /* if no valid file was specified, use the default */ 4340Sstevel@tonic-gate commdoutfile = "/var/run/commd.out"; 4350Sstevel@tonic-gate commdout = fopen(commdoutfile, "a"); 4360Sstevel@tonic-gate } else { 4370Sstevel@tonic-gate /* check if the directory exists and is writable */ 4380Sstevel@tonic-gate tmp_dir = strdup(commdoutfile); 4390Sstevel@tonic-gate if ((access(dirname(tmp_dir), X_OK|W_OK)) || 4400Sstevel@tonic-gate ((commdout = fopen(commdoutfile, "a")) == (FILE *)NULL)) { 4410Sstevel@tonic-gate syslog(LOG_ERR, 4420Sstevel@tonic-gate "Can't write to specified output file %s,\n" 4430Sstevel@tonic-gate "using /var/run/commd.out instead\n", commdoutfile); 4440Sstevel@tonic-gate free(commdoutfile); 4450Sstevel@tonic-gate commdoutfile = "/var/run/commd.out"; 4460Sstevel@tonic-gate commdout = fopen(commdoutfile, "a"); 4470Sstevel@tonic-gate } 4480Sstevel@tonic-gate free(tmp_dir); 4490Sstevel@tonic-gate } 4500Sstevel@tonic-gate 4510Sstevel@tonic-gate if (commdout == (FILE *)NULL) { 4520Sstevel@tonic-gate syslog(LOG_ERR, "Can't write to debug output file %s\n", 4530Sstevel@tonic-gate commdoutfile); 4540Sstevel@tonic-gate } 4550Sstevel@tonic-gate } 456*393Sskamm 457*393Sskamm /* 458*393Sskamm * mdmn_is_node_dead checks to see if a node is dead using 459*393Sskamm * the SunCluster infrastructure which is a stable interface. 460*393Sskamm * If unable to contact SunCuster the node is assumed to be alive. 461*393Sskamm * Return values: 462*393Sskamm * 1 - node is dead 463*393Sskamm * 0 - node is alive 464*393Sskamm */ 465*393Sskamm int 466*393Sskamm mdmn_is_node_dead(md_mnnode_desc *node) 467*393Sskamm { 468*393Sskamm char *fmt = "/usr/cluster/bin/scha_cluster_get -O NODESTATE_NODE "; 469*393Sskamm char *cmd; 470*393Sskamm size_t size; 471*393Sskamm char buf[10]; 472*393Sskamm FILE *ptr; 473*393Sskamm int retval = 0; 474*393Sskamm 475*393Sskamm /* I know that I'm alive */ 476*393Sskamm if (strcmp(node->nd_nodename, mynode()) == 0) 477*393Sskamm return (retval); 478*393Sskamm 479*393Sskamm size = strlen(fmt) + strlen(node->nd_nodename) + 1; 480*393Sskamm cmd = Zalloc(size); 481*393Sskamm (void) strlcat(cmd, fmt, size); 482*393Sskamm (void) strlcat(cmd, node->nd_nodename, size); 483*393Sskamm 484*393Sskamm if ((ptr = popen(cmd, "r")) != NULL) { 485*393Sskamm if (fgets(buf, sizeof (buf), ptr) != NULL) { 486*393Sskamm /* If scha_cluster_get returned DOWN - return dead */ 487*393Sskamm if (strncmp(buf, "DOWN", 4) == 0) 488*393Sskamm retval = 1; 489*393Sskamm } 490*393Sskamm (void) pclose(ptr); 491*393Sskamm } 492*393Sskamm Free(cmd); 493*393Sskamm return (retval); 494*393Sskamm } 495*393Sskamm 4960Sstevel@tonic-gate /* 4970Sstevel@tonic-gate * global_init() 4980Sstevel@tonic-gate * 4990Sstevel@tonic-gate * Perform some global initializations. 5000Sstevel@tonic-gate * 5010Sstevel@tonic-gate * the following routines have to call this before operation can start: 5020Sstevel@tonic-gate * - mdmn_send_svc_1 5030Sstevel@tonic-gate * - mdmn_work_svc_1 5040Sstevel@tonic-gate * - mdmn_comm_lock_svc_1 5050Sstevel@tonic-gate * - mdmn_comm_unlock_svc_1 5060Sstevel@tonic-gate * - mdmn_comm_suspend_svc_1 5070Sstevel@tonic-gate * - mdmn_comm_resume_svc_1 5080Sstevel@tonic-gate * - mdmn_comm_reinit_set_svc_1 5090Sstevel@tonic-gate * 5100Sstevel@tonic-gate * This is a single threaded daemon, so it can only be in one of the above 5110Sstevel@tonic-gate * routines at the same time. 5120Sstevel@tonic-gate * This means, global_init() cannot be called more than once at the same time. 5130Sstevel@tonic-gate * Hence, no lock is needed. 5140Sstevel@tonic-gate */ 5150Sstevel@tonic-gate void 5160Sstevel@tonic-gate global_init(void) 5170Sstevel@tonic-gate { 5180Sstevel@tonic-gate set_t set; 5190Sstevel@tonic-gate md_mn_msgclass_t class; 5200Sstevel@tonic-gate struct sigaction sighandler; 5210Sstevel@tonic-gate time_t clock_val; 5220Sstevel@tonic-gate 5230Sstevel@tonic-gate /* Do these global initializations only once */ 5240Sstevel@tonic-gate if (md_commd_global_state & MD_CGS_INITED) { 5250Sstevel@tonic-gate return; 5260Sstevel@tonic-gate } 5270Sstevel@tonic-gate (void) sdssc_bind_library(); 5280Sstevel@tonic-gate 5290Sstevel@tonic-gate /* setup the debug options from the config file */ 5300Sstevel@tonic-gate setup_debug(); 5310Sstevel@tonic-gate 5320Sstevel@tonic-gate /* Make setup_debug() be the action in case of SIGHUP */ 5330Sstevel@tonic-gate sighandler.sa_flags = 0; 5340Sstevel@tonic-gate sigfillset(&sighandler.sa_mask); 5350Sstevel@tonic-gate sighandler.sa_handler = (void (*)(int)) setup_debug; 5360Sstevel@tonic-gate sigaction(SIGHUP, &sighandler, NULL); 5370Sstevel@tonic-gate 5380Sstevel@tonic-gate __savetime = gethrtime(); 5390Sstevel@tonic-gate (void) time(&clock_val); 5400Sstevel@tonic-gate commd_debug(MD_MMV_MISC, "global init called %s\n", 5410Sstevel@tonic-gate ctime(&clock_val)); 5420Sstevel@tonic-gate 5430Sstevel@tonic-gate /* start a thread that flushes out the debug on a regular basis */ 5440Sstevel@tonic-gate thr_create(NULL, 0, (void *(*)(void *))flush_fcout, 5450Sstevel@tonic-gate (void *) NULL, THR_DETACHED, NULL); 5460Sstevel@tonic-gate 5470Sstevel@tonic-gate /* global rwlock's / mutex's / cond_t's go here */ 5480Sstevel@tonic-gate mutex_init(&check_timeout_mutex, USYNC_THREAD, NULL); 5490Sstevel@tonic-gate cond_init(&check_timeout_cv, USYNC_THREAD, NULL); 5500Sstevel@tonic-gate mutex_init(&get_setdesc_mutex, USYNC_THREAD, NULL); 5510Sstevel@tonic-gate 5520Sstevel@tonic-gate /* Make sure the initiator table is initialized correctly */ 5530Sstevel@tonic-gate for (set = 0; set < MD_MAXSETS; set++) { 5540Sstevel@tonic-gate for (class = 0; class < MD_MN_NCLASSES; class++) { 5550Sstevel@tonic-gate mdmn_unregister_initiator_table(set, class); 5560Sstevel@tonic-gate } 5570Sstevel@tonic-gate } 5580Sstevel@tonic-gate 5590Sstevel@tonic-gate 5600Sstevel@tonic-gate /* setup the check for timeouts */ 5610Sstevel@tonic-gate thr_create(NULL, 0, (void *(*)(void *))check_timeouts, 5620Sstevel@tonic-gate (void *) NULL, THR_DETACHED, NULL); 5630Sstevel@tonic-gate 5640Sstevel@tonic-gate md_commd_global_state |= MD_CGS_INITED; 5650Sstevel@tonic-gate } 5660Sstevel@tonic-gate 5670Sstevel@tonic-gate 5680Sstevel@tonic-gate /* 5690Sstevel@tonic-gate * mdmn_init_client(setno, nodeid) 5700Sstevel@tonic-gate * called if client[setno][nodeid] is NULL 5710Sstevel@tonic-gate * 5720Sstevel@tonic-gate * NOTE: Must be called with set_desc_rwlock held as a reader 5730Sstevel@tonic-gate * NOTE: Must be called with client_rwlock held as a writer 5740Sstevel@tonic-gate * 5750Sstevel@tonic-gate * If the rpc client for this node has not been setup for any set, we do it now. 5760Sstevel@tonic-gate * 5770Sstevel@tonic-gate * Returns 0 on success (node found in set, rpc client setup) 5780Sstevel@tonic-gate * -1 if metaget_setdesc failed, 5790Sstevel@tonic-gate * -2 if node not part of set 5800Sstevel@tonic-gate * -3 if clnt_create fails 5810Sstevel@tonic-gate */ 5820Sstevel@tonic-gate static int 5830Sstevel@tonic-gate mdmn_init_client(set_t setno, md_mn_nodeid_t nid) 5840Sstevel@tonic-gate { 5850Sstevel@tonic-gate md_error_t ep = mdnullerror; 5860Sstevel@tonic-gate md_mnnode_desc *node; 5870Sstevel@tonic-gate md_set_desc *sd; /* just an abbr for set_descriptor[setno] */ 5880Sstevel@tonic-gate 5890Sstevel@tonic-gate sd = set_descriptor[setno]; 5900Sstevel@tonic-gate 5910Sstevel@tonic-gate /* 5920Sstevel@tonic-gate * Is the appropriate set_descriptor already initialized ? 5930Sstevel@tonic-gate * Can't think of a scenario where this is not the case, but we'd better 5940Sstevel@tonic-gate * check for it anyway. 5950Sstevel@tonic-gate */ 5960Sstevel@tonic-gate if (sd == NULL) { 5970Sstevel@tonic-gate mdsetname_t *sp; 5980Sstevel@tonic-gate 5990Sstevel@tonic-gate rw_unlock(&set_desc_rwlock[setno]); /* readlock -> writelock */ 6000Sstevel@tonic-gate rw_wrlock(&set_desc_rwlock[setno]); 6010Sstevel@tonic-gate sp = metasetnosetname(setno, &ep); 6020Sstevel@tonic-gate /* Only one thread is supposed to be in metaget_setdesc() */ 6030Sstevel@tonic-gate mutex_lock(&get_setdesc_mutex); 6040Sstevel@tonic-gate sd = metaget_setdesc(sp, &ep); 6050Sstevel@tonic-gate mutex_unlock(&get_setdesc_mutex); 6060Sstevel@tonic-gate if (sd == NULL) { 6070Sstevel@tonic-gate rw_unlock(&set_desc_rwlock[setno]); /* back to ... */ 6080Sstevel@tonic-gate rw_rdlock(&set_desc_rwlock[setno]); /* ... readlock */ 6090Sstevel@tonic-gate return (-1); 6100Sstevel@tonic-gate } 6110Sstevel@tonic-gate set_descriptor[setno] = sd; 6120Sstevel@tonic-gate rw_unlock(&set_desc_rwlock[setno]); /* back to readlock */ 6130Sstevel@tonic-gate rw_rdlock(&set_desc_rwlock[setno]); 6140Sstevel@tonic-gate } 6150Sstevel@tonic-gate 6160Sstevel@tonic-gate /* first we have to find the node name for this node id */ 6170Sstevel@tonic-gate for (node = sd->sd_nodelist; node; node = node->nd_next) { 6180Sstevel@tonic-gate if (node->nd_nodeid == nid) 6190Sstevel@tonic-gate break; /* we found our node in this set */ 6200Sstevel@tonic-gate } 6210Sstevel@tonic-gate 6220Sstevel@tonic-gate 6230Sstevel@tonic-gate if (node == (md_mnnode_desc *)NULL) { 6240Sstevel@tonic-gate commd_debug(MD_MMV_SYSLOG, 6250Sstevel@tonic-gate "FATAL: node %d not found in set %d\n", nid, setno); 6260Sstevel@tonic-gate rw_unlock(&set_desc_rwlock[setno]); 6270Sstevel@tonic-gate return (-2); 6280Sstevel@tonic-gate } 6290Sstevel@tonic-gate 6300Sstevel@tonic-gate commd_debug(MD_MMV_INIT, "init: %s has the flags: 0x%x\n", 6310Sstevel@tonic-gate node->nd_nodename ? node->nd_nodename : "NULL", node->nd_flags); 6320Sstevel@tonic-gate 6330Sstevel@tonic-gate /* Did this node join the diskset? */ 6340Sstevel@tonic-gate if ((node->nd_flags & MD_MN_NODE_OWN) == 0) { 6350Sstevel@tonic-gate commd_debug(MD_MMV_INIT, "init: %s didn't join set %d\n", 6360Sstevel@tonic-gate node->nd_nodename ? node->nd_nodename : "NULL", setno); 6370Sstevel@tonic-gate rw_unlock(&set_desc_rwlock[setno]); 6380Sstevel@tonic-gate return (-2); 6390Sstevel@tonic-gate } 6400Sstevel@tonic-gate 6410Sstevel@tonic-gate /* if clnt_create has not been done for that node, do it now */ 6420Sstevel@tonic-gate if (client[setno][nid] == (CLIENT *) NULL) { 643*393Sskamm time_t tout = 0; 644*393Sskamm 645*393Sskamm /* 646*393Sskamm * While trying to create a connection to a node, 647*393Sskamm * periodically check to see if the node has been marked 648*393Sskamm * dead by the SunCluster infrastructure. 649*393Sskamm * This periodic check is needed since a non-responsive 650*393Sskamm * rpc.mdcommd (while it is attempting to create a connection 651*393Sskamm * to a dead node) can lead to large delays and/or failures 652*393Sskamm * in the reconfig steps. 653*393Sskamm */ 654*393Sskamm while ((client[setno][nid] == (CLIENT *) NULL) && 655*393Sskamm (tout < MD_CLNT_CREATE_TOUT)) { 656*393Sskamm client[setno][nid] = meta_client_create_retry 657*393Sskamm (node->nd_nodename, mdmn_clnt_create, 658*393Sskamm (void *) node, MD_CLNT_CREATE_SUBTIMEOUT, &ep); 659*393Sskamm /* Is the node dead? */ 660*393Sskamm if (mdmn_is_node_dead(node) == 1) { 661*393Sskamm commd_debug(MD_MMV_SYSLOG, 662*393Sskamm "rpc.mdcommd: no client for dead node %s\n", 663*393Sskamm node->nd_nodename); 664*393Sskamm break; 665*393Sskamm } else 666*393Sskamm tout += MD_CLNT_CREATE_SUBTIMEOUT; 667*393Sskamm } 668*393Sskamm 6690Sstevel@tonic-gate if (client[setno][nid] == (CLIENT *) NULL) { 6700Sstevel@tonic-gate clnt_pcreateerror(node->nd_nodename); 6710Sstevel@tonic-gate rw_unlock(&set_desc_rwlock[setno]); 6720Sstevel@tonic-gate return (-3); 6730Sstevel@tonic-gate } 6740Sstevel@tonic-gate /* this node has the license to send */ 6750Sstevel@tonic-gate commd_debug(MD_MMV_MISC, "init_client: calling add_lic\n"); 6760Sstevel@tonic-gate add_license(node); 6770Sstevel@tonic-gate 6780Sstevel@tonic-gate /* set the timeout value */ 6790Sstevel@tonic-gate clnt_control(client[setno][nid], CLSET_TIMEOUT, 6800Sstevel@tonic-gate (char *)&FOUR_SECS); 6810Sstevel@tonic-gate 6820Sstevel@tonic-gate } 6830Sstevel@tonic-gate rw_unlock(&set_desc_rwlock[setno]); 6840Sstevel@tonic-gate return (0); 6850Sstevel@tonic-gate } 6860Sstevel@tonic-gate 6870Sstevel@tonic-gate /* 6880Sstevel@tonic-gate * check_client(setno, nodeid) 6890Sstevel@tonic-gate * 6900Sstevel@tonic-gate * must be called with reader lock held for set_desc_rwlock[setno] 6910Sstevel@tonic-gate * and must be called with reader lock held for client_rwlock[setno] 6920Sstevel@tonic-gate * Checks if the client for this set/node combination is already setup 6930Sstevel@tonic-gate * if not it upgrades the lock to a writer lock 6940Sstevel@tonic-gate * and tries to initialize the client. 6950Sstevel@tonic-gate * Finally it's checked if the client nulled out again due to some race 6960Sstevel@tonic-gate * 6970Sstevel@tonic-gate * returns 0 if there is a usable client 6980Sstevel@tonic-gate * returns MDMNE_RPC_FAIL otherwise 6990Sstevel@tonic-gate */ 7000Sstevel@tonic-gate static int 7010Sstevel@tonic-gate check_client(set_t setno, md_mn_nodeid_t nodeid) 7020Sstevel@tonic-gate { 7030Sstevel@tonic-gate int ret = 0; 7040Sstevel@tonic-gate 7050Sstevel@tonic-gate while ((client[setno][nodeid] == (CLIENT *)NULL) && (ret == 0)) { 7060Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); /* upgrade reader ... */ 7070Sstevel@tonic-gate rw_wrlock(&client_rwlock[setno]); /* ... to writer lock. */ 7080Sstevel@tonic-gate if (mdmn_init_client(setno, nodeid) != 0) { 7090Sstevel@tonic-gate ret = MDMNE_RPC_FAIL; 7100Sstevel@tonic-gate } 7110Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); /* downgrade writer ... */ 7120Sstevel@tonic-gate rw_rdlock(&client_rwlock[setno]); /* ... back to reader lock. */ 7130Sstevel@tonic-gate } 7140Sstevel@tonic-gate return (ret); 7150Sstevel@tonic-gate } 7160Sstevel@tonic-gate 7170Sstevel@tonic-gate /* 7180Sstevel@tonic-gate * mdmn_init_set(setno, todo) 7190Sstevel@tonic-gate * setno is the number of the set to be initialized. 7200Sstevel@tonic-gate * todo is one of the MDMN_SET_* thingies or MDMN_SET_READY 7210Sstevel@tonic-gate * If called with MDMN_SET_READY everything is initialized. 7220Sstevel@tonic-gate * 7230Sstevel@tonic-gate * If the set mutexes are already initialized, the caller has to hold 7240Sstevel@tonic-gate * both set_desc_rwlock[setno] and client_rwlock[setno] as a writer, before 7250Sstevel@tonic-gate * calling mdmn_init_set() 7260Sstevel@tonic-gate */ 7270Sstevel@tonic-gate int 7280Sstevel@tonic-gate mdmn_init_set(set_t setno, int todo) 7290Sstevel@tonic-gate { 7300Sstevel@tonic-gate int class; 7310Sstevel@tonic-gate md_mnnode_desc *node; 7320Sstevel@tonic-gate md_set_desc *sd; /* just an abbr for set_descriptor[setno] */ 7330Sstevel@tonic-gate mdsetname_t *sp; 7340Sstevel@tonic-gate md_error_t ep = mdnullerror; 7350Sstevel@tonic-gate md_mn_nodeid_t nid; 7360Sstevel@tonic-gate 7370Sstevel@tonic-gate /* 7380Sstevel@tonic-gate * Check if we are told to setup the mutexes and 7390Sstevel@tonic-gate * if these are not yet setup 7400Sstevel@tonic-gate */ 7410Sstevel@tonic-gate if ((todo & MDMN_SET_MUTEXES) && 7420Sstevel@tonic-gate ((md_mn_set_inited[setno] & MDMN_SET_MUTEXES) == 0)) { 7430Sstevel@tonic-gate mutex_init(&mdmn_busy_mutex[setno], USYNC_THREAD, NULL); 7440Sstevel@tonic-gate cond_init(&mdmn_busy_cv[setno], USYNC_THREAD, NULL); 7450Sstevel@tonic-gate rwlock_init(&client_rwlock[setno], USYNC_THREAD, NULL); 7460Sstevel@tonic-gate rwlock_init(&set_desc_rwlock[setno], USYNC_THREAD, NULL); 7470Sstevel@tonic-gate 7480Sstevel@tonic-gate for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) { 7490Sstevel@tonic-gate mutex_init(mdmn_get_master_table_mx(setno, class), 7500Sstevel@tonic-gate USYNC_THREAD, NULL); 7510Sstevel@tonic-gate cond_init(mdmn_get_master_table_cv(setno, class), 7520Sstevel@tonic-gate USYNC_THREAD, NULL); 7530Sstevel@tonic-gate mutex_init(mdmn_get_initiator_table_mx(setno, class), 7540Sstevel@tonic-gate USYNC_THREAD, NULL); 7550Sstevel@tonic-gate } 7560Sstevel@tonic-gate md_mn_set_inited[setno] |= MDMN_SET_MUTEXES; 7570Sstevel@tonic-gate } 7580Sstevel@tonic-gate if ((todo & MDMN_SET_MCT) && 7590Sstevel@tonic-gate ((md_mn_set_inited[setno] & MDMN_SET_MCT) == 0)) { 7600Sstevel@tonic-gate int fd; 7610Sstevel@tonic-gate size_t filesize; 7620Sstevel@tonic-gate caddr_t addr; 7630Sstevel@tonic-gate char table_name[32]; 7640Sstevel@tonic-gate 7650Sstevel@tonic-gate filesize = (sizeof (md_mn_mct_t)); 7660Sstevel@tonic-gate (void) snprintf(table_name, sizeof (table_name), "%s%d", 7670Sstevel@tonic-gate MD_MN_MSG_COMP_TABLE, setno); 7680Sstevel@tonic-gate /* 7690Sstevel@tonic-gate * If the mct file exists we map it into memory. 7700Sstevel@tonic-gate * Otherwise we create an empty file of appropriate 7710Sstevel@tonic-gate * size and map that into memory. 7720Sstevel@tonic-gate * The mapped areas are stored in mct[setno]. 7730Sstevel@tonic-gate */ 7740Sstevel@tonic-gate fd = open(table_name, O_RDWR|O_CREAT|O_DSYNC, 0600); 7750Sstevel@tonic-gate if (fd < 0) { 7760Sstevel@tonic-gate commd_debug(MD_MMV_MISC, 7770Sstevel@tonic-gate "init_set: Can't open MCT\n"); 7780Sstevel@tonic-gate return (-1); 7790Sstevel@tonic-gate } 7800Sstevel@tonic-gate /* 7810Sstevel@tonic-gate * To ensure that the file has the appropriate size, 7820Sstevel@tonic-gate * we write a byte at the end of the file. 7830Sstevel@tonic-gate */ 7840Sstevel@tonic-gate lseek(fd, filesize + 1, SEEK_SET); 7850Sstevel@tonic-gate write(fd, "\0", 1); 7860Sstevel@tonic-gate 7870Sstevel@tonic-gate /* at this point we have a file in place that we can mmap */ 7880Sstevel@tonic-gate addr = mmap(0, filesize, PROT_READ | PROT_WRITE, 7890Sstevel@tonic-gate MAP_SHARED, fd, (off_t)0); 7900Sstevel@tonic-gate if (addr == MAP_FAILED) { 7910Sstevel@tonic-gate commd_debug(MD_MMV_INIT, 7920Sstevel@tonic-gate "init_set: mmap mct error %d\n", 7930Sstevel@tonic-gate errno); 7940Sstevel@tonic-gate return (-1); 7950Sstevel@tonic-gate } 7960Sstevel@tonic-gate /* LINTED pointer alignment */ 7970Sstevel@tonic-gate mct[setno] = (md_mn_mct_t *)addr; 7980Sstevel@tonic-gate 7990Sstevel@tonic-gate /* finally we initialize the mutexes that protect the mct */ 8000Sstevel@tonic-gate for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) { 8010Sstevel@tonic-gate mutex_init(&(mct_mutex[setno][class]), 8020Sstevel@tonic-gate USYNC_THREAD, NULL); 8030Sstevel@tonic-gate } 8040Sstevel@tonic-gate 8050Sstevel@tonic-gate md_mn_set_inited[setno] |= MDMN_SET_MCT; 8060Sstevel@tonic-gate } 8070Sstevel@tonic-gate /* 8080Sstevel@tonic-gate * Check if we are told to setup the nodes and 8090Sstevel@tonic-gate * if these are not yet setup 8100Sstevel@tonic-gate * (Attention: negative logic here compared to above!) 8110Sstevel@tonic-gate */ 8120Sstevel@tonic-gate if (((todo & MDMN_SET_NODES) == 0) || 8130Sstevel@tonic-gate (md_mn_set_inited[setno] & MDMN_SET_NODES)) { 8140Sstevel@tonic-gate return (0); /* success */ 8150Sstevel@tonic-gate } 8160Sstevel@tonic-gate 8170Sstevel@tonic-gate if ((sp = metasetnosetname(setno, &ep)) == NULL) { 8180Sstevel@tonic-gate commd_debug(MD_MMV_SYSLOG, 8190Sstevel@tonic-gate "metasetnosetname(%d) returned NULL\n", setno); 8200Sstevel@tonic-gate return (MDMNE_NOT_JOINED); 8210Sstevel@tonic-gate } 8220Sstevel@tonic-gate 8230Sstevel@tonic-gate /* flush local copy of rpc.metad data */ 8240Sstevel@tonic-gate metaflushsetname(sp); 8250Sstevel@tonic-gate 8260Sstevel@tonic-gate mutex_lock(&get_setdesc_mutex); 8270Sstevel@tonic-gate sd = metaget_setdesc(sp, &ep); 8280Sstevel@tonic-gate mutex_unlock(&get_setdesc_mutex); 8290Sstevel@tonic-gate 8300Sstevel@tonic-gate if (sd == NULL) { 8310Sstevel@tonic-gate commd_debug(MD_MMV_SYSLOG, 8320Sstevel@tonic-gate "metaget_setdesc(%d) returned NULL\n", setno); 8330Sstevel@tonic-gate return (MDMNE_NOT_JOINED); 8340Sstevel@tonic-gate } 8350Sstevel@tonic-gate 8360Sstevel@tonic-gate /* 8370Sstevel@tonic-gate * if this set is not a multinode set or 8380Sstevel@tonic-gate * this node didn't join yet the diskset, better don't do anything 8390Sstevel@tonic-gate */ 8400Sstevel@tonic-gate if ((MD_MNSET_DESC(sd) == 0) || 8410Sstevel@tonic-gate (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN) == 0) { 8420Sstevel@tonic-gate commd_debug(MD_MMV_INIT, "didn't yet join set %d\n", setno); 8430Sstevel@tonic-gate return (MDMNE_NOT_JOINED); 8440Sstevel@tonic-gate } 8450Sstevel@tonic-gate 8460Sstevel@tonic-gate for (node = sd->sd_nodelist; node != NULL; node = node->nd_next) { 847*393Sskamm time_t tout = 0; 8480Sstevel@tonic-gate nid = node->nd_nodeid; 8490Sstevel@tonic-gate 8500Sstevel@tonic-gate commd_debug(MD_MMV_INIT, 8510Sstevel@tonic-gate "setting up: node=%s, priv_ic=%s, flags=0x%x\n", 8520Sstevel@tonic-gate node->nd_nodename ? node->nd_nodename : "NULL", 8530Sstevel@tonic-gate node->nd_priv_ic ? node->nd_priv_ic : "NULL", 8540Sstevel@tonic-gate node->nd_flags); 8550Sstevel@tonic-gate 8560Sstevel@tonic-gate if ((node->nd_flags & MD_MN_NODE_OWN) == 0) { 8570Sstevel@tonic-gate commd_debug(MD_MMV_INIT, 8580Sstevel@tonic-gate "init: %s didn't join set %d\n", 8590Sstevel@tonic-gate node->nd_nodename ? node->nd_nodename : "NULL", 8600Sstevel@tonic-gate setno); 8610Sstevel@tonic-gate continue; 8620Sstevel@tonic-gate } 8630Sstevel@tonic-gate 8640Sstevel@tonic-gate if (client[setno][nid] != (CLIENT *) NULL) { 8650Sstevel@tonic-gate /* already inited */ 8660Sstevel@tonic-gate commd_debug(MD_MMV_INIT, "init: already: node=%s\n", 8670Sstevel@tonic-gate node->nd_nodename ? node->nd_nodename : "NULL"); 8680Sstevel@tonic-gate continue; 8690Sstevel@tonic-gate } 870*393Sskamm 871*393Sskamm /* 872*393Sskamm * While trying to create a connection to a node, 873*393Sskamm * periodically check to see if the node has been marked 874*393Sskamm * dead by the SunCluster infrastructure. 875*393Sskamm * This periodic check is needed since a non-responsive 876*393Sskamm * rpc.mdcommd (while it is attempting to create a connection 877*393Sskamm * to a dead node) can lead to large delays and/or failures 878*393Sskamm * in the reconfig steps. 879*393Sskamm */ 880*393Sskamm while ((client[setno][nid] == (CLIENT *) NULL) && 881*393Sskamm (tout < MD_CLNT_CREATE_TOUT)) { 882*393Sskamm client[setno][nid] = meta_client_create_retry 883*393Sskamm (node->nd_nodename, mdmn_clnt_create, 884*393Sskamm (void *) node, MD_CLNT_CREATE_SUBTIMEOUT, &ep); 885*393Sskamm /* Is the node dead? */ 886*393Sskamm if (mdmn_is_node_dead(node) == 1) { 887*393Sskamm commd_debug(MD_MMV_SYSLOG, 888*393Sskamm "rpc.mdcommd: no client for dead node %s\n", 889*393Sskamm node->nd_nodename); 890*393Sskamm break; 891*393Sskamm } else 892*393Sskamm tout += MD_CLNT_CREATE_SUBTIMEOUT; 893*393Sskamm } 8940Sstevel@tonic-gate 8950Sstevel@tonic-gate if (client[setno][nid] == (CLIENT *) NULL) { 8960Sstevel@tonic-gate clnt_pcreateerror(node->nd_nodename); 8970Sstevel@tonic-gate /* 8980Sstevel@tonic-gate * If we cannot connect to a single node 8990Sstevel@tonic-gate * (maybe because it is down) we mark this node as not 9000Sstevel@tonic-gate * owned and continue with the next node in the list. 9010Sstevel@tonic-gate * This is better than failing the entire starting up 9020Sstevel@tonic-gate * of the commd system. 9030Sstevel@tonic-gate */ 9040Sstevel@tonic-gate node->nd_flags &= ~MD_MN_NODE_OWN; 9050Sstevel@tonic-gate commd_debug(MD_MMV_SYSLOG, 9060Sstevel@tonic-gate "WARNING couldn't create client for %s\n" 9070Sstevel@tonic-gate "Reconfig cycle required\n", 9080Sstevel@tonic-gate node->nd_nodename); 9090Sstevel@tonic-gate commd_debug(MD_MMV_INIT, 9100Sstevel@tonic-gate "WARNING couldn't create client for %s\n" 9110Sstevel@tonic-gate "Reconfig cycle required\n", 9120Sstevel@tonic-gate node->nd_nodename); 9130Sstevel@tonic-gate continue; 9140Sstevel@tonic-gate } 9150Sstevel@tonic-gate /* this node has the license to send */ 9160Sstevel@tonic-gate commd_debug(MD_MMV_MISC, "init_set: calling add_lic\n"); 9170Sstevel@tonic-gate add_license(node); 9180Sstevel@tonic-gate 9190Sstevel@tonic-gate /* set the timeout value */ 9200Sstevel@tonic-gate clnt_control(client[setno][nid], CLSET_TIMEOUT, 9210Sstevel@tonic-gate (char *)&FOUR_SECS); 9220Sstevel@tonic-gate 9230Sstevel@tonic-gate commd_debug(MD_MMV_INIT, "init: done: node=%s\n", 9240Sstevel@tonic-gate node->nd_nodename ? node->nd_nodename : "NULL"); 9250Sstevel@tonic-gate } 9260Sstevel@tonic-gate 9270Sstevel@tonic-gate set_descriptor[setno] = sd; 9280Sstevel@tonic-gate md_mn_set_inited[setno] |= MDMN_SET_NODES; 9290Sstevel@tonic-gate return (0); /* success */ 9300Sstevel@tonic-gate } 9310Sstevel@tonic-gate 9320Sstevel@tonic-gate void * 9330Sstevel@tonic-gate mdmn_send_to_work(void *arg) 9340Sstevel@tonic-gate { 9350Sstevel@tonic-gate int *rpc_err; 9360Sstevel@tonic-gate int success; 9370Sstevel@tonic-gate int try_master; 9380Sstevel@tonic-gate set_t setno; 9390Sstevel@tonic-gate mutex_t *mx; /* protection for initiator_table */ 9400Sstevel@tonic-gate SVCXPRT *transp; 9410Sstevel@tonic-gate md_mn_msg_t *msg; 9420Sstevel@tonic-gate md_mn_nodeid_t set_master; 9430Sstevel@tonic-gate md_mn_msgclass_t class; 9440Sstevel@tonic-gate md_mn_msg_and_transp_t *matp = (md_mn_msg_and_transp_t *)arg; 9450Sstevel@tonic-gate 9460Sstevel@tonic-gate msg = matp->mat_msg; 9470Sstevel@tonic-gate transp = matp->mat_transp; 9480Sstevel@tonic-gate 9490Sstevel@tonic-gate /* the alloc was done in mdmn_send_svc_1 */ 9500Sstevel@tonic-gate free(matp); 9510Sstevel@tonic-gate 9520Sstevel@tonic-gate class = mdmn_get_message_class(msg->msg_type); 9530Sstevel@tonic-gate setno = msg->msg_setno; 9540Sstevel@tonic-gate 9550Sstevel@tonic-gate /* set the sender, so the master knows who to send the results */ 9560Sstevel@tonic-gate rw_rdlock(&set_desc_rwlock[setno]); 9570Sstevel@tonic-gate msg->msg_sender = set_descriptor[setno]->sd_mn_mynode->nd_nodeid; 9580Sstevel@tonic-gate set_master = set_descriptor[setno]->sd_mn_master_nodeid; 9590Sstevel@tonic-gate 9600Sstevel@tonic-gate mx = mdmn_get_initiator_table_mx(setno, class); 9610Sstevel@tonic-gate mutex_lock(mx); 9620Sstevel@tonic-gate 9630Sstevel@tonic-gate /* 9640Sstevel@tonic-gate * Here we check, if the initiator table slot for this set/class 9650Sstevel@tonic-gate * combination is free to use. 9660Sstevel@tonic-gate * If this is not the case, we return CLASS_BUSY forcing the 9670Sstevel@tonic-gate * initiating send_message call to retry 9680Sstevel@tonic-gate */ 9690Sstevel@tonic-gate success = mdmn_check_initiator_table(setno, class); 9700Sstevel@tonic-gate if (success == MDMNE_CLASS_BUSY) { 9710Sstevel@tonic-gate md_mn_msgid_t active_mid; 9720Sstevel@tonic-gate 9730Sstevel@tonic-gate mdmn_get_initiator_table_id(setno, class, 9740Sstevel@tonic-gate &active_mid); 9750Sstevel@tonic-gate 9760Sstevel@tonic-gate commd_debug(MD_MMV_SEND, 9770Sstevel@tonic-gate "send_to_work: received but locally busy " 9780Sstevel@tonic-gate "(%d, 0x%llx-%d), set=%d, class=%d, type=%d, " 9790Sstevel@tonic-gate "active msg=(%d, 0x%llx-%d)\n", 9800Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), setno, class, 9810Sstevel@tonic-gate msg->msg_type, MSGID_ELEMS(active_mid)); 9820Sstevel@tonic-gate } else { 9830Sstevel@tonic-gate commd_debug(MD_MMV_SEND, 9840Sstevel@tonic-gate "send_to_work: received (%d, 0x%llx-%d), " 9850Sstevel@tonic-gate "set=%d, class=%d, type=%d\n", 9860Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type); 9870Sstevel@tonic-gate } 9880Sstevel@tonic-gate 9890Sstevel@tonic-gate try_master = 2; /* return failure after two retries */ 9900Sstevel@tonic-gate while ((success == MDMNE_ACK) && (try_master--)) { 9910Sstevel@tonic-gate rw_rdlock(&client_rwlock[setno]); 9920Sstevel@tonic-gate /* is the rpc client to the master still around ? */ 9930Sstevel@tonic-gate if (check_client(setno, set_master)) { 9940Sstevel@tonic-gate success = MDMNE_RPC_FAIL; 9950Sstevel@tonic-gate FLUSH_DEBUGFILE(); 9960Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); 9970Sstevel@tonic-gate break; /* out of try_master-loop */ 9980Sstevel@tonic-gate } 9990Sstevel@tonic-gate 10000Sstevel@tonic-gate /* 10010Sstevel@tonic-gate * Send the request to the work function on the master 10020Sstevel@tonic-gate * this call will return immediately 10030Sstevel@tonic-gate */ 10040Sstevel@tonic-gate rpc_err = mdmn_work_1(msg, client[setno][set_master]); 10050Sstevel@tonic-gate 10060Sstevel@tonic-gate /* Everything's Ok? */ 10070Sstevel@tonic-gate if (rpc_err == NULL) { 10080Sstevel@tonic-gate success = MDMNE_RPC_FAIL; 10090Sstevel@tonic-gate /* 10100Sstevel@tonic-gate * Probably something happened to the daemon on the 10110Sstevel@tonic-gate * master. Kill the client, and try again... 10120Sstevel@tonic-gate */ 10130Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); 10140Sstevel@tonic-gate rw_wrlock(&client_rwlock[setno]); 10150Sstevel@tonic-gate mdmn_clnt_destroy(client[setno][set_master]); 10160Sstevel@tonic-gate if (client[setno][set_master] != (CLIENT *)NULL) { 10170Sstevel@tonic-gate client[setno][set_master] = (CLIENT *)NULL; 10180Sstevel@tonic-gate } 10190Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); 10200Sstevel@tonic-gate continue; 10210Sstevel@tonic-gate 10220Sstevel@tonic-gate } else if (*rpc_err != MDMNE_ACK) { 10230Sstevel@tonic-gate /* something went wrong, break out */ 10240Sstevel@tonic-gate success = *rpc_err; 10250Sstevel@tonic-gate free(rpc_err); 10260Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); 10270Sstevel@tonic-gate break; /* out of try_master-loop */ 10280Sstevel@tonic-gate } 10290Sstevel@tonic-gate 10300Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); 10310Sstevel@tonic-gate free(rpc_err); 10320Sstevel@tonic-gate 10330Sstevel@tonic-gate /* 10340Sstevel@tonic-gate * If we are here, we sucessfully delivered the message. 10350Sstevel@tonic-gate * We register the initiator_table, so that 10360Sstevel@tonic-gate * wakeup_initiator_1 can do the sendreply with the 10370Sstevel@tonic-gate * results for us. 10380Sstevel@tonic-gate */ 10390Sstevel@tonic-gate success = MDMNE_ACK; 10400Sstevel@tonic-gate mdmn_register_initiator_table(setno, class, msg, transp); 10410Sstevel@tonic-gate 10420Sstevel@tonic-gate /* tell check_timeouts, there's work to do */ 10430Sstevel@tonic-gate mutex_lock(&check_timeout_mutex); 10440Sstevel@tonic-gate messages_on_their_way++; 10450Sstevel@tonic-gate cond_signal(&check_timeout_cv); 10460Sstevel@tonic-gate mutex_unlock(&check_timeout_mutex); 10470Sstevel@tonic-gate break; /* out of try_master-loop */ 10480Sstevel@tonic-gate } 10490Sstevel@tonic-gate 10500Sstevel@tonic-gate rw_unlock(&set_desc_rwlock[setno]); 10510Sstevel@tonic-gate 10520Sstevel@tonic-gate if (success == MDMNE_ACK) { 10530Sstevel@tonic-gate commd_debug(MD_MMV_SEND, 10540Sstevel@tonic-gate "send_to_work: registered (%d, 0x%llx-%d)\n", 10550Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid)); 10560Sstevel@tonic-gate } else { 10570Sstevel@tonic-gate /* In case of failure do the sendreply now */ 10580Sstevel@tonic-gate md_mn_result_t *resultp; 10590Sstevel@tonic-gate resultp = Zalloc(sizeof (md_mn_result_t)); 10600Sstevel@tonic-gate resultp->mmr_comm_state = success; 10610Sstevel@tonic-gate mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp); 10620Sstevel@tonic-gate commd_debug(MD_MMV_SEND, 10630Sstevel@tonic-gate "send_to_work: not registered (%d, 0x%llx-%d) cs=%d\n", 10640Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), success); 10650Sstevel@tonic-gate free_result(resultp); 10660Sstevel@tonic-gate 10670Sstevel@tonic-gate } 10680Sstevel@tonic-gate 10690Sstevel@tonic-gate free_msg(msg); 10700Sstevel@tonic-gate mutex_unlock(mx); 10710Sstevel@tonic-gate return (NULL); 10720Sstevel@tonic-gate 10730Sstevel@tonic-gate } 10740Sstevel@tonic-gate 10750Sstevel@tonic-gate /* 10760Sstevel@tonic-gate * do_message_locally(msg, result) 10770Sstevel@tonic-gate * Process a message locally on the master 10780Sstevel@tonic-gate * Lookup the MCT if the message has already been processed. 10790Sstevel@tonic-gate * If not, call the handler and store the result 10800Sstevel@tonic-gate * If yes, retrieve the result from the MCT. 10810Sstevel@tonic-gate * Return: 10820Sstevel@tonic-gate * MDMNE_ACK in case of success 10830Sstevel@tonic-gate * MDMNE_LOG_FAIL if the MCT could not be checked 10840Sstevel@tonic-gate */ 10850Sstevel@tonic-gate static int 10860Sstevel@tonic-gate do_message_locally(md_mn_msg_t *msg, md_mn_result_t *result) 10870Sstevel@tonic-gate { 10880Sstevel@tonic-gate int completed; 10890Sstevel@tonic-gate set_t setno; 10900Sstevel@tonic-gate md_mn_msgtype_t msgtype = msg->msg_type; 10910Sstevel@tonic-gate md_mn_msgclass_t class; 10920Sstevel@tonic-gate 10930Sstevel@tonic-gate void (*handler)(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *res); 10940Sstevel@tonic-gate 10950Sstevel@tonic-gate handler = mdmn_get_handler(msgtype); 10960Sstevel@tonic-gate if (handler == NULL) { 10970Sstevel@tonic-gate result->mmr_exitval = 0; 10980Sstevel@tonic-gate /* let the sender decide if this is an error or not */ 10990Sstevel@tonic-gate result->mmr_comm_state = MDMNE_NO_HANDLER; 11000Sstevel@tonic-gate return (MDMNE_NO_HANDLER); 11010Sstevel@tonic-gate } 11020Sstevel@tonic-gate 11030Sstevel@tonic-gate class = mdmn_get_message_class(msg->msg_type); 11040Sstevel@tonic-gate setno = msg->msg_setno; 11050Sstevel@tonic-gate 11060Sstevel@tonic-gate result->mmr_msgtype = msgtype; 11070Sstevel@tonic-gate result->mmr_flags = msg->msg_flags; 11080Sstevel@tonic-gate MSGID_COPY(&(msg->msg_msgid), &(result->mmr_msgid)); 11090Sstevel@tonic-gate 11100Sstevel@tonic-gate mutex_lock(&mct_mutex[setno][class]); 11110Sstevel@tonic-gate completed = mdmn_check_completion(msg, result); 11120Sstevel@tonic-gate if (completed == MDMN_MCT_NOT_DONE) { 11130Sstevel@tonic-gate /* message not yet processed locally */ 11140Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, "proc_mas: " 11150Sstevel@tonic-gate "calling handler for (%d,0x%llx-%d) type %d\n", 11160Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), msgtype); 11170Sstevel@tonic-gate 11180Sstevel@tonic-gate /* 11190Sstevel@tonic-gate * Mark the message as being currently processed, 11200Sstevel@tonic-gate * so we won't start a second handler for it 11210Sstevel@tonic-gate */ 11220Sstevel@tonic-gate (void) mdmn_mark_completion(msg, NULL, MDMN_MCT_IN_PROGRESS); 11230Sstevel@tonic-gate mutex_unlock(&mct_mutex[setno][class]); 11240Sstevel@tonic-gate 11250Sstevel@tonic-gate /* here we actually process the message on the master */ 11260Sstevel@tonic-gate (*handler)(msg, MD_MSGF_ON_MASTER, result); 11270Sstevel@tonic-gate 11280Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, "proc_mas: " 11290Sstevel@tonic-gate "finished handler for (%d,0x%llx-%d) type %d\n", 11300Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), msgtype); 11310Sstevel@tonic-gate 11320Sstevel@tonic-gate /* Mark the message as fully processed, store the result */ 11330Sstevel@tonic-gate mutex_lock(&mct_mutex[setno][class]); 11340Sstevel@tonic-gate (void) mdmn_mark_completion(msg, result, MDMN_MCT_DONE); 11350Sstevel@tonic-gate } else if (completed == MDMN_MCT_DONE) { 11360Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, "proc_mas: " 11370Sstevel@tonic-gate "result for (%d, 0x%llx-%d) from MCT\n", 11380Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), msgtype); 11390Sstevel@tonic-gate } else if (completed == MDMN_MCT_IN_PROGRESS) { 11400Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, "proc_mas: " 11410Sstevel@tonic-gate "(%d, 0x%llx-%d) is currently being processed\n", 11420Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), msgtype); 11430Sstevel@tonic-gate } else { 11440Sstevel@tonic-gate /* MCT error occurred (should never happen) */ 11450Sstevel@tonic-gate mutex_unlock(&mct_mutex[setno][class]); 11460Sstevel@tonic-gate result->mmr_comm_state = MDMNE_LOG_FAIL; 11470Sstevel@tonic-gate commd_debug(MD_MMV_SYSLOG, "WARNING " 11480Sstevel@tonic-gate "mdmn_check_completion returned %d " 11490Sstevel@tonic-gate "for (%d,0x%llx-%d)\n", completed, 11500Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid)); 11510Sstevel@tonic-gate return (MDMNE_LOG_FAIL); 11520Sstevel@tonic-gate } 11530Sstevel@tonic-gate mutex_unlock(&mct_mutex[setno][class]); 11540Sstevel@tonic-gate return (MDMNE_ACK); 11550Sstevel@tonic-gate 11560Sstevel@tonic-gate } 11570Sstevel@tonic-gate 11580Sstevel@tonic-gate /* 11590Sstevel@tonic-gate * do_send_message(msg, node) 11600Sstevel@tonic-gate * 11610Sstevel@tonic-gate * Send a message to a given node and wait for a acknowledgment, that the 11620Sstevel@tonic-gate * message has arrived on the remote node. 11630Sstevel@tonic-gate * Make sure that the client for the set is setup correctly. 11640Sstevel@tonic-gate * If no ACK arrives, destroy and recreate the RPC client and retry the 11650Sstevel@tonic-gate * message one time 11660Sstevel@tonic-gate * After actually sending wait no longer than the appropriate number of 11670Sstevel@tonic-gate * before timing out the message. 11680Sstevel@tonic-gate * 11690Sstevel@tonic-gate * Note must be called with set_desc_wrlock held in reader mode 11700Sstevel@tonic-gate */ 11710Sstevel@tonic-gate static int 11720Sstevel@tonic-gate do_send_message(md_mn_msg_t *msg, md_mnnode_desc *node) 11730Sstevel@tonic-gate { 11740Sstevel@tonic-gate int err; 11750Sstevel@tonic-gate int rpc_retries; 11760Sstevel@tonic-gate int timeout_retries = 0; 11770Sstevel@tonic-gate int *ret = NULL; 11780Sstevel@tonic-gate set_t setno; 11790Sstevel@tonic-gate cond_t *cv; /* see mdmn_wakeup_master_svc_1 */ 11800Sstevel@tonic-gate mutex_t *mx; /* protection for class_busy */ 11810Sstevel@tonic-gate timestruc_t timeout; /* surveillance for remote daemon */ 11820Sstevel@tonic-gate md_mn_nodeid_t nid; 11830Sstevel@tonic-gate md_mn_msgtype_t msgtype; 11840Sstevel@tonic-gate md_mn_msgclass_t class; 11850Sstevel@tonic-gate 11860Sstevel@tonic-gate nid = node->nd_nodeid; 11870Sstevel@tonic-gate msgtype = msg->msg_type; 11880Sstevel@tonic-gate setno = msg->msg_setno; 11890Sstevel@tonic-gate class = mdmn_get_message_class(msgtype); 11900Sstevel@tonic-gate mx = mdmn_get_master_table_mx(setno, class); 11910Sstevel@tonic-gate cv = mdmn_get_master_table_cv(setno, class); 11920Sstevel@tonic-gate 11930Sstevel@tonic-gate retry_rpc: 11940Sstevel@tonic-gate 11950Sstevel@tonic-gate /* We try two times to send the message */ 11960Sstevel@tonic-gate rpc_retries = 2; 11970Sstevel@tonic-gate 11980Sstevel@tonic-gate /* 11990Sstevel@tonic-gate * if sending the message doesn't succeed the first time due to a 12000Sstevel@tonic-gate * RPC problem, we retry one time 12010Sstevel@tonic-gate */ 12020Sstevel@tonic-gate while ((rpc_retries != 0) && (ret == NULL)) { 12030Sstevel@tonic-gate /* in abort state, we error out immediately */ 12040Sstevel@tonic-gate if (md_commd_global_state & MD_CGS_ABORTED) { 12050Sstevel@tonic-gate return (MDMNE_ABORT); 12060Sstevel@tonic-gate } 12070Sstevel@tonic-gate 12080Sstevel@tonic-gate rw_rdlock(&client_rwlock[setno]); 12090Sstevel@tonic-gate /* unable to create client? Ignore it */ 12100Sstevel@tonic-gate if (check_client(setno, nid)) { 12110Sstevel@tonic-gate /* 12120Sstevel@tonic-gate * In case we cannot establish an RPC client, we 12130Sstevel@tonic-gate * take this node out of our considerations. 12140Sstevel@tonic-gate * This will be reset by a reconfig 12150Sstevel@tonic-gate * cycle that should come pretty soon. 12160Sstevel@tonic-gate * MNISSUE: Should a reconfig cycle 12170Sstevel@tonic-gate * be forced on SunCluster? 12180Sstevel@tonic-gate */ 12190Sstevel@tonic-gate node->nd_flags &= ~MD_MN_NODE_OWN; 12200Sstevel@tonic-gate commd_debug(MD_MMV_SYSLOG, 12210Sstevel@tonic-gate "WARNING couldn't create client for %s\n" 12220Sstevel@tonic-gate "Reconfig cycle required\n", 12230Sstevel@tonic-gate node->nd_nodename); 12240Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, "proc_mas: (%d,0x%llx-%d) " 12250Sstevel@tonic-gate "WARNING couldn't create client for %s\n", 12260Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), node->nd_nodename); 12270Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); 12280Sstevel@tonic-gate return (MDMNE_IGNORE_NODE); 12290Sstevel@tonic-gate } 12300Sstevel@tonic-gate /* let's be paranoid and check again before sending */ 12310Sstevel@tonic-gate if (client[setno][nid] == NULL) { 12320Sstevel@tonic-gate /* 12330Sstevel@tonic-gate * if this is true, strange enough, we catch our breath, 12340Sstevel@tonic-gate * and then continue, so that the client is set up 12350Sstevel@tonic-gate * once again. 12360Sstevel@tonic-gate */ 12370Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, "client is NULL\n"); 12380Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); 12390Sstevel@tonic-gate sleep(1); 12400Sstevel@tonic-gate continue; 12410Sstevel@tonic-gate } 12420Sstevel@tonic-gate 12430Sstevel@tonic-gate /* send it over, it will return immediately */ 12440Sstevel@tonic-gate ret = mdmn_work_1(msg, client[setno][nid]); 12450Sstevel@tonic-gate 12460Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); 12470Sstevel@tonic-gate 12480Sstevel@tonic-gate if (ret != NULL) { 12490Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, 12500Sstevel@tonic-gate "proc_mas: sending (%d,0x%llx-%d) to %d returned " 12510Sstevel@tonic-gate " 0x%x\n", 12520Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), nid, *ret); 12530Sstevel@tonic-gate } else { 12540Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, 12550Sstevel@tonic-gate "proc_mas: sending (%d,0x%llx-%d) to %d returned " 12560Sstevel@tonic-gate " NULL \n", 12570Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), nid); 12580Sstevel@tonic-gate } 12590Sstevel@tonic-gate 12600Sstevel@tonic-gate if ((ret == NULL) || (*ret == MDMNE_CANNOT_CONNECT) || 12610Sstevel@tonic-gate (*ret == MDMNE_THR_CREATE_FAIL)) { 12620Sstevel@tonic-gate /* 12630Sstevel@tonic-gate * Something happened to the daemon on the other side. 12640Sstevel@tonic-gate * Kill the client, and try again. 12650Sstevel@tonic-gate * check_client() will create a new client 12660Sstevel@tonic-gate */ 12670Sstevel@tonic-gate rw_wrlock(&client_rwlock[setno]); 12680Sstevel@tonic-gate mdmn_clnt_destroy(client[setno][nid]); 12690Sstevel@tonic-gate if (client[setno][nid] != (CLIENT *)NULL) { 12700Sstevel@tonic-gate client[setno][nid] = (CLIENT *)NULL; 12710Sstevel@tonic-gate } 12720Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); 12730Sstevel@tonic-gate 12740Sstevel@tonic-gate /* ... but don't try infinitely */ 12750Sstevel@tonic-gate --rpc_retries; 12760Sstevel@tonic-gate continue; 12770Sstevel@tonic-gate } 12780Sstevel@tonic-gate /* 12790Sstevel@tonic-gate * If the class is locked on the other node, keep trying. 12800Sstevel@tonic-gate * This situation will go away automatically, 12810Sstevel@tonic-gate * if we wait long enough 12820Sstevel@tonic-gate */ 12830Sstevel@tonic-gate if (*ret == MDMNE_CLASS_LOCKED) { 12840Sstevel@tonic-gate sleep(1); 12850Sstevel@tonic-gate free(ret); 12860Sstevel@tonic-gate ret = NULL; 12870Sstevel@tonic-gate continue; 12880Sstevel@tonic-gate } 12890Sstevel@tonic-gate } 12900Sstevel@tonic-gate if (ret == NULL) { 12910Sstevel@tonic-gate return (MDMNE_RPC_FAIL); 12920Sstevel@tonic-gate } 12930Sstevel@tonic-gate 12940Sstevel@tonic-gate 12950Sstevel@tonic-gate /* if the slave is in abort state, we just ignore it. */ 12960Sstevel@tonic-gate if (*ret == MDMNE_ABORT) { 12970Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, 12980Sstevel@tonic-gate "proc_mas: work(%d,0x%llx-%d) returned " 12990Sstevel@tonic-gate "MDMNE_ABORT\n", 13000Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid)); 13010Sstevel@tonic-gate free(ret); 13020Sstevel@tonic-gate return (MDMNE_IGNORE_NODE); 13030Sstevel@tonic-gate } 13040Sstevel@tonic-gate 13050Sstevel@tonic-gate /* Did the remote processing succeed? */ 13060Sstevel@tonic-gate if (*ret != MDMNE_ACK) { 13070Sstevel@tonic-gate /* 13080Sstevel@tonic-gate * Some commd failure in the middle of sending the msg 13090Sstevel@tonic-gate * to the nodes. We don't continue here. 13100Sstevel@tonic-gate */ 13110Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, 13120Sstevel@tonic-gate "proc_mas: work(%d,0x%llx-%d) returns %d\n", 13130Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), *ret); 13140Sstevel@tonic-gate free(ret); 13150Sstevel@tonic-gate return (MDMNE_RPC_FAIL); 13160Sstevel@tonic-gate } 13170Sstevel@tonic-gate free(ret); 13180Sstevel@tonic-gate ret = NULL; 13190Sstevel@tonic-gate 13200Sstevel@tonic-gate /* 13210Sstevel@tonic-gate * When we are here, we have sent the message to the other node and 13220Sstevel@tonic-gate * we know that node has accepted it. 13230Sstevel@tonic-gate * We go to sleep and have trust to be woken up by wakeup. 13240Sstevel@tonic-gate * If we wakeup due to a timeout, or a signal, no result has been 13250Sstevel@tonic-gate * placed in the appropriate slot. 13260Sstevel@tonic-gate * If we timeout, it is likely that this is because the node has 13270Sstevel@tonic-gate * gone away, so we will destroy the client and try it again in the 13280Sstevel@tonic-gate * expectation that the rpc will fail and we will return 13290Sstevel@tonic-gate * MDMNE_IGNORE_NODE. If that is not the case, the message must still 13300Sstevel@tonic-gate * be being processed on the slave. In this case just timeout for 4 13310Sstevel@tonic-gate * more seconds and then return RPC_FAIL if the message is not complete. 13320Sstevel@tonic-gate */ 13330Sstevel@tonic-gate timeout.tv_nsec = 0; 13340Sstevel@tonic-gate timeout.tv_sec = (timeout_retries == 0) ? mdmn_get_timeout(msgtype) : 13350Sstevel@tonic-gate FOUR_SECS.tv_sec; 13360Sstevel@tonic-gate err = cond_reltimedwait(cv, mx, &timeout); 13370Sstevel@tonic-gate 13380Sstevel@tonic-gate if (err == 0) { 13390Sstevel@tonic-gate /* everything's fine, return success */ 13400Sstevel@tonic-gate return (MDMNE_ACK); 13410Sstevel@tonic-gate } 13420Sstevel@tonic-gate 13430Sstevel@tonic-gate if (err == ETIME) { 13440Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, "proc_mas: " 13450Sstevel@tonic-gate "timeout occured, set=%d, class=%d, " 13460Sstevel@tonic-gate "msgid=(%d, 0x%llx-%d), timeout_retries=%d\n", 13470Sstevel@tonic-gate setno, class, MSGID_ELEMS(msg->msg_msgid), timeout_retries); 13480Sstevel@tonic-gate if (timeout_retries == 0) { 13490Sstevel@tonic-gate timeout_retries++; 13500Sstevel@tonic-gate /* 13510Sstevel@tonic-gate * Destroy the client and try the rpc call again 13520Sstevel@tonic-gate */ 13530Sstevel@tonic-gate rw_wrlock(&client_rwlock[setno]); 13540Sstevel@tonic-gate mdmn_clnt_destroy(client[setno][nid]); 13550Sstevel@tonic-gate client[setno][nid] = (CLIENT *)NULL; 13560Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); 13570Sstevel@tonic-gate goto retry_rpc; 13580Sstevel@tonic-gate } 13590Sstevel@tonic-gate } else if (err == EINTR) { 13600Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, "proc_mas: " 13610Sstevel@tonic-gate "commd signalled, set=%d, class=%d, " 13620Sstevel@tonic-gate "msgid=(%d, 0x%llx-%d)\n", 13630Sstevel@tonic-gate setno, class, MSGID_ELEMS(msg->msg_msgid)); 13640Sstevel@tonic-gate } else { 13650Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, "proc_mas: " 13660Sstevel@tonic-gate "cond_reltimedwait err=%d, set=%d, " 13670Sstevel@tonic-gate "class=%d, msgid=(%d, 0x%llx-%d)\n", 13680Sstevel@tonic-gate err, setno, class, 13690Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid)); 13700Sstevel@tonic-gate } 13710Sstevel@tonic-gate 13720Sstevel@tonic-gate /* some failure happened */ 13730Sstevel@tonic-gate return (MDMNE_RPC_FAIL); 13740Sstevel@tonic-gate } 13750Sstevel@tonic-gate 13760Sstevel@tonic-gate /* 13770Sstevel@tonic-gate * before we return we have to 13780Sstevel@tonic-gate * free_msg(msg); because we are working on a copied message 13790Sstevel@tonic-gate */ 13800Sstevel@tonic-gate void 13810Sstevel@tonic-gate mdmn_master_process_msg(md_mn_msg_t *msg) 13820Sstevel@tonic-gate { 13830Sstevel@tonic-gate int *ret; 13840Sstevel@tonic-gate int err; 13850Sstevel@tonic-gate int nmsgs; /* total number of msgs */ 13860Sstevel@tonic-gate int curmsg; /* index of current msg */ 13870Sstevel@tonic-gate set_t setno; 13880Sstevel@tonic-gate uint_t inherit_flags = 0; 13890Sstevel@tonic-gate uint_t secdiff, usecdiff; /* runtime of this message */ 13900Sstevel@tonic-gate md_error_t mde = mdnullerror; 13910Sstevel@tonic-gate md_mn_msg_t *msglist[MAX_SUBMESSAGES]; /* all msgs to process */ 13920Sstevel@tonic-gate md_mn_msg_t *cmsg; /* current msg */ 13930Sstevel@tonic-gate md_mn_msgid_t dummyid; 13940Sstevel@tonic-gate md_mn_result_t *result; 13950Sstevel@tonic-gate md_mn_result_t *slave_result; 13960Sstevel@tonic-gate md_mn_nodeid_t sender; 13970Sstevel@tonic-gate md_mn_nodeid_t set_master; 13980Sstevel@tonic-gate md_mnnode_desc *node; 13990Sstevel@tonic-gate md_mn_msgtype_t orig_type; /* type of the original message */ 14000Sstevel@tonic-gate md_mn_msgtype_t msgtype; /* type of the current message */ 14010Sstevel@tonic-gate md_mn_msgclass_t orig_class; /* class of the original message */ 14020Sstevel@tonic-gate md_mn_msgclass_t class; /* class of the current message */ 14030Sstevel@tonic-gate 14040Sstevel@tonic-gate int (*smgen)(md_mn_msg_t *msg, md_mn_msg_t **msglist); 14050Sstevel@tonic-gate 14060Sstevel@tonic-gate orig_type = msgtype = msg->msg_type; 14070Sstevel@tonic-gate sender = msg->msg_sender; 14080Sstevel@tonic-gate setno = msg->msg_setno; 14090Sstevel@tonic-gate 14100Sstevel@tonic-gate result = Zalloc(sizeof (md_mn_result_t)); 14110Sstevel@tonic-gate result->mmr_setno = setno; 14120Sstevel@tonic-gate result->mmr_msgtype = msgtype; 14130Sstevel@tonic-gate MSGID_COPY(&(msg->msg_msgid), &(result->mmr_msgid)); 14140Sstevel@tonic-gate 14150Sstevel@tonic-gate orig_class = mdmn_get_message_class(msgtype); 14160Sstevel@tonic-gate 14170Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, 14180Sstevel@tonic-gate "proc_mas: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d\n", 14190Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), setno, orig_class, msgtype); 14200Sstevel@tonic-gate 14210Sstevel@tonic-gate rw_rdlock(&set_desc_rwlock[setno]); 14220Sstevel@tonic-gate set_master = set_descriptor[setno]->sd_mn_master_nodeid; 14230Sstevel@tonic-gate result->mmr_sender = set_master; 14240Sstevel@tonic-gate /* 14250Sstevel@tonic-gate * Put message into the change log unless told otherwise 14260Sstevel@tonic-gate * Note that we only log original messages. 14270Sstevel@tonic-gate * If they are generated by some smgen, we don't log them! 14280Sstevel@tonic-gate * Replay messages aren't logged either. 14290Sstevel@tonic-gate * Note, that replay messages are unlogged on completion. 14300Sstevel@tonic-gate */ 14310Sstevel@tonic-gate if ((msg->msg_flags & (MD_MSGF_NO_LOG | MD_MSGF_REPLAY_MSG)) == 0) { 14320Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, 14330Sstevel@tonic-gate "proc_mas: calling log_msg for (%d,0x%llx-%d) type %d\n", 14340Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), msgtype); 14350Sstevel@tonic-gate err = mdmn_log_msg(msg); 14360Sstevel@tonic-gate if (err == MDMNE_NULL) { 14370Sstevel@tonic-gate /* msg logged successfully */ 14380Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, "proc_mas: " 14390Sstevel@tonic-gate "done log_msg for (%d,0x%llx-%d) type %d\n", 14400Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), msgtype); 14410Sstevel@tonic-gate goto proceed; 14420Sstevel@tonic-gate } 14430Sstevel@tonic-gate if (err == MDMNE_ACK) { 14440Sstevel@tonic-gate /* Same msg in the slot, proceed */ 14450Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, "proc_mas: " 14460Sstevel@tonic-gate "already logged (%d,0x%llx-%d) type %d\n", 14470Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), msgtype); 14480Sstevel@tonic-gate goto proceed; 14490Sstevel@tonic-gate } 14500Sstevel@tonic-gate if (err == MDMNE_LOG_FAIL) { 14510Sstevel@tonic-gate /* Oh, bad, the log is non functional. */ 14520Sstevel@tonic-gate result->mmr_comm_state = MDMNE_LOG_FAIL; 14530Sstevel@tonic-gate /* 14540Sstevel@tonic-gate * Note that the mark_busy was already done by 14550Sstevel@tonic-gate * mdmn_work_svc_1() 14560Sstevel@tonic-gate */ 14570Sstevel@tonic-gate mutex_lock(&mdmn_busy_mutex[setno]); 14580Sstevel@tonic-gate mdmn_mark_class_unbusy(setno, orig_class); 14590Sstevel@tonic-gate mutex_unlock(&mdmn_busy_mutex[setno]); 14600Sstevel@tonic-gate 14610Sstevel@tonic-gate } 14620Sstevel@tonic-gate if (err == MDMNE_CLASS_BUSY) { 14630Sstevel@tonic-gate /* 14640Sstevel@tonic-gate * The log is occupied with a different message 14650Sstevel@tonic-gate * that needs to be played first. 14660Sstevel@tonic-gate * We reject the current message with MDMNE_CLASS_BUSY 14670Sstevel@tonic-gate * to the initiator and do not unbusy the set/class, 14680Sstevel@tonic-gate * because we will proceed with the logged message, 14690Sstevel@tonic-gate * which has the same set/class combination 14700Sstevel@tonic-gate */ 14710Sstevel@tonic-gate result->mmr_comm_state = MDMNE_CLASS_BUSY; 14720Sstevel@tonic-gate } 14730Sstevel@tonic-gate ret = (int *)NULL; 14740Sstevel@tonic-gate rw_rdlock(&client_rwlock[setno]); 14750Sstevel@tonic-gate 14760Sstevel@tonic-gate if (check_client(setno, sender)) { 14770Sstevel@tonic-gate commd_debug(MD_MMV_SYSLOG, 14780Sstevel@tonic-gate "proc_mas: No client for initiator \n"); 14790Sstevel@tonic-gate } else { 14800Sstevel@tonic-gate ret = mdmn_wakeup_initiator_1(result, 14810Sstevel@tonic-gate client[setno][sender]); 14820Sstevel@tonic-gate } 14830Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); 14840Sstevel@tonic-gate 14850Sstevel@tonic-gate if (ret == (int *)NULL) { 14860Sstevel@tonic-gate commd_debug(MD_MMV_SYSLOG, 14870Sstevel@tonic-gate "proc_mas: couldn't wakeup_initiator \n"); 14880Sstevel@tonic-gate } else { 14890Sstevel@tonic-gate if (*ret != MDMNE_ACK) { 14900Sstevel@tonic-gate commd_debug(MD_MMV_SYSLOG, "proc_mas: " 14910Sstevel@tonic-gate "wakeup_initiator returned %d\n", *ret); 14920Sstevel@tonic-gate } 14930Sstevel@tonic-gate free(ret); 14940Sstevel@tonic-gate } 14950Sstevel@tonic-gate free_msg(msg); 14960Sstevel@tonic-gate 14970Sstevel@tonic-gate if (err == MDMNE_LOG_FAIL) { 14980Sstevel@tonic-gate /* we can't proceed here */ 14990Sstevel@tonic-gate free_result(result); 15000Sstevel@tonic-gate rw_unlock(&set_desc_rwlock[setno]); 15010Sstevel@tonic-gate return; 15020Sstevel@tonic-gate } else if (err == MDMNE_CLASS_BUSY) { 15030Sstevel@tonic-gate mdmn_changelog_record_t *lr; 15040Sstevel@tonic-gate lr = mdmn_get_changelogrec(setno, orig_class); 15050Sstevel@tonic-gate assert(lr != NULL); 15060Sstevel@tonic-gate 15070Sstevel@tonic-gate /* proceed with the logged message */ 15080Sstevel@tonic-gate msg = copy_msg(&(lr->lr_msg), NULL); 15090Sstevel@tonic-gate 15100Sstevel@tonic-gate /* 15110Sstevel@tonic-gate * The logged message has to have the same class but 15120Sstevel@tonic-gate * type and sender can be different 15130Sstevel@tonic-gate */ 15140Sstevel@tonic-gate orig_type = msgtype = msg->msg_type; 15150Sstevel@tonic-gate sender = msg->msg_sender; 15160Sstevel@tonic-gate 15170Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, 15180Sstevel@tonic-gate "proc_mas: Got new message from change log: " 15190Sstevel@tonic-gate "(%d,0x%llx-%d) type %d\n", 15200Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), msgtype); 15210Sstevel@tonic-gate 15220Sstevel@tonic-gate /* continue normal operation with this message */ 15230Sstevel@tonic-gate } 15240Sstevel@tonic-gate } 15250Sstevel@tonic-gate 15260Sstevel@tonic-gate proceed: 15270Sstevel@tonic-gate smgen = mdmn_get_submessage_generator(msgtype); 15280Sstevel@tonic-gate if (smgen == NULL) { 15290Sstevel@tonic-gate /* no submessages to create, just use the original message */ 15300Sstevel@tonic-gate msglist[0] = msg; 15310Sstevel@tonic-gate nmsgs = 1; 15320Sstevel@tonic-gate } else { 15330Sstevel@tonic-gate /* some bits are passed on to submessages */ 15340Sstevel@tonic-gate inherit_flags = msg->msg_flags & MD_MSGF_INHERIT_BITS; 15350Sstevel@tonic-gate 15360Sstevel@tonic-gate nmsgs = smgen(msg, msglist); 15370Sstevel@tonic-gate 15380Sstevel@tonic-gate /* some settings for the submessages */ 15390Sstevel@tonic-gate for (curmsg = 0; curmsg < nmsgs; curmsg++) { 15400Sstevel@tonic-gate cmsg = msglist[curmsg]; 15410Sstevel@tonic-gate 15420Sstevel@tonic-gate /* Apply the inherited flags */ 15430Sstevel@tonic-gate cmsg->msg_flags |= inherit_flags; 15440Sstevel@tonic-gate 15450Sstevel@tonic-gate /* 15460Sstevel@tonic-gate * Make sure the submessage ID is set correctly 15470Sstevel@tonic-gate * Note: first submessage has mid_smid of 1 (not 0) 15480Sstevel@tonic-gate */ 15490Sstevel@tonic-gate cmsg->msg_msgid.mid_smid = curmsg + 1; 15500Sstevel@tonic-gate 15510Sstevel@tonic-gate /* need the original class set in msgID (for MCT) */ 15520Sstevel@tonic-gate cmsg->msg_msgid.mid_oclass = orig_class; 15530Sstevel@tonic-gate } 15540Sstevel@tonic-gate 15550Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, 15560Sstevel@tonic-gate "smgen generated %d submsgs, origclass = %d\n", 15570Sstevel@tonic-gate nmsgs, orig_class); 15580Sstevel@tonic-gate } 15590Sstevel@tonic-gate /* 15600Sstevel@tonic-gate * This big loop does the following. 15610Sstevel@tonic-gate * For all messages: 15620Sstevel@tonic-gate * process message on the master first (a message completion 15630Sstevel@tonic-gate * table MCT ensures a message is not processed twice) 15640Sstevel@tonic-gate * in case of an error break out of message loop 15650Sstevel@tonic-gate * for all nodes -- unless MD_MSGF_NO_BCAST is set -- 15660Sstevel@tonic-gate * send message to node until that succeeds 15670Sstevel@tonic-gate * merge result -- not yet implemented 15680Sstevel@tonic-gate * respect MD_MSGF_STOP_ON_ERROR 15690Sstevel@tonic-gate */ 15700Sstevel@tonic-gate for (curmsg = 0; curmsg < nmsgs; curmsg++) { 15710Sstevel@tonic-gate int break_msg_loop = 0; 15720Sstevel@tonic-gate mutex_t *mx; /* protection for class_busy */ 15730Sstevel@tonic-gate int master_err; 15740Sstevel@tonic-gate int master_exitval = -1; 15750Sstevel@tonic-gate 15760Sstevel@tonic-gate cmsg = msglist[curmsg]; 15770Sstevel@tonic-gate msgtype = cmsg->msg_type; 15780Sstevel@tonic-gate class = mdmn_get_message_class(msgtype); 15790Sstevel@tonic-gate node = NULL; 15800Sstevel@tonic-gate mx = mdmn_get_master_table_mx(setno, class); 15810Sstevel@tonic-gate 15820Sstevel@tonic-gate /* If we are in the abort state, we error out immediately */ 15830Sstevel@tonic-gate if (md_commd_global_state & MD_CGS_ABORTED) { 15840Sstevel@tonic-gate break; /* out of the message loop */ 15850Sstevel@tonic-gate } 15860Sstevel@tonic-gate 15870Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, "class=%d, orig_class=%d\n", 15880Sstevel@tonic-gate class, orig_class); 15890Sstevel@tonic-gate /* 15900Sstevel@tonic-gate * If the current class is different from the original class, 15910Sstevel@tonic-gate * we have to lock it down. 15920Sstevel@tonic-gate * The original class is already marked busy. 15930Sstevel@tonic-gate * At this point we cannot refuse the message because the 15940Sstevel@tonic-gate * class is busy right now, so we wait until the class becomes 15950Sstevel@tonic-gate * available again. As soon as something changes for this set 15960Sstevel@tonic-gate * we will be cond_signal'ed (in mdmn_mark_class_unbusy) 15970Sstevel@tonic-gate * 15980Sstevel@tonic-gate * Granularity could be finer (setno/class) 15990Sstevel@tonic-gate */ 16000Sstevel@tonic-gate if (class != orig_class) { 16010Sstevel@tonic-gate mutex_lock(&mdmn_busy_mutex[setno]); 16020Sstevel@tonic-gate while (mdmn_mark_class_busy(setno, class) == FALSE) { 16030Sstevel@tonic-gate cond_wait(&mdmn_busy_cv[setno], 16040Sstevel@tonic-gate &mdmn_busy_mutex[setno]); 16050Sstevel@tonic-gate } 16060Sstevel@tonic-gate mutex_unlock(&mdmn_busy_mutex[setno]); 16070Sstevel@tonic-gate } 16080Sstevel@tonic-gate 16090Sstevel@tonic-gate master_err = do_message_locally(cmsg, result); 16100Sstevel@tonic-gate 16110Sstevel@tonic-gate if ((master_err != MDMNE_ACK) || 16120Sstevel@tonic-gate ((master_err == MDMNE_ACK) && (result->mmr_exitval != 0))) { 16130Sstevel@tonic-gate result->mmr_failing_node = set_master; 16140Sstevel@tonic-gate if (cmsg->msg_flags & MD_MSGF_STOP_ON_ERROR) { 16150Sstevel@tonic-gate /* 16160Sstevel@tonic-gate * if appropriate, unbusy the class and 16170Sstevel@tonic-gate * break out of the message loop 16180Sstevel@tonic-gate */ 16190Sstevel@tonic-gate if (class != orig_class) { 16200Sstevel@tonic-gate mutex_lock(&mdmn_busy_mutex[setno]); 16210Sstevel@tonic-gate mdmn_mark_class_unbusy(setno, class); 16220Sstevel@tonic-gate mutex_unlock(&mdmn_busy_mutex[setno]); 16230Sstevel@tonic-gate } 16240Sstevel@tonic-gate break; 16250Sstevel@tonic-gate } 16260Sstevel@tonic-gate } 16270Sstevel@tonic-gate 16280Sstevel@tonic-gate if (master_err == MDMNE_ACK) 16290Sstevel@tonic-gate master_exitval = result->mmr_exitval; 16300Sstevel@tonic-gate 16310Sstevel@tonic-gate /* No broadcast? => next message */ 16320Sstevel@tonic-gate if (cmsg->msg_flags & MD_MSGF_NO_BCAST) { 16330Sstevel@tonic-gate /* if appropriate, unbusy the class */ 16340Sstevel@tonic-gate if (class != orig_class) { 16350Sstevel@tonic-gate mutex_lock(&mdmn_busy_mutex[setno]); 16360Sstevel@tonic-gate mdmn_mark_class_unbusy(setno, class); 16370Sstevel@tonic-gate mutex_unlock(&mdmn_busy_mutex[setno]); 16380Sstevel@tonic-gate } 16390Sstevel@tonic-gate continue; 16400Sstevel@tonic-gate } 16410Sstevel@tonic-gate 16420Sstevel@tonic-gate 16430Sstevel@tonic-gate /* fake sender, so we get notified when the results are avail */ 16440Sstevel@tonic-gate cmsg->msg_sender = set_master; 16450Sstevel@tonic-gate /* 16460Sstevel@tonic-gate * register to the master_table. It's needed by wakeup_master to 16470Sstevel@tonic-gate * wakeup the sleeping thread. 16480Sstevel@tonic-gate * Access is protected by the class lock: mdmn_mark_class_busy() 16490Sstevel@tonic-gate */ 16500Sstevel@tonic-gate mdmn_set_master_table_id(setno, class, &(cmsg->msg_msgid)); 16510Sstevel@tonic-gate 16520Sstevel@tonic-gate 16530Sstevel@tonic-gate 16540Sstevel@tonic-gate rw_rdlock(&set_desc_rwlock[setno]); 16550Sstevel@tonic-gate /* Send the message to all other nodes */ 16560Sstevel@tonic-gate for (node = set_descriptor[setno]->sd_nodelist; node; 16570Sstevel@tonic-gate node = node->nd_next) { 16580Sstevel@tonic-gate md_mn_nodeid_t nid = node->nd_nodeid; 16590Sstevel@tonic-gate 16600Sstevel@tonic-gate /* We are master and have already processed the msg */ 16610Sstevel@tonic-gate if (node == set_descriptor[setno]->sd_mn_masternode) { 16620Sstevel@tonic-gate continue; 16630Sstevel@tonic-gate } 16640Sstevel@tonic-gate 16650Sstevel@tonic-gate /* If this node didn't join the disk set, ignore it */ 16660Sstevel@tonic-gate if ((node->nd_flags & MD_MN_NODE_OWN) == 0) { 16670Sstevel@tonic-gate continue; 16680Sstevel@tonic-gate } 16690Sstevel@tonic-gate 16700Sstevel@tonic-gate mutex_lock(mx); 16710Sstevel@tonic-gate /* 16720Sstevel@tonic-gate * Register the node that is addressed, 16730Sstevel@tonic-gate * so we can detect unsolicited messages 16740Sstevel@tonic-gate */ 16750Sstevel@tonic-gate mdmn_set_master_table_addr(setno, class, nid); 16760Sstevel@tonic-gate slave_result = (md_mn_result_t *)NULL; 16770Sstevel@tonic-gate 16780Sstevel@tonic-gate /* 16790Sstevel@tonic-gate * Now send it. do_send_message() will return if 16800Sstevel@tonic-gate * a failure occurs or 16810Sstevel@tonic-gate * the results are available 16820Sstevel@tonic-gate */ 16830Sstevel@tonic-gate err = do_send_message(cmsg, node); 16840Sstevel@tonic-gate 16850Sstevel@tonic-gate /* in abort state, we error out immediately */ 16860Sstevel@tonic-gate if (md_commd_global_state & MD_CGS_ABORTED) { 16870Sstevel@tonic-gate break; 16880Sstevel@tonic-gate } 16890Sstevel@tonic-gate 16900Sstevel@tonic-gate if (err == MDMNE_ACK) { 16910Sstevel@tonic-gate slave_result = 16920Sstevel@tonic-gate mdmn_get_master_table_res(setno, class); 16930Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, 16940Sstevel@tonic-gate "proc_mas: got result for (%d,0x%llx-%d)\n", 16950Sstevel@tonic-gate MSGID_ELEMS(cmsg->msg_msgid)); 16960Sstevel@tonic-gate } else if (err == MDMNE_IGNORE_NODE) { 16970Sstevel@tonic-gate mutex_unlock(mx); 16980Sstevel@tonic-gate continue; /* send to next node */ 16990Sstevel@tonic-gate } 17000Sstevel@tonic-gate mutex_unlock(mx); 17010Sstevel@tonic-gate 17020Sstevel@tonic-gate 17030Sstevel@tonic-gate /* 17040Sstevel@tonic-gate * If the result is NULL, or err doesn't show success, 17050Sstevel@tonic-gate * something went wrong with this RPC call. 17060Sstevel@tonic-gate */ 17070Sstevel@tonic-gate if ((slave_result == NULL) || (err != MDMNE_ACK)) { 17080Sstevel@tonic-gate /* 17090Sstevel@tonic-gate * If PANIC_WHEN_INCONSISTENT set, 17100Sstevel@tonic-gate * panic if the master succeeded while 17110Sstevel@tonic-gate * this node failed 17120Sstevel@tonic-gate */ 17130Sstevel@tonic-gate if ((cmsg->msg_flags & 17140Sstevel@tonic-gate MD_MSGF_PANIC_WHEN_INCONSISTENT) && 17150Sstevel@tonic-gate (master_err == MDMNE_ACK)) 17160Sstevel@tonic-gate panic_system(nid, cmsg->msg_type, 17170Sstevel@tonic-gate master_err, master_exitval, 17180Sstevel@tonic-gate slave_result); 17190Sstevel@tonic-gate 17200Sstevel@tonic-gate result->mmr_failing_node = nid; 17210Sstevel@tonic-gate /* are we supposed to stop in case of error? */ 17220Sstevel@tonic-gate if (cmsg->msg_flags & MD_MSGF_STOP_ON_ERROR) { 17230Sstevel@tonic-gate result->mmr_exitval = MDMNE_RPC_FAIL; 17240Sstevel@tonic-gate commd_debug(MD_MMV_SYSLOG, "proc_mas: " 17250Sstevel@tonic-gate "result (%d,0x%llx-%d) is NULL\n", 17260Sstevel@tonic-gate MSGID_ELEMS(cmsg->msg_msgid)); 17270Sstevel@tonic-gate FLUSH_DEBUGFILE(); 17280Sstevel@tonic-gate break_msg_loop = 1; 17290Sstevel@tonic-gate break; /* out of node loop first */ 17300Sstevel@tonic-gate } else { 17310Sstevel@tonic-gate /* send msg to the next node */ 17320Sstevel@tonic-gate continue; 17330Sstevel@tonic-gate } 17340Sstevel@tonic-gate 17350Sstevel@tonic-gate } 17360Sstevel@tonic-gate 17370Sstevel@tonic-gate /* 17380Sstevel@tonic-gate * Message processed on remote node. 17390Sstevel@tonic-gate * If PANIC_WHEN_INCONSISTENT set, panic if the 17400Sstevel@tonic-gate * result is different on this node from the result 17410Sstevel@tonic-gate * on the master 17420Sstevel@tonic-gate */ 17430Sstevel@tonic-gate if ((cmsg->msg_flags & 17440Sstevel@tonic-gate MD_MSGF_PANIC_WHEN_INCONSISTENT) && 17450Sstevel@tonic-gate ((master_err != MDMNE_ACK) || 17460Sstevel@tonic-gate (slave_result->mmr_exitval != master_exitval))) 17470Sstevel@tonic-gate panic_system(nid, cmsg->msg_type, master_err, 17480Sstevel@tonic-gate master_exitval, slave_result); 17490Sstevel@tonic-gate 17500Sstevel@tonic-gate /* 17510Sstevel@tonic-gate * At this point we know we have a message that was 17520Sstevel@tonic-gate * processed on the remote node. 17530Sstevel@tonic-gate * We now check if the exitval is non zero. 17540Sstevel@tonic-gate * In that case we discard the previous result and 17550Sstevel@tonic-gate * rather use the current. 17560Sstevel@tonic-gate * This means: If a message fails on no node, 17570Sstevel@tonic-gate * the result from the master will be returned. 17580Sstevel@tonic-gate * There's currently no such thing as merge of results 17590Sstevel@tonic-gate * If additionally STOP_ON_ERROR is set, we bail out 17600Sstevel@tonic-gate */ 17610Sstevel@tonic-gate if (slave_result->mmr_exitval != 0) { 17620Sstevel@tonic-gate /* throw away the previously allocated result */ 17630Sstevel@tonic-gate free_result(result); 17640Sstevel@tonic-gate 17650Sstevel@tonic-gate /* copy_result() allocates new memory */ 17660Sstevel@tonic-gate result = copy_result(slave_result); 17670Sstevel@tonic-gate free_result(slave_result); 17680Sstevel@tonic-gate 17690Sstevel@tonic-gate dump_result(MD_MMV_PROC_M, "proc_mas", result); 17700Sstevel@tonic-gate 17710Sstevel@tonic-gate result->mmr_failing_node = nid; 17720Sstevel@tonic-gate if (cmsg->msg_flags & MD_MSGF_STOP_ON_ERROR) { 17730Sstevel@tonic-gate break_msg_loop = 1; 17740Sstevel@tonic-gate break; /* out of node loop */ 17750Sstevel@tonic-gate } 17760Sstevel@tonic-gate continue; /* try next node */ 17770Sstevel@tonic-gate 17780Sstevel@tonic-gate } else { 17790Sstevel@tonic-gate /* 17800Sstevel@tonic-gate * MNIssue: may want to merge the results 17810Sstevel@tonic-gate * from all slaves. Currently only report 17820Sstevel@tonic-gate * the results from the master. 17830Sstevel@tonic-gate */ 17840Sstevel@tonic-gate free_result(slave_result); 17850Sstevel@tonic-gate } 17860Sstevel@tonic-gate 17870Sstevel@tonic-gate } /* End of loop over the nodes */ 17880Sstevel@tonic-gate rw_unlock(&set_desc_rwlock[setno]); 17890Sstevel@tonic-gate 17900Sstevel@tonic-gate 17910Sstevel@tonic-gate /* release the current class again */ 17920Sstevel@tonic-gate if (class != orig_class) { 17930Sstevel@tonic-gate mutex_lock(&mdmn_busy_mutex[setno]); 17940Sstevel@tonic-gate mdmn_mark_class_unbusy(setno, class); 17950Sstevel@tonic-gate mutex_unlock(&mdmn_busy_mutex[setno]); 17960Sstevel@tonic-gate } 17970Sstevel@tonic-gate 17980Sstevel@tonic-gate /* are we supposed to quit entirely ? */ 17990Sstevel@tonic-gate if (break_msg_loop || 18000Sstevel@tonic-gate (md_commd_global_state & MD_CGS_ABORTED)) { 18010Sstevel@tonic-gate break; /* out of msg loop */ 18020Sstevel@tonic-gate } 18030Sstevel@tonic-gate 18040Sstevel@tonic-gate } /* End of loop over the messages */ 18050Sstevel@tonic-gate /* 18060Sstevel@tonic-gate * If we are here, there's two possibilities: 18070Sstevel@tonic-gate * - we processed all messages on all nodes without an error. 18080Sstevel@tonic-gate * In this case we return the result from the master. 18090Sstevel@tonic-gate * (to be implemented: return the merged result) 18100Sstevel@tonic-gate * - we encountered an error in which case result has been 18110Sstevel@tonic-gate * set accordingly already. 18120Sstevel@tonic-gate */ 18130Sstevel@tonic-gate 18140Sstevel@tonic-gate if (md_commd_global_state & MD_CGS_ABORTED) { 18150Sstevel@tonic-gate result->mmr_comm_state = MDMNE_ABORT; 18160Sstevel@tonic-gate } 18170Sstevel@tonic-gate 18180Sstevel@tonic-gate /* 18190Sstevel@tonic-gate * This message has been processed completely. 18200Sstevel@tonic-gate * Remove it from the changelog. 18210Sstevel@tonic-gate * Do this for replay messages too. 18220Sstevel@tonic-gate * Note that the message is unlogged before waking up the 18230Sstevel@tonic-gate * initiator. This is done for two reasons. 18240Sstevel@tonic-gate * 1. Remove a race condition that occurs when back to back 18250Sstevel@tonic-gate * messages are sent for the same class, the registeration is 18260Sstevel@tonic-gate * is lost. 18270Sstevel@tonic-gate * 2. If the initiator died but the action was completed on all the 18280Sstevel@tonic-gate * the nodes, we want that to be marked "done" quickly. 18290Sstevel@tonic-gate */ 18300Sstevel@tonic-gate 18310Sstevel@tonic-gate if ((msg->msg_flags & MD_MSGF_NO_LOG) == 0) { 18320Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, 18330Sstevel@tonic-gate "proc_mas: calling unlog_msg for (%d,0x%llx-%d) type %d\n", 18340Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), msgtype); 18350Sstevel@tonic-gate mdmn_unlog_msg(msg); 18360Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, 18370Sstevel@tonic-gate "proc_mas: done unlog_msg for (%d,0x%llx-%d) type %d\n", 18380Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), msgtype); 18390Sstevel@tonic-gate } 18400Sstevel@tonic-gate 18410Sstevel@tonic-gate /* 18420Sstevel@tonic-gate * In case of submessages, we increased the submessage ID in the 18430Sstevel@tonic-gate * result structure. We restore the message ID to the value that 18440Sstevel@tonic-gate * the initiator is waiting for. 18450Sstevel@tonic-gate */ 18460Sstevel@tonic-gate result->mmr_msgid.mid_smid = 0; 18470Sstevel@tonic-gate result->mmr_msgtype = orig_type; 18480Sstevel@tonic-gate result->mmr_sender = set_master; 18490Sstevel@tonic-gate 18500Sstevel@tonic-gate /* if we have an inited client, send result */ 18510Sstevel@tonic-gate ret = (int *)NULL; 18520Sstevel@tonic-gate 18530Sstevel@tonic-gate rw_rdlock(&client_rwlock[setno]); 18540Sstevel@tonic-gate if (check_client(setno, sender)) { 18550Sstevel@tonic-gate commd_debug(MD_MMV_SYSLOG, 18560Sstevel@tonic-gate "proc_mas: unable to create client for initiator\n"); 18570Sstevel@tonic-gate } else { 18580Sstevel@tonic-gate ret = mdmn_wakeup_initiator_1(result, client[setno][sender]); 18590Sstevel@tonic-gate } 18600Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); 18610Sstevel@tonic-gate 18620Sstevel@tonic-gate if (ret == (int *)NULL) { 18630Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, 18640Sstevel@tonic-gate "proc_mas: couldn't wakeup initiator\n"); 18650Sstevel@tonic-gate } else { 18660Sstevel@tonic-gate if (*ret != MDMNE_ACK) { 18670Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, 18680Sstevel@tonic-gate "proc_mas: wakeup_initiator returned %d\n", 18690Sstevel@tonic-gate *ret); 18700Sstevel@tonic-gate } 18710Sstevel@tonic-gate free(ret); 18720Sstevel@tonic-gate } 18730Sstevel@tonic-gate 18740Sstevel@tonic-gate rw_unlock(&set_desc_rwlock[setno]); 18750Sstevel@tonic-gate /* Free all submessages, if there were any */ 18760Sstevel@tonic-gate if (nmsgs > 1) { 18770Sstevel@tonic-gate for (curmsg = 0; curmsg < nmsgs; curmsg++) { 18780Sstevel@tonic-gate free_msg(msglist[curmsg]); 18790Sstevel@tonic-gate } 18800Sstevel@tonic-gate } 18810Sstevel@tonic-gate /* Free the result */ 18820Sstevel@tonic-gate free_result(result); 18830Sstevel@tonic-gate 18840Sstevel@tonic-gate mutex_lock(&mdmn_busy_mutex[setno]); 18850Sstevel@tonic-gate mdmn_mark_class_unbusy(setno, orig_class); 18860Sstevel@tonic-gate mutex_unlock(&mdmn_busy_mutex[setno]); 18870Sstevel@tonic-gate 18880Sstevel@tonic-gate 18890Sstevel@tonic-gate /* 18900Sstevel@tonic-gate * We use this ioctl just to get the time in the same format as used in 18910Sstevel@tonic-gate * the messageID. If it fails, all we get is a bad runtime output. 18920Sstevel@tonic-gate */ 18930Sstevel@tonic-gate (void) metaioctl(MD_IOCGUNIQMSGID, &dummyid, &mde, NULL); 18940Sstevel@tonic-gate secdiff = (dummyid.mid_time - msg->msg_msgid.mid_time) >> 32; 18950Sstevel@tonic-gate usecdiff = (dummyid.mid_time - msg->msg_msgid.mid_time) & 0xfffff; 18960Sstevel@tonic-gate 18970Sstevel@tonic-gate /* catching possible overflow */ 18980Sstevel@tonic-gate if (usecdiff >= 1000000) { 18990Sstevel@tonic-gate usecdiff -= 1000000; 19000Sstevel@tonic-gate secdiff++; 19010Sstevel@tonic-gate } 19020Sstevel@tonic-gate 19030Sstevel@tonic-gate 19040Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, "proc_mas: done (%d, 0x%llx-%d) type=%02d " 19050Sstevel@tonic-gate "%5d.%06d secs runtime\n", 19060Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), orig_type, secdiff, usecdiff); 19070Sstevel@tonic-gate 19080Sstevel@tonic-gate /* Free the original message */ 19090Sstevel@tonic-gate free_msg(msg); 19100Sstevel@tonic-gate } 19110Sstevel@tonic-gate 19120Sstevel@tonic-gate void 19130Sstevel@tonic-gate mdmn_slave_process_msg(md_mn_msg_t *msg) 19140Sstevel@tonic-gate { 19150Sstevel@tonic-gate int *ret = NULL; 19160Sstevel@tonic-gate int completed; 19170Sstevel@tonic-gate int retries; 19180Sstevel@tonic-gate int successfully_returned; 19190Sstevel@tonic-gate set_t setno; 19200Sstevel@tonic-gate md_mn_result_t *result; 19210Sstevel@tonic-gate md_mn_nodeid_t sender; 19220Sstevel@tonic-gate md_mn_nodeid_t whoami; 19230Sstevel@tonic-gate md_mn_msgtype_t msgtype; 19240Sstevel@tonic-gate md_mn_msgclass_t class; 19250Sstevel@tonic-gate 19260Sstevel@tonic-gate void (*handler)(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *res); 19270Sstevel@tonic-gate 19280Sstevel@tonic-gate setno = msg->msg_setno; 19290Sstevel@tonic-gate sender = msg->msg_sender; /* this is always the master of the set */ 19300Sstevel@tonic-gate msgtype = msg->msg_type; 19310Sstevel@tonic-gate 19320Sstevel@tonic-gate rw_rdlock(&set_desc_rwlock[setno]); 19330Sstevel@tonic-gate whoami = set_descriptor[setno]->sd_mn_mynode->nd_nodeid; 19340Sstevel@tonic-gate rw_unlock(&set_desc_rwlock[setno]); 19350Sstevel@tonic-gate 19360Sstevel@tonic-gate result = Zalloc(sizeof (md_mn_result_t)); 19370Sstevel@tonic-gate result->mmr_flags = msg->msg_flags; 19380Sstevel@tonic-gate result->mmr_setno = setno; 19390Sstevel@tonic-gate result->mmr_msgtype = msgtype; 19400Sstevel@tonic-gate result->mmr_sender = whoami; 19410Sstevel@tonic-gate result->mmr_comm_state = MDMNE_ACK; /* Ok state */ 19420Sstevel@tonic-gate MSGID_COPY(&(msg->msg_msgid), &(result->mmr_msgid)); 19430Sstevel@tonic-gate class = mdmn_get_message_class(msgtype); 19440Sstevel@tonic-gate 19450Sstevel@tonic-gate commd_debug(MD_MMV_PROC_S, 19460Sstevel@tonic-gate "proc_sla: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d\n", 19470Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), setno, class, msgtype); 19480Sstevel@tonic-gate 19490Sstevel@tonic-gate handler = mdmn_get_handler(msgtype); 19500Sstevel@tonic-gate 19510Sstevel@tonic-gate if (handler == NULL) { 19520Sstevel@tonic-gate result->mmr_exitval = 0; 19530Sstevel@tonic-gate /* let the sender decide if this is an error or not */ 19540Sstevel@tonic-gate result->mmr_comm_state = MDMNE_NO_HANDLER; 19550Sstevel@tonic-gate commd_debug(MD_MMV_PROC_S, 19560Sstevel@tonic-gate "proc_sla: No handler for (%d, 0x%llx-%d)\n", 19570Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid)); 19580Sstevel@tonic-gate } else { 19590Sstevel@tonic-gate 19600Sstevel@tonic-gate /* Did we already process this message ? */ 19610Sstevel@tonic-gate mutex_lock(&mct_mutex[setno][class]); 19620Sstevel@tonic-gate completed = mdmn_check_completion(msg, result); 19630Sstevel@tonic-gate 19640Sstevel@tonic-gate if (completed == MDMN_MCT_NOT_DONE) { 19650Sstevel@tonic-gate /* message not yet processed locally */ 19660Sstevel@tonic-gate commd_debug(MD_MMV_PROC_S, 19670Sstevel@tonic-gate "proc_sla: calling handler for (%d, 0x%llx-%d)\n", 19680Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid)); 19690Sstevel@tonic-gate 19700Sstevel@tonic-gate /* 19710Sstevel@tonic-gate * Mark the message as being currently processed, 19720Sstevel@tonic-gate * so we won't start a second handler for it 19730Sstevel@tonic-gate */ 19740Sstevel@tonic-gate (void) mdmn_mark_completion(msg, NULL, 19750Sstevel@tonic-gate MDMN_MCT_IN_PROGRESS); 19760Sstevel@tonic-gate 19770Sstevel@tonic-gate mutex_unlock(&mct_mutex[setno][class]); 19780Sstevel@tonic-gate (*handler)(msg, MD_MSGF_ON_SLAVE, result); 19790Sstevel@tonic-gate 19800Sstevel@tonic-gate commd_debug(MD_MMV_PROC_S, 19810Sstevel@tonic-gate "proc_sla: finished handler for (%d, 0x%llx-%d)\n", 19820Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid)); 19830Sstevel@tonic-gate 19840Sstevel@tonic-gate mutex_lock(&mct_mutex[setno][class]); 19850Sstevel@tonic-gate /* Mark the message as fully done, store the result */ 19860Sstevel@tonic-gate (void) mdmn_mark_completion(msg, result, MDMN_MCT_DONE); 19870Sstevel@tonic-gate 19880Sstevel@tonic-gate } else if (completed == MDMN_MCT_DONE) { 19890Sstevel@tonic-gate /* message processed previously, got result from MCT */ 19900Sstevel@tonic-gate commd_debug(MD_MMV_PROC_S, 19910Sstevel@tonic-gate "proc_sla: result for (%d, 0x%llx-%d) from MCT\n", 19920Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid)); 19930Sstevel@tonic-gate } else if (completed == MDMN_MCT_IN_PROGRESS) { 19940Sstevel@tonic-gate /* 19950Sstevel@tonic-gate * If the message is curruntly being processed, 19960Sstevel@tonic-gate * we can return here, without sending a result back. 19970Sstevel@tonic-gate * This will be done by the initial message handling 19980Sstevel@tonic-gate * thread 19990Sstevel@tonic-gate */ 20000Sstevel@tonic-gate mutex_unlock(&mct_mutex[setno][class]); 20010Sstevel@tonic-gate commd_debug(MD_MMV_PROC_M, "proc_sla: " 20020Sstevel@tonic-gate "(%d, 0x%llx-%d) is currently being processed\n", 20030Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), msgtype); 20040Sstevel@tonic-gate 20050Sstevel@tonic-gate free_msg(msg); 20060Sstevel@tonic-gate free_result(result); 20070Sstevel@tonic-gate return; 20080Sstevel@tonic-gate } else { 20090Sstevel@tonic-gate /* MCT error occurred (should never happen) */ 20100Sstevel@tonic-gate result->mmr_comm_state = MDMNE_LOG_FAIL; 20110Sstevel@tonic-gate commd_debug(MD_MMV_PROC_S, 20120Sstevel@tonic-gate "proc_sla: MCT error for (%d, 0x%llx-%d)\n", 20130Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid)); 20140Sstevel@tonic-gate } 20150Sstevel@tonic-gate mutex_unlock(&mct_mutex[setno][class]); 20160Sstevel@tonic-gate } 20170Sstevel@tonic-gate 20180Sstevel@tonic-gate /* 20190Sstevel@tonic-gate * At this point we have a result (even in an error case) 20200Sstevel@tonic-gate * that we return to the master. 20210Sstevel@tonic-gate */ 20220Sstevel@tonic-gate rw_rdlock(&set_desc_rwlock[setno]); 20230Sstevel@tonic-gate retries = 2; /* we will try two times to send the results */ 20240Sstevel@tonic-gate successfully_returned = 0; 20250Sstevel@tonic-gate 20260Sstevel@tonic-gate while (!successfully_returned && (retries != 0)) { 20270Sstevel@tonic-gate ret = (int *)NULL; 20280Sstevel@tonic-gate rw_rdlock(&client_rwlock[setno]); 20290Sstevel@tonic-gate if (check_client(setno, sender)) { 20300Sstevel@tonic-gate /* 20310Sstevel@tonic-gate * If we cannot setup the rpc connection to the master, 20320Sstevel@tonic-gate * we can't do anything besides logging this fact. 20330Sstevel@tonic-gate */ 20340Sstevel@tonic-gate commd_debug(MD_MMV_SYSLOG, 20350Sstevel@tonic-gate "proc_mas: unable to create client for master\n"); 20360Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); 20370Sstevel@tonic-gate break; 20380Sstevel@tonic-gate } else { 20390Sstevel@tonic-gate ret = mdmn_wakeup_master_1(result, 20400Sstevel@tonic-gate client[setno][sender]); 20410Sstevel@tonic-gate /* 20420Sstevel@tonic-gate * if mdmn_wakeup_master_1 returns NULL, it can be that 20430Sstevel@tonic-gate * the master (or the commd on the master) had died. 20440Sstevel@tonic-gate * In that case, we destroy the client to the master 20450Sstevel@tonic-gate * and retry. 20460Sstevel@tonic-gate * If mdmn_wakeup_master_1 doesn't return MDMNE_ACK, 20470Sstevel@tonic-gate * the commd on the master is alive but 20480Sstevel@tonic-gate * something else is wrong, 20490Sstevel@tonic-gate * in that case a retry doesn't make sense => break out 20500Sstevel@tonic-gate */ 20510Sstevel@tonic-gate if (ret == (int *)NULL) { 20520Sstevel@tonic-gate commd_debug(MD_MMV_PROC_S, 20530Sstevel@tonic-gate "proc_sla: wakeup_master returned NULL\n"); 20540Sstevel@tonic-gate /* release reader lock, grab writer lock */ 20550Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); 20560Sstevel@tonic-gate rw_wrlock(&client_rwlock[setno]); 20570Sstevel@tonic-gate mdmn_clnt_destroy(client[setno][sender]); 20580Sstevel@tonic-gate if (client[setno][sender] != (CLIENT *)NULL) { 20590Sstevel@tonic-gate client[setno][sender] = (CLIENT *)NULL; 20600Sstevel@tonic-gate } 20610Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); 20620Sstevel@tonic-gate retries--; 20630Sstevel@tonic-gate commd_debug(MD_MMV_PROC_S, 20640Sstevel@tonic-gate "retries = %d\n", retries); 20650Sstevel@tonic-gate continue; 20660Sstevel@tonic-gate } 20670Sstevel@tonic-gate if (*ret != MDMNE_ACK) { 20680Sstevel@tonic-gate commd_debug(MD_MMV_PROC_S, "proc_sla: " 20690Sstevel@tonic-gate "wakeup_master returned %d\n", *ret); 20700Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); 20710Sstevel@tonic-gate break; 20720Sstevel@tonic-gate } else { /* Good case */ 20730Sstevel@tonic-gate successfully_returned = 1; 20740Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); 20750Sstevel@tonic-gate } 20760Sstevel@tonic-gate } 20770Sstevel@tonic-gate } 20780Sstevel@tonic-gate 20790Sstevel@tonic-gate rw_unlock(&set_desc_rwlock[setno]); 20800Sstevel@tonic-gate commd_debug(MD_MMV_PROC_S, "proc_sla: done (%d, 0x%llx-%d)\n", 20810Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid)); 20820Sstevel@tonic-gate 20830Sstevel@tonic-gate if (ret != (int *)NULL) 20840Sstevel@tonic-gate free(ret); 20850Sstevel@tonic-gate free_msg(msg); 20860Sstevel@tonic-gate free_result(result); 20870Sstevel@tonic-gate } 20880Sstevel@tonic-gate 20890Sstevel@tonic-gate 20900Sstevel@tonic-gate md_mn_result_t * 20910Sstevel@tonic-gate mdmn_send_svc_1(md_mn_msg_t *omsg, struct svc_req *rqstp) 20920Sstevel@tonic-gate { 20930Sstevel@tonic-gate int err; 20940Sstevel@tonic-gate set_t setno; 20950Sstevel@tonic-gate SVCXPRT *transp = rqstp->rq_xprt; 20960Sstevel@tonic-gate md_mn_msg_t *msg; 20970Sstevel@tonic-gate md_mn_result_t *resultp; 20980Sstevel@tonic-gate md_mn_msgclass_t class; 20990Sstevel@tonic-gate md_mn_msg_and_transp_t *matp; 21000Sstevel@tonic-gate 21010Sstevel@tonic-gate msg = copy_msg(omsg, NULL); 21020Sstevel@tonic-gate xdr_free(xdr_md_mn_msg_t, (caddr_t)omsg); 21030Sstevel@tonic-gate 21040Sstevel@tonic-gate setno = msg->msg_setno; 21050Sstevel@tonic-gate class = mdmn_get_message_class(msg->msg_type); 21060Sstevel@tonic-gate 21070Sstevel@tonic-gate /* If we are in the abort state, we error out immediately */ 21080Sstevel@tonic-gate if (md_commd_global_state & MD_CGS_ABORTED) { 21090Sstevel@tonic-gate resultp = Zalloc(sizeof (md_mn_result_t)); 21100Sstevel@tonic-gate resultp->mmr_comm_state = MDMNE_ABORT; 21110Sstevel@tonic-gate mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp); 21120Sstevel@tonic-gate free_result(resultp); 21130Sstevel@tonic-gate svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); 21140Sstevel@tonic-gate return (NULL); 21150Sstevel@tonic-gate } 21160Sstevel@tonic-gate 21170Sstevel@tonic-gate /* check if the global initialization is done */ 21180Sstevel@tonic-gate if ((md_commd_global_state & MD_CGS_INITED) == 0) { 21190Sstevel@tonic-gate global_init(); 21200Sstevel@tonic-gate } 21210Sstevel@tonic-gate 21220Sstevel@tonic-gate commd_debug(MD_MMV_SEND, 21230Sstevel@tonic-gate "send: received (%d, 0x%llx-%d), set=%d, class=%d, type=%d\n", 21240Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type); 21250Sstevel@tonic-gate 21260Sstevel@tonic-gate /* Check for verbosity related message */ 21270Sstevel@tonic-gate if (msg->msg_type == MD_MN_MSG_VERBOSITY) { 21280Sstevel@tonic-gate md_mn_verbose_t *d; 21290Sstevel@tonic-gate 21300Sstevel@tonic-gate d = (md_mn_verbose_t *)((void *)(msg->msg_event_data)); 21310Sstevel@tonic-gate md_commd_global_verb = d->mmv_what; 21320Sstevel@tonic-gate /* everytime the bitmask is set, we reset the timer */ 21330Sstevel@tonic-gate __savetime = gethrtime(); 21340Sstevel@tonic-gate /* 21350Sstevel@tonic-gate * If local-only-flag is set, we are done here, 21360Sstevel@tonic-gate * otherwise we pass that message on to the master. 21370Sstevel@tonic-gate */ 21380Sstevel@tonic-gate if (msg->msg_flags & MD_MSGF_LOCAL_ONLY) { 21390Sstevel@tonic-gate resultp = Zalloc(sizeof (md_mn_result_t)); 21400Sstevel@tonic-gate resultp->mmr_comm_state = MDMNE_ACK; 21410Sstevel@tonic-gate mdmn_svc_sendreply(transp, xdr_md_mn_result_t, 21420Sstevel@tonic-gate (char *)resultp); 21430Sstevel@tonic-gate free_result(resultp); 21440Sstevel@tonic-gate svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); 21450Sstevel@tonic-gate return (NULL); 21460Sstevel@tonic-gate } 21470Sstevel@tonic-gate } 21480Sstevel@tonic-gate 21490Sstevel@tonic-gate /* 21500Sstevel@tonic-gate * Are we entering the abort state? 21510Sstevel@tonic-gate * Here we don't even need to check for MD_MSGF_LOCAL_ONLY, because 21520Sstevel@tonic-gate * this message cannot be distributed anyway. 21530Sstevel@tonic-gate * So, it's safe to return immediately. 21540Sstevel@tonic-gate */ 21550Sstevel@tonic-gate if (msg->msg_type == MD_MN_MSG_ABORT) { 21560Sstevel@tonic-gate md_commd_global_state |= MD_CGS_ABORTED; 21570Sstevel@tonic-gate resultp = Zalloc(sizeof (md_mn_result_t)); 21580Sstevel@tonic-gate resultp->mmr_comm_state = MDMNE_ACK; 21590Sstevel@tonic-gate mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp); 21600Sstevel@tonic-gate free_result(resultp); 21610Sstevel@tonic-gate svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); 21620Sstevel@tonic-gate return (NULL); 21630Sstevel@tonic-gate } 21640Sstevel@tonic-gate 21650Sstevel@tonic-gate 21660Sstevel@tonic-gate /* 21670Sstevel@tonic-gate * Is this message type blocked? 21680Sstevel@tonic-gate * If so we return MDMNE_CLASS_LOCKED, immediately 21690Sstevel@tonic-gate */ 21700Sstevel@tonic-gate if (msgtype_lock_state[msg->msg_type] == MMTL_LOCK) { 21710Sstevel@tonic-gate resultp = Zalloc(sizeof (md_mn_result_t)); 21720Sstevel@tonic-gate resultp->mmr_comm_state = MDMNE_CLASS_LOCKED; 21730Sstevel@tonic-gate mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp); 21740Sstevel@tonic-gate free_result(resultp); 21750Sstevel@tonic-gate svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); 21760Sstevel@tonic-gate commd_debug(MD_MMV_SEND, 21770Sstevel@tonic-gate "send: type locked (%d, 0x%llx-%d), set=%d, class=%d, " 21780Sstevel@tonic-gate "type=%d\n", MSGID_ELEMS(msg->msg_msgid), setno, class, 21790Sstevel@tonic-gate msg->msg_type); 21800Sstevel@tonic-gate return (NULL); 21810Sstevel@tonic-gate } 21820Sstevel@tonic-gate 21830Sstevel@tonic-gate 21840Sstevel@tonic-gate if (md_mn_set_inited[setno] != MDMN_SET_READY) { 21850Sstevel@tonic-gate /* Can only use the appropriate mutexes if they are inited */ 21860Sstevel@tonic-gate if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) { 21870Sstevel@tonic-gate rw_wrlock(&set_desc_rwlock[setno]); 21880Sstevel@tonic-gate rw_wrlock(&client_rwlock[setno]); 21890Sstevel@tonic-gate err = mdmn_init_set(setno, MDMN_SET_READY); 21900Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); 21910Sstevel@tonic-gate rw_unlock(&set_desc_rwlock[setno]); 21920Sstevel@tonic-gate } else { 21930Sstevel@tonic-gate err = mdmn_init_set(setno, MDMN_SET_READY); 21940Sstevel@tonic-gate } 21950Sstevel@tonic-gate 21960Sstevel@tonic-gate if (err) { 21970Sstevel@tonic-gate /* couldn't initialize connections, cannot proceed */ 21980Sstevel@tonic-gate resultp = Zalloc(sizeof (md_mn_result_t)); 21990Sstevel@tonic-gate resultp->mmr_comm_state = err; 22000Sstevel@tonic-gate mdmn_svc_sendreply(transp, xdr_md_mn_result_t, 22010Sstevel@tonic-gate (char *)resultp); 22020Sstevel@tonic-gate svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); 22030Sstevel@tonic-gate free_result(resultp); 22040Sstevel@tonic-gate commd_debug(MD_MMV_SEND, 22050Sstevel@tonic-gate "send: init err = %d\n", err); 22060Sstevel@tonic-gate return (NULL); 22070Sstevel@tonic-gate } 22080Sstevel@tonic-gate } 22090Sstevel@tonic-gate 22100Sstevel@tonic-gate mutex_lock(&mdmn_busy_mutex[setno]); 22110Sstevel@tonic-gate if ((mdmn_is_class_suspended(setno, class) == TRUE) && 22120Sstevel@tonic-gate ((msg->msg_flags & MD_MSGF_OVERRIDE_SUSPEND) == 0)) { 22130Sstevel@tonic-gate mutex_unlock(&mdmn_busy_mutex[setno]); 22140Sstevel@tonic-gate resultp = Zalloc(sizeof (md_mn_result_t)); 22150Sstevel@tonic-gate resultp->mmr_comm_state = MDMNE_SUSPENDED; 22160Sstevel@tonic-gate mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp); 22170Sstevel@tonic-gate svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); 22180Sstevel@tonic-gate free_result(resultp); 22190Sstevel@tonic-gate commd_debug(MD_MMV_SEND, 22200Sstevel@tonic-gate "send: class suspended (%d, 0x%llx-%d), set=%d, " 22210Sstevel@tonic-gate "class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid), 22220Sstevel@tonic-gate setno, class, msg->msg_type); 22230Sstevel@tonic-gate return (NULL); 22240Sstevel@tonic-gate } 22250Sstevel@tonic-gate mutex_unlock(&mdmn_busy_mutex[setno]); 22260Sstevel@tonic-gate 22270Sstevel@tonic-gate /* is this rpc request coming from the local node? */ 22280Sstevel@tonic-gate if (check_license(rqstp, 0) == FALSE) { 22290Sstevel@tonic-gate svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); 22300Sstevel@tonic-gate commd_debug(MD_MMV_SEND, 22310Sstevel@tonic-gate "send: check licence fail(%d, 0x%llx-%d), set=%d, " 22320Sstevel@tonic-gate "class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid), 22330Sstevel@tonic-gate setno, class, msg->msg_type); 22340Sstevel@tonic-gate return (NULL); 22350Sstevel@tonic-gate } 22360Sstevel@tonic-gate 22370Sstevel@tonic-gate 22380Sstevel@tonic-gate /* 22390Sstevel@tonic-gate * We allocate a structure that can take two pointers in order to pass 22400Sstevel@tonic-gate * both the message and the transp into thread_create. 22410Sstevel@tonic-gate * The free for this alloc is done in mdmn_send_to_work() 22420Sstevel@tonic-gate */ 22430Sstevel@tonic-gate matp = Malloc(sizeof (md_mn_msg_and_transp_t)); 22440Sstevel@tonic-gate matp->mat_msg = msg; 22450Sstevel@tonic-gate matp->mat_transp = transp; 22460Sstevel@tonic-gate 22470Sstevel@tonic-gate /* 22480Sstevel@tonic-gate * create a thread here that calls work on the master. 22490Sstevel@tonic-gate * If we are already on the master, this would block if running 22500Sstevel@tonic-gate * in the same context. (our service is single threaded)( 22510Sstevel@tonic-gate * Make it a detached thread because it will not communicate with 22520Sstevel@tonic-gate * anybody thru thr_* mechanisms 22530Sstevel@tonic-gate */ 22540Sstevel@tonic-gate thr_create(NULL, 0, mdmn_send_to_work, (void *) matp, THR_DETACHED, 22550Sstevel@tonic-gate NULL); 22560Sstevel@tonic-gate 22570Sstevel@tonic-gate commd_debug(MD_MMV_SEND, "send: done (%d, 0x%llx-%d)\n", 22580Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid)); 22590Sstevel@tonic-gate /* 22600Sstevel@tonic-gate * We return here without sending results. This will be done by 22610Sstevel@tonic-gate * mdmn_wakeup_initiator_svc_1() as soon as the results are available. 22620Sstevel@tonic-gate * Until then the calling send_message will be blocked, while we 22630Sstevel@tonic-gate * are able to take calls. 22640Sstevel@tonic-gate */ 22650Sstevel@tonic-gate 22660Sstevel@tonic-gate return (NULL); 22670Sstevel@tonic-gate } 22680Sstevel@tonic-gate 22690Sstevel@tonic-gate /* ARGSUSED */ 22700Sstevel@tonic-gate int * 22710Sstevel@tonic-gate mdmn_work_svc_1(md_mn_msg_t *omsg, struct svc_req *rqstp) 22720Sstevel@tonic-gate { 22730Sstevel@tonic-gate int err; 22740Sstevel@tonic-gate set_t setno; 22750Sstevel@tonic-gate thread_t tid; 22760Sstevel@tonic-gate int *retval; 22770Sstevel@tonic-gate md_mn_msg_t *msg; 22780Sstevel@tonic-gate md_mn_msgclass_t class; 22790Sstevel@tonic-gate 22800Sstevel@tonic-gate retval = Malloc(sizeof (int)); 22810Sstevel@tonic-gate 22820Sstevel@tonic-gate /* If we are in the abort state, we error out immediately */ 22830Sstevel@tonic-gate if (md_commd_global_state & MD_CGS_ABORTED) { 22840Sstevel@tonic-gate xdr_free(xdr_md_mn_msg_t, (caddr_t)omsg); 22850Sstevel@tonic-gate *retval = MDMNE_ABORT; 22860Sstevel@tonic-gate return (retval); 22870Sstevel@tonic-gate } 22880Sstevel@tonic-gate 22890Sstevel@tonic-gate msg = copy_msg(omsg, NULL); 22900Sstevel@tonic-gate xdr_free(xdr_md_mn_msg_t, (caddr_t)omsg); 22910Sstevel@tonic-gate 22920Sstevel@tonic-gate /* 22930Sstevel@tonic-gate * Is this message type blocked? 22940Sstevel@tonic-gate * If so we return MDMNE_CLASS_LOCKED, immediately. 22950Sstevel@tonic-gate * This check is performed on master and slave. 22960Sstevel@tonic-gate */ 22970Sstevel@tonic-gate if (msgtype_lock_state[msg->msg_type] == MMTL_LOCK) { 22980Sstevel@tonic-gate *retval = MDMNE_CLASS_LOCKED; 22990Sstevel@tonic-gate return (retval); 23000Sstevel@tonic-gate } 23010Sstevel@tonic-gate 23020Sstevel@tonic-gate /* check if the global initialization is done */ 23030Sstevel@tonic-gate if ((md_commd_global_state & MD_CGS_INITED) == 0) { 23040Sstevel@tonic-gate global_init(); 23050Sstevel@tonic-gate } 23060Sstevel@tonic-gate 23070Sstevel@tonic-gate class = mdmn_get_message_class(msg->msg_type); 23080Sstevel@tonic-gate setno = msg->msg_setno; 23090Sstevel@tonic-gate 23100Sstevel@tonic-gate if (md_mn_set_inited[setno] != MDMN_SET_READY) { 23110Sstevel@tonic-gate /* Can only use the appropriate mutexes if they are inited */ 23120Sstevel@tonic-gate if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) { 23130Sstevel@tonic-gate rw_wrlock(&set_desc_rwlock[setno]); 23140Sstevel@tonic-gate rw_wrlock(&client_rwlock[setno]); 23150Sstevel@tonic-gate err = mdmn_init_set(setno, MDMN_SET_READY); 23160Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); 23170Sstevel@tonic-gate rw_unlock(&set_desc_rwlock[setno]); 23180Sstevel@tonic-gate } else { 23190Sstevel@tonic-gate err = mdmn_init_set(setno, MDMN_SET_READY); 23200Sstevel@tonic-gate } 23210Sstevel@tonic-gate 23220Sstevel@tonic-gate if (err) { 23230Sstevel@tonic-gate *retval = MDMNE_CANNOT_CONNECT; 23240Sstevel@tonic-gate free_msg(msg); 23250Sstevel@tonic-gate return (retval); 23260Sstevel@tonic-gate } 23270Sstevel@tonic-gate } 23280Sstevel@tonic-gate 23290Sstevel@tonic-gate /* is this rpc request coming from a licensed node? */ 23300Sstevel@tonic-gate if (check_license(rqstp, msg->msg_sender) == FALSE) { 23310Sstevel@tonic-gate free_msg(msg); 23320Sstevel@tonic-gate *retval = MDMNE_RPC_FAIL; 23330Sstevel@tonic-gate return (retval); 23340Sstevel@tonic-gate } 23350Sstevel@tonic-gate 23360Sstevel@tonic-gate commd_debug(MD_MMV_WORK, 23370Sstevel@tonic-gate "work: received (%d, 0x%llx-%d), set=%d, class=%d, type=%d, " 23380Sstevel@tonic-gate "flags=0x%x\n", 23390Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type, 23400Sstevel@tonic-gate msg->msg_flags); 23410Sstevel@tonic-gate 23420Sstevel@tonic-gate /* Check for various CLASS0 message types */ 23430Sstevel@tonic-gate if (msg->msg_type == MD_MN_MSG_VERBOSITY) { 23440Sstevel@tonic-gate md_mn_verbose_t *d; 23450Sstevel@tonic-gate 23460Sstevel@tonic-gate d = (md_mn_verbose_t *)((void *)(msg->msg_event_data)); 23470Sstevel@tonic-gate /* for now we ignore set / class in md_mn_verbose_t */ 23480Sstevel@tonic-gate md_commd_global_verb = d->mmv_what; 23490Sstevel@tonic-gate /* everytime the bitmask is set, we reset the timer */ 23500Sstevel@tonic-gate __savetime = gethrtime(); 23510Sstevel@tonic-gate } 23520Sstevel@tonic-gate 23530Sstevel@tonic-gate mutex_lock(&mdmn_busy_mutex[setno]); 23540Sstevel@tonic-gate 23550Sstevel@tonic-gate /* check if class is locked via a call to mdmn_comm_lock_svc_1 */ 23560Sstevel@tonic-gate if (mdmn_is_class_locked(setno, class) == TRUE) { 23570Sstevel@tonic-gate mutex_unlock(&mdmn_busy_mutex[setno]); 23580Sstevel@tonic-gate *retval = MDMNE_CLASS_LOCKED; 23590Sstevel@tonic-gate free_msg(msg); 23600Sstevel@tonic-gate return (retval); 23610Sstevel@tonic-gate } 23620Sstevel@tonic-gate mutex_unlock(&mdmn_busy_mutex[setno]); 23630Sstevel@tonic-gate 23640Sstevel@tonic-gate /* Check if the class is busy right now. Do it only on the master */ 23650Sstevel@tonic-gate rw_rdlock(&set_desc_rwlock[setno]); 23660Sstevel@tonic-gate if (set_descriptor[setno]->sd_mn_am_i_master) { 23670Sstevel@tonic-gate rw_unlock(&set_desc_rwlock[setno]); 23680Sstevel@tonic-gate /* 23690Sstevel@tonic-gate * If the class is currently suspended, don't accept new 23700Sstevel@tonic-gate * messages, unless they are flagged with an override bit. 23710Sstevel@tonic-gate */ 23720Sstevel@tonic-gate mutex_lock(&mdmn_busy_mutex[setno]); 23730Sstevel@tonic-gate if ((mdmn_is_class_suspended(setno, class) == TRUE) && 23740Sstevel@tonic-gate ((msg->msg_flags & MD_MSGF_OVERRIDE_SUSPEND) == 0)) { 23750Sstevel@tonic-gate mutex_unlock(&mdmn_busy_mutex[setno]); 23760Sstevel@tonic-gate *retval = MDMNE_SUSPENDED; 23770Sstevel@tonic-gate commd_debug(MD_MMV_SEND, 23780Sstevel@tonic-gate "send: set %d is suspended\n", setno); 23790Sstevel@tonic-gate free_msg(msg); 23800Sstevel@tonic-gate return (retval); 23810Sstevel@tonic-gate } 23820Sstevel@tonic-gate if (mdmn_mark_class_busy(setno, class) == FALSE) { 23830Sstevel@tonic-gate mutex_unlock(&mdmn_busy_mutex[setno]); 23840Sstevel@tonic-gate *retval = MDMNE_CLASS_BUSY; 23850Sstevel@tonic-gate free_msg(msg); 23860Sstevel@tonic-gate return (retval); 23870Sstevel@tonic-gate } 23880Sstevel@tonic-gate mutex_unlock(&mdmn_busy_mutex[setno]); 23890Sstevel@tonic-gate /* 23900Sstevel@tonic-gate * Because the real processing of the message takes time we 23910Sstevel@tonic-gate * create a thread for it. So the master thread can continue 23920Sstevel@tonic-gate * to run and accept further messages. 23930Sstevel@tonic-gate */ 23940Sstevel@tonic-gate *retval = thr_create(NULL, 0, 23950Sstevel@tonic-gate (void *(*)(void *))mdmn_master_process_msg, (void *)msg, 23960Sstevel@tonic-gate THR_DETACHED|THR_SUSPENDED, &tid); 23970Sstevel@tonic-gate } else { 23980Sstevel@tonic-gate rw_unlock(&set_desc_rwlock[setno]); 23990Sstevel@tonic-gate *retval = thr_create(NULL, 0, 24000Sstevel@tonic-gate (void *(*)(void *)) mdmn_slave_process_msg, (void *)msg, 24010Sstevel@tonic-gate THR_DETACHED|THR_SUSPENDED, &tid); 24020Sstevel@tonic-gate } 24030Sstevel@tonic-gate 24040Sstevel@tonic-gate if (*retval != 0) { 24050Sstevel@tonic-gate *retval = MDMNE_THR_CREATE_FAIL; 24060Sstevel@tonic-gate free_msg(msg); 24070Sstevel@tonic-gate return (retval); 24080Sstevel@tonic-gate } 24090Sstevel@tonic-gate 24100Sstevel@tonic-gate /* Now run the new thread */ 24110Sstevel@tonic-gate thr_continue(tid); 24120Sstevel@tonic-gate 24130Sstevel@tonic-gate commd_debug(MD_MMV_WORK, 24140Sstevel@tonic-gate "work: done (%d, 0x%llx-%d), set=%d, class=%d, type=%d\n", 24150Sstevel@tonic-gate MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type); 24160Sstevel@tonic-gate 24170Sstevel@tonic-gate *retval = MDMNE_ACK; /* this means success */ 24180Sstevel@tonic-gate return (retval); 24190Sstevel@tonic-gate } 24200Sstevel@tonic-gate 24210Sstevel@tonic-gate /* ARGSUSED */ 24220Sstevel@tonic-gate int * 24230Sstevel@tonic-gate mdmn_wakeup_initiator_svc_1(md_mn_result_t *res, struct svc_req *rqstp) 24240Sstevel@tonic-gate { 24250Sstevel@tonic-gate 24260Sstevel@tonic-gate int *retval; 24270Sstevel@tonic-gate int err; 24280Sstevel@tonic-gate set_t setno; 24290Sstevel@tonic-gate mutex_t *mx; /* protection of initiator_table */ 24300Sstevel@tonic-gate SVCXPRT *transp; 24310Sstevel@tonic-gate md_mn_msgid_t initiator_table_id; 24320Sstevel@tonic-gate md_mn_msgclass_t class; 24330Sstevel@tonic-gate 24340Sstevel@tonic-gate retval = Malloc(sizeof (int)); 24350Sstevel@tonic-gate 24360Sstevel@tonic-gate /* check if the global initialization is done */ 24370Sstevel@tonic-gate if ((md_commd_global_state & MD_CGS_INITED) == 0) { 24380Sstevel@tonic-gate global_init(); 24390Sstevel@tonic-gate } 24400Sstevel@tonic-gate 24410Sstevel@tonic-gate setno = res->mmr_setno; 24420Sstevel@tonic-gate 24430Sstevel@tonic-gate if (md_mn_set_inited[setno] != MDMN_SET_READY) { 24440Sstevel@tonic-gate /* set not ready means we just crashed are restarted now */ 24450Sstevel@tonic-gate /* Can only use the appropriate mutexes if they are inited */ 24460Sstevel@tonic-gate if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) { 24470Sstevel@tonic-gate rw_wrlock(&set_desc_rwlock[setno]); 24480Sstevel@tonic-gate rw_wrlock(&client_rwlock[setno]); 24490Sstevel@tonic-gate err = mdmn_init_set(setno, MDMN_SET_READY); 24500Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); 24510Sstevel@tonic-gate rw_unlock(&set_desc_rwlock[setno]); 24520Sstevel@tonic-gate } else { 24530Sstevel@tonic-gate err = mdmn_init_set(setno, MDMN_SET_READY); 24540Sstevel@tonic-gate } 24550Sstevel@tonic-gate 24560Sstevel@tonic-gate if (err) { 24570Sstevel@tonic-gate *retval = MDMNE_CANNOT_CONNECT; 24580Sstevel@tonic-gate xdr_free(xdr_md_mn_result_t, (caddr_t)res); 24590Sstevel@tonic-gate return (retval); 24600Sstevel@tonic-gate } 24610Sstevel@tonic-gate } 24620Sstevel@tonic-gate 24630Sstevel@tonic-gate /* is this rpc request coming from a licensed node? */ 24640Sstevel@tonic-gate if (check_license(rqstp, res->mmr_sender) == FALSE) { 24650Sstevel@tonic-gate xdr_free(xdr_md_mn_result_t, (caddr_t)res); 24660Sstevel@tonic-gate *retval = MDMNE_RPC_FAIL; 24670Sstevel@tonic-gate return (retval); 24680Sstevel@tonic-gate } 24690Sstevel@tonic-gate 24700Sstevel@tonic-gate 24710Sstevel@tonic-gate class = mdmn_get_message_class(res->mmr_msgtype); 24720Sstevel@tonic-gate mx = mdmn_get_initiator_table_mx(setno, class); 24730Sstevel@tonic-gate 24740Sstevel@tonic-gate commd_debug(MD_MMV_WAKE_I, 24750Sstevel@tonic-gate "wake_ini: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d\n", 24760Sstevel@tonic-gate MSGID_ELEMS(res->mmr_msgid), setno, class, res->mmr_msgtype); 24770Sstevel@tonic-gate 24780Sstevel@tonic-gate mutex_lock(mx); 24790Sstevel@tonic-gate 24800Sstevel@tonic-gate /* 24810Sstevel@tonic-gate * Search the initiator wakeup table. 24820Sstevel@tonic-gate * If we find an entry here (which should always be true) 24830Sstevel@tonic-gate * we are on the initiating node and we wakeup the original 24840Sstevel@tonic-gate * local rpc call 24850Sstevel@tonic-gate */ 24860Sstevel@tonic-gate mdmn_get_initiator_table_id(setno, class, &initiator_table_id); 24870Sstevel@tonic-gate 24880Sstevel@tonic-gate if (MSGID_CMP(&(initiator_table_id), &(res->mmr_msgid))) { 24890Sstevel@tonic-gate transp = mdmn_get_initiator_table_transp(setno, class); 24900Sstevel@tonic-gate mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)res); 24910Sstevel@tonic-gate mdmn_unregister_initiator_table(setno, class); 24920Sstevel@tonic-gate *retval = MDMNE_ACK; 24930Sstevel@tonic-gate 24940Sstevel@tonic-gate commd_debug(MD_MMV_WAKE_I, 24950Sstevel@tonic-gate "wake_ini: replied (%d, 0x%llx-%d)\n", 24960Sstevel@tonic-gate MSGID_ELEMS(res->mmr_msgid)); 24970Sstevel@tonic-gate } else { 24980Sstevel@tonic-gate commd_debug(MD_MMV_WAKE_I, 24990Sstevel@tonic-gate "wakeup initiator: unsolicited message (%d, 0x%llx-%d)\n", 25000Sstevel@tonic-gate MSGID_ELEMS(res->mmr_msgid)); 25010Sstevel@tonic-gate *retval = MDMNE_NO_WAKEUP_ENTRY; 25020Sstevel@tonic-gate } 25030Sstevel@tonic-gate mutex_unlock(mx); 25040Sstevel@tonic-gate /* less work for check_timeouts */ 25050Sstevel@tonic-gate mutex_lock(&check_timeout_mutex); 25060Sstevel@tonic-gate if (messages_on_their_way == 0) { 25070Sstevel@tonic-gate commd_debug(MD_MMV_WAKE_I, 25080Sstevel@tonic-gate "Oops, messages_on_their_way < 0 (%d, 0x%llx-%d)\n", 25090Sstevel@tonic-gate MSGID_ELEMS(res->mmr_msgid)); 25100Sstevel@tonic-gate } else { 25110Sstevel@tonic-gate messages_on_their_way--; 25120Sstevel@tonic-gate } 25130Sstevel@tonic-gate mutex_unlock(&check_timeout_mutex); 25140Sstevel@tonic-gate xdr_free(xdr_md_mn_result_t, (caddr_t)res); 25150Sstevel@tonic-gate 25160Sstevel@tonic-gate return (retval); 25170Sstevel@tonic-gate } 25180Sstevel@tonic-gate 25190Sstevel@tonic-gate 25200Sstevel@tonic-gate /* 25210Sstevel@tonic-gate * res must be free'd by the thread we wake up 25220Sstevel@tonic-gate */ 25230Sstevel@tonic-gate /* ARGSUSED */ 25240Sstevel@tonic-gate int * 25250Sstevel@tonic-gate mdmn_wakeup_master_svc_1(md_mn_result_t *ores, struct svc_req *rqstp) 25260Sstevel@tonic-gate { 25270Sstevel@tonic-gate 25280Sstevel@tonic-gate int *retval; 25290Sstevel@tonic-gate int err; 25300Sstevel@tonic-gate set_t setno; 25310Sstevel@tonic-gate cond_t *cv; 25320Sstevel@tonic-gate mutex_t *mx; 25330Sstevel@tonic-gate md_mn_msgid_t master_table_id; 25340Sstevel@tonic-gate md_mn_nodeid_t sender; 25350Sstevel@tonic-gate md_mn_result_t *res; 25360Sstevel@tonic-gate md_mn_msgclass_t class; 25370Sstevel@tonic-gate 25380Sstevel@tonic-gate retval = Malloc(sizeof (int)); 25390Sstevel@tonic-gate 25400Sstevel@tonic-gate /* check if the global initialization is done */ 25410Sstevel@tonic-gate if ((md_commd_global_state & MD_CGS_INITED) == 0) { 25420Sstevel@tonic-gate global_init(); 25430Sstevel@tonic-gate } 25440Sstevel@tonic-gate 25450Sstevel@tonic-gate /* Need to copy the results here, as they are static for RPC */ 25460Sstevel@tonic-gate res = copy_result(ores); 25470Sstevel@tonic-gate xdr_free(xdr_md_mn_result_t, (caddr_t)ores); 25480Sstevel@tonic-gate 25490Sstevel@tonic-gate class = mdmn_get_message_class(res->mmr_msgtype); 25500Sstevel@tonic-gate setno = res->mmr_setno; 25510Sstevel@tonic-gate 25520Sstevel@tonic-gate if (md_mn_set_inited[setno] != MDMN_SET_READY) { 25530Sstevel@tonic-gate /* set not ready means we just crashed are restarted now */ 25540Sstevel@tonic-gate /* Can only use the appropriate mutexes if they are inited */ 25550Sstevel@tonic-gate if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) { 25560Sstevel@tonic-gate rw_wrlock(&set_desc_rwlock[setno]); 25570Sstevel@tonic-gate rw_wrlock(&client_rwlock[setno]); 25580Sstevel@tonic-gate err = mdmn_init_set(setno, MDMN_SET_READY); 25590Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); 25600Sstevel@tonic-gate rw_unlock(&set_desc_rwlock[setno]); 25610Sstevel@tonic-gate } else { 25620Sstevel@tonic-gate err = mdmn_init_set(setno, MDMN_SET_READY); 25630Sstevel@tonic-gate } 25640Sstevel@tonic-gate 25650Sstevel@tonic-gate if (err) { 25660Sstevel@tonic-gate *retval = MDMNE_CANNOT_CONNECT; 25670Sstevel@tonic-gate xdr_free(xdr_md_mn_result_t, (caddr_t)res); 25680Sstevel@tonic-gate return (retval); 25690Sstevel@tonic-gate } 25700Sstevel@tonic-gate } 25710Sstevel@tonic-gate 25720Sstevel@tonic-gate /* is this rpc request coming from a licensed node? */ 25730Sstevel@tonic-gate if (check_license(rqstp, res->mmr_sender) == FALSE) { 25740Sstevel@tonic-gate *retval = MDMNE_RPC_FAIL; 25750Sstevel@tonic-gate xdr_free(xdr_md_mn_result_t, (caddr_t)res); 25760Sstevel@tonic-gate return (retval); 25770Sstevel@tonic-gate } 25780Sstevel@tonic-gate 25790Sstevel@tonic-gate 25800Sstevel@tonic-gate commd_debug(MD_MMV_WAKE_M, 25810Sstevel@tonic-gate "wake_mas: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d " 25820Sstevel@tonic-gate "from %d\n", 25830Sstevel@tonic-gate MSGID_ELEMS(res->mmr_msgid), setno, class, res->mmr_msgtype, 25840Sstevel@tonic-gate res->mmr_sender); 25850Sstevel@tonic-gate /* 25860Sstevel@tonic-gate * The mutex and cv are needed for waking up the thread 25870Sstevel@tonic-gate * sleeping in mdmn_master_process_msg() 25880Sstevel@tonic-gate */ 25890Sstevel@tonic-gate mx = mdmn_get_master_table_mx(setno, class); 25900Sstevel@tonic-gate cv = mdmn_get_master_table_cv(setno, class); 25910Sstevel@tonic-gate 25920Sstevel@tonic-gate /* 25930Sstevel@tonic-gate * lookup the master wakeup table 25940Sstevel@tonic-gate * If we find our message, we are on the master and 25950Sstevel@tonic-gate * called by a slave that finished processing a message. 25960Sstevel@tonic-gate * We store the results in the appropriate slot and 25970Sstevel@tonic-gate * wakeup the thread (mdmn_master_process_msg()) waiting for them. 25980Sstevel@tonic-gate */ 25990Sstevel@tonic-gate mutex_lock(mx); 26000Sstevel@tonic-gate mdmn_get_master_table_id(setno, class, &master_table_id); 26010Sstevel@tonic-gate sender = mdmn_get_master_table_addr(setno, class); 26020Sstevel@tonic-gate 26030Sstevel@tonic-gate if (MSGID_CMP(&(master_table_id), &(res->mmr_msgid))) { 26040Sstevel@tonic-gate if (sender == res->mmr_sender) { 26050Sstevel@tonic-gate mdmn_set_master_table_res(setno, class, res); 26060Sstevel@tonic-gate cond_signal(cv); 26070Sstevel@tonic-gate *retval = MDMNE_ACK; 26080Sstevel@tonic-gate } else { 26090Sstevel@tonic-gate /* id is correct but wrong sender (I smell a timeout) */ 26100Sstevel@tonic-gate commd_debug(MD_MMV_WAKE_M, 26110Sstevel@tonic-gate "wakeup master got unsolicited message: " 26120Sstevel@tonic-gate "(%d, 0x%llx-%d) from %d\n", 26130Sstevel@tonic-gate MSGID_ELEMS(res->mmr_msgid), res->mmr_sender); 26140Sstevel@tonic-gate free_result(res); 26150Sstevel@tonic-gate *retval = MDMNE_TIMEOUT; 26160Sstevel@tonic-gate } 26170Sstevel@tonic-gate } else { 26180Sstevel@tonic-gate /* id is wrong, smells like a very late timeout */ 26190Sstevel@tonic-gate commd_debug(MD_MMV_WAKE_M, 26200Sstevel@tonic-gate "wakeup master got unsolicited message: " 26210Sstevel@tonic-gate "(%d, 0x%llx-%d) from %d, expected (%d, 0x%llx-%d)\n", 26220Sstevel@tonic-gate MSGID_ELEMS(res->mmr_msgid), res->mmr_sender, 26230Sstevel@tonic-gate MSGID_ELEMS(master_table_id)); 26240Sstevel@tonic-gate free_result(res); 26250Sstevel@tonic-gate *retval = MDMNE_NO_WAKEUP_ENTRY; 26260Sstevel@tonic-gate } 26270Sstevel@tonic-gate 26280Sstevel@tonic-gate mutex_unlock(mx); 26290Sstevel@tonic-gate 26300Sstevel@tonic-gate return (retval); 26310Sstevel@tonic-gate } 26320Sstevel@tonic-gate 26330Sstevel@tonic-gate /* 26340Sstevel@tonic-gate * Lock a set/class combination. 26350Sstevel@tonic-gate * This is mainly done for debug purpose. 26360Sstevel@tonic-gate * This set/class combination immediately is blocked, 26370Sstevel@tonic-gate * even in the middle of sending messages to multiple slaves. 26380Sstevel@tonic-gate * This remains until the user issues a mdmn_comm_unlock_svc_1 for the same 26390Sstevel@tonic-gate * set/class combination. 26400Sstevel@tonic-gate * 26410Sstevel@tonic-gate * Special messages of class MD_MSG_CLASS0 can never be locked. 26420Sstevel@tonic-gate * e.g. MD_MN_MSG_VERBOSITY, MD_MN_MSG_ABORT 26430Sstevel@tonic-gate * 26440Sstevel@tonic-gate * That means, if MD_MSG_CLASS0 is specified, we lock all classes from 26450Sstevel@tonic-gate * >= MD_MSG_CLASS1 to < MD_MN_NCLASSES 26460Sstevel@tonic-gate * 26470Sstevel@tonic-gate * set must be between 1 and MD_MAXSETS 26480Sstevel@tonic-gate * class can be: 26490Sstevel@tonic-gate * MD_MSG_CLASS0 which means all other classes in this case 26500Sstevel@tonic-gate * or one specific class (< MD_MN_NCLASSES) 26510Sstevel@tonic-gate * 26520Sstevel@tonic-gate * Returns: 26530Sstevel@tonic-gate * MDMNE_ACK on sucess (locking a locked class is Ok) 26540Sstevel@tonic-gate * MDMNE_EINVAL if a parameter is out of range 26550Sstevel@tonic-gate */ 26560Sstevel@tonic-gate 26570Sstevel@tonic-gate /* ARGSUSED */ 26580Sstevel@tonic-gate int * 26590Sstevel@tonic-gate mdmn_comm_lock_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp) 26600Sstevel@tonic-gate { 26610Sstevel@tonic-gate int *retval; 26620Sstevel@tonic-gate set_t setno = msc->msc_set; 26630Sstevel@tonic-gate md_mn_msgclass_t class = msc->msc_class; 26640Sstevel@tonic-gate 26650Sstevel@tonic-gate retval = Malloc(sizeof (int)); 26660Sstevel@tonic-gate 26670Sstevel@tonic-gate /* check if the global initialization is done */ 26680Sstevel@tonic-gate if ((md_commd_global_state & MD_CGS_INITED) == 0) { 26690Sstevel@tonic-gate global_init(); 26700Sstevel@tonic-gate } 26710Sstevel@tonic-gate 26720Sstevel@tonic-gate /* is this rpc request coming from the local node ? */ 26730Sstevel@tonic-gate if (check_license(rqstp, 0) == FALSE) { 26740Sstevel@tonic-gate xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc); 26750Sstevel@tonic-gate *retval = MDMNE_RPC_FAIL; 26760Sstevel@tonic-gate return (retval); 26770Sstevel@tonic-gate } 26780Sstevel@tonic-gate 26790Sstevel@tonic-gate /* Perform some range checking */ 26800Sstevel@tonic-gate if ((setno == 0) || (setno >= MD_MAXSETS) || 26810Sstevel@tonic-gate (class < MD_MSG_CLASS0) || (class >= MD_MN_NCLASSES)) { 26820Sstevel@tonic-gate *retval = MDMNE_EINVAL; 26830Sstevel@tonic-gate return (retval); 26840Sstevel@tonic-gate } 26850Sstevel@tonic-gate 26860Sstevel@tonic-gate commd_debug(MD_MMV_MISC, "lock: set=%d, class=%d\n", setno, class); 26870Sstevel@tonic-gate mutex_lock(&mdmn_busy_mutex[setno]); 26880Sstevel@tonic-gate if (class != MD_MSG_CLASS0) { 26890Sstevel@tonic-gate mdmn_mark_class_locked(setno, class); 26900Sstevel@tonic-gate } else { 26910Sstevel@tonic-gate /* MD_MSG_CLASS0 is used as a wild card for all classes */ 26920Sstevel@tonic-gate for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) { 26930Sstevel@tonic-gate mdmn_mark_class_locked(setno, class); 26940Sstevel@tonic-gate } 26950Sstevel@tonic-gate } 26960Sstevel@tonic-gate mutex_unlock(&mdmn_busy_mutex[setno]); 26970Sstevel@tonic-gate 26980Sstevel@tonic-gate *retval = MDMNE_ACK; 26990Sstevel@tonic-gate return (retval); 27000Sstevel@tonic-gate } 27010Sstevel@tonic-gate 27020Sstevel@tonic-gate /* 27030Sstevel@tonic-gate * Unlock a set/class combination. 27040Sstevel@tonic-gate * set must be between 1 and MD_MAXSETS 27050Sstevel@tonic-gate * class can be: 27060Sstevel@tonic-gate * MD_MSG_CLASS0 which means all other classes in this case (like above) 27070Sstevel@tonic-gate * or one specific class (< MD_MN_NCLASSES) 27080Sstevel@tonic-gate * 27090Sstevel@tonic-gate * Returns: 27100Sstevel@tonic-gate * MDMNE_ACK on sucess (unlocking an unlocked class is Ok) 27110Sstevel@tonic-gate * MDMNE_EINVAL if a parameter is out of range 27120Sstevel@tonic-gate */ 27130Sstevel@tonic-gate /* ARGSUSED */ 27140Sstevel@tonic-gate int * 27150Sstevel@tonic-gate mdmn_comm_unlock_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp) 27160Sstevel@tonic-gate { 27170Sstevel@tonic-gate int *retval; 27180Sstevel@tonic-gate set_t setno = msc->msc_set; 27190Sstevel@tonic-gate md_mn_msgclass_t class = msc->msc_class; 27200Sstevel@tonic-gate 27210Sstevel@tonic-gate retval = Malloc(sizeof (int)); 27220Sstevel@tonic-gate 27230Sstevel@tonic-gate /* check if the global initialization is done */ 27240Sstevel@tonic-gate if ((md_commd_global_state & MD_CGS_INITED) == 0) { 27250Sstevel@tonic-gate global_init(); 27260Sstevel@tonic-gate } 27270Sstevel@tonic-gate 27280Sstevel@tonic-gate /* is this rpc request coming from the local node ? */ 27290Sstevel@tonic-gate if (check_license(rqstp, 0) == FALSE) { 27300Sstevel@tonic-gate xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc); 27310Sstevel@tonic-gate *retval = MDMNE_RPC_FAIL; 27320Sstevel@tonic-gate return (retval); 27330Sstevel@tonic-gate } 27340Sstevel@tonic-gate 27350Sstevel@tonic-gate /* Perform some range checking */ 27360Sstevel@tonic-gate if ((setno == 0) || (setno >= MD_MAXSETS) || 27370Sstevel@tonic-gate (class < MD_MSG_CLASS0) || (class >= MD_MN_NCLASSES)) { 27380Sstevel@tonic-gate *retval = MDMNE_EINVAL; 27390Sstevel@tonic-gate return (retval); 27400Sstevel@tonic-gate } 27410Sstevel@tonic-gate commd_debug(MD_MMV_MISC, "unlock: set=%d, class=%d\n", setno, class); 27420Sstevel@tonic-gate 27430Sstevel@tonic-gate mutex_lock(&mdmn_busy_mutex[setno]); 27440Sstevel@tonic-gate if (class != MD_MSG_CLASS0) { 27450Sstevel@tonic-gate mdmn_mark_class_unlocked(setno, class); 27460Sstevel@tonic-gate } else { 27470Sstevel@tonic-gate /* MD_MSG_CLASS0 is used as a wild card for all classes */ 27480Sstevel@tonic-gate for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) { 27490Sstevel@tonic-gate mdmn_mark_class_unlocked(setno, class); 27500Sstevel@tonic-gate } 27510Sstevel@tonic-gate } 27520Sstevel@tonic-gate mutex_unlock(&mdmn_busy_mutex[setno]); 27530Sstevel@tonic-gate 27540Sstevel@tonic-gate *retval = MDMNE_ACK; 27550Sstevel@tonic-gate return (retval); 27560Sstevel@tonic-gate } 27570Sstevel@tonic-gate 27580Sstevel@tonic-gate /* 27590Sstevel@tonic-gate * mdmn_comm_suspend_svc_1(setno, class) 27600Sstevel@tonic-gate * 27610Sstevel@tonic-gate * Drain all outstanding messages for a given set/class combination 27620Sstevel@tonic-gate * and don't allow new messages to be processed. 27630Sstevel@tonic-gate * 27640Sstevel@tonic-gate * Special messages of class MD_MSG_CLASS0 can never be locked. 27650Sstevel@tonic-gate * e.g. MD_MN_MSG_VERBOSITY 27660Sstevel@tonic-gate * 27670Sstevel@tonic-gate * 1 <= setno < MD_MAXSETS or setno == MD_COMM_ALL_SETS 27680Sstevel@tonic-gate * 1 <= class < MD_MN_NCLASSES or class == MD_COMM_ALL_CLASSES 27690Sstevel@tonic-gate * 27700Sstevel@tonic-gate * If class _is_not_ MD_COMM_ALL_CLASSES, then we simply mark this 27710Sstevel@tonic-gate * one class as being suspended. 27720Sstevel@tonic-gate * If messages for this class are currently on their way, 27730Sstevel@tonic-gate * MDMNE_SET_NOT_DRAINED is returned. Otherwise MDMNE_ACK is returned. 27740Sstevel@tonic-gate * 27750Sstevel@tonic-gate * If class _is_ MD_COMM_ALL_CLASSES we drain all classes of this set. 27760Sstevel@tonic-gate * Messages must be generated in ascending order. 27770Sstevel@tonic-gate * This means, a message cannot create submessages with the same or lower class. 27780Sstevel@tonic-gate * Draining messages must go from 1 to NCLASSES in order to ensure we don't 27790Sstevel@tonic-gate * generate a hanging situation here. 27800Sstevel@tonic-gate * We mark class 1 as being suspended. 27810Sstevel@tonic-gate * if the class is not busy, we proceed with class 2 27820Sstevel@tonic-gate * and so on 27830Sstevel@tonic-gate * if a class *is* busy, we cannot continue here, but return 27840Sstevel@tonic-gate * MDMNE_SET_NOT_DRAINED. 27850Sstevel@tonic-gate * We expect the caller to hold on for some seconds and try again. 27860Sstevel@tonic-gate * When that message, that held the class busy is done in 27870Sstevel@tonic-gate * mdmn_master_process_msg(), mdmn_mark_class_unbusy() called. 27880Sstevel@tonic-gate * There it is checked if the class is about to drain. 27890Sstevel@tonic-gate * In that case it tries to drain all higher classes there. 27900Sstevel@tonic-gate * 27910Sstevel@tonic-gate * If setno is MD_COMM_ALL_SETS then we perform this on all possible sets. 27920Sstevel@tonic-gate * In that case we return MDMNE_SET_NOT_DRAINED if not all sets are 27930Sstevel@tonic-gate * completely drained. 27940Sstevel@tonic-gate * 27950Sstevel@tonic-gate * Returns: 27960Sstevel@tonic-gate * MDMNE_ACK on sucess (set is drained, no outstanding messages) 27970Sstevel@tonic-gate * MDMNE_SET_NOT_DRAINED if drain process is started, but there are 27980Sstevel@tonic-gate * still outstanding messages for this set(s) 27990Sstevel@tonic-gate * MDMNE_EINVAL if setno is out of range 28000Sstevel@tonic-gate * MDMNE_NOT_JOINED if the set is not yet initialized on this node 28010Sstevel@tonic-gate */ 28020Sstevel@tonic-gate 28030Sstevel@tonic-gate /* ARGSUSED */ 28040Sstevel@tonic-gate int * 28050Sstevel@tonic-gate mdmn_comm_suspend_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp) 28060Sstevel@tonic-gate { 28070Sstevel@tonic-gate int *retval; 28080Sstevel@tonic-gate int failure = 0; 28090Sstevel@tonic-gate set_t startset, endset; 28100Sstevel@tonic-gate set_t setno = msc->msc_set; 28110Sstevel@tonic-gate md_mn_msgclass_t oclass = msc->msc_class; 28120Sstevel@tonic-gate #ifdef NOT_YET_NEEDED 28130Sstevel@tonic-gate uint_t flags = msc->msc_flags; 28140Sstevel@tonic-gate #endif /* NOT_YET_NEEDED */ 28150Sstevel@tonic-gate md_mn_msgclass_t class; 28160Sstevel@tonic-gate 28170Sstevel@tonic-gate retval = Malloc(sizeof (int)); 28180Sstevel@tonic-gate 28190Sstevel@tonic-gate /* check if the global initialization is done */ 28200Sstevel@tonic-gate if ((md_commd_global_state & MD_CGS_INITED) == 0) { 28210Sstevel@tonic-gate global_init(); 28220Sstevel@tonic-gate } 28230Sstevel@tonic-gate 28240Sstevel@tonic-gate /* is this rpc request coming from the local node ? */ 28250Sstevel@tonic-gate if (check_license(rqstp, 0) == FALSE) { 28260Sstevel@tonic-gate xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc); 28270Sstevel@tonic-gate *retval = MDMNE_RPC_FAIL; 28280Sstevel@tonic-gate return (retval); 28290Sstevel@tonic-gate } 28300Sstevel@tonic-gate 28310Sstevel@tonic-gate commd_debug(MD_MMV_MISC, "suspend: called for set=%d class=%d\n", 28320Sstevel@tonic-gate setno, oclass); 28330Sstevel@tonic-gate 28340Sstevel@tonic-gate /* Perform some range checking */ 28350Sstevel@tonic-gate if (setno >= MD_MAXSETS) { 28360Sstevel@tonic-gate *retval = MDMNE_EINVAL; 28370Sstevel@tonic-gate commd_debug(MD_MMV_MISC, "suspend: returning MDMNE_EINVAL\n"); 28380Sstevel@tonic-gate return (retval); 28390Sstevel@tonic-gate } 28400Sstevel@tonic-gate 28410Sstevel@tonic-gate /* setno == MD_COMM_ALL_SETS means: we walk thru all possible sets. */ 28420Sstevel@tonic-gate if (setno == MD_COMM_ALL_SETS) { 28430Sstevel@tonic-gate startset = 1; 28440Sstevel@tonic-gate endset = MD_MAXSETS - 1; 28450Sstevel@tonic-gate } else { 28460Sstevel@tonic-gate startset = setno; 28470Sstevel@tonic-gate endset = setno; 28480Sstevel@tonic-gate } 28490Sstevel@tonic-gate 28500Sstevel@tonic-gate for (setno = startset; setno <= endset; setno++) { 28510Sstevel@tonic-gate /* Here we need the mutexes for the set to be setup */ 28520Sstevel@tonic-gate if (md_mn_set_inited[setno] != MDMN_SET_MUTEXES) { 28530Sstevel@tonic-gate (void) mdmn_init_set(setno, MDMN_SET_MUTEXES); 28540Sstevel@tonic-gate } 28550Sstevel@tonic-gate 28560Sstevel@tonic-gate mutex_lock(&mdmn_busy_mutex[setno]); 28570Sstevel@tonic-gate /* shall we drain all classes of this set? */ 28580Sstevel@tonic-gate if (oclass == MD_COMM_ALL_CLASSES) { 28590Sstevel@tonic-gate for (class = 1; class < MD_MN_NCLASSES; class ++) { 28600Sstevel@tonic-gate commd_debug(MD_MMV_MISC, 28610Sstevel@tonic-gate "suspend: suspending set %d, class %d\n", 28620Sstevel@tonic-gate setno, class); 28630Sstevel@tonic-gate *retval = mdmn_mark_class_suspended(setno, 28640Sstevel@tonic-gate class, MDMN_SUSPEND_ALL); 28650Sstevel@tonic-gate if (*retval == MDMNE_SET_NOT_DRAINED) { 28660Sstevel@tonic-gate failure++; 28670Sstevel@tonic-gate } 28680Sstevel@tonic-gate } 28690Sstevel@tonic-gate } else { 28700Sstevel@tonic-gate /* only drain one specific class */ 28710Sstevel@tonic-gate commd_debug(MD_MMV_MISC, 28720Sstevel@tonic-gate "suspend: suspending set=%d class=%d\n", 28730Sstevel@tonic-gate setno, oclass); 28740Sstevel@tonic-gate *retval = mdmn_mark_class_suspended(setno, oclass, 28750Sstevel@tonic-gate MDMN_SUSPEND_1); 28760Sstevel@tonic-gate if (*retval == MDMNE_SET_NOT_DRAINED) { 28770Sstevel@tonic-gate failure++; 28780Sstevel@tonic-gate } 28790Sstevel@tonic-gate } 28800Sstevel@tonic-gate mutex_unlock(&mdmn_busy_mutex[setno]); 28810Sstevel@tonic-gate } 28820Sstevel@tonic-gate /* If one or more sets are not entirely drained, failure is non-zero */ 28830Sstevel@tonic-gate if (failure != 0) { 28840Sstevel@tonic-gate *retval = MDMNE_SET_NOT_DRAINED; 28850Sstevel@tonic-gate commd_debug(MD_MMV_MISC, 28860Sstevel@tonic-gate "suspend: returning MDMNE_SET_NOT_DRAINED\n"); 28870Sstevel@tonic-gate } else { 28880Sstevel@tonic-gate *retval = MDMNE_ACK; 28890Sstevel@tonic-gate } 28900Sstevel@tonic-gate 28910Sstevel@tonic-gate return (retval); 28920Sstevel@tonic-gate } 28930Sstevel@tonic-gate 28940Sstevel@tonic-gate /* 28950Sstevel@tonic-gate * mdmn_comm_resume_svc_1(setno, class) 28960Sstevel@tonic-gate * 28970Sstevel@tonic-gate * Resume processing messages for a given set. 28980Sstevel@tonic-gate * This incorporates the repeal of a previous suspend operation. 28990Sstevel@tonic-gate * 29000Sstevel@tonic-gate * 1 <= setno < MD_MAXSETS or setno == MD_COMM_ALL_SETS 29010Sstevel@tonic-gate * 1 <= class < MD_MN_NCLASSES or class == MD_COMM_ALL_CLASSES 29020Sstevel@tonic-gate * 29030Sstevel@tonic-gate * If class _is_not_ MD_COMM_ALL_CLASSES, then we simply mark this 29040Sstevel@tonic-gate * one class as being resumed. 29050Sstevel@tonic-gate * 29060Sstevel@tonic-gate * If class _is_ MD_COMM_ALL_CLASSES we resume all classes of this set. 29070Sstevel@tonic-gate * 29080Sstevel@tonic-gate * If setno is MD_COMM_ALL_SETS then we perform this on all possible sets. 29090Sstevel@tonic-gate * 29100Sstevel@tonic-gate * If both setno is MD_COMM_ALL_SETS and class is MD_COMM_ALL_CLASSES we also 29110Sstevel@tonic-gate * reset any ABORT flag from the global state. 29120Sstevel@tonic-gate * 29130Sstevel@tonic-gate * Returns: 29140Sstevel@tonic-gate * MDMNE_ACK on sucess (resuming an unlocked set is Ok) 29150Sstevel@tonic-gate * MDMNE_EINVAL if setno is out of range 29160Sstevel@tonic-gate * MDMNE_NOT_JOINED if the set is not yet initialized on this node 29170Sstevel@tonic-gate */ 29180Sstevel@tonic-gate /* ARGSUSED */ 29190Sstevel@tonic-gate int * 29200Sstevel@tonic-gate mdmn_comm_resume_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp) 29210Sstevel@tonic-gate { 29220Sstevel@tonic-gate int *retval; 29230Sstevel@tonic-gate set_t startset, endset; 29240Sstevel@tonic-gate set_t setno = msc->msc_set; 29250Sstevel@tonic-gate md_mn_msgclass_t oclass = msc->msc_class; 29260Sstevel@tonic-gate uint_t flags = msc->msc_flags; 29270Sstevel@tonic-gate md_mn_msgclass_t class; 29280Sstevel@tonic-gate 29290Sstevel@tonic-gate retval = Malloc(sizeof (int)); 29300Sstevel@tonic-gate 29310Sstevel@tonic-gate /* check if the global initialization is done */ 29320Sstevel@tonic-gate if ((md_commd_global_state & MD_CGS_INITED) == 0) { 29330Sstevel@tonic-gate global_init(); 29340Sstevel@tonic-gate } 29350Sstevel@tonic-gate 29360Sstevel@tonic-gate /* is this rpc request coming from the local node ? */ 29370Sstevel@tonic-gate if (check_license(rqstp, 0) == FALSE) { 29380Sstevel@tonic-gate xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc); 29390Sstevel@tonic-gate *retval = MDMNE_RPC_FAIL; 29400Sstevel@tonic-gate return (retval); 29410Sstevel@tonic-gate } 29420Sstevel@tonic-gate 29430Sstevel@tonic-gate commd_debug(MD_MMV_MISC, "resume: called for set=%d class=%d\n", 29440Sstevel@tonic-gate setno, oclass); 29450Sstevel@tonic-gate 29460Sstevel@tonic-gate /* Perform some range checking */ 29470Sstevel@tonic-gate if (setno > MD_MAXSETS) { 29480Sstevel@tonic-gate *retval = MDMNE_EINVAL; 29490Sstevel@tonic-gate return (retval); 29500Sstevel@tonic-gate } 29510Sstevel@tonic-gate 29520Sstevel@tonic-gate if (setno == MD_COMM_ALL_SETS) { 29530Sstevel@tonic-gate startset = 1; 29540Sstevel@tonic-gate endset = MD_MAXSETS - 1; 29550Sstevel@tonic-gate if (oclass == MD_COMM_ALL_CLASSES) { 29560Sstevel@tonic-gate /* This is the point where we "unabort" the commd */ 29570Sstevel@tonic-gate commd_debug(MD_MMV_MISC, "resume: resetting ABORT\n"); 29580Sstevel@tonic-gate md_commd_global_state &= ~MD_CGS_ABORTED; 29590Sstevel@tonic-gate } 29600Sstevel@tonic-gate } else { 29610Sstevel@tonic-gate startset = setno; 29620Sstevel@tonic-gate endset = setno; 29630Sstevel@tonic-gate } 29640Sstevel@tonic-gate 29650Sstevel@tonic-gate for (setno = startset; setno <= endset; setno++) { 29660Sstevel@tonic-gate 29670Sstevel@tonic-gate /* Here we need the mutexes for the set to be setup */ 29680Sstevel@tonic-gate if ((md_mn_set_inited[setno] & MDMN_SET_MUTEXES) == 0) { 29690Sstevel@tonic-gate (void) mdmn_init_set(setno, MDMN_SET_MUTEXES); 29700Sstevel@tonic-gate } 29710Sstevel@tonic-gate 29720Sstevel@tonic-gate mutex_lock(&mdmn_busy_mutex[setno]); 29730Sstevel@tonic-gate 29740Sstevel@tonic-gate if (oclass == MD_COMM_ALL_CLASSES) { 29750Sstevel@tonic-gate int end_class = 1; 29760Sstevel@tonic-gate /* 29770Sstevel@tonic-gate * When SUSPENDing all classes, we go 29780Sstevel@tonic-gate * from 1 to MD_MN_NCLASSES-1 29790Sstevel@tonic-gate * The correct reverse action is RESUMing 29800Sstevel@tonic-gate * from MD_MN_NCLASSES-1 to 1 (or 2) 29810Sstevel@tonic-gate */ 29820Sstevel@tonic-gate 29830Sstevel@tonic-gate if (flags & MD_MSCF_DONT_RESUME_CLASS1) { 29840Sstevel@tonic-gate end_class = 2; 29850Sstevel@tonic-gate } 29860Sstevel@tonic-gate 29870Sstevel@tonic-gate /* 29880Sstevel@tonic-gate * Then mark all classes of this set as no longer 29890Sstevel@tonic-gate * suspended. This supersedes any previous suspend(1) 29900Sstevel@tonic-gate * calls and resumes the set entirely. 29910Sstevel@tonic-gate */ 29920Sstevel@tonic-gate for (class = MD_MN_NCLASSES - 1; class >= end_class; 29930Sstevel@tonic-gate class --) { 29940Sstevel@tonic-gate commd_debug(MD_MMV_MISC, 29950Sstevel@tonic-gate "resume: resuming set=%d class=%d\n", 29960Sstevel@tonic-gate setno, class); 29970Sstevel@tonic-gate mdmn_mark_class_resumed(setno, class, 29980Sstevel@tonic-gate (MDMN_SUSPEND_ALL | MDMN_SUSPEND_1)); 29990Sstevel@tonic-gate } 30000Sstevel@tonic-gate } else { 30010Sstevel@tonic-gate /* 30020Sstevel@tonic-gate * In this case only one class is marked as not 30030Sstevel@tonic-gate * suspended. If a suspend(all) is currently active for 30040Sstevel@tonic-gate * this set, this class will still be suspended. 30050Sstevel@tonic-gate * That state will be cleared by a suspend(all) 30060Sstevel@tonic-gate * (see above) 30070Sstevel@tonic-gate */ 30080Sstevel@tonic-gate commd_debug(MD_MMV_MISC, 30090Sstevel@tonic-gate "resume: resuming set=%d class=%d\n", 30100Sstevel@tonic-gate setno, oclass); 30110Sstevel@tonic-gate mdmn_mark_class_resumed(setno, oclass, MDMN_SUSPEND_1); 30120Sstevel@tonic-gate } 30130Sstevel@tonic-gate 30140Sstevel@tonic-gate mutex_unlock(&mdmn_busy_mutex[setno]); 30150Sstevel@tonic-gate } 30160Sstevel@tonic-gate 30170Sstevel@tonic-gate *retval = MDMNE_ACK; 30180Sstevel@tonic-gate return (retval); 30190Sstevel@tonic-gate } 30200Sstevel@tonic-gate /* ARGSUSED */ 30210Sstevel@tonic-gate int * 30220Sstevel@tonic-gate mdmn_comm_reinit_set_svc_1(set_t *setnop, struct svc_req *rqstp) 30230Sstevel@tonic-gate { 30240Sstevel@tonic-gate int *retval; 30250Sstevel@tonic-gate md_mnnode_desc *node; 30260Sstevel@tonic-gate set_t setno = *setnop; 30270Sstevel@tonic-gate 30280Sstevel@tonic-gate retval = Malloc(sizeof (int)); 30290Sstevel@tonic-gate 30300Sstevel@tonic-gate /* check if the global initialization is done */ 30310Sstevel@tonic-gate if ((md_commd_global_state & MD_CGS_INITED) == 0) { 30320Sstevel@tonic-gate global_init(); 30330Sstevel@tonic-gate } 30340Sstevel@tonic-gate 30350Sstevel@tonic-gate /* is this rpc request coming from the local node ? */ 30360Sstevel@tonic-gate if (check_license(rqstp, 0) == FALSE) { 30370Sstevel@tonic-gate xdr_free(xdr_set_t, (caddr_t)setnop); 30380Sstevel@tonic-gate *retval = MDMNE_RPC_FAIL; 30390Sstevel@tonic-gate return (retval); 30400Sstevel@tonic-gate } 30410Sstevel@tonic-gate 30420Sstevel@tonic-gate commd_debug(MD_MMV_MISC, "reinit: set=%d\n", setno); 30430Sstevel@tonic-gate 30440Sstevel@tonic-gate rw_rdlock(&set_desc_rwlock[setno]); 30450Sstevel@tonic-gate /* 30460Sstevel@tonic-gate * We assume, that all messages have been suspended previously. 30470Sstevel@tonic-gate * 30480Sstevel@tonic-gate * As we are modifying lots of clients here we grab the client_rwlock 30490Sstevel@tonic-gate * in writer mode. This ensures, no new messages come in. 30500Sstevel@tonic-gate */ 30510Sstevel@tonic-gate rw_wrlock(&client_rwlock[setno]); 30520Sstevel@tonic-gate /* This set is no longer initialized */ 30530Sstevel@tonic-gate 30540Sstevel@tonic-gate if ((set_descriptor[setno] != NULL) && 30550Sstevel@tonic-gate (md_mn_set_inited[setno] & MDMN_SET_NODES)) { 30560Sstevel@tonic-gate /* destroy all rpc clients from this set */ 30570Sstevel@tonic-gate for (node = set_descriptor[setno]->sd_nodelist; node; 30580Sstevel@tonic-gate node = node->nd_next) { 30590Sstevel@tonic-gate mdmn_clnt_destroy(client[setno][node->nd_nodeid]); 30600Sstevel@tonic-gate if (client[setno][node->nd_nodeid] != (CLIENT *)NULL) { 30610Sstevel@tonic-gate client[setno][node->nd_nodeid] = (CLIENT *)NULL; 30620Sstevel@tonic-gate } 30630Sstevel@tonic-gate } 30640Sstevel@tonic-gate md_mn_set_inited[setno] &= ~MDMN_SET_NODES; 30650Sstevel@tonic-gate } 30660Sstevel@tonic-gate 30670Sstevel@tonic-gate commd_debug(MD_MMV_MISC, "reinit: done init_set(%d)\n", setno); 30680Sstevel@tonic-gate 30690Sstevel@tonic-gate rw_unlock(&client_rwlock[setno]); 30700Sstevel@tonic-gate rw_unlock(&set_desc_rwlock[setno]); 30710Sstevel@tonic-gate *retval = MDMNE_ACK; 30720Sstevel@tonic-gate return (retval); 30730Sstevel@tonic-gate } 30740Sstevel@tonic-gate 30750Sstevel@tonic-gate /* 30760Sstevel@tonic-gate * This is just an interface for testing purpose. 30770Sstevel@tonic-gate * Here we can disable single message types. 30780Sstevel@tonic-gate * If we block a message type, this is valid for all MN sets. 30790Sstevel@tonic-gate * If a message arrives later, and it's message type is blocked, it will 30800Sstevel@tonic-gate * be returned immediately with MDMNE_CLASS_LOCKED, which causes the sender to 30810Sstevel@tonic-gate * resend this message over and over again. 30820Sstevel@tonic-gate */ 30830Sstevel@tonic-gate 30840Sstevel@tonic-gate /* ARGSUSED */ 30850Sstevel@tonic-gate int * 30860Sstevel@tonic-gate mdmn_comm_msglock_svc_1(md_mn_type_and_lock_t *mmtl, struct svc_req *rqstp) 30870Sstevel@tonic-gate { 30880Sstevel@tonic-gate int *retval; 30890Sstevel@tonic-gate md_mn_msgtype_t type = mmtl->mmtl_type; 30900Sstevel@tonic-gate uint_t lock = mmtl->mmtl_lock; 30910Sstevel@tonic-gate 30920Sstevel@tonic-gate retval = Malloc(sizeof (int)); 30930Sstevel@tonic-gate 30940Sstevel@tonic-gate /* check if the global initialization is done */ 30950Sstevel@tonic-gate if ((md_commd_global_state & MD_CGS_INITED) == 0) { 30960Sstevel@tonic-gate global_init(); 30970Sstevel@tonic-gate } 30980Sstevel@tonic-gate 30990Sstevel@tonic-gate /* is this rpc request coming from the local node ? */ 31000Sstevel@tonic-gate if (check_license(rqstp, 0) == FALSE) { 31010Sstevel@tonic-gate xdr_free(xdr_md_mn_type_and_lock_t, (caddr_t)mmtl); 31020Sstevel@tonic-gate *retval = MDMNE_RPC_FAIL; 31030Sstevel@tonic-gate return (retval); 31040Sstevel@tonic-gate } 31050Sstevel@tonic-gate 31060Sstevel@tonic-gate /* Perform some range checking */ 31070Sstevel@tonic-gate if ((type == 0) || (type >= MD_MN_NMESSAGES)) { 31080Sstevel@tonic-gate *retval = MDMNE_EINVAL; 31090Sstevel@tonic-gate return (retval); 31100Sstevel@tonic-gate } 31110Sstevel@tonic-gate 31120Sstevel@tonic-gate commd_debug(MD_MMV_MISC, "msglock: type=%d, lock=%d\n", type, lock); 31130Sstevel@tonic-gate msgtype_lock_state[type] = lock; 31140Sstevel@tonic-gate 31150Sstevel@tonic-gate *retval = MDMNE_ACK; 31160Sstevel@tonic-gate return (retval); 31170Sstevel@tonic-gate } 3118