xref: /onnv-gate/usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_server.c (revision 0:68f95e015346)
1*0Sstevel@tonic-gate /*
2*0Sstevel@tonic-gate  * CDDL HEADER START
3*0Sstevel@tonic-gate  *
4*0Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*0Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
6*0Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
7*0Sstevel@tonic-gate  * with the License.
8*0Sstevel@tonic-gate  *
9*0Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*0Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
11*0Sstevel@tonic-gate  * See the License for the specific language governing permissions
12*0Sstevel@tonic-gate  * and limitations under the License.
13*0Sstevel@tonic-gate  *
14*0Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
15*0Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*0Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
17*0Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
18*0Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
19*0Sstevel@tonic-gate  *
20*0Sstevel@tonic-gate  * CDDL HEADER END
21*0Sstevel@tonic-gate  */
22*0Sstevel@tonic-gate /*
23*0Sstevel@tonic-gate  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24*0Sstevel@tonic-gate  * Use is subject to license terms.
25*0Sstevel@tonic-gate  */
26*0Sstevel@tonic-gate 
27*0Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*0Sstevel@tonic-gate 
29*0Sstevel@tonic-gate #include <unistd.h>
30*0Sstevel@tonic-gate #include <sys/types.h>
31*0Sstevel@tonic-gate #include <sys/stat.h>
32*0Sstevel@tonic-gate #include <sys/statvfs.h>
33*0Sstevel@tonic-gate #include <sys/uadmin.h>
34*0Sstevel@tonic-gate #include <fcntl.h>
35*0Sstevel@tonic-gate #include <stdio.h>
36*0Sstevel@tonic-gate #include <thread.h>
37*0Sstevel@tonic-gate #include <meta.h>
38*0Sstevel@tonic-gate #include <sdssc.h>
39*0Sstevel@tonic-gate #include <mdmn_changelog.h>
40*0Sstevel@tonic-gate #include "mdmn_subr.h"
41*0Sstevel@tonic-gate 
42*0Sstevel@tonic-gate /*
43*0Sstevel@tonic-gate  * This is the communication daemon for SVM Multi Node Disksets.
44*0Sstevel@tonic-gate  * It runs on every node and provides the following rpc services:
45*0Sstevel@tonic-gate  *  - mdmn_send_svc_1
46*0Sstevel@tonic-gate  *  - mdmn_work_svc_1
47*0Sstevel@tonic-gate  *  - mdmn_wakeup_initiator_svc_1
48*0Sstevel@tonic-gate  *  - mdmn_wakeup_master_svc_1
49*0Sstevel@tonic-gate  *  - mdmn_comm_lock_svc_1
50*0Sstevel@tonic-gate  *  - mdmn_comm_unlock_svc_1
51*0Sstevel@tonic-gate  *  - mdmn_comm_suspend_svc_1
52*0Sstevel@tonic-gate  *  - mdmn_comm_resume_svc_1
53*0Sstevel@tonic-gate  *  - mdmn_comm_reinit_set_svc_1
54*0Sstevel@tonic-gate  * where send, lock, unlock and reinit are meant for external use,
55*0Sstevel@tonic-gate  * work and the two wakeups are for internal use only.
56*0Sstevel@tonic-gate  *
57*0Sstevel@tonic-gate  * NOTE:
58*0Sstevel@tonic-gate  * On every node only one of those xxx_1 functions can be active at the
59*0Sstevel@tonic-gate  * same time because the daemon is single threaded.
60*0Sstevel@tonic-gate  *
61*0Sstevel@tonic-gate  *
62*0Sstevel@tonic-gate  * In case an event occurs that has to be propagated to all the nodes...
63*0Sstevel@tonic-gate  *
64*0Sstevel@tonic-gate  * One node (the initiator)
65*0Sstevel@tonic-gate  *	calls the libmeta function mdmn_send_message()
66*0Sstevel@tonic-gate  *	This function calls the local daemon thru mdmn_send_svc_1.
67*0Sstevel@tonic-gate  *
68*0Sstevel@tonic-gate  * On the initiator:
69*0Sstevel@tonic-gate  *	mdmn_send_svc_1()
70*0Sstevel@tonic-gate  *	    - starts a thread -> mdmn_send_to_work() and returns.
71*0Sstevel@tonic-gate  *	mdmn_send_to_work()
72*0Sstevel@tonic-gate  *	    - sends this message over to the master of the diskset.
73*0Sstevel@tonic-gate  *	      This is done by calling mdmn_work_svc_1 on the master.
74*0Sstevel@tonic-gate  *	    - registers to the initiator_table
75*0Sstevel@tonic-gate  *	    - exits without doing a svc_sendreply() for the call to
76*0Sstevel@tonic-gate  *	      mdmn_send_svc_1. This means that call is blocked until somebody
77*0Sstevel@tonic-gate  *	      (see end of this comment) does a svc_sendreply().
78*0Sstevel@tonic-gate  *	      This means mdmn_send_message() does not yet return.
79*0Sstevel@tonic-gate  *	    - A timeout surveillance is started at this point.
80*0Sstevel@tonic-gate  *	      This means in case the master doesn't reply at all in an
81*0Sstevel@tonic-gate  *	      aproppriate time, an error condition is returned
82*0Sstevel@tonic-gate  *	      to the caller.
83*0Sstevel@tonic-gate  *
84*0Sstevel@tonic-gate  * On the master:
85*0Sstevel@tonic-gate  *	mdmn_work_svc_1()
86*0Sstevel@tonic-gate  *	    - starts a thread -> mdmn_master_process_msg() and returns
87*0Sstevel@tonic-gate  *	mdmn_master_process_msg()
88*0Sstevel@tonic-gate  *	    - logs the message to the change log
89*0Sstevel@tonic-gate  *	    - executes the message locally
90*0Sstevel@tonic-gate  *	    - flags the message in the change log
91*0Sstevel@tonic-gate  *	    - sends the message to mdmn_work_svc_1() on all the
92*0Sstevel@tonic-gate  *	      other nodes (slaves)
93*0Sstevel@tonic-gate  *	      after each call to mdmn_work_svc_1 the thread goes to sleep and
94*0Sstevel@tonic-gate  *	      will be woken up by mdmn_wakeup_master_svc_1() as soon as the
95*0Sstevel@tonic-gate  *	      slave node is done with this message.
96*0Sstevel@tonic-gate  *	    - In case the slave doesn't respond in a apropriate time, an error
97*0Sstevel@tonic-gate  *	      is assumed to ensure the master doesn't wait forever.
98*0Sstevel@tonic-gate  *
99*0Sstevel@tonic-gate  * On a slave:
100*0Sstevel@tonic-gate  *	mdmn_work_svc_1()
101*0Sstevel@tonic-gate  *	    - starts a thread -> mdmn_slave_process_msg() and returns
102*0Sstevel@tonic-gate  *	mdmn_slave_process_msg()
103*0Sstevel@tonic-gate  *	    - processes this message locally by calling the appropriate message
104*0Sstevel@tonic-gate  *	      handler, that creates some result.
105*0Sstevel@tonic-gate  *	    - sends that result thru a call to mdmn_wakeup_master_svc_1() to
106*0Sstevel@tonic-gate  *	      the master.
107*0Sstevel@tonic-gate  *
108*0Sstevel@tonic-gate  * Back on the master:
109*0Sstevel@tonic-gate  *	mdmn_wakeup_master_svc_1()
110*0Sstevel@tonic-gate  *	    - stores the result into the master_table.
111*0Sstevel@tonic-gate  *	    - signals the mdmn_master_process_msg-thread.
112*0Sstevel@tonic-gate  *	    - returns
113*0Sstevel@tonic-gate  *	mdmn_master_process_msg()
114*0Sstevel@tonic-gate  *	    - after getting the results from all nodes
115*0Sstevel@tonic-gate  *	    - sends them back to the initiating node thru a call to
116*0Sstevel@tonic-gate  *	      mdmn_wakeup_initiator_svc_1.
117*0Sstevel@tonic-gate  *
118*0Sstevel@tonic-gate  * Back on the initiator:
119*0Sstevel@tonic-gate  *	mdmn_wakeup_initiator_svc_1()
120*0Sstevel@tonic-gate  *	    - calls svc_sendreply() which makes the call to mdmn_send_svc_1()
121*0Sstevel@tonic-gate  *	      return.
122*0Sstevel@tonic-gate  *	      which allows the initial mdmn_send_message() call to return.
123*0Sstevel@tonic-gate  */
124*0Sstevel@tonic-gate 
125*0Sstevel@tonic-gate FILE *commdout;		/* debug output for the commd */
126*0Sstevel@tonic-gate char *commdoutfile;	/* file name for the above output */
127*0Sstevel@tonic-gate /* want at least 10 MB free space when logging into a file */
128*0Sstevel@tonic-gate #define	MIN_FS_SPACE	(10LL * 1024 * 1024)
129*0Sstevel@tonic-gate 
130*0Sstevel@tonic-gate /*
131*0Sstevel@tonic-gate  * Number of outstanding messages that were initiated by this node.
132*0Sstevel@tonic-gate  * If zero, check_timeouts goes to sleep
133*0Sstevel@tonic-gate  */
134*0Sstevel@tonic-gate uint_t	messages_on_their_way;
135*0Sstevel@tonic-gate mutex_t	check_timeout_mutex;	/* need mutex to protect above */
136*0Sstevel@tonic-gate cond_t	check_timeout_cv;	/* trigger for check_timeouts */
137*0Sstevel@tonic-gate 
138*0Sstevel@tonic-gate /* for printing out time stamps */
139*0Sstevel@tonic-gate hrtime_t __savetime;
140*0Sstevel@tonic-gate 
141*0Sstevel@tonic-gate /* RPC clients for every set and every node and their protecting locks */
142*0Sstevel@tonic-gate CLIENT	*client[MD_MAXSETS][NNODES];
143*0Sstevel@tonic-gate rwlock_t client_rwlock[MD_MAXSETS];
144*0Sstevel@tonic-gate 
145*0Sstevel@tonic-gate /* the descriptors of all possible sets and their protectors */
146*0Sstevel@tonic-gate struct md_set_desc *set_descriptor[MD_MAXSETS];
147*0Sstevel@tonic-gate rwlock_t set_desc_rwlock[MD_MAXSETS];
148*0Sstevel@tonic-gate 
149*0Sstevel@tonic-gate /* the daemon to daemon communication has to timeout quickly */
150*0Sstevel@tonic-gate static struct timeval FOUR_SECS = { 4, 0 };
151*0Sstevel@tonic-gate 
152*0Sstevel@tonic-gate /* These indicate if a set has already been setup */
153*0Sstevel@tonic-gate int md_mn_set_inited[MD_MAXSETS];
154*0Sstevel@tonic-gate 
155*0Sstevel@tonic-gate /* For every set we have a message completion table and protecting mutexes */
156*0Sstevel@tonic-gate md_mn_mct_t *mct[MD_MAXSETS];
157*0Sstevel@tonic-gate mutex_t	mct_mutex[MD_MAXSETS][MD_MN_NCLASSES];
158*0Sstevel@tonic-gate 
159*0Sstevel@tonic-gate /* Stuff to describe the global status of the commd on one node */
160*0Sstevel@tonic-gate #define	MD_CGS_INITED		0x0001
161*0Sstevel@tonic-gate #define	MD_CGS_ABORTED		0x0002	/* return everything with MDMNE_ABORT */
162*0Sstevel@tonic-gate uint_t md_commd_global_state = 0;	/* No state when starting up */
163*0Sstevel@tonic-gate 
164*0Sstevel@tonic-gate /*
165*0Sstevel@tonic-gate  * Global verbosity level for the daemon
166*0Sstevel@tonic-gate  */
167*0Sstevel@tonic-gate uint_t md_commd_global_verb;
168*0Sstevel@tonic-gate 
169*0Sstevel@tonic-gate /*
170*0Sstevel@tonic-gate  * libmeta doesn't like multiple threads in metaget_setdesc().
171*0Sstevel@tonic-gate  * So we must protect access to it with a global lock
172*0Sstevel@tonic-gate  */
173*0Sstevel@tonic-gate mutex_t get_setdesc_mutex;
174*0Sstevel@tonic-gate 
175*0Sstevel@tonic-gate /*
176*0Sstevel@tonic-gate  * Need a way to block single message types,
177*0Sstevel@tonic-gate  * hence an array with a status for every message type
178*0Sstevel@tonic-gate  */
179*0Sstevel@tonic-gate uint_t msgtype_lock_state[MD_MN_NMESSAGES];
180*0Sstevel@tonic-gate 
181*0Sstevel@tonic-gate /* for reading in the config file */
182*0Sstevel@tonic-gate #define	MAX_LINE_SIZE 1024
183*0Sstevel@tonic-gate 
184*0Sstevel@tonic-gate extern char *commd_get_outfile(void);
185*0Sstevel@tonic-gate extern uint_t commd_get_verbosity(void);
186*0Sstevel@tonic-gate 
187*0Sstevel@tonic-gate /*
188*0Sstevel@tonic-gate  * mdmn_clnt_create is a helper function for meta_client_create_retry.  It
189*0Sstevel@tonic-gate  * merely needs to call clnt_create_timed, and meta_client_create_retry
190*0Sstevel@tonic-gate  * will take care of the rest.
191*0Sstevel@tonic-gate  */
192*0Sstevel@tonic-gate /* ARGSUSED */
193*0Sstevel@tonic-gate static CLIENT *
194*0Sstevel@tonic-gate mdmn_clnt_create(char *ignore, void *data, struct timeval *time_out)
195*0Sstevel@tonic-gate {
196*0Sstevel@tonic-gate 	md_mnnode_desc	*node = (md_mnnode_desc *)data;
197*0Sstevel@tonic-gate 
198*0Sstevel@tonic-gate 	return (clnt_create_timed(node->nd_priv_ic, MDMN_COMMD, ONE, "tcp",
199*0Sstevel@tonic-gate 		time_out));
200*0Sstevel@tonic-gate }
201*0Sstevel@tonic-gate 
202*0Sstevel@tonic-gate #define	FLUSH_DEBUGFILE() \
203*0Sstevel@tonic-gate 	if (commdout != (FILE *)NULL) { \
204*0Sstevel@tonic-gate 		fflush(commdout); \
205*0Sstevel@tonic-gate 		fsync(fileno(commdout)); \
206*0Sstevel@tonic-gate 	}
207*0Sstevel@tonic-gate 
208*0Sstevel@tonic-gate static void
209*0Sstevel@tonic-gate panic_system(int nid, md_mn_msgtype_t type, int master_err, int master_exitval,
210*0Sstevel@tonic-gate     md_mn_result_t *slave_result)
211*0Sstevel@tonic-gate {
212*0Sstevel@tonic-gate 	md_mn_commd_err_t	commd_err;
213*0Sstevel@tonic-gate 	md_error_t		mne = mdnullerror;
214*0Sstevel@tonic-gate 	char			*msg_buf;
215*0Sstevel@tonic-gate 
216*0Sstevel@tonic-gate 	msg_buf = (char *)calloc(MAXPATHLEN + 1, sizeof (char));
217*0Sstevel@tonic-gate 
218*0Sstevel@tonic-gate 	FLUSH_DEBUGFILE();
219*0Sstevel@tonic-gate 
220*0Sstevel@tonic-gate 	if (master_err != MDMNE_ACK) {
221*0Sstevel@tonic-gate 		snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: RPC fail on master "
222*0Sstevel@tonic-gate 			"when processing message type %d\n", type);
223*0Sstevel@tonic-gate 	} else if (slave_result == NULL) {
224*0Sstevel@tonic-gate 		snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: RPC fail on node "
225*0Sstevel@tonic-gate 			"%d when processing message type %d\n", nid, type);
226*0Sstevel@tonic-gate 	} else {
227*0Sstevel@tonic-gate 		snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: Inconsistent "
228*0Sstevel@tonic-gate 			"return value from node %d when processing message "
229*0Sstevel@tonic-gate 			"type %d. Master exitval = %d, Slave exitval = %d\n",
230*0Sstevel@tonic-gate 			nid, type, master_exitval, slave_result->mmr_exitval);
231*0Sstevel@tonic-gate 	}
232*0Sstevel@tonic-gate 	commd_err.size = strlen(msg_buf);
233*0Sstevel@tonic-gate 	commd_err.md_message = (uint64_t)&msg_buf[0];
234*0Sstevel@tonic-gate 
235*0Sstevel@tonic-gate 	metaioctl(MD_MN_COMMD_ERR, &commd_err, &mne, "rpc.mdcommd");
236*0Sstevel@tonic-gate 	(void) uadmin(A_DUMP, AD_BOOT, NULL);
237*0Sstevel@tonic-gate }
238*0Sstevel@tonic-gate 
239*0Sstevel@tonic-gate static void
240*0Sstevel@tonic-gate flush_fcout()
241*0Sstevel@tonic-gate {
242*0Sstevel@tonic-gate 	struct statvfs64 vfsbuf;
243*0Sstevel@tonic-gate 	long long avail_bytes;
244*0Sstevel@tonic-gate 	int warned = 0;
245*0Sstevel@tonic-gate 
246*0Sstevel@tonic-gate 	for (; ; ) {
247*0Sstevel@tonic-gate 		sleep(10);
248*0Sstevel@tonic-gate 		/* No output file, nothing to do */
249*0Sstevel@tonic-gate 		if (commdout == (FILE *)NULL)
250*0Sstevel@tonic-gate 			continue;
251*0Sstevel@tonic-gate 
252*0Sstevel@tonic-gate 		/*
253*0Sstevel@tonic-gate 		 * stat the appropriate filesystem to check for available space.
254*0Sstevel@tonic-gate 		 */
255*0Sstevel@tonic-gate 		if (statvfs64(commdoutfile, &vfsbuf)) {
256*0Sstevel@tonic-gate 			continue;
257*0Sstevel@tonic-gate 		}
258*0Sstevel@tonic-gate 
259*0Sstevel@tonic-gate 		avail_bytes = vfsbuf.f_frsize * vfsbuf.f_bavail;
260*0Sstevel@tonic-gate 		/*
261*0Sstevel@tonic-gate 		 * If we don't have enough space, we print out a warning.
262*0Sstevel@tonic-gate 		 * And we drop the verbosity level to NULL
263*0Sstevel@tonic-gate 		 * In case the condtion doesn't go away, we don't repeat
264*0Sstevel@tonic-gate 		 * the warning.
265*0Sstevel@tonic-gate 		 */
266*0Sstevel@tonic-gate 		if (avail_bytes < MIN_FS_SPACE) {
267*0Sstevel@tonic-gate 			if (warned) {
268*0Sstevel@tonic-gate 				continue;
269*0Sstevel@tonic-gate 			}
270*0Sstevel@tonic-gate 			commd_debug(MD_MMV_SYSLOG,
271*0Sstevel@tonic-gate 			    "NOT enough space available for logging\n");
272*0Sstevel@tonic-gate 			commd_debug(MD_MMV_SYSLOG,
273*0Sstevel@tonic-gate 			    "Have %lld bytes, need %lld bytes\n",
274*0Sstevel@tonic-gate 			    avail_bytes, MIN_FS_SPACE);
275*0Sstevel@tonic-gate 			warned = 1;
276*0Sstevel@tonic-gate 			md_commd_global_verb = MD_MMV_NULL;
277*0Sstevel@tonic-gate 		} else {
278*0Sstevel@tonic-gate 			warned = 0;
279*0Sstevel@tonic-gate 		}
280*0Sstevel@tonic-gate 
281*0Sstevel@tonic-gate 		fflush(commdout);
282*0Sstevel@tonic-gate 	}
283*0Sstevel@tonic-gate }
284*0Sstevel@tonic-gate 
285*0Sstevel@tonic-gate /* safer version of clnt_destroy. If clnt is NULL don't do anything */
286*0Sstevel@tonic-gate #define	mdmn_clnt_destroy(clnt) {	\
287*0Sstevel@tonic-gate 	if (clnt)			\
288*0Sstevel@tonic-gate 		clnt_destroy(clnt);	\
289*0Sstevel@tonic-gate }
290*0Sstevel@tonic-gate 
291*0Sstevel@tonic-gate /*
292*0Sstevel@tonic-gate  * Own version of svc_sendreply that checks the integrity of the transport
293*0Sstevel@tonic-gate  * handle and so prevents us from core dumps in the real svc_sendreply()
294*0Sstevel@tonic-gate  */
295*0Sstevel@tonic-gate void
296*0Sstevel@tonic-gate mdmn_svc_sendreply(SVCXPRT *transp, xdrproc_t xdr, caddr_t data)
297*0Sstevel@tonic-gate {
298*0Sstevel@tonic-gate 	if (SVC_STAT(transp) == XPRT_DIED) {
299*0Sstevel@tonic-gate 		commd_debug(MD_MMV_MISC,
300*0Sstevel@tonic-gate 		    "mdmn_svc_sendreply: XPRT_DIED\n");
301*0Sstevel@tonic-gate 		return;
302*0Sstevel@tonic-gate 	}
303*0Sstevel@tonic-gate 	(void) svc_sendreply(transp, xdr, data);
304*0Sstevel@tonic-gate }
305*0Sstevel@tonic-gate 
306*0Sstevel@tonic-gate /*
307*0Sstevel@tonic-gate  * timeout_initiator(set, class)
308*0Sstevel@tonic-gate  *
309*0Sstevel@tonic-gate  * Alas, I sent a message and didn't get a response back in aproppriate time.
310*0Sstevel@tonic-gate  *
311*0Sstevel@tonic-gate  * timeout_initiator() takes care for doing the needed svc_sendreply() to the
312*0Sstevel@tonic-gate  * calling mdmn_send_message, so that guy doesn't wait forever
313*0Sstevel@tonic-gate  * What is done here is pretty much the same as what is done in
314*0Sstevel@tonic-gate  * wakeup initiator. The difference is that we cannot provide for any results,
315*0Sstevel@tonic-gate  * of course and we set the comm_state to MDMNE_TIMEOUT.
316*0Sstevel@tonic-gate  *
317*0Sstevel@tonic-gate  * By doing so, mdmn_send_message can decide if a retry would make sense or not.
318*0Sstevel@tonic-gate  * It's not our's to decide that here.
319*0Sstevel@tonic-gate  */
320*0Sstevel@tonic-gate void
321*0Sstevel@tonic-gate timeout_initiator(set_t setno, md_mn_msgclass_t class)
322*0Sstevel@tonic-gate {
323*0Sstevel@tonic-gate 	SVCXPRT		*transp;
324*0Sstevel@tonic-gate 	md_mn_msgid_t	mid;
325*0Sstevel@tonic-gate 	md_mn_result_t *resultp;
326*0Sstevel@tonic-gate 
327*0Sstevel@tonic-gate 	resultp = Zalloc(sizeof (md_mn_result_t));
328*0Sstevel@tonic-gate 	resultp->mmr_comm_state	= MDMNE_TIMEOUT;
329*0Sstevel@tonic-gate 
330*0Sstevel@tonic-gate 	commd_debug(MD_MMV_MISC,
331*0Sstevel@tonic-gate 	    "timeout_initiator set = %d, class = %d\n", setno, class);
332*0Sstevel@tonic-gate 
333*0Sstevel@tonic-gate 	transp = mdmn_get_initiator_table_transp(setno, class);
334*0Sstevel@tonic-gate 	mdmn_get_initiator_table_id(setno, class, &mid);
335*0Sstevel@tonic-gate 
336*0Sstevel@tonic-gate 	commd_debug(MD_MMV_MISC, "timeout_ini: (%d, 0x%llx-%d)\n",
337*0Sstevel@tonic-gate 	    MSGID_ELEMS(mid));
338*0Sstevel@tonic-gate 
339*0Sstevel@tonic-gate 	/* return to mdmn_send_message() and let it deal with the situation */
340*0Sstevel@tonic-gate 	mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
341*0Sstevel@tonic-gate 
342*0Sstevel@tonic-gate 	free(resultp);
343*0Sstevel@tonic-gate 	commd_debug(MD_MMV_MISC, "timeout_ini: sendreplied\n");
344*0Sstevel@tonic-gate 	mdmn_unregister_initiator_table(setno, class);
345*0Sstevel@tonic-gate }
346*0Sstevel@tonic-gate 
347*0Sstevel@tonic-gate 
348*0Sstevel@tonic-gate /*
349*0Sstevel@tonic-gate  * check_timeouts - thread
350*0Sstevel@tonic-gate  *
351*0Sstevel@tonic-gate  * This implements a timeout surveillance for messages sent from the
352*0Sstevel@tonic-gate  * initiator to the master.
353*0Sstevel@tonic-gate  *
354*0Sstevel@tonic-gate  * If a message is started, this thread is triggered thru
355*0Sstevel@tonic-gate  * cond_signal(&check_timeout_cv) and we keep track of the numbers of
356*0Sstevel@tonic-gate  * messages that are outstanding (messages_on_their_way).
357*0Sstevel@tonic-gate  *
358*0Sstevel@tonic-gate  * As long as there are messages on their way, this thread never goes to sleep.
359*0Sstevel@tonic-gate  * It'll keep checking all class/set combinations for outstanding messages.
360*0Sstevel@tonic-gate  * If one is found, it's checked if this message is overdue. In that case,
361*0Sstevel@tonic-gate  * timeout_initiator() is called to wakeup the calling mdmn_send_message and
362*0Sstevel@tonic-gate  * to clean up the mess.
363*0Sstevel@tonic-gate  *
364*0Sstevel@tonic-gate  * If the result from the master arrives later, this message is considered
365*0Sstevel@tonic-gate  * to be unsolicited. And will be ignored.
366*0Sstevel@tonic-gate  */
367*0Sstevel@tonic-gate 
368*0Sstevel@tonic-gate void
369*0Sstevel@tonic-gate check_timeouts()
370*0Sstevel@tonic-gate {
371*0Sstevel@tonic-gate 	set_t			setno;
372*0Sstevel@tonic-gate 	time_t			now, then;
373*0Sstevel@tonic-gate 	mutex_t			*mx;
374*0Sstevel@tonic-gate 	md_mn_msgclass_t	class;
375*0Sstevel@tonic-gate 
376*0Sstevel@tonic-gate 	for (; ; ) {
377*0Sstevel@tonic-gate 		now = time((time_t *)NULL);
378*0Sstevel@tonic-gate 		for (setno = 1; setno < MD_MAXSETS; setno++) {
379*0Sstevel@tonic-gate 			if (md_mn_set_inited[setno] != MDMN_SET_READY) {
380*0Sstevel@tonic-gate 				continue;
381*0Sstevel@tonic-gate 			}
382*0Sstevel@tonic-gate 			for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES;
383*0Sstevel@tonic-gate 			    class++) {
384*0Sstevel@tonic-gate 				mx = mdmn_get_initiator_table_mx(setno, class);
385*0Sstevel@tonic-gate 				mutex_lock(mx);
386*0Sstevel@tonic-gate 
387*0Sstevel@tonic-gate 				/* then is the registered time */
388*0Sstevel@tonic-gate 				then =
389*0Sstevel@tonic-gate 				    mdmn_get_initiator_table_time(setno, class);
390*0Sstevel@tonic-gate 				if ((then != 0) && (now > then)) {
391*0Sstevel@tonic-gate 					timeout_initiator(setno, class);
392*0Sstevel@tonic-gate 				}
393*0Sstevel@tonic-gate 				mutex_unlock(mx);
394*0Sstevel@tonic-gate 			}
395*0Sstevel@tonic-gate 		}
396*0Sstevel@tonic-gate 		/* it's ok to check only once per second */
397*0Sstevel@tonic-gate 		sleep(1);
398*0Sstevel@tonic-gate 
399*0Sstevel@tonic-gate 		/* is there work to do? */
400*0Sstevel@tonic-gate 		mutex_lock(&check_timeout_mutex);
401*0Sstevel@tonic-gate 		if (messages_on_their_way == 0) {
402*0Sstevel@tonic-gate 			cond_wait(&check_timeout_cv, &check_timeout_mutex);
403*0Sstevel@tonic-gate 		}
404*0Sstevel@tonic-gate 		mutex_unlock(&check_timeout_mutex);
405*0Sstevel@tonic-gate 	}
406*0Sstevel@tonic-gate }
407*0Sstevel@tonic-gate 
408*0Sstevel@tonic-gate void
409*0Sstevel@tonic-gate setup_debug(void)
410*0Sstevel@tonic-gate {
411*0Sstevel@tonic-gate 	char	*tmp_dir;
412*0Sstevel@tonic-gate 
413*0Sstevel@tonic-gate 	/* Read in the debug-controlling tokens from runtime.cf */
414*0Sstevel@tonic-gate 	md_commd_global_verb = commd_get_verbosity();
415*0Sstevel@tonic-gate 	/*
416*0Sstevel@tonic-gate 	 * If the user didn't specify a verbosity level in runtime.cf
417*0Sstevel@tonic-gate 	 * we can safely return here. As we don't intend to printout
418*0Sstevel@tonic-gate 	 * debug messages, we don't need to check for the output file.
419*0Sstevel@tonic-gate 	 */
420*0Sstevel@tonic-gate 	if (md_commd_global_verb == 0) {
421*0Sstevel@tonic-gate 		return;
422*0Sstevel@tonic-gate 	}
423*0Sstevel@tonic-gate 
424*0Sstevel@tonic-gate 	/* if commdout is non-NULL it is an open FILE, we'd better close it */
425*0Sstevel@tonic-gate 	if (commdout != (FILE *)NULL) {
426*0Sstevel@tonic-gate 		fclose(commdout);
427*0Sstevel@tonic-gate 	}
428*0Sstevel@tonic-gate 
429*0Sstevel@tonic-gate 	commdoutfile = commd_get_outfile();
430*0Sstevel@tonic-gate 
431*0Sstevel@tonic-gate 	/* setup the debug output */
432*0Sstevel@tonic-gate 	if (commdoutfile == (char *)NULL) {
433*0Sstevel@tonic-gate 		/* if no valid file was specified, use the default */
434*0Sstevel@tonic-gate 		commdoutfile = "/var/run/commd.out";
435*0Sstevel@tonic-gate 		commdout = fopen(commdoutfile, "a");
436*0Sstevel@tonic-gate 	} else {
437*0Sstevel@tonic-gate 		/* check if the directory exists and is writable */
438*0Sstevel@tonic-gate 		tmp_dir = strdup(commdoutfile);
439*0Sstevel@tonic-gate 		if ((access(dirname(tmp_dir), X_OK|W_OK)) ||
440*0Sstevel@tonic-gate 		    ((commdout = fopen(commdoutfile, "a")) == (FILE *)NULL)) {
441*0Sstevel@tonic-gate 			syslog(LOG_ERR,
442*0Sstevel@tonic-gate 			    "Can't write to specified output file %s,\n"
443*0Sstevel@tonic-gate 			    "using /var/run/commd.out instead\n", commdoutfile);
444*0Sstevel@tonic-gate 			free(commdoutfile);
445*0Sstevel@tonic-gate 			commdoutfile = "/var/run/commd.out";
446*0Sstevel@tonic-gate 			commdout = fopen(commdoutfile, "a");
447*0Sstevel@tonic-gate 		}
448*0Sstevel@tonic-gate 		free(tmp_dir);
449*0Sstevel@tonic-gate 	}
450*0Sstevel@tonic-gate 
451*0Sstevel@tonic-gate 	if (commdout == (FILE *)NULL) {
452*0Sstevel@tonic-gate 		syslog(LOG_ERR, "Can't write to debug output file %s\n",
453*0Sstevel@tonic-gate 		    commdoutfile);
454*0Sstevel@tonic-gate 	}
455*0Sstevel@tonic-gate }
456*0Sstevel@tonic-gate /*
457*0Sstevel@tonic-gate  * global_init()
458*0Sstevel@tonic-gate  *
459*0Sstevel@tonic-gate  * Perform some global initializations.
460*0Sstevel@tonic-gate  *
461*0Sstevel@tonic-gate  * the following routines have to call this before operation can start:
462*0Sstevel@tonic-gate  *  - mdmn_send_svc_1
463*0Sstevel@tonic-gate  *  - mdmn_work_svc_1
464*0Sstevel@tonic-gate  *  - mdmn_comm_lock_svc_1
465*0Sstevel@tonic-gate  *  - mdmn_comm_unlock_svc_1
466*0Sstevel@tonic-gate  *  - mdmn_comm_suspend_svc_1
467*0Sstevel@tonic-gate  *  - mdmn_comm_resume_svc_1
468*0Sstevel@tonic-gate  *  - mdmn_comm_reinit_set_svc_1
469*0Sstevel@tonic-gate  *
470*0Sstevel@tonic-gate  * This is a single threaded daemon, so it can only be in one of the above
471*0Sstevel@tonic-gate  * routines at the same time.
472*0Sstevel@tonic-gate  * This means, global_init() cannot be called more than once at the same time.
473*0Sstevel@tonic-gate  * Hence, no lock is needed.
474*0Sstevel@tonic-gate  */
475*0Sstevel@tonic-gate void
476*0Sstevel@tonic-gate global_init(void)
477*0Sstevel@tonic-gate {
478*0Sstevel@tonic-gate 	set_t			set;
479*0Sstevel@tonic-gate 	md_mn_msgclass_t	class;
480*0Sstevel@tonic-gate 	struct sigaction	sighandler;
481*0Sstevel@tonic-gate 	time_t			clock_val;
482*0Sstevel@tonic-gate 
483*0Sstevel@tonic-gate 	/* Do these global initializations only once */
484*0Sstevel@tonic-gate 	if (md_commd_global_state & MD_CGS_INITED) {
485*0Sstevel@tonic-gate 		return;
486*0Sstevel@tonic-gate 	}
487*0Sstevel@tonic-gate 	(void) sdssc_bind_library();
488*0Sstevel@tonic-gate 
489*0Sstevel@tonic-gate 	/* setup the debug options from the config file */
490*0Sstevel@tonic-gate 	setup_debug();
491*0Sstevel@tonic-gate 
492*0Sstevel@tonic-gate 	/* Make setup_debug() be the action in case of SIGHUP */
493*0Sstevel@tonic-gate 	sighandler.sa_flags = 0;
494*0Sstevel@tonic-gate 	sigfillset(&sighandler.sa_mask);
495*0Sstevel@tonic-gate 	sighandler.sa_handler = (void (*)(int)) setup_debug;
496*0Sstevel@tonic-gate 	sigaction(SIGHUP, &sighandler, NULL);
497*0Sstevel@tonic-gate 
498*0Sstevel@tonic-gate 	__savetime = gethrtime();
499*0Sstevel@tonic-gate 	(void) time(&clock_val);
500*0Sstevel@tonic-gate 	commd_debug(MD_MMV_MISC, "global init called %s\n",
501*0Sstevel@tonic-gate 			ctime(&clock_val));
502*0Sstevel@tonic-gate 
503*0Sstevel@tonic-gate 	/* start a thread that flushes out the debug on a regular basis */
504*0Sstevel@tonic-gate 	thr_create(NULL, 0, (void *(*)(void *))flush_fcout,
505*0Sstevel@tonic-gate 	    (void *) NULL, THR_DETACHED, NULL);
506*0Sstevel@tonic-gate 
507*0Sstevel@tonic-gate 	/* global rwlock's / mutex's / cond_t's go here */
508*0Sstevel@tonic-gate 	mutex_init(&check_timeout_mutex, USYNC_THREAD, NULL);
509*0Sstevel@tonic-gate 	cond_init(&check_timeout_cv, USYNC_THREAD, NULL);
510*0Sstevel@tonic-gate 	mutex_init(&get_setdesc_mutex, USYNC_THREAD, NULL);
511*0Sstevel@tonic-gate 
512*0Sstevel@tonic-gate 	/* Make sure the initiator table is initialized correctly */
513*0Sstevel@tonic-gate 	for (set = 0; set < MD_MAXSETS; set++) {
514*0Sstevel@tonic-gate 		for (class = 0; class < MD_MN_NCLASSES; class++) {
515*0Sstevel@tonic-gate 			mdmn_unregister_initiator_table(set, class);
516*0Sstevel@tonic-gate 		}
517*0Sstevel@tonic-gate 	}
518*0Sstevel@tonic-gate 
519*0Sstevel@tonic-gate 
520*0Sstevel@tonic-gate 	/* setup the check for timeouts */
521*0Sstevel@tonic-gate 	thr_create(NULL, 0, (void *(*)(void *))check_timeouts,
522*0Sstevel@tonic-gate 	    (void *) NULL, THR_DETACHED, NULL);
523*0Sstevel@tonic-gate 
524*0Sstevel@tonic-gate 	md_commd_global_state |= MD_CGS_INITED;
525*0Sstevel@tonic-gate }
526*0Sstevel@tonic-gate 
527*0Sstevel@tonic-gate 
528*0Sstevel@tonic-gate /*
529*0Sstevel@tonic-gate  * mdmn_init_client(setno, nodeid)
530*0Sstevel@tonic-gate  * called if client[setno][nodeid] is NULL
531*0Sstevel@tonic-gate  *
532*0Sstevel@tonic-gate  * NOTE: Must be called with set_desc_rwlock held as a reader
533*0Sstevel@tonic-gate  * NOTE: Must be called with client_rwlock held as a writer
534*0Sstevel@tonic-gate  *
535*0Sstevel@tonic-gate  * If the rpc client for this node has not been setup for any set, we do it now.
536*0Sstevel@tonic-gate  *
537*0Sstevel@tonic-gate  * Returns	0 on success (node found in set, rpc client setup)
538*0Sstevel@tonic-gate  *		-1 if metaget_setdesc failed,
539*0Sstevel@tonic-gate  *		-2 if node not part of set
540*0Sstevel@tonic-gate  *		-3 if clnt_create fails
541*0Sstevel@tonic-gate  */
542*0Sstevel@tonic-gate static int
543*0Sstevel@tonic-gate mdmn_init_client(set_t setno, md_mn_nodeid_t nid)
544*0Sstevel@tonic-gate {
545*0Sstevel@tonic-gate 	md_error_t	ep = mdnullerror;
546*0Sstevel@tonic-gate 	md_mnnode_desc	*node;
547*0Sstevel@tonic-gate 	md_set_desc	*sd;	/* just an abbr for set_descriptor[setno] */
548*0Sstevel@tonic-gate 
549*0Sstevel@tonic-gate 	sd = set_descriptor[setno];
550*0Sstevel@tonic-gate 
551*0Sstevel@tonic-gate 	/*
552*0Sstevel@tonic-gate 	 * Is the appropriate set_descriptor already initialized ?
553*0Sstevel@tonic-gate 	 * Can't think of a scenario where this is not the case, but we'd better
554*0Sstevel@tonic-gate 	 * check for it anyway.
555*0Sstevel@tonic-gate 	 */
556*0Sstevel@tonic-gate 	if (sd == NULL) {
557*0Sstevel@tonic-gate 		mdsetname_t	*sp;
558*0Sstevel@tonic-gate 
559*0Sstevel@tonic-gate 		rw_unlock(&set_desc_rwlock[setno]); /* readlock -> writelock */
560*0Sstevel@tonic-gate 		rw_wrlock(&set_desc_rwlock[setno]);
561*0Sstevel@tonic-gate 		sp = metasetnosetname(setno, &ep);
562*0Sstevel@tonic-gate 		/* Only one thread is supposed to be in metaget_setdesc() */
563*0Sstevel@tonic-gate 		mutex_lock(&get_setdesc_mutex);
564*0Sstevel@tonic-gate 		sd = metaget_setdesc(sp, &ep);
565*0Sstevel@tonic-gate 		mutex_unlock(&get_setdesc_mutex);
566*0Sstevel@tonic-gate 		if (sd == NULL) {
567*0Sstevel@tonic-gate 			rw_unlock(&set_desc_rwlock[setno]); /* back to ... */
568*0Sstevel@tonic-gate 			rw_rdlock(&set_desc_rwlock[setno]); /* ... readlock */
569*0Sstevel@tonic-gate 			return (-1);
570*0Sstevel@tonic-gate 		}
571*0Sstevel@tonic-gate 		set_descriptor[setno] = sd;
572*0Sstevel@tonic-gate 		rw_unlock(&set_desc_rwlock[setno]); /* back to readlock */
573*0Sstevel@tonic-gate 		rw_rdlock(&set_desc_rwlock[setno]);
574*0Sstevel@tonic-gate 	}
575*0Sstevel@tonic-gate 
576*0Sstevel@tonic-gate 	/* first we have to find the node name for this node id */
577*0Sstevel@tonic-gate 	for (node = sd->sd_nodelist; node; node = node->nd_next) {
578*0Sstevel@tonic-gate 		if (node->nd_nodeid == nid)
579*0Sstevel@tonic-gate 			break; /* we found our node in this set */
580*0Sstevel@tonic-gate 	}
581*0Sstevel@tonic-gate 
582*0Sstevel@tonic-gate 
583*0Sstevel@tonic-gate 	if (node == (md_mnnode_desc *)NULL) {
584*0Sstevel@tonic-gate 		commd_debug(MD_MMV_SYSLOG,
585*0Sstevel@tonic-gate 		    "FATAL: node %d not found in set %d\n", nid, setno);
586*0Sstevel@tonic-gate 		rw_unlock(&set_desc_rwlock[setno]);
587*0Sstevel@tonic-gate 		return (-2);
588*0Sstevel@tonic-gate 	}
589*0Sstevel@tonic-gate 
590*0Sstevel@tonic-gate 	commd_debug(MD_MMV_INIT, "init: %s has the flags: 0x%x\n",
591*0Sstevel@tonic-gate 	    node->nd_nodename ? node->nd_nodename : "NULL", node->nd_flags);
592*0Sstevel@tonic-gate 
593*0Sstevel@tonic-gate 	/* Did this node join the diskset?  */
594*0Sstevel@tonic-gate 	if ((node->nd_flags & MD_MN_NODE_OWN) == 0) {
595*0Sstevel@tonic-gate 		commd_debug(MD_MMV_INIT, "init: %s didn't join set %d\n",
596*0Sstevel@tonic-gate 		    node->nd_nodename ? node->nd_nodename : "NULL", setno);
597*0Sstevel@tonic-gate 		rw_unlock(&set_desc_rwlock[setno]);
598*0Sstevel@tonic-gate 		return (-2);
599*0Sstevel@tonic-gate 	}
600*0Sstevel@tonic-gate 
601*0Sstevel@tonic-gate 	/* if clnt_create has not been done for that node, do it now */
602*0Sstevel@tonic-gate 	if (client[setno][nid] == (CLIENT *) NULL) {
603*0Sstevel@tonic-gate 		client[setno][nid] = meta_client_create_retry(node->nd_nodename,
604*0Sstevel@tonic-gate 			mdmn_clnt_create, (void *) node, MD_CLNT_CREATE_TOUT,
605*0Sstevel@tonic-gate 			&ep);
606*0Sstevel@tonic-gate 		if (client[setno][nid] == (CLIENT *) NULL) {
607*0Sstevel@tonic-gate 			clnt_pcreateerror(node->nd_nodename);
608*0Sstevel@tonic-gate 			rw_unlock(&set_desc_rwlock[setno]);
609*0Sstevel@tonic-gate 			return (-3);
610*0Sstevel@tonic-gate 		}
611*0Sstevel@tonic-gate 		/* this node has the license to send */
612*0Sstevel@tonic-gate 		commd_debug(MD_MMV_MISC, "init_client: calling add_lic\n");
613*0Sstevel@tonic-gate 		add_license(node);
614*0Sstevel@tonic-gate 
615*0Sstevel@tonic-gate 		/* set the timeout value */
616*0Sstevel@tonic-gate 		clnt_control(client[setno][nid], CLSET_TIMEOUT,
617*0Sstevel@tonic-gate 		    (char *)&FOUR_SECS);
618*0Sstevel@tonic-gate 
619*0Sstevel@tonic-gate 	}
620*0Sstevel@tonic-gate 	rw_unlock(&set_desc_rwlock[setno]);
621*0Sstevel@tonic-gate 	return (0);
622*0Sstevel@tonic-gate }
623*0Sstevel@tonic-gate 
624*0Sstevel@tonic-gate /*
625*0Sstevel@tonic-gate  * check_client(setno, nodeid)
626*0Sstevel@tonic-gate  *
627*0Sstevel@tonic-gate  * must be called with reader lock held for set_desc_rwlock[setno]
628*0Sstevel@tonic-gate  * and must be called with reader lock held for client_rwlock[setno]
629*0Sstevel@tonic-gate  * Checks if the client for this set/node combination is already setup
630*0Sstevel@tonic-gate  * if not it upgrades the lock to a writer lock
631*0Sstevel@tonic-gate  * and tries to initialize the client.
632*0Sstevel@tonic-gate  * Finally it's checked if the client nulled out again due to some race
633*0Sstevel@tonic-gate  *
634*0Sstevel@tonic-gate  * returns 0 if there is a usable client
635*0Sstevel@tonic-gate  * returns MDMNE_RPC_FAIL otherwise
636*0Sstevel@tonic-gate  */
637*0Sstevel@tonic-gate static int
638*0Sstevel@tonic-gate check_client(set_t setno, md_mn_nodeid_t nodeid)
639*0Sstevel@tonic-gate {
640*0Sstevel@tonic-gate 	int ret = 0;
641*0Sstevel@tonic-gate 
642*0Sstevel@tonic-gate 	while ((client[setno][nodeid] == (CLIENT *)NULL) && (ret == 0)) {
643*0Sstevel@tonic-gate 		rw_unlock(&client_rwlock[setno]); /* upgrade reader ... */
644*0Sstevel@tonic-gate 		rw_wrlock(&client_rwlock[setno]); /* ... to writer lock. */
645*0Sstevel@tonic-gate 		if (mdmn_init_client(setno, nodeid) != 0) {
646*0Sstevel@tonic-gate 			ret = MDMNE_RPC_FAIL;
647*0Sstevel@tonic-gate 		}
648*0Sstevel@tonic-gate 		rw_unlock(&client_rwlock[setno]); /* downgrade writer ... */
649*0Sstevel@tonic-gate 		rw_rdlock(&client_rwlock[setno]); /* ... back to reader lock. */
650*0Sstevel@tonic-gate 	}
651*0Sstevel@tonic-gate 	return (ret);
652*0Sstevel@tonic-gate }
653*0Sstevel@tonic-gate 
654*0Sstevel@tonic-gate /*
655*0Sstevel@tonic-gate  * mdmn_init_set(setno, todo)
656*0Sstevel@tonic-gate  * setno is the number of the set to be initialized.
657*0Sstevel@tonic-gate  * todo is one of the MDMN_SET_* thingies or MDMN_SET_READY
658*0Sstevel@tonic-gate  * If called with MDMN_SET_READY everything is initialized.
659*0Sstevel@tonic-gate  *
660*0Sstevel@tonic-gate  * If the set mutexes are already initialized, the caller has to hold
661*0Sstevel@tonic-gate  * both set_desc_rwlock[setno] and client_rwlock[setno] as a writer, before
662*0Sstevel@tonic-gate  * calling mdmn_init_set()
663*0Sstevel@tonic-gate  */
664*0Sstevel@tonic-gate int
665*0Sstevel@tonic-gate mdmn_init_set(set_t setno, int todo)
666*0Sstevel@tonic-gate {
667*0Sstevel@tonic-gate 	int class;
668*0Sstevel@tonic-gate 	md_mnnode_desc	*node;
669*0Sstevel@tonic-gate 	md_set_desc	*sd; /* just an abbr for set_descriptor[setno] */
670*0Sstevel@tonic-gate 	mdsetname_t	*sp;
671*0Sstevel@tonic-gate 	md_error_t	ep = mdnullerror;
672*0Sstevel@tonic-gate 	md_mn_nodeid_t	nid;
673*0Sstevel@tonic-gate 
674*0Sstevel@tonic-gate 	/*
675*0Sstevel@tonic-gate 	 * Check if we are told to setup the mutexes and
676*0Sstevel@tonic-gate 	 * if these are not yet setup
677*0Sstevel@tonic-gate 	 */
678*0Sstevel@tonic-gate 	if ((todo & MDMN_SET_MUTEXES) &&
679*0Sstevel@tonic-gate 	    ((md_mn_set_inited[setno] & MDMN_SET_MUTEXES) == 0)) {
680*0Sstevel@tonic-gate 		mutex_init(&mdmn_busy_mutex[setno], USYNC_THREAD, NULL);
681*0Sstevel@tonic-gate 		cond_init(&mdmn_busy_cv[setno], USYNC_THREAD, NULL);
682*0Sstevel@tonic-gate 		rwlock_init(&client_rwlock[setno], USYNC_THREAD, NULL);
683*0Sstevel@tonic-gate 		rwlock_init(&set_desc_rwlock[setno], USYNC_THREAD, NULL);
684*0Sstevel@tonic-gate 
685*0Sstevel@tonic-gate 		for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) {
686*0Sstevel@tonic-gate 			mutex_init(mdmn_get_master_table_mx(setno, class),
687*0Sstevel@tonic-gate 			    USYNC_THREAD, NULL);
688*0Sstevel@tonic-gate 			cond_init(mdmn_get_master_table_cv(setno, class),
689*0Sstevel@tonic-gate 			    USYNC_THREAD, NULL);
690*0Sstevel@tonic-gate 			mutex_init(mdmn_get_initiator_table_mx(setno, class),
691*0Sstevel@tonic-gate 			    USYNC_THREAD, NULL);
692*0Sstevel@tonic-gate 		}
693*0Sstevel@tonic-gate 		md_mn_set_inited[setno] |= MDMN_SET_MUTEXES;
694*0Sstevel@tonic-gate 	}
695*0Sstevel@tonic-gate 	if ((todo & MDMN_SET_MCT) &&
696*0Sstevel@tonic-gate 	    ((md_mn_set_inited[setno] & MDMN_SET_MCT) == 0)) {
697*0Sstevel@tonic-gate 		int	fd;
698*0Sstevel@tonic-gate 		size_t	filesize;
699*0Sstevel@tonic-gate 		caddr_t	addr;
700*0Sstevel@tonic-gate 		char table_name[32];
701*0Sstevel@tonic-gate 
702*0Sstevel@tonic-gate 		filesize = (sizeof (md_mn_mct_t));
703*0Sstevel@tonic-gate 		(void) snprintf(table_name, sizeof (table_name), "%s%d",
704*0Sstevel@tonic-gate 		    MD_MN_MSG_COMP_TABLE, setno);
705*0Sstevel@tonic-gate 		/*
706*0Sstevel@tonic-gate 		 * If the mct file exists we map it into memory.
707*0Sstevel@tonic-gate 		 * Otherwise we create an empty file of appropriate
708*0Sstevel@tonic-gate 		 * size and map that into memory.
709*0Sstevel@tonic-gate 		 * The mapped areas are stored in mct[setno].
710*0Sstevel@tonic-gate 		 */
711*0Sstevel@tonic-gate 		fd = open(table_name, O_RDWR|O_CREAT|O_DSYNC, 0600);
712*0Sstevel@tonic-gate 		if (fd < 0) {
713*0Sstevel@tonic-gate 			commd_debug(MD_MMV_MISC,
714*0Sstevel@tonic-gate 			    "init_set: Can't open MCT\n");
715*0Sstevel@tonic-gate 			return (-1);
716*0Sstevel@tonic-gate 		}
717*0Sstevel@tonic-gate 		/*
718*0Sstevel@tonic-gate 		 * To ensure that the file has the appropriate size,
719*0Sstevel@tonic-gate 		 * we write a byte at the end of the file.
720*0Sstevel@tonic-gate 		 */
721*0Sstevel@tonic-gate 		lseek(fd, filesize + 1, SEEK_SET);
722*0Sstevel@tonic-gate 		write(fd, "\0", 1);
723*0Sstevel@tonic-gate 
724*0Sstevel@tonic-gate 		/* at this point we have a file in place that we can mmap */
725*0Sstevel@tonic-gate 		addr = mmap(0, filesize, PROT_READ | PROT_WRITE,
726*0Sstevel@tonic-gate 		    MAP_SHARED, fd, (off_t)0);
727*0Sstevel@tonic-gate 		if (addr == MAP_FAILED) {
728*0Sstevel@tonic-gate 			commd_debug(MD_MMV_INIT,
729*0Sstevel@tonic-gate 			    "init_set: mmap mct error %d\n",
730*0Sstevel@tonic-gate 			    errno);
731*0Sstevel@tonic-gate 			return (-1);
732*0Sstevel@tonic-gate 		}
733*0Sstevel@tonic-gate 		/* LINTED pointer alignment */
734*0Sstevel@tonic-gate 		mct[setno] = (md_mn_mct_t *)addr;
735*0Sstevel@tonic-gate 
736*0Sstevel@tonic-gate 		/* finally we initialize the mutexes that protect the mct */
737*0Sstevel@tonic-gate 		for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) {
738*0Sstevel@tonic-gate 			mutex_init(&(mct_mutex[setno][class]),
739*0Sstevel@tonic-gate 			    USYNC_THREAD, NULL);
740*0Sstevel@tonic-gate 		}
741*0Sstevel@tonic-gate 
742*0Sstevel@tonic-gate 		md_mn_set_inited[setno] |= MDMN_SET_MCT;
743*0Sstevel@tonic-gate 	}
744*0Sstevel@tonic-gate 	/*
745*0Sstevel@tonic-gate 	 * Check if we are told to setup the nodes and
746*0Sstevel@tonic-gate 	 * if these are not yet setup
747*0Sstevel@tonic-gate 	 * (Attention: negative logic here compared to above!)
748*0Sstevel@tonic-gate 	 */
749*0Sstevel@tonic-gate 	if (((todo & MDMN_SET_NODES) == 0) ||
750*0Sstevel@tonic-gate 	    (md_mn_set_inited[setno] & MDMN_SET_NODES)) {
751*0Sstevel@tonic-gate 		return (0); /* success */
752*0Sstevel@tonic-gate 	}
753*0Sstevel@tonic-gate 
754*0Sstevel@tonic-gate 	if ((sp = metasetnosetname(setno, &ep)) == NULL) {
755*0Sstevel@tonic-gate 		commd_debug(MD_MMV_SYSLOG,
756*0Sstevel@tonic-gate 		    "metasetnosetname(%d) returned NULL\n", setno);
757*0Sstevel@tonic-gate 		return (MDMNE_NOT_JOINED);
758*0Sstevel@tonic-gate 	}
759*0Sstevel@tonic-gate 
760*0Sstevel@tonic-gate 	/* flush local copy of rpc.metad data */
761*0Sstevel@tonic-gate 	metaflushsetname(sp);
762*0Sstevel@tonic-gate 
763*0Sstevel@tonic-gate 	mutex_lock(&get_setdesc_mutex);
764*0Sstevel@tonic-gate 	sd = metaget_setdesc(sp, &ep);
765*0Sstevel@tonic-gate 	mutex_unlock(&get_setdesc_mutex);
766*0Sstevel@tonic-gate 
767*0Sstevel@tonic-gate 	if (sd == NULL) {
768*0Sstevel@tonic-gate 		commd_debug(MD_MMV_SYSLOG,
769*0Sstevel@tonic-gate 		    "metaget_setdesc(%d) returned NULL\n", setno);
770*0Sstevel@tonic-gate 		return (MDMNE_NOT_JOINED);
771*0Sstevel@tonic-gate 	}
772*0Sstevel@tonic-gate 
773*0Sstevel@tonic-gate 	/*
774*0Sstevel@tonic-gate 	 * if this set is not a multinode set or
775*0Sstevel@tonic-gate 	 * this node didn't join yet the diskset, better don't do anything
776*0Sstevel@tonic-gate 	 */
777*0Sstevel@tonic-gate 	if ((MD_MNSET_DESC(sd) == 0) ||
778*0Sstevel@tonic-gate 	    (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN) == 0) {
779*0Sstevel@tonic-gate 		commd_debug(MD_MMV_INIT, "didn't yet join set %d\n", setno);
780*0Sstevel@tonic-gate 		return (MDMNE_NOT_JOINED);
781*0Sstevel@tonic-gate 	}
782*0Sstevel@tonic-gate 
783*0Sstevel@tonic-gate 	for (node = sd->sd_nodelist; node != NULL; node = node->nd_next) {
784*0Sstevel@tonic-gate 		nid = node->nd_nodeid;
785*0Sstevel@tonic-gate 
786*0Sstevel@tonic-gate 		commd_debug(MD_MMV_INIT,
787*0Sstevel@tonic-gate 		    "setting up: node=%s, priv_ic=%s, flags=0x%x\n",
788*0Sstevel@tonic-gate 		    node->nd_nodename ? node->nd_nodename : "NULL",
789*0Sstevel@tonic-gate 		    node->nd_priv_ic ? node->nd_priv_ic : "NULL",
790*0Sstevel@tonic-gate 		    node->nd_flags);
791*0Sstevel@tonic-gate 
792*0Sstevel@tonic-gate 		if ((node->nd_flags & MD_MN_NODE_OWN) == 0) {
793*0Sstevel@tonic-gate 			commd_debug(MD_MMV_INIT,
794*0Sstevel@tonic-gate 			    "init: %s didn't join set %d\n",
795*0Sstevel@tonic-gate 			    node->nd_nodename ? node->nd_nodename : "NULL",
796*0Sstevel@tonic-gate 			    setno);
797*0Sstevel@tonic-gate 			continue;
798*0Sstevel@tonic-gate 		}
799*0Sstevel@tonic-gate 
800*0Sstevel@tonic-gate 		if (client[setno][nid] != (CLIENT *) NULL) {
801*0Sstevel@tonic-gate 			/* already inited */
802*0Sstevel@tonic-gate 			commd_debug(MD_MMV_INIT, "init: already: node=%s\n",
803*0Sstevel@tonic-gate 			    node->nd_nodename ? node->nd_nodename : "NULL");
804*0Sstevel@tonic-gate 			continue;
805*0Sstevel@tonic-gate 		}
806*0Sstevel@tonic-gate 		client[setno][nid] = meta_client_create_retry(node->nd_nodename,
807*0Sstevel@tonic-gate 			mdmn_clnt_create, (void *)node, MD_CLNT_CREATE_TOUT,
808*0Sstevel@tonic-gate 			&ep);
809*0Sstevel@tonic-gate 
810*0Sstevel@tonic-gate 		if (client[setno][nid] == (CLIENT *) NULL) {
811*0Sstevel@tonic-gate 			clnt_pcreateerror(node->nd_nodename);
812*0Sstevel@tonic-gate 			/*
813*0Sstevel@tonic-gate 			 * If we cannot connect to a single node
814*0Sstevel@tonic-gate 			 * (maybe because it is down) we mark this node as not
815*0Sstevel@tonic-gate 			 * owned and continue with the next node in the list.
816*0Sstevel@tonic-gate 			 * This is better than failing the entire starting up
817*0Sstevel@tonic-gate 			 * of the commd system.
818*0Sstevel@tonic-gate 			 */
819*0Sstevel@tonic-gate 			node->nd_flags &= ~MD_MN_NODE_OWN;
820*0Sstevel@tonic-gate 			commd_debug(MD_MMV_SYSLOG,
821*0Sstevel@tonic-gate 			    "WARNING couldn't create client for %s\n"
822*0Sstevel@tonic-gate 			    "Reconfig cycle required\n",
823*0Sstevel@tonic-gate 			    node->nd_nodename);
824*0Sstevel@tonic-gate 			commd_debug(MD_MMV_INIT,
825*0Sstevel@tonic-gate 			    "WARNING couldn't create client for %s\n"
826*0Sstevel@tonic-gate 			    "Reconfig cycle required\n",
827*0Sstevel@tonic-gate 			    node->nd_nodename);
828*0Sstevel@tonic-gate 			continue;
829*0Sstevel@tonic-gate 		}
830*0Sstevel@tonic-gate 		/* this node has the license to send */
831*0Sstevel@tonic-gate 		commd_debug(MD_MMV_MISC, "init_set: calling add_lic\n");
832*0Sstevel@tonic-gate 		add_license(node);
833*0Sstevel@tonic-gate 
834*0Sstevel@tonic-gate 		/* set the timeout value */
835*0Sstevel@tonic-gate 		clnt_control(client[setno][nid], CLSET_TIMEOUT,
836*0Sstevel@tonic-gate 		    (char *)&FOUR_SECS);
837*0Sstevel@tonic-gate 
838*0Sstevel@tonic-gate 		commd_debug(MD_MMV_INIT, "init: done: node=%s\n",
839*0Sstevel@tonic-gate 		    node->nd_nodename ? node->nd_nodename : "NULL");
840*0Sstevel@tonic-gate 	}
841*0Sstevel@tonic-gate 
842*0Sstevel@tonic-gate 	set_descriptor[setno] = sd;
843*0Sstevel@tonic-gate 	md_mn_set_inited[setno] |= MDMN_SET_NODES;
844*0Sstevel@tonic-gate 	return (0); /* success */
845*0Sstevel@tonic-gate }
846*0Sstevel@tonic-gate 
847*0Sstevel@tonic-gate void *
848*0Sstevel@tonic-gate mdmn_send_to_work(void *arg)
849*0Sstevel@tonic-gate {
850*0Sstevel@tonic-gate 	int			*rpc_err;
851*0Sstevel@tonic-gate 	int			success;
852*0Sstevel@tonic-gate 	int			try_master;
853*0Sstevel@tonic-gate 	set_t			setno;
854*0Sstevel@tonic-gate 	mutex_t			*mx;	/* protection for initiator_table */
855*0Sstevel@tonic-gate 	SVCXPRT			*transp;
856*0Sstevel@tonic-gate 	md_mn_msg_t		*msg;
857*0Sstevel@tonic-gate 	md_mn_nodeid_t		set_master;
858*0Sstevel@tonic-gate 	md_mn_msgclass_t	class;
859*0Sstevel@tonic-gate 	md_mn_msg_and_transp_t	*matp = (md_mn_msg_and_transp_t *)arg;
860*0Sstevel@tonic-gate 
861*0Sstevel@tonic-gate 	msg			= matp->mat_msg;
862*0Sstevel@tonic-gate 	transp			= matp->mat_transp;
863*0Sstevel@tonic-gate 
864*0Sstevel@tonic-gate 	/* the alloc was done in mdmn_send_svc_1 */
865*0Sstevel@tonic-gate 	free(matp);
866*0Sstevel@tonic-gate 
867*0Sstevel@tonic-gate 	class = mdmn_get_message_class(msg->msg_type);
868*0Sstevel@tonic-gate 	setno = msg->msg_setno;
869*0Sstevel@tonic-gate 
870*0Sstevel@tonic-gate 	/* set the sender, so the master knows who to send the results */
871*0Sstevel@tonic-gate 	rw_rdlock(&set_desc_rwlock[setno]);
872*0Sstevel@tonic-gate 	msg->msg_sender = set_descriptor[setno]->sd_mn_mynode->nd_nodeid;
873*0Sstevel@tonic-gate 	set_master	= set_descriptor[setno]->sd_mn_master_nodeid;
874*0Sstevel@tonic-gate 
875*0Sstevel@tonic-gate 	mx = mdmn_get_initiator_table_mx(setno, class);
876*0Sstevel@tonic-gate 	mutex_lock(mx);
877*0Sstevel@tonic-gate 
878*0Sstevel@tonic-gate 	/*
879*0Sstevel@tonic-gate 	 * Here we check, if the initiator table slot for this set/class
880*0Sstevel@tonic-gate 	 * combination is free to use.
881*0Sstevel@tonic-gate 	 * If this is not the case, we return CLASS_BUSY forcing the
882*0Sstevel@tonic-gate 	 * initiating send_message call to retry
883*0Sstevel@tonic-gate 	 */
884*0Sstevel@tonic-gate 	success = mdmn_check_initiator_table(setno, class);
885*0Sstevel@tonic-gate 	if (success == MDMNE_CLASS_BUSY) {
886*0Sstevel@tonic-gate 		md_mn_msgid_t		active_mid;
887*0Sstevel@tonic-gate 
888*0Sstevel@tonic-gate 		mdmn_get_initiator_table_id(setno, class,
889*0Sstevel@tonic-gate 		&active_mid);
890*0Sstevel@tonic-gate 
891*0Sstevel@tonic-gate 		commd_debug(MD_MMV_SEND,
892*0Sstevel@tonic-gate 		    "send_to_work: received but locally busy "
893*0Sstevel@tonic-gate 		    "(%d, 0x%llx-%d), set=%d, class=%d, type=%d, "
894*0Sstevel@tonic-gate 		    "active msg=(%d, 0x%llx-%d)\n",
895*0Sstevel@tonic-gate 		    MSGID_ELEMS(msg->msg_msgid), setno, class,
896*0Sstevel@tonic-gate 		    msg->msg_type, MSGID_ELEMS(active_mid));
897*0Sstevel@tonic-gate 	} else {
898*0Sstevel@tonic-gate 		commd_debug(MD_MMV_SEND,
899*0Sstevel@tonic-gate 		    "send_to_work: received (%d, 0x%llx-%d), "
900*0Sstevel@tonic-gate 		    "set=%d, class=%d, type=%d\n",
901*0Sstevel@tonic-gate 		    MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type);
902*0Sstevel@tonic-gate 	}
903*0Sstevel@tonic-gate 
904*0Sstevel@tonic-gate 	try_master = 2; /* return failure after two retries */
905*0Sstevel@tonic-gate 	while ((success == MDMNE_ACK) && (try_master--)) {
906*0Sstevel@tonic-gate 		rw_rdlock(&client_rwlock[setno]);
907*0Sstevel@tonic-gate 		/* is the rpc client to the master still around ? */
908*0Sstevel@tonic-gate 		if (check_client(setno, set_master)) {
909*0Sstevel@tonic-gate 			success = MDMNE_RPC_FAIL;
910*0Sstevel@tonic-gate 			FLUSH_DEBUGFILE();
911*0Sstevel@tonic-gate 			rw_unlock(&client_rwlock[setno]);
912*0Sstevel@tonic-gate 			break; /* out of try_master-loop */
913*0Sstevel@tonic-gate 		}
914*0Sstevel@tonic-gate 
915*0Sstevel@tonic-gate 		/*
916*0Sstevel@tonic-gate 		 * Send the request to the work function on the master
917*0Sstevel@tonic-gate 		 * this call will return immediately
918*0Sstevel@tonic-gate 		 */
919*0Sstevel@tonic-gate 		rpc_err = mdmn_work_1(msg, client[setno][set_master]);
920*0Sstevel@tonic-gate 
921*0Sstevel@tonic-gate 		/* Everything's Ok? */
922*0Sstevel@tonic-gate 		if (rpc_err == NULL) {
923*0Sstevel@tonic-gate 			success = MDMNE_RPC_FAIL;
924*0Sstevel@tonic-gate 			/*
925*0Sstevel@tonic-gate 			 * Probably something happened to the daemon on the
926*0Sstevel@tonic-gate 			 * master. Kill the client, and try again...
927*0Sstevel@tonic-gate 			 */
928*0Sstevel@tonic-gate 			rw_unlock(&client_rwlock[setno]);
929*0Sstevel@tonic-gate 			rw_wrlock(&client_rwlock[setno]);
930*0Sstevel@tonic-gate 			mdmn_clnt_destroy(client[setno][set_master]);
931*0Sstevel@tonic-gate 			if (client[setno][set_master] != (CLIENT *)NULL) {
932*0Sstevel@tonic-gate 				client[setno][set_master] = (CLIENT *)NULL;
933*0Sstevel@tonic-gate 			}
934*0Sstevel@tonic-gate 			rw_unlock(&client_rwlock[setno]);
935*0Sstevel@tonic-gate 			continue;
936*0Sstevel@tonic-gate 
937*0Sstevel@tonic-gate 		} else  if (*rpc_err != MDMNE_ACK) {
938*0Sstevel@tonic-gate 			/* something went wrong, break out */
939*0Sstevel@tonic-gate 			success = *rpc_err;
940*0Sstevel@tonic-gate 			free(rpc_err);
941*0Sstevel@tonic-gate 			rw_unlock(&client_rwlock[setno]);
942*0Sstevel@tonic-gate 			break; /* out of try_master-loop */
943*0Sstevel@tonic-gate 		}
944*0Sstevel@tonic-gate 
945*0Sstevel@tonic-gate 		rw_unlock(&client_rwlock[setno]);
946*0Sstevel@tonic-gate 		free(rpc_err);
947*0Sstevel@tonic-gate 
948*0Sstevel@tonic-gate 		/*
949*0Sstevel@tonic-gate 		 * If we are here, we sucessfully delivered the message.
950*0Sstevel@tonic-gate 		 * We register the initiator_table, so that
951*0Sstevel@tonic-gate 		 * wakeup_initiator_1  can do the sendreply with the
952*0Sstevel@tonic-gate 		 * results for us.
953*0Sstevel@tonic-gate 		 */
954*0Sstevel@tonic-gate 		success = MDMNE_ACK;
955*0Sstevel@tonic-gate 		mdmn_register_initiator_table(setno, class, msg, transp);
956*0Sstevel@tonic-gate 
957*0Sstevel@tonic-gate 		/* tell check_timeouts, there's work to do */
958*0Sstevel@tonic-gate 		mutex_lock(&check_timeout_mutex);
959*0Sstevel@tonic-gate 		messages_on_their_way++;
960*0Sstevel@tonic-gate 		cond_signal(&check_timeout_cv);
961*0Sstevel@tonic-gate 		mutex_unlock(&check_timeout_mutex);
962*0Sstevel@tonic-gate 		break; /* out of try_master-loop */
963*0Sstevel@tonic-gate 	}
964*0Sstevel@tonic-gate 
965*0Sstevel@tonic-gate 	rw_unlock(&set_desc_rwlock[setno]);
966*0Sstevel@tonic-gate 
967*0Sstevel@tonic-gate 	if (success == MDMNE_ACK) {
968*0Sstevel@tonic-gate 		commd_debug(MD_MMV_SEND,
969*0Sstevel@tonic-gate 		    "send_to_work: registered (%d, 0x%llx-%d)\n",
970*0Sstevel@tonic-gate 		    MSGID_ELEMS(msg->msg_msgid));
971*0Sstevel@tonic-gate 	} else {
972*0Sstevel@tonic-gate 		/* In case of failure do the sendreply now */
973*0Sstevel@tonic-gate 		md_mn_result_t *resultp;
974*0Sstevel@tonic-gate 		resultp = Zalloc(sizeof (md_mn_result_t));
975*0Sstevel@tonic-gate 		resultp->mmr_comm_state = success;
976*0Sstevel@tonic-gate 		mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
977*0Sstevel@tonic-gate 		commd_debug(MD_MMV_SEND,
978*0Sstevel@tonic-gate 		    "send_to_work: not registered (%d, 0x%llx-%d) cs=%d\n",
979*0Sstevel@tonic-gate 		    MSGID_ELEMS(msg->msg_msgid), success);
980*0Sstevel@tonic-gate 		free_result(resultp);
981*0Sstevel@tonic-gate 
982*0Sstevel@tonic-gate 	}
983*0Sstevel@tonic-gate 
984*0Sstevel@tonic-gate 	free_msg(msg);
985*0Sstevel@tonic-gate 	mutex_unlock(mx);
986*0Sstevel@tonic-gate 	return (NULL);
987*0Sstevel@tonic-gate 
988*0Sstevel@tonic-gate }
989*0Sstevel@tonic-gate 
990*0Sstevel@tonic-gate /*
991*0Sstevel@tonic-gate  * do_message_locally(msg, result)
992*0Sstevel@tonic-gate  * Process a message locally on the master
993*0Sstevel@tonic-gate  * Lookup the MCT if the message has already been processed.
994*0Sstevel@tonic-gate  * If not, call the handler and store the result
995*0Sstevel@tonic-gate  * If yes, retrieve the result from the MCT.
996*0Sstevel@tonic-gate  * Return:
997*0Sstevel@tonic-gate  *	MDMNE_ACK in case of success
998*0Sstevel@tonic-gate  *	MDMNE_LOG_FAIL if the MCT could not be checked
999*0Sstevel@tonic-gate  */
1000*0Sstevel@tonic-gate static int
1001*0Sstevel@tonic-gate do_message_locally(md_mn_msg_t *msg, md_mn_result_t *result)
1002*0Sstevel@tonic-gate {
1003*0Sstevel@tonic-gate 	int			completed;
1004*0Sstevel@tonic-gate 	set_t			setno;
1005*0Sstevel@tonic-gate 	md_mn_msgtype_t		msgtype = msg->msg_type;
1006*0Sstevel@tonic-gate 	md_mn_msgclass_t	class;
1007*0Sstevel@tonic-gate 
1008*0Sstevel@tonic-gate 	void (*handler)(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *res);
1009*0Sstevel@tonic-gate 
1010*0Sstevel@tonic-gate 	handler = mdmn_get_handler(msgtype);
1011*0Sstevel@tonic-gate 	if (handler == NULL) {
1012*0Sstevel@tonic-gate 		result->mmr_exitval = 0;
1013*0Sstevel@tonic-gate 		/* let the sender decide if this is an error or not */
1014*0Sstevel@tonic-gate 		result->mmr_comm_state = MDMNE_NO_HANDLER;
1015*0Sstevel@tonic-gate 		return (MDMNE_NO_HANDLER);
1016*0Sstevel@tonic-gate 	}
1017*0Sstevel@tonic-gate 
1018*0Sstevel@tonic-gate 	class = mdmn_get_message_class(msg->msg_type);
1019*0Sstevel@tonic-gate 	setno = msg->msg_setno;
1020*0Sstevel@tonic-gate 
1021*0Sstevel@tonic-gate 	result->mmr_msgtype	= msgtype;
1022*0Sstevel@tonic-gate 	result->mmr_flags	= msg->msg_flags;
1023*0Sstevel@tonic-gate 	MSGID_COPY(&(msg->msg_msgid), &(result->mmr_msgid));
1024*0Sstevel@tonic-gate 
1025*0Sstevel@tonic-gate 	mutex_lock(&mct_mutex[setno][class]);
1026*0Sstevel@tonic-gate 	completed = mdmn_check_completion(msg, result);
1027*0Sstevel@tonic-gate 	if (completed == MDMN_MCT_NOT_DONE) {
1028*0Sstevel@tonic-gate 		/* message not yet processed locally */
1029*0Sstevel@tonic-gate 		commd_debug(MD_MMV_PROC_M, "proc_mas: "
1030*0Sstevel@tonic-gate 		    "calling handler for (%d,0x%llx-%d) type %d\n",
1031*0Sstevel@tonic-gate 		    MSGID_ELEMS(msg->msg_msgid), msgtype);
1032*0Sstevel@tonic-gate 
1033*0Sstevel@tonic-gate 		/*
1034*0Sstevel@tonic-gate 		 * Mark the message as being currently processed,
1035*0Sstevel@tonic-gate 		 * so we won't start a second handler for it
1036*0Sstevel@tonic-gate 		 */
1037*0Sstevel@tonic-gate 		(void) mdmn_mark_completion(msg, NULL, MDMN_MCT_IN_PROGRESS);
1038*0Sstevel@tonic-gate 		mutex_unlock(&mct_mutex[setno][class]);
1039*0Sstevel@tonic-gate 
1040*0Sstevel@tonic-gate 		/* here we actually process the message on the master */
1041*0Sstevel@tonic-gate 		(*handler)(msg, MD_MSGF_ON_MASTER, result);
1042*0Sstevel@tonic-gate 
1043*0Sstevel@tonic-gate 		commd_debug(MD_MMV_PROC_M, "proc_mas: "
1044*0Sstevel@tonic-gate 		    "finished handler for (%d,0x%llx-%d) type %d\n",
1045*0Sstevel@tonic-gate 		    MSGID_ELEMS(msg->msg_msgid), msgtype);
1046*0Sstevel@tonic-gate 
1047*0Sstevel@tonic-gate 		/* Mark the message as fully processed, store the result */
1048*0Sstevel@tonic-gate 		mutex_lock(&mct_mutex[setno][class]);
1049*0Sstevel@tonic-gate 		(void) mdmn_mark_completion(msg, result, MDMN_MCT_DONE);
1050*0Sstevel@tonic-gate 	} else if (completed == MDMN_MCT_DONE) {
1051*0Sstevel@tonic-gate 		commd_debug(MD_MMV_PROC_M, "proc_mas: "
1052*0Sstevel@tonic-gate 		    "result for (%d, 0x%llx-%d) from MCT\n",
1053*0Sstevel@tonic-gate 		    MSGID_ELEMS(msg->msg_msgid), msgtype);
1054*0Sstevel@tonic-gate 	} else if (completed == MDMN_MCT_IN_PROGRESS) {
1055*0Sstevel@tonic-gate 		commd_debug(MD_MMV_PROC_M, "proc_mas: "
1056*0Sstevel@tonic-gate 		    "(%d, 0x%llx-%d) is currently being processed\n",
1057*0Sstevel@tonic-gate 		    MSGID_ELEMS(msg->msg_msgid), msgtype);
1058*0Sstevel@tonic-gate 	} else {
1059*0Sstevel@tonic-gate 		/* MCT error occurred (should never happen) */
1060*0Sstevel@tonic-gate 		mutex_unlock(&mct_mutex[setno][class]);
1061*0Sstevel@tonic-gate 		result->mmr_comm_state = MDMNE_LOG_FAIL;
1062*0Sstevel@tonic-gate 		commd_debug(MD_MMV_SYSLOG, "WARNING "
1063*0Sstevel@tonic-gate 		    "mdmn_check_completion returned %d "
1064*0Sstevel@tonic-gate 		    "for (%d,0x%llx-%d)\n", completed,
1065*0Sstevel@tonic-gate 		    MSGID_ELEMS(msg->msg_msgid));
1066*0Sstevel@tonic-gate 		return (MDMNE_LOG_FAIL);
1067*0Sstevel@tonic-gate 	}
1068*0Sstevel@tonic-gate 	mutex_unlock(&mct_mutex[setno][class]);
1069*0Sstevel@tonic-gate 	return (MDMNE_ACK);
1070*0Sstevel@tonic-gate 
1071*0Sstevel@tonic-gate }
1072*0Sstevel@tonic-gate 
1073*0Sstevel@tonic-gate /*
1074*0Sstevel@tonic-gate  * do_send_message(msg, node)
1075*0Sstevel@tonic-gate  *
1076*0Sstevel@tonic-gate  * Send a message to a given node and wait for a acknowledgment, that the
1077*0Sstevel@tonic-gate  * message has arrived on the remote node.
1078*0Sstevel@tonic-gate  * Make sure that the client for the set is setup correctly.
1079*0Sstevel@tonic-gate  * If no ACK arrives, destroy and recreate the RPC client and retry the
1080*0Sstevel@tonic-gate  * message one time
1081*0Sstevel@tonic-gate  * After actually sending wait no longer than the appropriate number of
1082*0Sstevel@tonic-gate  * before timing out the message.
1083*0Sstevel@tonic-gate  *
1084*0Sstevel@tonic-gate  * Note must be called with set_desc_wrlock held in reader mode
1085*0Sstevel@tonic-gate  */
1086*0Sstevel@tonic-gate static int
1087*0Sstevel@tonic-gate do_send_message(md_mn_msg_t *msg, md_mnnode_desc *node)
1088*0Sstevel@tonic-gate {
1089*0Sstevel@tonic-gate 	int			err;
1090*0Sstevel@tonic-gate 	int			rpc_retries;
1091*0Sstevel@tonic-gate 	int			timeout_retries = 0;
1092*0Sstevel@tonic-gate 	int			*ret = NULL;
1093*0Sstevel@tonic-gate 	set_t			setno;
1094*0Sstevel@tonic-gate 	cond_t			*cv;	/* see mdmn_wakeup_master_svc_1 */
1095*0Sstevel@tonic-gate 	mutex_t			*mx;	/* protection for class_busy */
1096*0Sstevel@tonic-gate 	timestruc_t		timeout; /* surveillance for remote daemon */
1097*0Sstevel@tonic-gate 	md_mn_nodeid_t		nid;
1098*0Sstevel@tonic-gate 	md_mn_msgtype_t		msgtype;
1099*0Sstevel@tonic-gate 	md_mn_msgclass_t	class;
1100*0Sstevel@tonic-gate 
1101*0Sstevel@tonic-gate 	nid	= node->nd_nodeid;
1102*0Sstevel@tonic-gate 	msgtype = msg->msg_type;
1103*0Sstevel@tonic-gate 	setno	= msg->msg_setno;
1104*0Sstevel@tonic-gate 	class	= mdmn_get_message_class(msgtype);
1105*0Sstevel@tonic-gate 	mx	= mdmn_get_master_table_mx(setno, class);
1106*0Sstevel@tonic-gate 	cv	= mdmn_get_master_table_cv(setno, class);
1107*0Sstevel@tonic-gate 
1108*0Sstevel@tonic-gate retry_rpc:
1109*0Sstevel@tonic-gate 
1110*0Sstevel@tonic-gate 	/* We try two times to send the message */
1111*0Sstevel@tonic-gate 	rpc_retries = 2;
1112*0Sstevel@tonic-gate 
1113*0Sstevel@tonic-gate 	/*
1114*0Sstevel@tonic-gate 	 * if sending the message doesn't succeed the first time due to a
1115*0Sstevel@tonic-gate 	 * RPC problem, we retry one time
1116*0Sstevel@tonic-gate 	 */
1117*0Sstevel@tonic-gate 	while ((rpc_retries != 0) && (ret == NULL)) {
1118*0Sstevel@tonic-gate 		/*  in abort state, we error out immediately */
1119*0Sstevel@tonic-gate 		if (md_commd_global_state & MD_CGS_ABORTED) {
1120*0Sstevel@tonic-gate 			return (MDMNE_ABORT);
1121*0Sstevel@tonic-gate 		}
1122*0Sstevel@tonic-gate 
1123*0Sstevel@tonic-gate 		rw_rdlock(&client_rwlock[setno]);
1124*0Sstevel@tonic-gate 		/* unable to create client? Ignore it */
1125*0Sstevel@tonic-gate 		if (check_client(setno, nid)) {
1126*0Sstevel@tonic-gate 			/*
1127*0Sstevel@tonic-gate 			 * In case we cannot establish an RPC client, we
1128*0Sstevel@tonic-gate 			 * take this node out of our considerations.
1129*0Sstevel@tonic-gate 			 * This will be reset by a reconfig
1130*0Sstevel@tonic-gate 			 * cycle that should come pretty soon.
1131*0Sstevel@tonic-gate 			 * MNISSUE: Should a reconfig cycle
1132*0Sstevel@tonic-gate 			 * be forced on SunCluster?
1133*0Sstevel@tonic-gate 			 */
1134*0Sstevel@tonic-gate 			node->nd_flags &= ~MD_MN_NODE_OWN;
1135*0Sstevel@tonic-gate 			commd_debug(MD_MMV_SYSLOG,
1136*0Sstevel@tonic-gate 			    "WARNING couldn't create client for %s\n"
1137*0Sstevel@tonic-gate 			    "Reconfig cycle required\n",
1138*0Sstevel@tonic-gate 			    node->nd_nodename);
1139*0Sstevel@tonic-gate 			commd_debug(MD_MMV_PROC_M, "proc_mas: (%d,0x%llx-%d) "
1140*0Sstevel@tonic-gate 			    "WARNING couldn't create client for %s\n",
1141*0Sstevel@tonic-gate 			    MSGID_ELEMS(msg->msg_msgid), node->nd_nodename);
1142*0Sstevel@tonic-gate 			rw_unlock(&client_rwlock[setno]);
1143*0Sstevel@tonic-gate 			return (MDMNE_IGNORE_NODE);
1144*0Sstevel@tonic-gate 		}
1145*0Sstevel@tonic-gate 		/* let's be paranoid and check again before sending */
1146*0Sstevel@tonic-gate 		if (client[setno][nid] == NULL) {
1147*0Sstevel@tonic-gate 			/*
1148*0Sstevel@tonic-gate 			 * if this is true, strange enough, we catch our breath,
1149*0Sstevel@tonic-gate 			 * and then continue, so that the client is set up
1150*0Sstevel@tonic-gate 			 * once again.
1151*0Sstevel@tonic-gate 			 */
1152*0Sstevel@tonic-gate 			commd_debug(MD_MMV_PROC_M, "client is NULL\n");
1153*0Sstevel@tonic-gate 			rw_unlock(&client_rwlock[setno]);
1154*0Sstevel@tonic-gate 			sleep(1);
1155*0Sstevel@tonic-gate 			continue;
1156*0Sstevel@tonic-gate 		}
1157*0Sstevel@tonic-gate 
1158*0Sstevel@tonic-gate 		/* send it over, it will return immediately */
1159*0Sstevel@tonic-gate 		ret = mdmn_work_1(msg, client[setno][nid]);
1160*0Sstevel@tonic-gate 
1161*0Sstevel@tonic-gate 		rw_unlock(&client_rwlock[setno]);
1162*0Sstevel@tonic-gate 
1163*0Sstevel@tonic-gate 		if (ret != NULL) {
1164*0Sstevel@tonic-gate 			commd_debug(MD_MMV_PROC_M,
1165*0Sstevel@tonic-gate 			    "proc_mas: sending (%d,0x%llx-%d) to %d returned "
1166*0Sstevel@tonic-gate 			    " 0x%x\n",
1167*0Sstevel@tonic-gate 			    MSGID_ELEMS(msg->msg_msgid), nid, *ret);
1168*0Sstevel@tonic-gate 		} else {
1169*0Sstevel@tonic-gate 			commd_debug(MD_MMV_PROC_M,
1170*0Sstevel@tonic-gate 			    "proc_mas: sending (%d,0x%llx-%d) to %d returned "
1171*0Sstevel@tonic-gate 			    " NULL \n",
1172*0Sstevel@tonic-gate 			    MSGID_ELEMS(msg->msg_msgid), nid);
1173*0Sstevel@tonic-gate 		}
1174*0Sstevel@tonic-gate 
1175*0Sstevel@tonic-gate 		if ((ret == NULL) || (*ret == MDMNE_CANNOT_CONNECT) ||
1176*0Sstevel@tonic-gate 		    (*ret == MDMNE_THR_CREATE_FAIL)) {
1177*0Sstevel@tonic-gate 			/*
1178*0Sstevel@tonic-gate 			 * Something happened to the daemon on the other side.
1179*0Sstevel@tonic-gate 			 * Kill the client, and try again.
1180*0Sstevel@tonic-gate 			 * check_client() will create a new client
1181*0Sstevel@tonic-gate 			 */
1182*0Sstevel@tonic-gate 			rw_wrlock(&client_rwlock[setno]);
1183*0Sstevel@tonic-gate 			mdmn_clnt_destroy(client[setno][nid]);
1184*0Sstevel@tonic-gate 			if (client[setno][nid] != (CLIENT *)NULL) {
1185*0Sstevel@tonic-gate 				client[setno][nid] = (CLIENT *)NULL;
1186*0Sstevel@tonic-gate 			}
1187*0Sstevel@tonic-gate 			rw_unlock(&client_rwlock[setno]);
1188*0Sstevel@tonic-gate 
1189*0Sstevel@tonic-gate 			/* ... but don't try infinitely */
1190*0Sstevel@tonic-gate 			--rpc_retries;
1191*0Sstevel@tonic-gate 			continue;
1192*0Sstevel@tonic-gate 		}
1193*0Sstevel@tonic-gate 		/*
1194*0Sstevel@tonic-gate 		 * If the class is locked on the other node, keep trying.
1195*0Sstevel@tonic-gate 		 * This situation will go away automatically,
1196*0Sstevel@tonic-gate 		 * if we wait long enough
1197*0Sstevel@tonic-gate 		 */
1198*0Sstevel@tonic-gate 		if (*ret == MDMNE_CLASS_LOCKED) {
1199*0Sstevel@tonic-gate 			sleep(1);
1200*0Sstevel@tonic-gate 			free(ret);
1201*0Sstevel@tonic-gate 			ret = NULL;
1202*0Sstevel@tonic-gate 			continue;
1203*0Sstevel@tonic-gate 		}
1204*0Sstevel@tonic-gate 	}
1205*0Sstevel@tonic-gate 	if (ret == NULL) {
1206*0Sstevel@tonic-gate 		return (MDMNE_RPC_FAIL);
1207*0Sstevel@tonic-gate 	}
1208*0Sstevel@tonic-gate 
1209*0Sstevel@tonic-gate 
1210*0Sstevel@tonic-gate 	/* if the slave is in abort state, we just ignore it. */
1211*0Sstevel@tonic-gate 	if (*ret == MDMNE_ABORT) {
1212*0Sstevel@tonic-gate 		commd_debug(MD_MMV_PROC_M,
1213*0Sstevel@tonic-gate 		    "proc_mas: work(%d,0x%llx-%d) returned "
1214*0Sstevel@tonic-gate 		    "MDMNE_ABORT\n",
1215*0Sstevel@tonic-gate 		    MSGID_ELEMS(msg->msg_msgid));
1216*0Sstevel@tonic-gate 		free(ret);
1217*0Sstevel@tonic-gate 		return (MDMNE_IGNORE_NODE);
1218*0Sstevel@tonic-gate 	}
1219*0Sstevel@tonic-gate 
1220*0Sstevel@tonic-gate 	/* Did the remote processing succeed? */
1221*0Sstevel@tonic-gate 	if (*ret != MDMNE_ACK) {
1222*0Sstevel@tonic-gate 		/*
1223*0Sstevel@tonic-gate 		 * Some commd failure in the middle of sending the msg
1224*0Sstevel@tonic-gate 		 * to the nodes. We don't continue here.
1225*0Sstevel@tonic-gate 		 */
1226*0Sstevel@tonic-gate 		commd_debug(MD_MMV_PROC_M,
1227*0Sstevel@tonic-gate 		    "proc_mas: work(%d,0x%llx-%d) returns %d\n",
1228*0Sstevel@tonic-gate 		    MSGID_ELEMS(msg->msg_msgid), *ret);
1229*0Sstevel@tonic-gate 		free(ret);
1230*0Sstevel@tonic-gate 		return (MDMNE_RPC_FAIL);
1231*0Sstevel@tonic-gate 	}
1232*0Sstevel@tonic-gate 	free(ret);
1233*0Sstevel@tonic-gate 	ret = NULL;
1234*0Sstevel@tonic-gate 
1235*0Sstevel@tonic-gate 	/*
1236*0Sstevel@tonic-gate 	 * When we are here, we have sent the message to the other node and
1237*0Sstevel@tonic-gate 	 * we know that node has accepted it.
1238*0Sstevel@tonic-gate 	 * We go to sleep and have trust to be woken up by wakeup.
1239*0Sstevel@tonic-gate 	 * If we wakeup due to a timeout, or a signal, no result has been
1240*0Sstevel@tonic-gate 	 * placed in the appropriate slot.
1241*0Sstevel@tonic-gate 	 * If we timeout, it is likely that this is because the node has
1242*0Sstevel@tonic-gate 	 * gone away, so we will destroy the client and try it again in the
1243*0Sstevel@tonic-gate 	 * expectation that the rpc will fail and we will return
1244*0Sstevel@tonic-gate 	 * MDMNE_IGNORE_NODE. If that is not the case, the message must still
1245*0Sstevel@tonic-gate 	 * be being processed on the slave. In this case just timeout for 4
1246*0Sstevel@tonic-gate 	 * more seconds and then return RPC_FAIL if the message is not complete.
1247*0Sstevel@tonic-gate 	 */
1248*0Sstevel@tonic-gate 	timeout.tv_nsec = 0;
1249*0Sstevel@tonic-gate 	timeout.tv_sec = (timeout_retries == 0) ? mdmn_get_timeout(msgtype) :
1250*0Sstevel@tonic-gate 	    FOUR_SECS.tv_sec;
1251*0Sstevel@tonic-gate 	err = cond_reltimedwait(cv, mx, &timeout);
1252*0Sstevel@tonic-gate 
1253*0Sstevel@tonic-gate 	if (err == 0) {
1254*0Sstevel@tonic-gate 		/* everything's fine, return success */
1255*0Sstevel@tonic-gate 		return (MDMNE_ACK);
1256*0Sstevel@tonic-gate 	}
1257*0Sstevel@tonic-gate 
1258*0Sstevel@tonic-gate 	if (err == ETIME) {
1259*0Sstevel@tonic-gate 		commd_debug(MD_MMV_PROC_M, "proc_mas: "
1260*0Sstevel@tonic-gate 		    "timeout occured, set=%d, class=%d, "
1261*0Sstevel@tonic-gate 		    "msgid=(%d, 0x%llx-%d), timeout_retries=%d\n",
1262*0Sstevel@tonic-gate 		    setno, class, MSGID_ELEMS(msg->msg_msgid), timeout_retries);
1263*0Sstevel@tonic-gate 		if (timeout_retries == 0) {
1264*0Sstevel@tonic-gate 			timeout_retries++;
1265*0Sstevel@tonic-gate 			/*
1266*0Sstevel@tonic-gate 			 * Destroy the client and try the rpc call again
1267*0Sstevel@tonic-gate 			 */
1268*0Sstevel@tonic-gate 			rw_wrlock(&client_rwlock[setno]);
1269*0Sstevel@tonic-gate 			mdmn_clnt_destroy(client[setno][nid]);
1270*0Sstevel@tonic-gate 			client[setno][nid] = (CLIENT *)NULL;
1271*0Sstevel@tonic-gate 			rw_unlock(&client_rwlock[setno]);
1272*0Sstevel@tonic-gate 			goto retry_rpc;
1273*0Sstevel@tonic-gate 		}
1274*0Sstevel@tonic-gate 	} else if (err == EINTR) {
1275*0Sstevel@tonic-gate 		commd_debug(MD_MMV_PROC_M, "proc_mas: "
1276*0Sstevel@tonic-gate 		    "commd signalled, set=%d, class=%d, "
1277*0Sstevel@tonic-gate 		    "msgid=(%d, 0x%llx-%d)\n",
1278*0Sstevel@tonic-gate 		    setno, class, MSGID_ELEMS(msg->msg_msgid));
1279*0Sstevel@tonic-gate 	} else {
1280*0Sstevel@tonic-gate 		commd_debug(MD_MMV_PROC_M, "proc_mas: "
1281*0Sstevel@tonic-gate 		    "cond_reltimedwait err=%d, set=%d, "
1282*0Sstevel@tonic-gate 		    "class=%d, msgid=(%d, 0x%llx-%d)\n",
1283*0Sstevel@tonic-gate 		    err, setno, class,
1284*0Sstevel@tonic-gate 		    MSGID_ELEMS(msg->msg_msgid));
1285*0Sstevel@tonic-gate 	}
1286*0Sstevel@tonic-gate 
1287*0Sstevel@tonic-gate 	/* some failure happened */
1288*0Sstevel@tonic-gate 	return (MDMNE_RPC_FAIL);
1289*0Sstevel@tonic-gate }
1290*0Sstevel@tonic-gate 
1291*0Sstevel@tonic-gate /*
1292*0Sstevel@tonic-gate  * before we return we have to
1293*0Sstevel@tonic-gate  * free_msg(msg); because we are working on a copied message
1294*0Sstevel@tonic-gate  */
1295*0Sstevel@tonic-gate void
1296*0Sstevel@tonic-gate mdmn_master_process_msg(md_mn_msg_t *msg)
1297*0Sstevel@tonic-gate {
1298*0Sstevel@tonic-gate 	int		*ret;
1299*0Sstevel@tonic-gate 	int		err;
1300*0Sstevel@tonic-gate 	int		nmsgs;		/* total number of msgs */
1301*0Sstevel@tonic-gate 	int		curmsg;		/* index of current msg */
1302*0Sstevel@tonic-gate 	set_t		setno;
1303*0Sstevel@tonic-gate 	uint_t		inherit_flags = 0;
1304*0Sstevel@tonic-gate 	uint_t		secdiff, usecdiff; /* runtime of this message */
1305*0Sstevel@tonic-gate 	md_error_t	mde = mdnullerror;
1306*0Sstevel@tonic-gate 	md_mn_msg_t	*msglist[MAX_SUBMESSAGES]; /* all msgs to process */
1307*0Sstevel@tonic-gate 	md_mn_msg_t	*cmsg;		/* current msg */
1308*0Sstevel@tonic-gate 	md_mn_msgid_t	dummyid;
1309*0Sstevel@tonic-gate 	md_mn_result_t	*result;
1310*0Sstevel@tonic-gate 	md_mn_result_t	*slave_result;
1311*0Sstevel@tonic-gate 	md_mn_nodeid_t	sender;
1312*0Sstevel@tonic-gate 	md_mn_nodeid_t	set_master;
1313*0Sstevel@tonic-gate 	md_mnnode_desc	*node;
1314*0Sstevel@tonic-gate 	md_mn_msgtype_t	orig_type;	/* type of the original message */
1315*0Sstevel@tonic-gate 	md_mn_msgtype_t	msgtype;	/* type of the current message */
1316*0Sstevel@tonic-gate 	md_mn_msgclass_t orig_class;	/* class of the original message */
1317*0Sstevel@tonic-gate 	md_mn_msgclass_t class;		/* class of the current message */
1318*0Sstevel@tonic-gate 
1319*0Sstevel@tonic-gate 	int (*smgen)(md_mn_msg_t *msg, md_mn_msg_t **msglist);
1320*0Sstevel@tonic-gate 
1321*0Sstevel@tonic-gate 	orig_type = msgtype = msg->msg_type;
1322*0Sstevel@tonic-gate 	sender	= msg->msg_sender;
1323*0Sstevel@tonic-gate 	setno	= msg->msg_setno;
1324*0Sstevel@tonic-gate 
1325*0Sstevel@tonic-gate 	result = Zalloc(sizeof (md_mn_result_t));
1326*0Sstevel@tonic-gate 	result->mmr_setno	= setno;
1327*0Sstevel@tonic-gate 	result->mmr_msgtype	= msgtype;
1328*0Sstevel@tonic-gate 	MSGID_COPY(&(msg->msg_msgid), &(result->mmr_msgid));
1329*0Sstevel@tonic-gate 
1330*0Sstevel@tonic-gate 	orig_class = mdmn_get_message_class(msgtype);
1331*0Sstevel@tonic-gate 
1332*0Sstevel@tonic-gate 	commd_debug(MD_MMV_PROC_M,
1333*0Sstevel@tonic-gate 	    "proc_mas: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d\n",
1334*0Sstevel@tonic-gate 	    MSGID_ELEMS(msg->msg_msgid), setno, orig_class, msgtype);
1335*0Sstevel@tonic-gate 
1336*0Sstevel@tonic-gate 	rw_rdlock(&set_desc_rwlock[setno]);
1337*0Sstevel@tonic-gate 	set_master = set_descriptor[setno]->sd_mn_master_nodeid;
1338*0Sstevel@tonic-gate 	result->mmr_sender	= set_master;
1339*0Sstevel@tonic-gate 	/*
1340*0Sstevel@tonic-gate 	 * Put message into the change log unless told otherwise
1341*0Sstevel@tonic-gate 	 * Note that we only log original messages.
1342*0Sstevel@tonic-gate 	 * If they are generated by some smgen, we don't log them!
1343*0Sstevel@tonic-gate 	 * Replay messages aren't logged either.
1344*0Sstevel@tonic-gate 	 * Note, that replay messages are unlogged on completion.
1345*0Sstevel@tonic-gate 	 */
1346*0Sstevel@tonic-gate 	if ((msg->msg_flags & (MD_MSGF_NO_LOG | MD_MSGF_REPLAY_MSG)) == 0) {
1347*0Sstevel@tonic-gate 		commd_debug(MD_MMV_PROC_M,
1348*0Sstevel@tonic-gate 		    "proc_mas: calling log_msg for (%d,0x%llx-%d) type %d\n",
1349*0Sstevel@tonic-gate 		    MSGID_ELEMS(msg->msg_msgid), msgtype);
1350*0Sstevel@tonic-gate 		err = mdmn_log_msg(msg);
1351*0Sstevel@tonic-gate 		if (err == MDMNE_NULL) {
1352*0Sstevel@tonic-gate 			/* msg logged successfully */
1353*0Sstevel@tonic-gate 			commd_debug(MD_MMV_PROC_M, "proc_mas: "
1354*0Sstevel@tonic-gate 			    "done log_msg for (%d,0x%llx-%d) type %d\n",
1355*0Sstevel@tonic-gate 			    MSGID_ELEMS(msg->msg_msgid), msgtype);
1356*0Sstevel@tonic-gate 			goto proceed;
1357*0Sstevel@tonic-gate 		}
1358*0Sstevel@tonic-gate 		if (err == MDMNE_ACK) {
1359*0Sstevel@tonic-gate 			/* Same msg in the slot, proceed */
1360*0Sstevel@tonic-gate 			commd_debug(MD_MMV_PROC_M, "proc_mas: "
1361*0Sstevel@tonic-gate 			    "already logged (%d,0x%llx-%d) type %d\n",
1362*0Sstevel@tonic-gate 			    MSGID_ELEMS(msg->msg_msgid), msgtype);
1363*0Sstevel@tonic-gate 			goto proceed;
1364*0Sstevel@tonic-gate 		}
1365*0Sstevel@tonic-gate 		if (err == MDMNE_LOG_FAIL) {
1366*0Sstevel@tonic-gate 			/* Oh, bad, the log is non functional. */
1367*0Sstevel@tonic-gate 			result->mmr_comm_state = MDMNE_LOG_FAIL;
1368*0Sstevel@tonic-gate 			/*
1369*0Sstevel@tonic-gate 			 * Note that the mark_busy was already done by
1370*0Sstevel@tonic-gate 			 * mdmn_work_svc_1()
1371*0Sstevel@tonic-gate 			 */
1372*0Sstevel@tonic-gate 			mutex_lock(&mdmn_busy_mutex[setno]);
1373*0Sstevel@tonic-gate 			mdmn_mark_class_unbusy(setno, orig_class);
1374*0Sstevel@tonic-gate 			mutex_unlock(&mdmn_busy_mutex[setno]);
1375*0Sstevel@tonic-gate 
1376*0Sstevel@tonic-gate 		}
1377*0Sstevel@tonic-gate 		if (err == MDMNE_CLASS_BUSY) {
1378*0Sstevel@tonic-gate 			/*
1379*0Sstevel@tonic-gate 			 * The log is occupied with a different message
1380*0Sstevel@tonic-gate 			 * that needs to be played first.
1381*0Sstevel@tonic-gate 			 * We reject the current message with MDMNE_CLASS_BUSY
1382*0Sstevel@tonic-gate 			 * to the initiator and do not unbusy the set/class,
1383*0Sstevel@tonic-gate 			 * because we will proceed with the logged message,
1384*0Sstevel@tonic-gate 			 * which has the same set/class combination
1385*0Sstevel@tonic-gate 			 */
1386*0Sstevel@tonic-gate 			result->mmr_comm_state = MDMNE_CLASS_BUSY;
1387*0Sstevel@tonic-gate 		}
1388*0Sstevel@tonic-gate 		ret = (int *)NULL;
1389*0Sstevel@tonic-gate 		rw_rdlock(&client_rwlock[setno]);
1390*0Sstevel@tonic-gate 
1391*0Sstevel@tonic-gate 		if (check_client(setno, sender)) {
1392*0Sstevel@tonic-gate 			commd_debug(MD_MMV_SYSLOG,
1393*0Sstevel@tonic-gate 			    "proc_mas: No client for initiator \n");
1394*0Sstevel@tonic-gate 		} else {
1395*0Sstevel@tonic-gate 			ret = mdmn_wakeup_initiator_1(result,
1396*0Sstevel@tonic-gate 			    client[setno][sender]);
1397*0Sstevel@tonic-gate 		}
1398*0Sstevel@tonic-gate 		rw_unlock(&client_rwlock[setno]);
1399*0Sstevel@tonic-gate 
1400*0Sstevel@tonic-gate 		if (ret == (int *)NULL) {
1401*0Sstevel@tonic-gate 			commd_debug(MD_MMV_SYSLOG,
1402*0Sstevel@tonic-gate 			    "proc_mas: couldn't wakeup_initiator \n");
1403*0Sstevel@tonic-gate 		} else {
1404*0Sstevel@tonic-gate 			if (*ret != MDMNE_ACK) {
1405*0Sstevel@tonic-gate 				commd_debug(MD_MMV_SYSLOG, "proc_mas: "
1406*0Sstevel@tonic-gate 				    "wakeup_initiator returned %d\n", *ret);
1407*0Sstevel@tonic-gate 			}
1408*0Sstevel@tonic-gate 			free(ret);
1409*0Sstevel@tonic-gate 		}
1410*0Sstevel@tonic-gate 		free_msg(msg);
1411*0Sstevel@tonic-gate 
1412*0Sstevel@tonic-gate 		if (err == MDMNE_LOG_FAIL) {
1413*0Sstevel@tonic-gate 			/* we can't proceed here */
1414*0Sstevel@tonic-gate 			free_result(result);
1415*0Sstevel@tonic-gate 			rw_unlock(&set_desc_rwlock[setno]);
1416*0Sstevel@tonic-gate 			return;
1417*0Sstevel@tonic-gate 		} else if (err == MDMNE_CLASS_BUSY) {
1418*0Sstevel@tonic-gate 			mdmn_changelog_record_t *lr;
1419*0Sstevel@tonic-gate 			lr = mdmn_get_changelogrec(setno, orig_class);
1420*0Sstevel@tonic-gate 			assert(lr != NULL);
1421*0Sstevel@tonic-gate 
1422*0Sstevel@tonic-gate 			/* proceed with the logged message */
1423*0Sstevel@tonic-gate 			msg = copy_msg(&(lr->lr_msg), NULL);
1424*0Sstevel@tonic-gate 
1425*0Sstevel@tonic-gate 			/*
1426*0Sstevel@tonic-gate 			 * The logged message has to have the same class but
1427*0Sstevel@tonic-gate 			 * type and sender can be different
1428*0Sstevel@tonic-gate 			 */
1429*0Sstevel@tonic-gate 			orig_type = msgtype = msg->msg_type;
1430*0Sstevel@tonic-gate 			sender	= msg->msg_sender;
1431*0Sstevel@tonic-gate 
1432*0Sstevel@tonic-gate 			commd_debug(MD_MMV_PROC_M,
1433*0Sstevel@tonic-gate 			    "proc_mas: Got new message from change log: "
1434*0Sstevel@tonic-gate 			    "(%d,0x%llx-%d) type %d\n",
1435*0Sstevel@tonic-gate 			    MSGID_ELEMS(msg->msg_msgid), msgtype);
1436*0Sstevel@tonic-gate 
1437*0Sstevel@tonic-gate 			/* continue normal operation with this message */
1438*0Sstevel@tonic-gate 		}
1439*0Sstevel@tonic-gate 	}
1440*0Sstevel@tonic-gate 
1441*0Sstevel@tonic-gate proceed:
1442*0Sstevel@tonic-gate 	smgen = mdmn_get_submessage_generator(msgtype);
1443*0Sstevel@tonic-gate 	if (smgen == NULL) {
1444*0Sstevel@tonic-gate 		/* no submessages to create, just use the original message */
1445*0Sstevel@tonic-gate 		msglist[0] = msg;
1446*0Sstevel@tonic-gate 		nmsgs = 1;
1447*0Sstevel@tonic-gate 	} else {
1448*0Sstevel@tonic-gate 		/* some bits are passed on to submessages */
1449*0Sstevel@tonic-gate 		inherit_flags = msg->msg_flags & MD_MSGF_INHERIT_BITS;
1450*0Sstevel@tonic-gate 
1451*0Sstevel@tonic-gate 		nmsgs = smgen(msg, msglist);
1452*0Sstevel@tonic-gate 
1453*0Sstevel@tonic-gate 		/* some settings for the submessages */
1454*0Sstevel@tonic-gate 		for (curmsg = 0; curmsg < nmsgs; curmsg++) {
1455*0Sstevel@tonic-gate 			cmsg    = msglist[curmsg];
1456*0Sstevel@tonic-gate 
1457*0Sstevel@tonic-gate 			/* Apply the inherited flags */
1458*0Sstevel@tonic-gate 			cmsg->msg_flags |= inherit_flags;
1459*0Sstevel@tonic-gate 
1460*0Sstevel@tonic-gate 			/*
1461*0Sstevel@tonic-gate 			 * Make sure the submessage ID is set correctly
1462*0Sstevel@tonic-gate 			 * Note: first submessage has mid_smid of 1 (not 0)
1463*0Sstevel@tonic-gate 			 */
1464*0Sstevel@tonic-gate 			cmsg->msg_msgid.mid_smid = curmsg + 1;
1465*0Sstevel@tonic-gate 
1466*0Sstevel@tonic-gate 			/* need the original class set in msgID (for MCT) */
1467*0Sstevel@tonic-gate 			cmsg->msg_msgid.mid_oclass = orig_class;
1468*0Sstevel@tonic-gate 		}
1469*0Sstevel@tonic-gate 
1470*0Sstevel@tonic-gate 		commd_debug(MD_MMV_PROC_M,
1471*0Sstevel@tonic-gate 		    "smgen generated %d submsgs, origclass = %d\n",
1472*0Sstevel@tonic-gate 		    nmsgs, orig_class);
1473*0Sstevel@tonic-gate 	}
1474*0Sstevel@tonic-gate 	/*
1475*0Sstevel@tonic-gate 	 * This big loop does the following.
1476*0Sstevel@tonic-gate 	 * For all messages:
1477*0Sstevel@tonic-gate 	 *	process message on the master first (a message completion
1478*0Sstevel@tonic-gate 	 *		table MCT ensures a message is not processed twice)
1479*0Sstevel@tonic-gate 	 *	in case of an error break out of message loop
1480*0Sstevel@tonic-gate 	 *	for all nodes -- unless MD_MSGF_NO_BCAST is set --
1481*0Sstevel@tonic-gate 	 *		send message to node until that succeeds
1482*0Sstevel@tonic-gate 	 *		merge result -- not yet implemented
1483*0Sstevel@tonic-gate 	 *		respect MD_MSGF_STOP_ON_ERROR
1484*0Sstevel@tonic-gate 	 */
1485*0Sstevel@tonic-gate 	for (curmsg = 0; curmsg < nmsgs; curmsg++) {
1486*0Sstevel@tonic-gate 		int	break_msg_loop = 0;
1487*0Sstevel@tonic-gate 		mutex_t	*mx;		/* protection for class_busy */
1488*0Sstevel@tonic-gate 		int	master_err;
1489*0Sstevel@tonic-gate 		int	master_exitval = -1;
1490*0Sstevel@tonic-gate 
1491*0Sstevel@tonic-gate 		cmsg	= msglist[curmsg];
1492*0Sstevel@tonic-gate 		msgtype = cmsg->msg_type;
1493*0Sstevel@tonic-gate 		class	= mdmn_get_message_class(msgtype);
1494*0Sstevel@tonic-gate 		node	= NULL;
1495*0Sstevel@tonic-gate 		mx	= mdmn_get_master_table_mx(setno, class);
1496*0Sstevel@tonic-gate 
1497*0Sstevel@tonic-gate 		/* If we are in the abort state, we error out immediately */
1498*0Sstevel@tonic-gate 		if (md_commd_global_state & MD_CGS_ABORTED) {
1499*0Sstevel@tonic-gate 			break; /* out of the message loop */
1500*0Sstevel@tonic-gate 		}
1501*0Sstevel@tonic-gate 
1502*0Sstevel@tonic-gate 		commd_debug(MD_MMV_PROC_M, "class=%d, orig_class=%d\n",
1503*0Sstevel@tonic-gate 		    class, orig_class);
1504*0Sstevel@tonic-gate 		/*
1505*0Sstevel@tonic-gate 		 * If the current class is different from the original class,
1506*0Sstevel@tonic-gate 		 * we have to lock it down.
1507*0Sstevel@tonic-gate 		 * The original class is already marked busy.
1508*0Sstevel@tonic-gate 		 * At this point we cannot refuse the message because the
1509*0Sstevel@tonic-gate 		 * class is busy right now, so we wait until the class becomes
1510*0Sstevel@tonic-gate 		 * available again. As soon as something changes for this set
1511*0Sstevel@tonic-gate 		 * we will be cond_signal'ed (in mdmn_mark_class_unbusy)
1512*0Sstevel@tonic-gate 		 *
1513*0Sstevel@tonic-gate 		 * Granularity could be finer (setno/class)
1514*0Sstevel@tonic-gate 		 */
1515*0Sstevel@tonic-gate 		if (class != orig_class) {
1516*0Sstevel@tonic-gate 			mutex_lock(&mdmn_busy_mutex[setno]);
1517*0Sstevel@tonic-gate 			while (mdmn_mark_class_busy(setno, class) == FALSE) {
1518*0Sstevel@tonic-gate 				cond_wait(&mdmn_busy_cv[setno],
1519*0Sstevel@tonic-gate 				    &mdmn_busy_mutex[setno]);
1520*0Sstevel@tonic-gate 			}
1521*0Sstevel@tonic-gate 			mutex_unlock(&mdmn_busy_mutex[setno]);
1522*0Sstevel@tonic-gate 		}
1523*0Sstevel@tonic-gate 
1524*0Sstevel@tonic-gate 		master_err = do_message_locally(cmsg, result);
1525*0Sstevel@tonic-gate 
1526*0Sstevel@tonic-gate 		if ((master_err != MDMNE_ACK) ||
1527*0Sstevel@tonic-gate 		    ((master_err == MDMNE_ACK) && (result->mmr_exitval != 0))) {
1528*0Sstevel@tonic-gate 			result->mmr_failing_node = set_master;
1529*0Sstevel@tonic-gate 			if (cmsg->msg_flags & MD_MSGF_STOP_ON_ERROR) {
1530*0Sstevel@tonic-gate 				/*
1531*0Sstevel@tonic-gate 				 * if appropriate, unbusy the class and
1532*0Sstevel@tonic-gate 				 * break out of the message loop
1533*0Sstevel@tonic-gate 				 */
1534*0Sstevel@tonic-gate 				if (class != orig_class) {
1535*0Sstevel@tonic-gate 					mutex_lock(&mdmn_busy_mutex[setno]);
1536*0Sstevel@tonic-gate 					mdmn_mark_class_unbusy(setno, class);
1537*0Sstevel@tonic-gate 					mutex_unlock(&mdmn_busy_mutex[setno]);
1538*0Sstevel@tonic-gate 				}
1539*0Sstevel@tonic-gate 				break;
1540*0Sstevel@tonic-gate 			}
1541*0Sstevel@tonic-gate 		}
1542*0Sstevel@tonic-gate 
1543*0Sstevel@tonic-gate 		if (master_err == MDMNE_ACK)
1544*0Sstevel@tonic-gate 			master_exitval = result->mmr_exitval;
1545*0Sstevel@tonic-gate 
1546*0Sstevel@tonic-gate 		/* No broadcast? => next message */
1547*0Sstevel@tonic-gate 		if (cmsg->msg_flags & MD_MSGF_NO_BCAST) {
1548*0Sstevel@tonic-gate 			/* if appropriate, unbusy the class */
1549*0Sstevel@tonic-gate 			if (class != orig_class) {
1550*0Sstevel@tonic-gate 				mutex_lock(&mdmn_busy_mutex[setno]);
1551*0Sstevel@tonic-gate 				mdmn_mark_class_unbusy(setno, class);
1552*0Sstevel@tonic-gate 				mutex_unlock(&mdmn_busy_mutex[setno]);
1553*0Sstevel@tonic-gate 			}
1554*0Sstevel@tonic-gate 			continue;
1555*0Sstevel@tonic-gate 		}
1556*0Sstevel@tonic-gate 
1557*0Sstevel@tonic-gate 
1558*0Sstevel@tonic-gate 		/* fake sender, so we get notified when the results are avail */
1559*0Sstevel@tonic-gate 		cmsg->msg_sender = set_master;
1560*0Sstevel@tonic-gate 		/*
1561*0Sstevel@tonic-gate 		 * register to the master_table. It's needed by wakeup_master to
1562*0Sstevel@tonic-gate 		 * wakeup the sleeping thread.
1563*0Sstevel@tonic-gate 		 * Access is protected by the class lock: mdmn_mark_class_busy()
1564*0Sstevel@tonic-gate 		 */
1565*0Sstevel@tonic-gate 		mdmn_set_master_table_id(setno, class, &(cmsg->msg_msgid));
1566*0Sstevel@tonic-gate 
1567*0Sstevel@tonic-gate 
1568*0Sstevel@tonic-gate 
1569*0Sstevel@tonic-gate 		rw_rdlock(&set_desc_rwlock[setno]);
1570*0Sstevel@tonic-gate 		/* Send the message  to all other nodes */
1571*0Sstevel@tonic-gate 		for (node = set_descriptor[setno]->sd_nodelist; node;
1572*0Sstevel@tonic-gate 		    node = node->nd_next) {
1573*0Sstevel@tonic-gate 			md_mn_nodeid_t nid = node->nd_nodeid;
1574*0Sstevel@tonic-gate 
1575*0Sstevel@tonic-gate 			/* We are master and have already processed the msg */
1576*0Sstevel@tonic-gate 			if (node == set_descriptor[setno]->sd_mn_masternode) {
1577*0Sstevel@tonic-gate 				continue;
1578*0Sstevel@tonic-gate 			}
1579*0Sstevel@tonic-gate 
1580*0Sstevel@tonic-gate 			/* If this node didn't join the disk set, ignore it */
1581*0Sstevel@tonic-gate 			if ((node->nd_flags & MD_MN_NODE_OWN) == 0) {
1582*0Sstevel@tonic-gate 				continue;
1583*0Sstevel@tonic-gate 			}
1584*0Sstevel@tonic-gate 
1585*0Sstevel@tonic-gate 			mutex_lock(mx);
1586*0Sstevel@tonic-gate 			/*
1587*0Sstevel@tonic-gate 			 * Register the node that is addressed,
1588*0Sstevel@tonic-gate 			 * so we can detect unsolicited messages
1589*0Sstevel@tonic-gate 			 */
1590*0Sstevel@tonic-gate 			mdmn_set_master_table_addr(setno, class, nid);
1591*0Sstevel@tonic-gate 			slave_result = (md_mn_result_t *)NULL;
1592*0Sstevel@tonic-gate 
1593*0Sstevel@tonic-gate 			/*
1594*0Sstevel@tonic-gate 			 * Now send it. do_send_message() will return if
1595*0Sstevel@tonic-gate 			 *	a failure occurs or
1596*0Sstevel@tonic-gate 			 *	the results are available
1597*0Sstevel@tonic-gate 			 */
1598*0Sstevel@tonic-gate 			err = do_send_message(cmsg, node);
1599*0Sstevel@tonic-gate 
1600*0Sstevel@tonic-gate 			/*  in abort state, we error out immediately */
1601*0Sstevel@tonic-gate 			if (md_commd_global_state & MD_CGS_ABORTED) {
1602*0Sstevel@tonic-gate 				break;
1603*0Sstevel@tonic-gate 			}
1604*0Sstevel@tonic-gate 
1605*0Sstevel@tonic-gate 			if (err == MDMNE_ACK) {
1606*0Sstevel@tonic-gate 				slave_result =
1607*0Sstevel@tonic-gate 				    mdmn_get_master_table_res(setno, class);
1608*0Sstevel@tonic-gate 				commd_debug(MD_MMV_PROC_M,
1609*0Sstevel@tonic-gate 				    "proc_mas: got result for (%d,0x%llx-%d)\n",
1610*0Sstevel@tonic-gate 				    MSGID_ELEMS(cmsg->msg_msgid));
1611*0Sstevel@tonic-gate 			} else if (err == MDMNE_IGNORE_NODE) {
1612*0Sstevel@tonic-gate 				mutex_unlock(mx);
1613*0Sstevel@tonic-gate 				continue; /* send to next node */
1614*0Sstevel@tonic-gate 			}
1615*0Sstevel@tonic-gate 			mutex_unlock(mx);
1616*0Sstevel@tonic-gate 
1617*0Sstevel@tonic-gate 
1618*0Sstevel@tonic-gate 			/*
1619*0Sstevel@tonic-gate 			 * If the result is NULL, or err doesn't show success,
1620*0Sstevel@tonic-gate 			 * something went wrong with this RPC call.
1621*0Sstevel@tonic-gate 			 */
1622*0Sstevel@tonic-gate 			if ((slave_result == NULL) || (err != MDMNE_ACK)) {
1623*0Sstevel@tonic-gate 				/*
1624*0Sstevel@tonic-gate 				 * If PANIC_WHEN_INCONSISTENT set,
1625*0Sstevel@tonic-gate 				 * panic if the master succeeded while
1626*0Sstevel@tonic-gate 				 * this node failed
1627*0Sstevel@tonic-gate 				 */
1628*0Sstevel@tonic-gate 				if ((cmsg->msg_flags &
1629*0Sstevel@tonic-gate 				    MD_MSGF_PANIC_WHEN_INCONSISTENT) &&
1630*0Sstevel@tonic-gate 				    (master_err == MDMNE_ACK))
1631*0Sstevel@tonic-gate 					panic_system(nid, cmsg->msg_type,
1632*0Sstevel@tonic-gate 					    master_err, master_exitval,
1633*0Sstevel@tonic-gate 					    slave_result);
1634*0Sstevel@tonic-gate 
1635*0Sstevel@tonic-gate 				result->mmr_failing_node = nid;
1636*0Sstevel@tonic-gate 				/* are we supposed to stop in case of error? */
1637*0Sstevel@tonic-gate 				if (cmsg->msg_flags & MD_MSGF_STOP_ON_ERROR) {
1638*0Sstevel@tonic-gate 					result->mmr_exitval = MDMNE_RPC_FAIL;
1639*0Sstevel@tonic-gate 					commd_debug(MD_MMV_SYSLOG, "proc_mas: "
1640*0Sstevel@tonic-gate 					    "result (%d,0x%llx-%d) is NULL\n",
1641*0Sstevel@tonic-gate 					    MSGID_ELEMS(cmsg->msg_msgid));
1642*0Sstevel@tonic-gate 					FLUSH_DEBUGFILE();
1643*0Sstevel@tonic-gate 					break_msg_loop = 1;
1644*0Sstevel@tonic-gate 					break; /* out of node loop first */
1645*0Sstevel@tonic-gate 				} else {
1646*0Sstevel@tonic-gate 					/* send msg to the next node */
1647*0Sstevel@tonic-gate 					continue;
1648*0Sstevel@tonic-gate 				}
1649*0Sstevel@tonic-gate 
1650*0Sstevel@tonic-gate 			}
1651*0Sstevel@tonic-gate 
1652*0Sstevel@tonic-gate 			/*
1653*0Sstevel@tonic-gate 			 * Message processed on remote node.
1654*0Sstevel@tonic-gate 			 * If PANIC_WHEN_INCONSISTENT set, panic if the
1655*0Sstevel@tonic-gate 			 * result is different on this node from the result
1656*0Sstevel@tonic-gate 			 * on the master
1657*0Sstevel@tonic-gate 			 */
1658*0Sstevel@tonic-gate 			if ((cmsg->msg_flags &
1659*0Sstevel@tonic-gate 			    MD_MSGF_PANIC_WHEN_INCONSISTENT) &&
1660*0Sstevel@tonic-gate 			    ((master_err != MDMNE_ACK) ||
1661*0Sstevel@tonic-gate 			    (slave_result->mmr_exitval != master_exitval)))
1662*0Sstevel@tonic-gate 				panic_system(nid, cmsg->msg_type, master_err,
1663*0Sstevel@tonic-gate 				    master_exitval, slave_result);
1664*0Sstevel@tonic-gate 
1665*0Sstevel@tonic-gate 			/*
1666*0Sstevel@tonic-gate 			 * At this point we know we have a message that was
1667*0Sstevel@tonic-gate 			 * processed on the remote node.
1668*0Sstevel@tonic-gate 			 * We now check if the exitval is non zero.
1669*0Sstevel@tonic-gate 			 * In that case we discard the previous result and
1670*0Sstevel@tonic-gate 			 * rather use the current.
1671*0Sstevel@tonic-gate 			 * This means: If a message fails on no node,
1672*0Sstevel@tonic-gate 			 * the result from the master will be returned.
1673*0Sstevel@tonic-gate 			 * There's currently no such thing as merge of results
1674*0Sstevel@tonic-gate 			 * If additionally STOP_ON_ERROR is set, we bail out
1675*0Sstevel@tonic-gate 			 */
1676*0Sstevel@tonic-gate 			if (slave_result->mmr_exitval != 0) {
1677*0Sstevel@tonic-gate 				/* throw away the previously allocated result */
1678*0Sstevel@tonic-gate 				free_result(result);
1679*0Sstevel@tonic-gate 
1680*0Sstevel@tonic-gate 				/* copy_result() allocates new memory */
1681*0Sstevel@tonic-gate 				result = copy_result(slave_result);
1682*0Sstevel@tonic-gate 				free_result(slave_result);
1683*0Sstevel@tonic-gate 
1684*0Sstevel@tonic-gate 				dump_result(MD_MMV_PROC_M, "proc_mas", result);
1685*0Sstevel@tonic-gate 
1686*0Sstevel@tonic-gate 				result->mmr_failing_node = nid;
1687*0Sstevel@tonic-gate 				if (cmsg->msg_flags & MD_MSGF_STOP_ON_ERROR) {
1688*0Sstevel@tonic-gate 					break_msg_loop = 1;
1689*0Sstevel@tonic-gate 					break; /* out of node loop */
1690*0Sstevel@tonic-gate 				}
1691*0Sstevel@tonic-gate 				continue; /* try next node */
1692*0Sstevel@tonic-gate 
1693*0Sstevel@tonic-gate 			} else {
1694*0Sstevel@tonic-gate 				/*
1695*0Sstevel@tonic-gate 				 * MNIssue: may want to merge the results
1696*0Sstevel@tonic-gate 				 * from all slaves.  Currently only report
1697*0Sstevel@tonic-gate 				 * the results from the master.
1698*0Sstevel@tonic-gate 				 */
1699*0Sstevel@tonic-gate 				free_result(slave_result);
1700*0Sstevel@tonic-gate 			}
1701*0Sstevel@tonic-gate 
1702*0Sstevel@tonic-gate 		} /* End of loop over the nodes */
1703*0Sstevel@tonic-gate 		rw_unlock(&set_desc_rwlock[setno]);
1704*0Sstevel@tonic-gate 
1705*0Sstevel@tonic-gate 
1706*0Sstevel@tonic-gate 		/* release the current class again */
1707*0Sstevel@tonic-gate 		if (class != orig_class) {
1708*0Sstevel@tonic-gate 			mutex_lock(&mdmn_busy_mutex[setno]);
1709*0Sstevel@tonic-gate 			mdmn_mark_class_unbusy(setno, class);
1710*0Sstevel@tonic-gate 			mutex_unlock(&mdmn_busy_mutex[setno]);
1711*0Sstevel@tonic-gate 		}
1712*0Sstevel@tonic-gate 
1713*0Sstevel@tonic-gate 		/* are we supposed to quit entirely ? */
1714*0Sstevel@tonic-gate 		if (break_msg_loop ||
1715*0Sstevel@tonic-gate 		    (md_commd_global_state & MD_CGS_ABORTED)) {
1716*0Sstevel@tonic-gate 			break; /* out of msg loop */
1717*0Sstevel@tonic-gate 		}
1718*0Sstevel@tonic-gate 
1719*0Sstevel@tonic-gate 	} /* End of loop over the messages */
1720*0Sstevel@tonic-gate 	/*
1721*0Sstevel@tonic-gate 	 * If we are here, there's two possibilities:
1722*0Sstevel@tonic-gate 	 * 	- we processed all messages on all nodes without an error.
1723*0Sstevel@tonic-gate 	 *	    In this case we return the result from the master.
1724*0Sstevel@tonic-gate 	 *	    (to be implemented: return the merged result)
1725*0Sstevel@tonic-gate 	 *	- we encountered an error in which case result has been
1726*0Sstevel@tonic-gate 	 *	    set accordingly already.
1727*0Sstevel@tonic-gate 	 */
1728*0Sstevel@tonic-gate 
1729*0Sstevel@tonic-gate 	if (md_commd_global_state & MD_CGS_ABORTED) {
1730*0Sstevel@tonic-gate 		result->mmr_comm_state = MDMNE_ABORT;
1731*0Sstevel@tonic-gate 	}
1732*0Sstevel@tonic-gate 
1733*0Sstevel@tonic-gate 	/*
1734*0Sstevel@tonic-gate 	 * This message has been processed completely.
1735*0Sstevel@tonic-gate 	 * Remove it from the changelog.
1736*0Sstevel@tonic-gate 	 * Do this for replay messages too.
1737*0Sstevel@tonic-gate 	 * Note that the message is unlogged before waking up the
1738*0Sstevel@tonic-gate 	 * initiator.  This is done for two reasons.
1739*0Sstevel@tonic-gate 	 * 1. Remove a race condition that occurs when back to back
1740*0Sstevel@tonic-gate 	 *   messages are sent for the same class, the registeration is
1741*0Sstevel@tonic-gate 	 *   is lost.
1742*0Sstevel@tonic-gate 	 * 2. If the initiator died but the action was completed on all the
1743*0Sstevel@tonic-gate 	 *   the nodes, we want that to be marked "done" quickly.
1744*0Sstevel@tonic-gate 	 */
1745*0Sstevel@tonic-gate 
1746*0Sstevel@tonic-gate 	if ((msg->msg_flags & MD_MSGF_NO_LOG) == 0) {
1747*0Sstevel@tonic-gate 		commd_debug(MD_MMV_PROC_M,
1748*0Sstevel@tonic-gate 		    "proc_mas: calling unlog_msg for (%d,0x%llx-%d) type %d\n",
1749*0Sstevel@tonic-gate 		    MSGID_ELEMS(msg->msg_msgid), msgtype);
1750*0Sstevel@tonic-gate 		mdmn_unlog_msg(msg);
1751*0Sstevel@tonic-gate 		commd_debug(MD_MMV_PROC_M,
1752*0Sstevel@tonic-gate 		    "proc_mas: done unlog_msg for (%d,0x%llx-%d) type %d\n",
1753*0Sstevel@tonic-gate 		    MSGID_ELEMS(msg->msg_msgid), msgtype);
1754*0Sstevel@tonic-gate 	}
1755*0Sstevel@tonic-gate 
1756*0Sstevel@tonic-gate 	/*
1757*0Sstevel@tonic-gate 	 * In case of submessages, we increased the submessage ID in the
1758*0Sstevel@tonic-gate 	 * result structure. We restore the message ID to the value that
1759*0Sstevel@tonic-gate 	 * the initiator is waiting for.
1760*0Sstevel@tonic-gate 	 */
1761*0Sstevel@tonic-gate 	result->mmr_msgid.mid_smid	= 0;
1762*0Sstevel@tonic-gate 	result->mmr_msgtype		= orig_type;
1763*0Sstevel@tonic-gate 	result->mmr_sender		= set_master;
1764*0Sstevel@tonic-gate 
1765*0Sstevel@tonic-gate 	/* if we have an inited client, send result */
1766*0Sstevel@tonic-gate 	ret = (int *)NULL;
1767*0Sstevel@tonic-gate 
1768*0Sstevel@tonic-gate 	rw_rdlock(&client_rwlock[setno]);
1769*0Sstevel@tonic-gate 	if (check_client(setno, sender)) {
1770*0Sstevel@tonic-gate 		commd_debug(MD_MMV_SYSLOG,
1771*0Sstevel@tonic-gate 		    "proc_mas: unable to create client for initiator\n");
1772*0Sstevel@tonic-gate 	} else {
1773*0Sstevel@tonic-gate 		ret = mdmn_wakeup_initiator_1(result, client[setno][sender]);
1774*0Sstevel@tonic-gate 	}
1775*0Sstevel@tonic-gate 	rw_unlock(&client_rwlock[setno]);
1776*0Sstevel@tonic-gate 
1777*0Sstevel@tonic-gate 	if (ret == (int *)NULL) {
1778*0Sstevel@tonic-gate 		commd_debug(MD_MMV_PROC_M,
1779*0Sstevel@tonic-gate 		    "proc_mas: couldn't wakeup initiator\n");
1780*0Sstevel@tonic-gate 	} else {
1781*0Sstevel@tonic-gate 		if (*ret != MDMNE_ACK) {
1782*0Sstevel@tonic-gate 			commd_debug(MD_MMV_PROC_M,
1783*0Sstevel@tonic-gate 			    "proc_mas: wakeup_initiator returned %d\n",
1784*0Sstevel@tonic-gate 			    *ret);
1785*0Sstevel@tonic-gate 		}
1786*0Sstevel@tonic-gate 		free(ret);
1787*0Sstevel@tonic-gate 	}
1788*0Sstevel@tonic-gate 
1789*0Sstevel@tonic-gate 	rw_unlock(&set_desc_rwlock[setno]);
1790*0Sstevel@tonic-gate 	/* Free all submessages, if there were any */
1791*0Sstevel@tonic-gate 	if (nmsgs > 1) {
1792*0Sstevel@tonic-gate 		for (curmsg = 0; curmsg < nmsgs; curmsg++) {
1793*0Sstevel@tonic-gate 			free_msg(msglist[curmsg]);
1794*0Sstevel@tonic-gate 		}
1795*0Sstevel@tonic-gate 	}
1796*0Sstevel@tonic-gate 	/* Free the result */
1797*0Sstevel@tonic-gate 	free_result(result);
1798*0Sstevel@tonic-gate 
1799*0Sstevel@tonic-gate 	mutex_lock(&mdmn_busy_mutex[setno]);
1800*0Sstevel@tonic-gate 	mdmn_mark_class_unbusy(setno, orig_class);
1801*0Sstevel@tonic-gate 	mutex_unlock(&mdmn_busy_mutex[setno]);
1802*0Sstevel@tonic-gate 
1803*0Sstevel@tonic-gate 
1804*0Sstevel@tonic-gate 	/*
1805*0Sstevel@tonic-gate 	 * We use this ioctl just to get the time in the same format as used in
1806*0Sstevel@tonic-gate 	 * the messageID. If it fails, all we get is a bad runtime output.
1807*0Sstevel@tonic-gate 	 */
1808*0Sstevel@tonic-gate 	(void) metaioctl(MD_IOCGUNIQMSGID, &dummyid, &mde, NULL);
1809*0Sstevel@tonic-gate 	secdiff = (dummyid.mid_time - msg->msg_msgid.mid_time) >> 32;
1810*0Sstevel@tonic-gate 	usecdiff = (dummyid.mid_time - msg->msg_msgid.mid_time) & 0xfffff;
1811*0Sstevel@tonic-gate 
1812*0Sstevel@tonic-gate 	/* catching possible overflow */
1813*0Sstevel@tonic-gate 	if (usecdiff >= 1000000) {
1814*0Sstevel@tonic-gate 		usecdiff -= 1000000;
1815*0Sstevel@tonic-gate 		secdiff++;
1816*0Sstevel@tonic-gate 	}
1817*0Sstevel@tonic-gate 
1818*0Sstevel@tonic-gate 
1819*0Sstevel@tonic-gate 	commd_debug(MD_MMV_PROC_M, "proc_mas: done (%d, 0x%llx-%d) type=%02d "
1820*0Sstevel@tonic-gate 	    "%5d.%06d secs runtime\n",
1821*0Sstevel@tonic-gate 	    MSGID_ELEMS(msg->msg_msgid), orig_type, secdiff, usecdiff);
1822*0Sstevel@tonic-gate 
1823*0Sstevel@tonic-gate 	/* Free the original message */
1824*0Sstevel@tonic-gate 	free_msg(msg);
1825*0Sstevel@tonic-gate }
1826*0Sstevel@tonic-gate 
1827*0Sstevel@tonic-gate void
1828*0Sstevel@tonic-gate mdmn_slave_process_msg(md_mn_msg_t *msg)
1829*0Sstevel@tonic-gate {
1830*0Sstevel@tonic-gate 	int			*ret = NULL;
1831*0Sstevel@tonic-gate 	int			completed;
1832*0Sstevel@tonic-gate 	int			retries;
1833*0Sstevel@tonic-gate 	int			successfully_returned;
1834*0Sstevel@tonic-gate 	set_t			setno;
1835*0Sstevel@tonic-gate 	md_mn_result_t		*result;
1836*0Sstevel@tonic-gate 	md_mn_nodeid_t		sender;
1837*0Sstevel@tonic-gate 	md_mn_nodeid_t		whoami;
1838*0Sstevel@tonic-gate 	md_mn_msgtype_t		msgtype;
1839*0Sstevel@tonic-gate 	md_mn_msgclass_t	class;
1840*0Sstevel@tonic-gate 
1841*0Sstevel@tonic-gate 	void (*handler)(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *res);
1842*0Sstevel@tonic-gate 
1843*0Sstevel@tonic-gate 	setno	= msg->msg_setno;
1844*0Sstevel@tonic-gate 	sender	= msg->msg_sender; /* this is always the master of the set */
1845*0Sstevel@tonic-gate 	msgtype	= msg->msg_type;
1846*0Sstevel@tonic-gate 
1847*0Sstevel@tonic-gate 	rw_rdlock(&set_desc_rwlock[setno]);
1848*0Sstevel@tonic-gate 	whoami		= set_descriptor[setno]->sd_mn_mynode->nd_nodeid;
1849*0Sstevel@tonic-gate 	rw_unlock(&set_desc_rwlock[setno]);
1850*0Sstevel@tonic-gate 
1851*0Sstevel@tonic-gate 	result = Zalloc(sizeof (md_mn_result_t));
1852*0Sstevel@tonic-gate 	result->mmr_flags	= msg->msg_flags;
1853*0Sstevel@tonic-gate 	result->mmr_setno	= setno;
1854*0Sstevel@tonic-gate 	result->mmr_msgtype	= msgtype;
1855*0Sstevel@tonic-gate 	result->mmr_sender	= whoami;
1856*0Sstevel@tonic-gate 	result->mmr_comm_state	= MDMNE_ACK; /* Ok state */
1857*0Sstevel@tonic-gate 	MSGID_COPY(&(msg->msg_msgid), &(result->mmr_msgid));
1858*0Sstevel@tonic-gate 	class = mdmn_get_message_class(msgtype);
1859*0Sstevel@tonic-gate 
1860*0Sstevel@tonic-gate 	commd_debug(MD_MMV_PROC_S,
1861*0Sstevel@tonic-gate 	    "proc_sla: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d\n",
1862*0Sstevel@tonic-gate 	    MSGID_ELEMS(msg->msg_msgid), setno, class, msgtype);
1863*0Sstevel@tonic-gate 
1864*0Sstevel@tonic-gate 	handler = mdmn_get_handler(msgtype);
1865*0Sstevel@tonic-gate 
1866*0Sstevel@tonic-gate 	if (handler == NULL) {
1867*0Sstevel@tonic-gate 		result->mmr_exitval = 0;
1868*0Sstevel@tonic-gate 		/* let the sender decide if this is an error or not */
1869*0Sstevel@tonic-gate 		result->mmr_comm_state = MDMNE_NO_HANDLER;
1870*0Sstevel@tonic-gate 		commd_debug(MD_MMV_PROC_S,
1871*0Sstevel@tonic-gate 		    "proc_sla: No handler for (%d, 0x%llx-%d)\n",
1872*0Sstevel@tonic-gate 		    MSGID_ELEMS(msg->msg_msgid));
1873*0Sstevel@tonic-gate 	} else {
1874*0Sstevel@tonic-gate 
1875*0Sstevel@tonic-gate 		/* Did we already process this message ? */
1876*0Sstevel@tonic-gate 		mutex_lock(&mct_mutex[setno][class]);
1877*0Sstevel@tonic-gate 		completed = mdmn_check_completion(msg, result);
1878*0Sstevel@tonic-gate 
1879*0Sstevel@tonic-gate 		if (completed == MDMN_MCT_NOT_DONE) {
1880*0Sstevel@tonic-gate 			/* message not yet processed locally */
1881*0Sstevel@tonic-gate 			commd_debug(MD_MMV_PROC_S,
1882*0Sstevel@tonic-gate 			    "proc_sla: calling handler for (%d, 0x%llx-%d)\n",
1883*0Sstevel@tonic-gate 			    MSGID_ELEMS(msg->msg_msgid));
1884*0Sstevel@tonic-gate 
1885*0Sstevel@tonic-gate 			/*
1886*0Sstevel@tonic-gate 			 * Mark the message as being currently processed,
1887*0Sstevel@tonic-gate 			 * so we won't start a second handler for it
1888*0Sstevel@tonic-gate 			 */
1889*0Sstevel@tonic-gate 			(void) mdmn_mark_completion(msg, NULL,
1890*0Sstevel@tonic-gate 			    MDMN_MCT_IN_PROGRESS);
1891*0Sstevel@tonic-gate 
1892*0Sstevel@tonic-gate 			mutex_unlock(&mct_mutex[setno][class]);
1893*0Sstevel@tonic-gate 			(*handler)(msg, MD_MSGF_ON_SLAVE, result);
1894*0Sstevel@tonic-gate 
1895*0Sstevel@tonic-gate 			commd_debug(MD_MMV_PROC_S,
1896*0Sstevel@tonic-gate 			    "proc_sla: finished handler for (%d, 0x%llx-%d)\n",
1897*0Sstevel@tonic-gate 			    MSGID_ELEMS(msg->msg_msgid));
1898*0Sstevel@tonic-gate 
1899*0Sstevel@tonic-gate 			mutex_lock(&mct_mutex[setno][class]);
1900*0Sstevel@tonic-gate 			/* Mark the message as fully done, store the result */
1901*0Sstevel@tonic-gate 			(void) mdmn_mark_completion(msg, result, MDMN_MCT_DONE);
1902*0Sstevel@tonic-gate 
1903*0Sstevel@tonic-gate 		} else if (completed == MDMN_MCT_DONE) {
1904*0Sstevel@tonic-gate 			/* message processed previously, got result from MCT */
1905*0Sstevel@tonic-gate 			commd_debug(MD_MMV_PROC_S,
1906*0Sstevel@tonic-gate 			    "proc_sla: result for (%d, 0x%llx-%d) from MCT\n",
1907*0Sstevel@tonic-gate 			    MSGID_ELEMS(msg->msg_msgid));
1908*0Sstevel@tonic-gate 		} else if (completed == MDMN_MCT_IN_PROGRESS) {
1909*0Sstevel@tonic-gate 			/*
1910*0Sstevel@tonic-gate 			 * If the message is curruntly being processed,
1911*0Sstevel@tonic-gate 			 * we can return here, without sending a result back.
1912*0Sstevel@tonic-gate 			 * This will be done by the initial message handling
1913*0Sstevel@tonic-gate 			 * thread
1914*0Sstevel@tonic-gate 			 */
1915*0Sstevel@tonic-gate 			mutex_unlock(&mct_mutex[setno][class]);
1916*0Sstevel@tonic-gate 			commd_debug(MD_MMV_PROC_M, "proc_sla: "
1917*0Sstevel@tonic-gate 			    "(%d, 0x%llx-%d) is currently being processed\n",
1918*0Sstevel@tonic-gate 			    MSGID_ELEMS(msg->msg_msgid), msgtype);
1919*0Sstevel@tonic-gate 
1920*0Sstevel@tonic-gate 			free_msg(msg);
1921*0Sstevel@tonic-gate 			free_result(result);
1922*0Sstevel@tonic-gate 			return;
1923*0Sstevel@tonic-gate 		} else {
1924*0Sstevel@tonic-gate 			/* MCT error occurred (should never happen) */
1925*0Sstevel@tonic-gate 			result->mmr_comm_state = MDMNE_LOG_FAIL;
1926*0Sstevel@tonic-gate 			commd_debug(MD_MMV_PROC_S,
1927*0Sstevel@tonic-gate 			    "proc_sla: MCT error for (%d, 0x%llx-%d)\n",
1928*0Sstevel@tonic-gate 			    MSGID_ELEMS(msg->msg_msgid));
1929*0Sstevel@tonic-gate 		}
1930*0Sstevel@tonic-gate 		mutex_unlock(&mct_mutex[setno][class]);
1931*0Sstevel@tonic-gate 	}
1932*0Sstevel@tonic-gate 
1933*0Sstevel@tonic-gate 	/*
1934*0Sstevel@tonic-gate 	 * At this point we have a result (even in an error case)
1935*0Sstevel@tonic-gate 	 * that we return to the master.
1936*0Sstevel@tonic-gate 	 */
1937*0Sstevel@tonic-gate 	rw_rdlock(&set_desc_rwlock[setno]);
1938*0Sstevel@tonic-gate 	retries = 2; /* we will try two times to send the results */
1939*0Sstevel@tonic-gate 	successfully_returned = 0;
1940*0Sstevel@tonic-gate 
1941*0Sstevel@tonic-gate 	while (!successfully_returned && (retries != 0)) {
1942*0Sstevel@tonic-gate 		ret = (int *)NULL;
1943*0Sstevel@tonic-gate 		rw_rdlock(&client_rwlock[setno]);
1944*0Sstevel@tonic-gate 		if (check_client(setno, sender)) {
1945*0Sstevel@tonic-gate 			/*
1946*0Sstevel@tonic-gate 			 * If we cannot setup the rpc connection to the master,
1947*0Sstevel@tonic-gate 			 * we can't do anything besides logging this fact.
1948*0Sstevel@tonic-gate 			 */
1949*0Sstevel@tonic-gate 			commd_debug(MD_MMV_SYSLOG,
1950*0Sstevel@tonic-gate 			    "proc_mas: unable to create client for master\n");
1951*0Sstevel@tonic-gate 			rw_unlock(&client_rwlock[setno]);
1952*0Sstevel@tonic-gate 			break;
1953*0Sstevel@tonic-gate 		} else {
1954*0Sstevel@tonic-gate 			ret = mdmn_wakeup_master_1(result,
1955*0Sstevel@tonic-gate 			    client[setno][sender]);
1956*0Sstevel@tonic-gate 			/*
1957*0Sstevel@tonic-gate 			 * if mdmn_wakeup_master_1 returns NULL, it can be that
1958*0Sstevel@tonic-gate 			 * the master (or the commd on the master) had died.
1959*0Sstevel@tonic-gate 			 * In that case, we destroy the client to the master
1960*0Sstevel@tonic-gate 			 * and retry.
1961*0Sstevel@tonic-gate 			 * If mdmn_wakeup_master_1 doesn't return MDMNE_ACK,
1962*0Sstevel@tonic-gate 			 * the commd on the master is alive but
1963*0Sstevel@tonic-gate 			 * something else is wrong,
1964*0Sstevel@tonic-gate 			 * in that case a retry doesn't make sense => break out
1965*0Sstevel@tonic-gate 			 */
1966*0Sstevel@tonic-gate 			if (ret == (int *)NULL) {
1967*0Sstevel@tonic-gate 				commd_debug(MD_MMV_PROC_S,
1968*0Sstevel@tonic-gate 				    "proc_sla: wakeup_master returned NULL\n");
1969*0Sstevel@tonic-gate 				/* release reader lock, grab writer lock */
1970*0Sstevel@tonic-gate 				rw_unlock(&client_rwlock[setno]);
1971*0Sstevel@tonic-gate 				rw_wrlock(&client_rwlock[setno]);
1972*0Sstevel@tonic-gate 				mdmn_clnt_destroy(client[setno][sender]);
1973*0Sstevel@tonic-gate 				if (client[setno][sender] != (CLIENT *)NULL) {
1974*0Sstevel@tonic-gate 					client[setno][sender] = (CLIENT *)NULL;
1975*0Sstevel@tonic-gate 				}
1976*0Sstevel@tonic-gate 				rw_unlock(&client_rwlock[setno]);
1977*0Sstevel@tonic-gate 				retries--;
1978*0Sstevel@tonic-gate 				commd_debug(MD_MMV_PROC_S,
1979*0Sstevel@tonic-gate 				    "retries = %d\n", retries);
1980*0Sstevel@tonic-gate 				continue;
1981*0Sstevel@tonic-gate 			}
1982*0Sstevel@tonic-gate 			if (*ret != MDMNE_ACK) {
1983*0Sstevel@tonic-gate 				commd_debug(MD_MMV_PROC_S, "proc_sla: "
1984*0Sstevel@tonic-gate 				    "wakeup_master returned %d\n", *ret);
1985*0Sstevel@tonic-gate 				rw_unlock(&client_rwlock[setno]);
1986*0Sstevel@tonic-gate 				break;
1987*0Sstevel@tonic-gate 			} else { /* Good case */
1988*0Sstevel@tonic-gate 				successfully_returned = 1;
1989*0Sstevel@tonic-gate 				rw_unlock(&client_rwlock[setno]);
1990*0Sstevel@tonic-gate 			}
1991*0Sstevel@tonic-gate 		}
1992*0Sstevel@tonic-gate 	}
1993*0Sstevel@tonic-gate 
1994*0Sstevel@tonic-gate 	rw_unlock(&set_desc_rwlock[setno]);
1995*0Sstevel@tonic-gate 	commd_debug(MD_MMV_PROC_S, "proc_sla: done (%d, 0x%llx-%d)\n",
1996*0Sstevel@tonic-gate 	    MSGID_ELEMS(msg->msg_msgid));
1997*0Sstevel@tonic-gate 
1998*0Sstevel@tonic-gate 	if (ret != (int *)NULL)
1999*0Sstevel@tonic-gate 		free(ret);
2000*0Sstevel@tonic-gate 	free_msg(msg);
2001*0Sstevel@tonic-gate 	free_result(result);
2002*0Sstevel@tonic-gate }
2003*0Sstevel@tonic-gate 
2004*0Sstevel@tonic-gate 
2005*0Sstevel@tonic-gate md_mn_result_t *
2006*0Sstevel@tonic-gate mdmn_send_svc_1(md_mn_msg_t *omsg, struct svc_req *rqstp)
2007*0Sstevel@tonic-gate {
2008*0Sstevel@tonic-gate 	int			err;
2009*0Sstevel@tonic-gate 	set_t			setno;
2010*0Sstevel@tonic-gate 	SVCXPRT			*transp = rqstp->rq_xprt;
2011*0Sstevel@tonic-gate 	md_mn_msg_t		*msg;
2012*0Sstevel@tonic-gate 	md_mn_result_t		*resultp;
2013*0Sstevel@tonic-gate 	md_mn_msgclass_t	class;
2014*0Sstevel@tonic-gate 	md_mn_msg_and_transp_t	*matp;
2015*0Sstevel@tonic-gate 
2016*0Sstevel@tonic-gate 	msg = copy_msg(omsg, NULL);
2017*0Sstevel@tonic-gate 	xdr_free(xdr_md_mn_msg_t, (caddr_t)omsg);
2018*0Sstevel@tonic-gate 
2019*0Sstevel@tonic-gate 	setno = msg->msg_setno;
2020*0Sstevel@tonic-gate 	class = mdmn_get_message_class(msg->msg_type);
2021*0Sstevel@tonic-gate 
2022*0Sstevel@tonic-gate 	/* If we are in the abort state, we error out immediately */
2023*0Sstevel@tonic-gate 	if (md_commd_global_state & MD_CGS_ABORTED) {
2024*0Sstevel@tonic-gate 		resultp = Zalloc(sizeof (md_mn_result_t));
2025*0Sstevel@tonic-gate 		resultp->mmr_comm_state = MDMNE_ABORT;
2026*0Sstevel@tonic-gate 		mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
2027*0Sstevel@tonic-gate 		free_result(resultp);
2028*0Sstevel@tonic-gate 		svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
2029*0Sstevel@tonic-gate 		return (NULL);
2030*0Sstevel@tonic-gate 	}
2031*0Sstevel@tonic-gate 
2032*0Sstevel@tonic-gate 	/* check if the global initialization is done */
2033*0Sstevel@tonic-gate 	if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2034*0Sstevel@tonic-gate 		global_init();
2035*0Sstevel@tonic-gate 	}
2036*0Sstevel@tonic-gate 
2037*0Sstevel@tonic-gate 	commd_debug(MD_MMV_SEND,
2038*0Sstevel@tonic-gate 	    "send: received (%d, 0x%llx-%d), set=%d, class=%d, type=%d\n",
2039*0Sstevel@tonic-gate 	    MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type);
2040*0Sstevel@tonic-gate 
2041*0Sstevel@tonic-gate 	/* Check for verbosity related message */
2042*0Sstevel@tonic-gate 	if (msg->msg_type == MD_MN_MSG_VERBOSITY) {
2043*0Sstevel@tonic-gate 		md_mn_verbose_t *d;
2044*0Sstevel@tonic-gate 
2045*0Sstevel@tonic-gate 		d = (md_mn_verbose_t *)((void *)(msg->msg_event_data));
2046*0Sstevel@tonic-gate 		md_commd_global_verb = d->mmv_what;
2047*0Sstevel@tonic-gate 		/* everytime the bitmask is set, we reset the timer */
2048*0Sstevel@tonic-gate 		__savetime = gethrtime();
2049*0Sstevel@tonic-gate 		/*
2050*0Sstevel@tonic-gate 		 * If local-only-flag is set, we are done here,
2051*0Sstevel@tonic-gate 		 * otherwise we pass that message on to the master.
2052*0Sstevel@tonic-gate 		 */
2053*0Sstevel@tonic-gate 		if (msg->msg_flags & MD_MSGF_LOCAL_ONLY) {
2054*0Sstevel@tonic-gate 			resultp = Zalloc(sizeof (md_mn_result_t));
2055*0Sstevel@tonic-gate 			resultp->mmr_comm_state = MDMNE_ACK;
2056*0Sstevel@tonic-gate 			mdmn_svc_sendreply(transp, xdr_md_mn_result_t,
2057*0Sstevel@tonic-gate 			    (char *)resultp);
2058*0Sstevel@tonic-gate 			free_result(resultp);
2059*0Sstevel@tonic-gate 			svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
2060*0Sstevel@tonic-gate 			return (NULL);
2061*0Sstevel@tonic-gate 		}
2062*0Sstevel@tonic-gate 	}
2063*0Sstevel@tonic-gate 
2064*0Sstevel@tonic-gate 	/*
2065*0Sstevel@tonic-gate 	 * Are we entering the abort state?
2066*0Sstevel@tonic-gate 	 * Here we don't even need to check for MD_MSGF_LOCAL_ONLY, because
2067*0Sstevel@tonic-gate 	 * this message cannot be distributed anyway.
2068*0Sstevel@tonic-gate 	 * So, it's safe to return immediately.
2069*0Sstevel@tonic-gate 	 */
2070*0Sstevel@tonic-gate 	if (msg->msg_type == MD_MN_MSG_ABORT) {
2071*0Sstevel@tonic-gate 		md_commd_global_state |= MD_CGS_ABORTED;
2072*0Sstevel@tonic-gate 		resultp = Zalloc(sizeof (md_mn_result_t));
2073*0Sstevel@tonic-gate 		resultp->mmr_comm_state = MDMNE_ACK;
2074*0Sstevel@tonic-gate 		mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
2075*0Sstevel@tonic-gate 		free_result(resultp);
2076*0Sstevel@tonic-gate 		svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
2077*0Sstevel@tonic-gate 		return (NULL);
2078*0Sstevel@tonic-gate 	}
2079*0Sstevel@tonic-gate 
2080*0Sstevel@tonic-gate 
2081*0Sstevel@tonic-gate 	/*
2082*0Sstevel@tonic-gate 	 * Is this message type blocked?
2083*0Sstevel@tonic-gate 	 * If so we return MDMNE_CLASS_LOCKED, immediately
2084*0Sstevel@tonic-gate 	 */
2085*0Sstevel@tonic-gate 	if (msgtype_lock_state[msg->msg_type] == MMTL_LOCK) {
2086*0Sstevel@tonic-gate 		resultp = Zalloc(sizeof (md_mn_result_t));
2087*0Sstevel@tonic-gate 		resultp->mmr_comm_state = MDMNE_CLASS_LOCKED;
2088*0Sstevel@tonic-gate 		mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
2089*0Sstevel@tonic-gate 		free_result(resultp);
2090*0Sstevel@tonic-gate 		svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
2091*0Sstevel@tonic-gate 		commd_debug(MD_MMV_SEND,
2092*0Sstevel@tonic-gate 			"send: type locked (%d, 0x%llx-%d), set=%d, class=%d, "
2093*0Sstevel@tonic-gate 			"type=%d\n", MSGID_ELEMS(msg->msg_msgid), setno, class,
2094*0Sstevel@tonic-gate 			msg->msg_type);
2095*0Sstevel@tonic-gate 		return (NULL);
2096*0Sstevel@tonic-gate 	}
2097*0Sstevel@tonic-gate 
2098*0Sstevel@tonic-gate 
2099*0Sstevel@tonic-gate 	if (md_mn_set_inited[setno] != MDMN_SET_READY) {
2100*0Sstevel@tonic-gate 		/* Can only use the appropriate mutexes if they are inited */
2101*0Sstevel@tonic-gate 		if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) {
2102*0Sstevel@tonic-gate 			rw_wrlock(&set_desc_rwlock[setno]);
2103*0Sstevel@tonic-gate 			rw_wrlock(&client_rwlock[setno]);
2104*0Sstevel@tonic-gate 			err = mdmn_init_set(setno, MDMN_SET_READY);
2105*0Sstevel@tonic-gate 			rw_unlock(&client_rwlock[setno]);
2106*0Sstevel@tonic-gate 			rw_unlock(&set_desc_rwlock[setno]);
2107*0Sstevel@tonic-gate 		} else {
2108*0Sstevel@tonic-gate 			err = mdmn_init_set(setno, MDMN_SET_READY);
2109*0Sstevel@tonic-gate 		}
2110*0Sstevel@tonic-gate 
2111*0Sstevel@tonic-gate 		if (err) {
2112*0Sstevel@tonic-gate 			/* couldn't initialize connections, cannot proceed */
2113*0Sstevel@tonic-gate 			resultp = Zalloc(sizeof (md_mn_result_t));
2114*0Sstevel@tonic-gate 			resultp->mmr_comm_state = err;
2115*0Sstevel@tonic-gate 			mdmn_svc_sendreply(transp, xdr_md_mn_result_t,
2116*0Sstevel@tonic-gate 			    (char *)resultp);
2117*0Sstevel@tonic-gate 			svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
2118*0Sstevel@tonic-gate 			free_result(resultp);
2119*0Sstevel@tonic-gate 			commd_debug(MD_MMV_SEND,
2120*0Sstevel@tonic-gate 			    "send: init err = %d\n", err);
2121*0Sstevel@tonic-gate 			return (NULL);
2122*0Sstevel@tonic-gate 		}
2123*0Sstevel@tonic-gate 	}
2124*0Sstevel@tonic-gate 
2125*0Sstevel@tonic-gate 	mutex_lock(&mdmn_busy_mutex[setno]);
2126*0Sstevel@tonic-gate 	if ((mdmn_is_class_suspended(setno, class) == TRUE) &&
2127*0Sstevel@tonic-gate 	    ((msg->msg_flags & MD_MSGF_OVERRIDE_SUSPEND) == 0)) {
2128*0Sstevel@tonic-gate 		mutex_unlock(&mdmn_busy_mutex[setno]);
2129*0Sstevel@tonic-gate 		resultp = Zalloc(sizeof (md_mn_result_t));
2130*0Sstevel@tonic-gate 		resultp->mmr_comm_state = MDMNE_SUSPENDED;
2131*0Sstevel@tonic-gate 		mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
2132*0Sstevel@tonic-gate 		svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
2133*0Sstevel@tonic-gate 		free_result(resultp);
2134*0Sstevel@tonic-gate 		commd_debug(MD_MMV_SEND,
2135*0Sstevel@tonic-gate 			"send: class suspended (%d, 0x%llx-%d), set=%d, "
2136*0Sstevel@tonic-gate 			"class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid),
2137*0Sstevel@tonic-gate 			setno, class, msg->msg_type);
2138*0Sstevel@tonic-gate 		return (NULL);
2139*0Sstevel@tonic-gate 	}
2140*0Sstevel@tonic-gate 	mutex_unlock(&mdmn_busy_mutex[setno]);
2141*0Sstevel@tonic-gate 
2142*0Sstevel@tonic-gate 	/* is this rpc request coming from the local node? */
2143*0Sstevel@tonic-gate 	if (check_license(rqstp, 0) == FALSE) {
2144*0Sstevel@tonic-gate 		svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
2145*0Sstevel@tonic-gate 		commd_debug(MD_MMV_SEND,
2146*0Sstevel@tonic-gate 			"send: check licence fail(%d, 0x%llx-%d), set=%d, "
2147*0Sstevel@tonic-gate 			"class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid),
2148*0Sstevel@tonic-gate 			setno, class, msg->msg_type);
2149*0Sstevel@tonic-gate 		return (NULL);
2150*0Sstevel@tonic-gate 	}
2151*0Sstevel@tonic-gate 
2152*0Sstevel@tonic-gate 
2153*0Sstevel@tonic-gate 	/*
2154*0Sstevel@tonic-gate 	 * We allocate a structure that can take two pointers in order to pass
2155*0Sstevel@tonic-gate 	 * both the message and the transp into thread_create.
2156*0Sstevel@tonic-gate 	 * The free for this alloc is done in mdmn_send_to_work()
2157*0Sstevel@tonic-gate 	 */
2158*0Sstevel@tonic-gate 	matp = Malloc(sizeof (md_mn_msg_and_transp_t));
2159*0Sstevel@tonic-gate 	matp->mat_msg = msg;
2160*0Sstevel@tonic-gate 	matp->mat_transp = transp;
2161*0Sstevel@tonic-gate 
2162*0Sstevel@tonic-gate 	/*
2163*0Sstevel@tonic-gate 	 * create a thread here that calls work on the master.
2164*0Sstevel@tonic-gate 	 * If we are already on the master, this would block if running
2165*0Sstevel@tonic-gate 	 * in the same context. (our service is single threaded)(
2166*0Sstevel@tonic-gate 	 * Make it a detached thread because it will not communicate with
2167*0Sstevel@tonic-gate 	 * anybody thru thr_* mechanisms
2168*0Sstevel@tonic-gate 	 */
2169*0Sstevel@tonic-gate 	thr_create(NULL, 0, mdmn_send_to_work, (void *) matp, THR_DETACHED,
2170*0Sstevel@tonic-gate 	    NULL);
2171*0Sstevel@tonic-gate 
2172*0Sstevel@tonic-gate 	commd_debug(MD_MMV_SEND, "send: done (%d, 0x%llx-%d)\n",
2173*0Sstevel@tonic-gate 	    MSGID_ELEMS(msg->msg_msgid));
2174*0Sstevel@tonic-gate 	/*
2175*0Sstevel@tonic-gate 	 * We return here without sending results. This will be done by
2176*0Sstevel@tonic-gate 	 * mdmn_wakeup_initiator_svc_1() as soon as the results are available.
2177*0Sstevel@tonic-gate 	 * Until then the calling send_message will be blocked, while we
2178*0Sstevel@tonic-gate 	 * are able to take calls.
2179*0Sstevel@tonic-gate 	 */
2180*0Sstevel@tonic-gate 
2181*0Sstevel@tonic-gate 	return (NULL);
2182*0Sstevel@tonic-gate }
2183*0Sstevel@tonic-gate 
2184*0Sstevel@tonic-gate /* ARGSUSED */
2185*0Sstevel@tonic-gate int *
2186*0Sstevel@tonic-gate mdmn_work_svc_1(md_mn_msg_t *omsg, struct svc_req *rqstp)
2187*0Sstevel@tonic-gate {
2188*0Sstevel@tonic-gate 	int		err;
2189*0Sstevel@tonic-gate 	set_t		setno;
2190*0Sstevel@tonic-gate 	thread_t	tid;
2191*0Sstevel@tonic-gate 	int		*retval;
2192*0Sstevel@tonic-gate 	md_mn_msg_t	*msg;
2193*0Sstevel@tonic-gate 	md_mn_msgclass_t class;
2194*0Sstevel@tonic-gate 
2195*0Sstevel@tonic-gate 	retval = Malloc(sizeof (int));
2196*0Sstevel@tonic-gate 
2197*0Sstevel@tonic-gate 	/* If we are in the abort state, we error out immediately */
2198*0Sstevel@tonic-gate 	if (md_commd_global_state & MD_CGS_ABORTED) {
2199*0Sstevel@tonic-gate 	xdr_free(xdr_md_mn_msg_t, (caddr_t)omsg);
2200*0Sstevel@tonic-gate 		*retval = MDMNE_ABORT;
2201*0Sstevel@tonic-gate 		return (retval);
2202*0Sstevel@tonic-gate 	}
2203*0Sstevel@tonic-gate 
2204*0Sstevel@tonic-gate 	msg = copy_msg(omsg, NULL);
2205*0Sstevel@tonic-gate 	xdr_free(xdr_md_mn_msg_t, (caddr_t)omsg);
2206*0Sstevel@tonic-gate 
2207*0Sstevel@tonic-gate 	/*
2208*0Sstevel@tonic-gate 	 * Is this message type blocked?
2209*0Sstevel@tonic-gate 	 * If so we return MDMNE_CLASS_LOCKED, immediately.
2210*0Sstevel@tonic-gate 	 * This check is performed on master and slave.
2211*0Sstevel@tonic-gate 	 */
2212*0Sstevel@tonic-gate 	if (msgtype_lock_state[msg->msg_type] == MMTL_LOCK) {
2213*0Sstevel@tonic-gate 		*retval = MDMNE_CLASS_LOCKED;
2214*0Sstevel@tonic-gate 		return (retval);
2215*0Sstevel@tonic-gate 	}
2216*0Sstevel@tonic-gate 
2217*0Sstevel@tonic-gate 	/* check if the global initialization is done */
2218*0Sstevel@tonic-gate 	if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2219*0Sstevel@tonic-gate 		global_init();
2220*0Sstevel@tonic-gate 	}
2221*0Sstevel@tonic-gate 
2222*0Sstevel@tonic-gate 	class = mdmn_get_message_class(msg->msg_type);
2223*0Sstevel@tonic-gate 	setno = msg->msg_setno;
2224*0Sstevel@tonic-gate 
2225*0Sstevel@tonic-gate 	if (md_mn_set_inited[setno] != MDMN_SET_READY) {
2226*0Sstevel@tonic-gate 		/* Can only use the appropriate mutexes if they are inited */
2227*0Sstevel@tonic-gate 		if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) {
2228*0Sstevel@tonic-gate 			rw_wrlock(&set_desc_rwlock[setno]);
2229*0Sstevel@tonic-gate 			rw_wrlock(&client_rwlock[setno]);
2230*0Sstevel@tonic-gate 			err = mdmn_init_set(setno, MDMN_SET_READY);
2231*0Sstevel@tonic-gate 			rw_unlock(&client_rwlock[setno]);
2232*0Sstevel@tonic-gate 			rw_unlock(&set_desc_rwlock[setno]);
2233*0Sstevel@tonic-gate 		} else {
2234*0Sstevel@tonic-gate 			err = mdmn_init_set(setno, MDMN_SET_READY);
2235*0Sstevel@tonic-gate 		}
2236*0Sstevel@tonic-gate 
2237*0Sstevel@tonic-gate 		if (err) {
2238*0Sstevel@tonic-gate 			*retval = MDMNE_CANNOT_CONNECT;
2239*0Sstevel@tonic-gate 			free_msg(msg);
2240*0Sstevel@tonic-gate 			return (retval);
2241*0Sstevel@tonic-gate 		}
2242*0Sstevel@tonic-gate 	}
2243*0Sstevel@tonic-gate 
2244*0Sstevel@tonic-gate 	/* is this rpc request coming from a licensed node? */
2245*0Sstevel@tonic-gate 	if (check_license(rqstp, msg->msg_sender) == FALSE) {
2246*0Sstevel@tonic-gate 		free_msg(msg);
2247*0Sstevel@tonic-gate 		*retval = MDMNE_RPC_FAIL;
2248*0Sstevel@tonic-gate 		return (retval);
2249*0Sstevel@tonic-gate 	}
2250*0Sstevel@tonic-gate 
2251*0Sstevel@tonic-gate 	commd_debug(MD_MMV_WORK,
2252*0Sstevel@tonic-gate 	    "work: received (%d, 0x%llx-%d), set=%d, class=%d, type=%d, "
2253*0Sstevel@tonic-gate 	    "flags=0x%x\n",
2254*0Sstevel@tonic-gate 	    MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type,
2255*0Sstevel@tonic-gate 	    msg->msg_flags);
2256*0Sstevel@tonic-gate 
2257*0Sstevel@tonic-gate 	/* Check for various CLASS0 message types */
2258*0Sstevel@tonic-gate 	if (msg->msg_type == MD_MN_MSG_VERBOSITY) {
2259*0Sstevel@tonic-gate 		md_mn_verbose_t *d;
2260*0Sstevel@tonic-gate 
2261*0Sstevel@tonic-gate 		d = (md_mn_verbose_t *)((void *)(msg->msg_event_data));
2262*0Sstevel@tonic-gate 		/* for now we ignore set / class in md_mn_verbose_t */
2263*0Sstevel@tonic-gate 		md_commd_global_verb = d->mmv_what;
2264*0Sstevel@tonic-gate 		/* everytime the bitmask is set, we reset the timer */
2265*0Sstevel@tonic-gate 		__savetime = gethrtime();
2266*0Sstevel@tonic-gate 	}
2267*0Sstevel@tonic-gate 
2268*0Sstevel@tonic-gate 	mutex_lock(&mdmn_busy_mutex[setno]);
2269*0Sstevel@tonic-gate 
2270*0Sstevel@tonic-gate 	/* check if class is locked via a call to mdmn_comm_lock_svc_1 */
2271*0Sstevel@tonic-gate 	if (mdmn_is_class_locked(setno, class) == TRUE) {
2272*0Sstevel@tonic-gate 		mutex_unlock(&mdmn_busy_mutex[setno]);
2273*0Sstevel@tonic-gate 		*retval = MDMNE_CLASS_LOCKED;
2274*0Sstevel@tonic-gate 		free_msg(msg);
2275*0Sstevel@tonic-gate 		return (retval);
2276*0Sstevel@tonic-gate 	}
2277*0Sstevel@tonic-gate 	mutex_unlock(&mdmn_busy_mutex[setno]);
2278*0Sstevel@tonic-gate 
2279*0Sstevel@tonic-gate 	/* Check if the class is busy right now. Do it only on the master */
2280*0Sstevel@tonic-gate 	rw_rdlock(&set_desc_rwlock[setno]);
2281*0Sstevel@tonic-gate 	if (set_descriptor[setno]->sd_mn_am_i_master) {
2282*0Sstevel@tonic-gate 		rw_unlock(&set_desc_rwlock[setno]);
2283*0Sstevel@tonic-gate 		/*
2284*0Sstevel@tonic-gate 		 * If the class is currently suspended, don't accept new
2285*0Sstevel@tonic-gate 		 * messages, unless they are flagged with an override bit.
2286*0Sstevel@tonic-gate 		 */
2287*0Sstevel@tonic-gate 		mutex_lock(&mdmn_busy_mutex[setno]);
2288*0Sstevel@tonic-gate 		if ((mdmn_is_class_suspended(setno, class) == TRUE) &&
2289*0Sstevel@tonic-gate 		    ((msg->msg_flags & MD_MSGF_OVERRIDE_SUSPEND) == 0)) {
2290*0Sstevel@tonic-gate 			mutex_unlock(&mdmn_busy_mutex[setno]);
2291*0Sstevel@tonic-gate 			*retval = MDMNE_SUSPENDED;
2292*0Sstevel@tonic-gate 			commd_debug(MD_MMV_SEND,
2293*0Sstevel@tonic-gate 			    "send: set %d is suspended\n", setno);
2294*0Sstevel@tonic-gate 			free_msg(msg);
2295*0Sstevel@tonic-gate 			return (retval);
2296*0Sstevel@tonic-gate 		}
2297*0Sstevel@tonic-gate 		if (mdmn_mark_class_busy(setno, class) == FALSE) {
2298*0Sstevel@tonic-gate 			mutex_unlock(&mdmn_busy_mutex[setno]);
2299*0Sstevel@tonic-gate 			*retval = MDMNE_CLASS_BUSY;
2300*0Sstevel@tonic-gate 			free_msg(msg);
2301*0Sstevel@tonic-gate 			return (retval);
2302*0Sstevel@tonic-gate 		}
2303*0Sstevel@tonic-gate 		mutex_unlock(&mdmn_busy_mutex[setno]);
2304*0Sstevel@tonic-gate 		/*
2305*0Sstevel@tonic-gate 		 * Because the real processing of the message takes time we
2306*0Sstevel@tonic-gate 		 * create a thread for it. So the master thread can continue
2307*0Sstevel@tonic-gate 		 * to run and accept further messages.
2308*0Sstevel@tonic-gate 		 */
2309*0Sstevel@tonic-gate 		*retval = thr_create(NULL, 0,
2310*0Sstevel@tonic-gate 		    (void *(*)(void *))mdmn_master_process_msg, (void *)msg,
2311*0Sstevel@tonic-gate 		    THR_DETACHED|THR_SUSPENDED, &tid);
2312*0Sstevel@tonic-gate 	} else {
2313*0Sstevel@tonic-gate 		rw_unlock(&set_desc_rwlock[setno]);
2314*0Sstevel@tonic-gate 		*retval = thr_create(NULL, 0,
2315*0Sstevel@tonic-gate 		    (void *(*)(void *)) mdmn_slave_process_msg, (void *)msg,
2316*0Sstevel@tonic-gate 		    THR_DETACHED|THR_SUSPENDED, &tid);
2317*0Sstevel@tonic-gate 	}
2318*0Sstevel@tonic-gate 
2319*0Sstevel@tonic-gate 	if (*retval != 0) {
2320*0Sstevel@tonic-gate 		*retval = MDMNE_THR_CREATE_FAIL;
2321*0Sstevel@tonic-gate 		free_msg(msg);
2322*0Sstevel@tonic-gate 		return (retval);
2323*0Sstevel@tonic-gate 	}
2324*0Sstevel@tonic-gate 
2325*0Sstevel@tonic-gate 	/* Now run the new thread */
2326*0Sstevel@tonic-gate 	thr_continue(tid);
2327*0Sstevel@tonic-gate 
2328*0Sstevel@tonic-gate 	commd_debug(MD_MMV_WORK,
2329*0Sstevel@tonic-gate 	    "work: done (%d, 0x%llx-%d), set=%d, class=%d, type=%d\n",
2330*0Sstevel@tonic-gate 	    MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type);
2331*0Sstevel@tonic-gate 
2332*0Sstevel@tonic-gate 	*retval = MDMNE_ACK; /* this means success */
2333*0Sstevel@tonic-gate 	return (retval);
2334*0Sstevel@tonic-gate }
2335*0Sstevel@tonic-gate 
2336*0Sstevel@tonic-gate /* ARGSUSED */
2337*0Sstevel@tonic-gate int *
2338*0Sstevel@tonic-gate mdmn_wakeup_initiator_svc_1(md_mn_result_t *res, struct svc_req *rqstp)
2339*0Sstevel@tonic-gate {
2340*0Sstevel@tonic-gate 
2341*0Sstevel@tonic-gate 	int		*retval;
2342*0Sstevel@tonic-gate 	int		err;
2343*0Sstevel@tonic-gate 	set_t		setno;
2344*0Sstevel@tonic-gate 	mutex_t		*mx;   /* protection of initiator_table */
2345*0Sstevel@tonic-gate 	SVCXPRT		*transp;
2346*0Sstevel@tonic-gate 	md_mn_msgid_t	initiator_table_id;
2347*0Sstevel@tonic-gate 	md_mn_msgclass_t class;
2348*0Sstevel@tonic-gate 
2349*0Sstevel@tonic-gate 	retval = Malloc(sizeof (int));
2350*0Sstevel@tonic-gate 
2351*0Sstevel@tonic-gate 	/* check if the global initialization is done */
2352*0Sstevel@tonic-gate 	if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2353*0Sstevel@tonic-gate 		global_init();
2354*0Sstevel@tonic-gate 	}
2355*0Sstevel@tonic-gate 
2356*0Sstevel@tonic-gate 	setno	= res->mmr_setno;
2357*0Sstevel@tonic-gate 
2358*0Sstevel@tonic-gate 	if (md_mn_set_inited[setno] != MDMN_SET_READY) {
2359*0Sstevel@tonic-gate 		/* set not ready means we just crashed are restarted now */
2360*0Sstevel@tonic-gate 		/* Can only use the appropriate mutexes if they are inited */
2361*0Sstevel@tonic-gate 		if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) {
2362*0Sstevel@tonic-gate 			rw_wrlock(&set_desc_rwlock[setno]);
2363*0Sstevel@tonic-gate 			rw_wrlock(&client_rwlock[setno]);
2364*0Sstevel@tonic-gate 			err = mdmn_init_set(setno, MDMN_SET_READY);
2365*0Sstevel@tonic-gate 			rw_unlock(&client_rwlock[setno]);
2366*0Sstevel@tonic-gate 			rw_unlock(&set_desc_rwlock[setno]);
2367*0Sstevel@tonic-gate 		} else {
2368*0Sstevel@tonic-gate 			err = mdmn_init_set(setno, MDMN_SET_READY);
2369*0Sstevel@tonic-gate 		}
2370*0Sstevel@tonic-gate 
2371*0Sstevel@tonic-gate 		if (err) {
2372*0Sstevel@tonic-gate 			*retval = MDMNE_CANNOT_CONNECT;
2373*0Sstevel@tonic-gate 			xdr_free(xdr_md_mn_result_t, (caddr_t)res);
2374*0Sstevel@tonic-gate 			return (retval);
2375*0Sstevel@tonic-gate 		}
2376*0Sstevel@tonic-gate 	}
2377*0Sstevel@tonic-gate 
2378*0Sstevel@tonic-gate 	/* is this rpc request coming from a licensed node? */
2379*0Sstevel@tonic-gate 	if (check_license(rqstp, res->mmr_sender) == FALSE) {
2380*0Sstevel@tonic-gate 		xdr_free(xdr_md_mn_result_t, (caddr_t)res);
2381*0Sstevel@tonic-gate 		*retval = MDMNE_RPC_FAIL;
2382*0Sstevel@tonic-gate 		return (retval);
2383*0Sstevel@tonic-gate 	}
2384*0Sstevel@tonic-gate 
2385*0Sstevel@tonic-gate 
2386*0Sstevel@tonic-gate 	class	= mdmn_get_message_class(res->mmr_msgtype);
2387*0Sstevel@tonic-gate 	mx	= mdmn_get_initiator_table_mx(setno, class);
2388*0Sstevel@tonic-gate 
2389*0Sstevel@tonic-gate 	commd_debug(MD_MMV_WAKE_I,
2390*0Sstevel@tonic-gate 	    "wake_ini: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d\n",
2391*0Sstevel@tonic-gate 	    MSGID_ELEMS(res->mmr_msgid), setno, class, res->mmr_msgtype);
2392*0Sstevel@tonic-gate 
2393*0Sstevel@tonic-gate 	mutex_lock(mx);
2394*0Sstevel@tonic-gate 
2395*0Sstevel@tonic-gate 	/*
2396*0Sstevel@tonic-gate 	 * Search the initiator wakeup table.
2397*0Sstevel@tonic-gate 	 * If we find an entry here (which should always be true)
2398*0Sstevel@tonic-gate 	 * we are on the initiating node and we wakeup the original
2399*0Sstevel@tonic-gate 	 * local rpc call
2400*0Sstevel@tonic-gate 	 */
2401*0Sstevel@tonic-gate 	mdmn_get_initiator_table_id(setno, class, &initiator_table_id);
2402*0Sstevel@tonic-gate 
2403*0Sstevel@tonic-gate 	if (MSGID_CMP(&(initiator_table_id), &(res->mmr_msgid))) {
2404*0Sstevel@tonic-gate 		transp = mdmn_get_initiator_table_transp(setno, class);
2405*0Sstevel@tonic-gate 		mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)res);
2406*0Sstevel@tonic-gate 		mdmn_unregister_initiator_table(setno, class);
2407*0Sstevel@tonic-gate 		*retval = MDMNE_ACK;
2408*0Sstevel@tonic-gate 
2409*0Sstevel@tonic-gate 		commd_debug(MD_MMV_WAKE_I,
2410*0Sstevel@tonic-gate 		    "wake_ini: replied (%d, 0x%llx-%d)\n",
2411*0Sstevel@tonic-gate 		    MSGID_ELEMS(res->mmr_msgid));
2412*0Sstevel@tonic-gate 	} else {
2413*0Sstevel@tonic-gate 		commd_debug(MD_MMV_WAKE_I,
2414*0Sstevel@tonic-gate 		    "wakeup initiator: unsolicited message (%d, 0x%llx-%d)\n",
2415*0Sstevel@tonic-gate 		    MSGID_ELEMS(res->mmr_msgid));
2416*0Sstevel@tonic-gate 		*retval = MDMNE_NO_WAKEUP_ENTRY;
2417*0Sstevel@tonic-gate 	}
2418*0Sstevel@tonic-gate 	mutex_unlock(mx);
2419*0Sstevel@tonic-gate 	/* less work for check_timeouts */
2420*0Sstevel@tonic-gate 	mutex_lock(&check_timeout_mutex);
2421*0Sstevel@tonic-gate 	if (messages_on_their_way == 0) {
2422*0Sstevel@tonic-gate 		commd_debug(MD_MMV_WAKE_I,
2423*0Sstevel@tonic-gate 		    "Oops, messages_on_their_way < 0 (%d, 0x%llx-%d)\n",
2424*0Sstevel@tonic-gate 		    MSGID_ELEMS(res->mmr_msgid));
2425*0Sstevel@tonic-gate 	} else {
2426*0Sstevel@tonic-gate 		messages_on_their_way--;
2427*0Sstevel@tonic-gate 	}
2428*0Sstevel@tonic-gate 	mutex_unlock(&check_timeout_mutex);
2429*0Sstevel@tonic-gate 	xdr_free(xdr_md_mn_result_t, (caddr_t)res);
2430*0Sstevel@tonic-gate 
2431*0Sstevel@tonic-gate 	return (retval);
2432*0Sstevel@tonic-gate }
2433*0Sstevel@tonic-gate 
2434*0Sstevel@tonic-gate 
2435*0Sstevel@tonic-gate /*
2436*0Sstevel@tonic-gate  * res must be free'd by the thread we wake up
2437*0Sstevel@tonic-gate  */
2438*0Sstevel@tonic-gate /* ARGSUSED */
2439*0Sstevel@tonic-gate int *
2440*0Sstevel@tonic-gate mdmn_wakeup_master_svc_1(md_mn_result_t *ores, struct svc_req *rqstp)
2441*0Sstevel@tonic-gate {
2442*0Sstevel@tonic-gate 
2443*0Sstevel@tonic-gate 	int		*retval;
2444*0Sstevel@tonic-gate 	int		err;
2445*0Sstevel@tonic-gate 	set_t		setno;
2446*0Sstevel@tonic-gate 	cond_t		*cv;
2447*0Sstevel@tonic-gate 	mutex_t		*mx;
2448*0Sstevel@tonic-gate 	md_mn_msgid_t	master_table_id;
2449*0Sstevel@tonic-gate 	md_mn_nodeid_t	sender;
2450*0Sstevel@tonic-gate 	md_mn_result_t	*res;
2451*0Sstevel@tonic-gate 	md_mn_msgclass_t class;
2452*0Sstevel@tonic-gate 
2453*0Sstevel@tonic-gate 	retval = Malloc(sizeof (int));
2454*0Sstevel@tonic-gate 
2455*0Sstevel@tonic-gate 	/* check if the global initialization is done */
2456*0Sstevel@tonic-gate 	if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2457*0Sstevel@tonic-gate 		global_init();
2458*0Sstevel@tonic-gate 	}
2459*0Sstevel@tonic-gate 
2460*0Sstevel@tonic-gate 	/* Need to copy the results here, as they are static for RPC */
2461*0Sstevel@tonic-gate 	res = copy_result(ores);
2462*0Sstevel@tonic-gate 	xdr_free(xdr_md_mn_result_t, (caddr_t)ores);
2463*0Sstevel@tonic-gate 
2464*0Sstevel@tonic-gate 	class = mdmn_get_message_class(res->mmr_msgtype);
2465*0Sstevel@tonic-gate 	setno = res->mmr_setno;
2466*0Sstevel@tonic-gate 
2467*0Sstevel@tonic-gate 	if (md_mn_set_inited[setno] != MDMN_SET_READY) {
2468*0Sstevel@tonic-gate 		/* set not ready means we just crashed are restarted now */
2469*0Sstevel@tonic-gate 		/* Can only use the appropriate mutexes if they are inited */
2470*0Sstevel@tonic-gate 		if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) {
2471*0Sstevel@tonic-gate 			rw_wrlock(&set_desc_rwlock[setno]);
2472*0Sstevel@tonic-gate 			rw_wrlock(&client_rwlock[setno]);
2473*0Sstevel@tonic-gate 			err = mdmn_init_set(setno, MDMN_SET_READY);
2474*0Sstevel@tonic-gate 			rw_unlock(&client_rwlock[setno]);
2475*0Sstevel@tonic-gate 			rw_unlock(&set_desc_rwlock[setno]);
2476*0Sstevel@tonic-gate 		} else {
2477*0Sstevel@tonic-gate 			err = mdmn_init_set(setno, MDMN_SET_READY);
2478*0Sstevel@tonic-gate 		}
2479*0Sstevel@tonic-gate 
2480*0Sstevel@tonic-gate 		if (err) {
2481*0Sstevel@tonic-gate 			*retval = MDMNE_CANNOT_CONNECT;
2482*0Sstevel@tonic-gate 			xdr_free(xdr_md_mn_result_t, (caddr_t)res);
2483*0Sstevel@tonic-gate 			return (retval);
2484*0Sstevel@tonic-gate 		}
2485*0Sstevel@tonic-gate 	}
2486*0Sstevel@tonic-gate 
2487*0Sstevel@tonic-gate 	/* is this rpc request coming from a licensed node? */
2488*0Sstevel@tonic-gate 	if (check_license(rqstp, res->mmr_sender) == FALSE) {
2489*0Sstevel@tonic-gate 		*retval = MDMNE_RPC_FAIL;
2490*0Sstevel@tonic-gate 		xdr_free(xdr_md_mn_result_t, (caddr_t)res);
2491*0Sstevel@tonic-gate 		return (retval);
2492*0Sstevel@tonic-gate 	}
2493*0Sstevel@tonic-gate 
2494*0Sstevel@tonic-gate 
2495*0Sstevel@tonic-gate 	commd_debug(MD_MMV_WAKE_M,
2496*0Sstevel@tonic-gate 	    "wake_mas: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d "
2497*0Sstevel@tonic-gate 	    "from %d\n",
2498*0Sstevel@tonic-gate 	    MSGID_ELEMS(res->mmr_msgid), setno, class, res->mmr_msgtype,
2499*0Sstevel@tonic-gate 	    res->mmr_sender);
2500*0Sstevel@tonic-gate 	/*
2501*0Sstevel@tonic-gate 	 * The mutex and cv are needed for waking up the thread
2502*0Sstevel@tonic-gate 	 * sleeping in mdmn_master_process_msg()
2503*0Sstevel@tonic-gate 	 */
2504*0Sstevel@tonic-gate 	mx = mdmn_get_master_table_mx(setno, class);
2505*0Sstevel@tonic-gate 	cv = mdmn_get_master_table_cv(setno, class);
2506*0Sstevel@tonic-gate 
2507*0Sstevel@tonic-gate 	/*
2508*0Sstevel@tonic-gate 	 * lookup the master wakeup table
2509*0Sstevel@tonic-gate 	 * If we find our message, we are on the master and
2510*0Sstevel@tonic-gate 	 * called by a slave that finished processing a message.
2511*0Sstevel@tonic-gate 	 * We store the results in the appropriate slot and
2512*0Sstevel@tonic-gate 	 * wakeup the thread (mdmn_master_process_msg()) waiting for them.
2513*0Sstevel@tonic-gate 	 */
2514*0Sstevel@tonic-gate 	mutex_lock(mx);
2515*0Sstevel@tonic-gate 	mdmn_get_master_table_id(setno, class, &master_table_id);
2516*0Sstevel@tonic-gate 	sender = mdmn_get_master_table_addr(setno, class);
2517*0Sstevel@tonic-gate 
2518*0Sstevel@tonic-gate 	if (MSGID_CMP(&(master_table_id), &(res->mmr_msgid))) {
2519*0Sstevel@tonic-gate 		if (sender == res->mmr_sender) {
2520*0Sstevel@tonic-gate 			mdmn_set_master_table_res(setno, class, res);
2521*0Sstevel@tonic-gate 			cond_signal(cv);
2522*0Sstevel@tonic-gate 			*retval = MDMNE_ACK;
2523*0Sstevel@tonic-gate 		} else {
2524*0Sstevel@tonic-gate 			/* id is correct but wrong sender (I smell a timeout) */
2525*0Sstevel@tonic-gate 			commd_debug(MD_MMV_WAKE_M,
2526*0Sstevel@tonic-gate 			    "wakeup master got unsolicited message: "
2527*0Sstevel@tonic-gate 			    "(%d, 0x%llx-%d) from %d\n",
2528*0Sstevel@tonic-gate 			    MSGID_ELEMS(res->mmr_msgid), res->mmr_sender);
2529*0Sstevel@tonic-gate 			free_result(res);
2530*0Sstevel@tonic-gate 			*retval = MDMNE_TIMEOUT;
2531*0Sstevel@tonic-gate 		}
2532*0Sstevel@tonic-gate 	} else {
2533*0Sstevel@tonic-gate 		/* id is wrong, smells like a very late timeout */
2534*0Sstevel@tonic-gate 		commd_debug(MD_MMV_WAKE_M,
2535*0Sstevel@tonic-gate 		    "wakeup master got unsolicited message: "
2536*0Sstevel@tonic-gate 		    "(%d, 0x%llx-%d) from %d, expected (%d, 0x%llx-%d)\n",
2537*0Sstevel@tonic-gate 		    MSGID_ELEMS(res->mmr_msgid), res->mmr_sender,
2538*0Sstevel@tonic-gate 		    MSGID_ELEMS(master_table_id));
2539*0Sstevel@tonic-gate 		free_result(res);
2540*0Sstevel@tonic-gate 		*retval = MDMNE_NO_WAKEUP_ENTRY;
2541*0Sstevel@tonic-gate 	}
2542*0Sstevel@tonic-gate 
2543*0Sstevel@tonic-gate 	mutex_unlock(mx);
2544*0Sstevel@tonic-gate 
2545*0Sstevel@tonic-gate 	return (retval);
2546*0Sstevel@tonic-gate }
2547*0Sstevel@tonic-gate 
2548*0Sstevel@tonic-gate /*
2549*0Sstevel@tonic-gate  * Lock a set/class combination.
2550*0Sstevel@tonic-gate  * This is mainly done for debug purpose.
2551*0Sstevel@tonic-gate  * This set/class combination immediately is blocked,
2552*0Sstevel@tonic-gate  * even in the middle of sending messages to multiple slaves.
2553*0Sstevel@tonic-gate  * This remains until the user issues a mdmn_comm_unlock_svc_1 for the same
2554*0Sstevel@tonic-gate  * set/class combination.
2555*0Sstevel@tonic-gate  *
2556*0Sstevel@tonic-gate  * Special messages of class MD_MSG_CLASS0 can never be locked.
2557*0Sstevel@tonic-gate  * 	e.g. MD_MN_MSG_VERBOSITY, MD_MN_MSG_ABORT
2558*0Sstevel@tonic-gate  *
2559*0Sstevel@tonic-gate  * That means, if MD_MSG_CLASS0 is specified, we lock all classes from
2560*0Sstevel@tonic-gate  * >= MD_MSG_CLASS1 to < MD_MN_NCLASSES
2561*0Sstevel@tonic-gate  *
2562*0Sstevel@tonic-gate  * set must be between 1 and MD_MAXSETS
2563*0Sstevel@tonic-gate  * class can be:
2564*0Sstevel@tonic-gate  *	MD_MSG_CLASS0 which means all other classes in this case
2565*0Sstevel@tonic-gate  *	or one specific class (< MD_MN_NCLASSES)
2566*0Sstevel@tonic-gate  *
2567*0Sstevel@tonic-gate  * Returns:
2568*0Sstevel@tonic-gate  *	MDMNE_ACK on sucess (locking a locked class is Ok)
2569*0Sstevel@tonic-gate  *	MDMNE_EINVAL if a parameter is out of range
2570*0Sstevel@tonic-gate  */
2571*0Sstevel@tonic-gate 
2572*0Sstevel@tonic-gate /* ARGSUSED */
2573*0Sstevel@tonic-gate int *
2574*0Sstevel@tonic-gate mdmn_comm_lock_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
2575*0Sstevel@tonic-gate {
2576*0Sstevel@tonic-gate 	int			*retval;
2577*0Sstevel@tonic-gate 	set_t			setno = msc->msc_set;
2578*0Sstevel@tonic-gate 	md_mn_msgclass_t	class = msc->msc_class;
2579*0Sstevel@tonic-gate 
2580*0Sstevel@tonic-gate 	retval = Malloc(sizeof (int));
2581*0Sstevel@tonic-gate 
2582*0Sstevel@tonic-gate 	/* check if the global initialization is done */
2583*0Sstevel@tonic-gate 	if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2584*0Sstevel@tonic-gate 		global_init();
2585*0Sstevel@tonic-gate 	}
2586*0Sstevel@tonic-gate 
2587*0Sstevel@tonic-gate 	/* is this rpc request coming from the local node ? */
2588*0Sstevel@tonic-gate 	if (check_license(rqstp, 0) == FALSE) {
2589*0Sstevel@tonic-gate 		xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc);
2590*0Sstevel@tonic-gate 		*retval = MDMNE_RPC_FAIL;
2591*0Sstevel@tonic-gate 		return (retval);
2592*0Sstevel@tonic-gate 	}
2593*0Sstevel@tonic-gate 
2594*0Sstevel@tonic-gate 	/* Perform some range checking */
2595*0Sstevel@tonic-gate 	if ((setno == 0) || (setno >= MD_MAXSETS) ||
2596*0Sstevel@tonic-gate 	    (class < MD_MSG_CLASS0) || (class >= MD_MN_NCLASSES)) {
2597*0Sstevel@tonic-gate 		*retval = MDMNE_EINVAL;
2598*0Sstevel@tonic-gate 		return (retval);
2599*0Sstevel@tonic-gate 	}
2600*0Sstevel@tonic-gate 
2601*0Sstevel@tonic-gate 	commd_debug(MD_MMV_MISC, "lock: set=%d, class=%d\n", setno, class);
2602*0Sstevel@tonic-gate 	mutex_lock(&mdmn_busy_mutex[setno]);
2603*0Sstevel@tonic-gate 	if (class != MD_MSG_CLASS0) {
2604*0Sstevel@tonic-gate 		mdmn_mark_class_locked(setno, class);
2605*0Sstevel@tonic-gate 	} else {
2606*0Sstevel@tonic-gate 		/* MD_MSG_CLASS0 is used as a wild card for all classes */
2607*0Sstevel@tonic-gate 		for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) {
2608*0Sstevel@tonic-gate 			mdmn_mark_class_locked(setno, class);
2609*0Sstevel@tonic-gate 		}
2610*0Sstevel@tonic-gate 	}
2611*0Sstevel@tonic-gate 	mutex_unlock(&mdmn_busy_mutex[setno]);
2612*0Sstevel@tonic-gate 
2613*0Sstevel@tonic-gate 	*retval = MDMNE_ACK;
2614*0Sstevel@tonic-gate 	return (retval);
2615*0Sstevel@tonic-gate }
2616*0Sstevel@tonic-gate 
2617*0Sstevel@tonic-gate /*
2618*0Sstevel@tonic-gate  * Unlock a set/class combination.
2619*0Sstevel@tonic-gate  * set must be between 1 and MD_MAXSETS
2620*0Sstevel@tonic-gate  * class can be:
2621*0Sstevel@tonic-gate  *	MD_MSG_CLASS0 which means all other classes in this case (like above)
2622*0Sstevel@tonic-gate  *	or one specific class (< MD_MN_NCLASSES)
2623*0Sstevel@tonic-gate  *
2624*0Sstevel@tonic-gate  * Returns:
2625*0Sstevel@tonic-gate  *	MDMNE_ACK on sucess (unlocking an unlocked class is Ok)
2626*0Sstevel@tonic-gate  *	MDMNE_EINVAL if a parameter is out of range
2627*0Sstevel@tonic-gate  */
2628*0Sstevel@tonic-gate /* ARGSUSED */
2629*0Sstevel@tonic-gate int *
2630*0Sstevel@tonic-gate mdmn_comm_unlock_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
2631*0Sstevel@tonic-gate {
2632*0Sstevel@tonic-gate 	int			*retval;
2633*0Sstevel@tonic-gate 	set_t			setno  = msc->msc_set;
2634*0Sstevel@tonic-gate 	md_mn_msgclass_t	class  = msc->msc_class;
2635*0Sstevel@tonic-gate 
2636*0Sstevel@tonic-gate 	retval = Malloc(sizeof (int));
2637*0Sstevel@tonic-gate 
2638*0Sstevel@tonic-gate 	/* check if the global initialization is done */
2639*0Sstevel@tonic-gate 	if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2640*0Sstevel@tonic-gate 		global_init();
2641*0Sstevel@tonic-gate 	}
2642*0Sstevel@tonic-gate 
2643*0Sstevel@tonic-gate 	/* is this rpc request coming from the local node ? */
2644*0Sstevel@tonic-gate 	if (check_license(rqstp, 0) == FALSE) {
2645*0Sstevel@tonic-gate 		xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc);
2646*0Sstevel@tonic-gate 		*retval = MDMNE_RPC_FAIL;
2647*0Sstevel@tonic-gate 		return (retval);
2648*0Sstevel@tonic-gate 	}
2649*0Sstevel@tonic-gate 
2650*0Sstevel@tonic-gate 	/* Perform some range checking */
2651*0Sstevel@tonic-gate 	if ((setno == 0) || (setno >= MD_MAXSETS) ||
2652*0Sstevel@tonic-gate 	    (class < MD_MSG_CLASS0) || (class >= MD_MN_NCLASSES)) {
2653*0Sstevel@tonic-gate 		*retval = MDMNE_EINVAL;
2654*0Sstevel@tonic-gate 		return (retval);
2655*0Sstevel@tonic-gate 	}
2656*0Sstevel@tonic-gate 	commd_debug(MD_MMV_MISC, "unlock: set=%d, class=%d\n", setno, class);
2657*0Sstevel@tonic-gate 
2658*0Sstevel@tonic-gate 	mutex_lock(&mdmn_busy_mutex[setno]);
2659*0Sstevel@tonic-gate 	if (class != MD_MSG_CLASS0) {
2660*0Sstevel@tonic-gate 		mdmn_mark_class_unlocked(setno, class);
2661*0Sstevel@tonic-gate 	} else {
2662*0Sstevel@tonic-gate 		/* MD_MSG_CLASS0 is used as a wild card for all classes */
2663*0Sstevel@tonic-gate 		for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) {
2664*0Sstevel@tonic-gate 			mdmn_mark_class_unlocked(setno, class);
2665*0Sstevel@tonic-gate 		}
2666*0Sstevel@tonic-gate 	}
2667*0Sstevel@tonic-gate 	mutex_unlock(&mdmn_busy_mutex[setno]);
2668*0Sstevel@tonic-gate 
2669*0Sstevel@tonic-gate 	*retval = MDMNE_ACK;
2670*0Sstevel@tonic-gate 	return (retval);
2671*0Sstevel@tonic-gate }
2672*0Sstevel@tonic-gate 
2673*0Sstevel@tonic-gate /*
2674*0Sstevel@tonic-gate  * mdmn_comm_suspend_svc_1(setno, class)
2675*0Sstevel@tonic-gate  *
2676*0Sstevel@tonic-gate  * Drain all outstanding messages for a given set/class combination
2677*0Sstevel@tonic-gate  * and don't allow new messages to be processed.
2678*0Sstevel@tonic-gate  *
2679*0Sstevel@tonic-gate  * Special messages of class MD_MSG_CLASS0 can never be locked.
2680*0Sstevel@tonic-gate  * 	e.g. MD_MN_MSG_VERBOSITY
2681*0Sstevel@tonic-gate  *
2682*0Sstevel@tonic-gate  * 1 <= setno < MD_MAXSETS	or setno == MD_COMM_ALL_SETS
2683*0Sstevel@tonic-gate  * 1 <= class < MD_MN_NCLASSES	or class == MD_COMM_ALL_CLASSES
2684*0Sstevel@tonic-gate  *
2685*0Sstevel@tonic-gate  * If class _is_not_ MD_COMM_ALL_CLASSES, then we simply mark this
2686*0Sstevel@tonic-gate  * one class as being suspended.
2687*0Sstevel@tonic-gate  * If messages for this class are currently on their way,
2688*0Sstevel@tonic-gate  * MDMNE_SET_NOT_DRAINED is returned. Otherwise MDMNE_ACK is returned.
2689*0Sstevel@tonic-gate  *
2690*0Sstevel@tonic-gate  * If class _is_ MD_COMM_ALL_CLASSES we drain all classes of this set.
2691*0Sstevel@tonic-gate  * Messages must be generated in ascending order.
2692*0Sstevel@tonic-gate  * This means, a message cannot create submessages with the same or lower class.
2693*0Sstevel@tonic-gate  * Draining messages must go from 1 to NCLASSES in order to ensure we don't
2694*0Sstevel@tonic-gate  * generate a hanging situation here.
2695*0Sstevel@tonic-gate  * We mark class 1 as being suspended.
2696*0Sstevel@tonic-gate  * if the class is not busy, we proceed with class 2
2697*0Sstevel@tonic-gate  * and so on
2698*0Sstevel@tonic-gate  * if a class *is* busy, we cannot continue here, but return
2699*0Sstevel@tonic-gate  * MDMNE_SET_NOT_DRAINED.
2700*0Sstevel@tonic-gate  * We expect the caller to hold on for some seconds and try again.
2701*0Sstevel@tonic-gate  * When that message, that held the class busy is done in
2702*0Sstevel@tonic-gate  * mdmn_master_process_msg(), mdmn_mark_class_unbusy() called.
2703*0Sstevel@tonic-gate  * There it is checked if the class is about to drain.
2704*0Sstevel@tonic-gate  * In that case it tries to drain all higher classes there.
2705*0Sstevel@tonic-gate  *
2706*0Sstevel@tonic-gate  * If setno is MD_COMM_ALL_SETS then we perform this on all possible sets.
2707*0Sstevel@tonic-gate  * In that case we return MDMNE_SET_NOT_DRAINED if not all sets are
2708*0Sstevel@tonic-gate  * completely drained.
2709*0Sstevel@tonic-gate  *
2710*0Sstevel@tonic-gate  * Returns:
2711*0Sstevel@tonic-gate  *	MDMNE_ACK on sucess (set is drained, no outstanding messages)
2712*0Sstevel@tonic-gate  *	MDMNE_SET_NOT_DRAINED  if drain process is started, but there are
2713*0Sstevel@tonic-gate  *		still outstanding messages for this set(s)
2714*0Sstevel@tonic-gate  *	MDMNE_EINVAL if setno is out of range
2715*0Sstevel@tonic-gate  *	MDMNE_NOT_JOINED if the set is not yet initialized on this node
2716*0Sstevel@tonic-gate  */
2717*0Sstevel@tonic-gate 
2718*0Sstevel@tonic-gate /* ARGSUSED */
2719*0Sstevel@tonic-gate int *
2720*0Sstevel@tonic-gate mdmn_comm_suspend_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
2721*0Sstevel@tonic-gate {
2722*0Sstevel@tonic-gate 	int			*retval;
2723*0Sstevel@tonic-gate 	int			failure = 0;
2724*0Sstevel@tonic-gate 	set_t			startset, endset;
2725*0Sstevel@tonic-gate 	set_t			setno  = msc->msc_set;
2726*0Sstevel@tonic-gate 	md_mn_msgclass_t	oclass = msc->msc_class;
2727*0Sstevel@tonic-gate #ifdef NOT_YET_NEEDED
2728*0Sstevel@tonic-gate 	uint_t			flags  = msc->msc_flags;
2729*0Sstevel@tonic-gate #endif /* NOT_YET_NEEDED */
2730*0Sstevel@tonic-gate 	md_mn_msgclass_t	class;
2731*0Sstevel@tonic-gate 
2732*0Sstevel@tonic-gate 	retval = Malloc(sizeof (int));
2733*0Sstevel@tonic-gate 
2734*0Sstevel@tonic-gate 	/* check if the global initialization is done */
2735*0Sstevel@tonic-gate 	if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2736*0Sstevel@tonic-gate 		global_init();
2737*0Sstevel@tonic-gate 	}
2738*0Sstevel@tonic-gate 
2739*0Sstevel@tonic-gate 	/* is this rpc request coming from the local node ? */
2740*0Sstevel@tonic-gate 	if (check_license(rqstp, 0) == FALSE) {
2741*0Sstevel@tonic-gate 		xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc);
2742*0Sstevel@tonic-gate 		*retval = MDMNE_RPC_FAIL;
2743*0Sstevel@tonic-gate 		return (retval);
2744*0Sstevel@tonic-gate 	}
2745*0Sstevel@tonic-gate 
2746*0Sstevel@tonic-gate 	commd_debug(MD_MMV_MISC, "suspend: called for set=%d class=%d\n",
2747*0Sstevel@tonic-gate 	    setno, oclass);
2748*0Sstevel@tonic-gate 
2749*0Sstevel@tonic-gate 	/* Perform some range checking */
2750*0Sstevel@tonic-gate 	if (setno >= MD_MAXSETS) {
2751*0Sstevel@tonic-gate 		*retval = MDMNE_EINVAL;
2752*0Sstevel@tonic-gate 		commd_debug(MD_MMV_MISC, "suspend: returning MDMNE_EINVAL\n");
2753*0Sstevel@tonic-gate 		return (retval);
2754*0Sstevel@tonic-gate 	}
2755*0Sstevel@tonic-gate 
2756*0Sstevel@tonic-gate 	/*  setno == MD_COMM_ALL_SETS means: we walk thru all possible sets. */
2757*0Sstevel@tonic-gate 	if (setno == MD_COMM_ALL_SETS) {
2758*0Sstevel@tonic-gate 		startset = 1;
2759*0Sstevel@tonic-gate 		endset = MD_MAXSETS - 1;
2760*0Sstevel@tonic-gate 	} else {
2761*0Sstevel@tonic-gate 		startset = setno;
2762*0Sstevel@tonic-gate 		endset = setno;
2763*0Sstevel@tonic-gate 	}
2764*0Sstevel@tonic-gate 
2765*0Sstevel@tonic-gate 	for (setno = startset; setno <= endset; setno++) {
2766*0Sstevel@tonic-gate 		/* Here we need the mutexes for the set to be setup */
2767*0Sstevel@tonic-gate 		if (md_mn_set_inited[setno] != MDMN_SET_MUTEXES) {
2768*0Sstevel@tonic-gate 			(void) mdmn_init_set(setno, MDMN_SET_MUTEXES);
2769*0Sstevel@tonic-gate 		}
2770*0Sstevel@tonic-gate 
2771*0Sstevel@tonic-gate 		mutex_lock(&mdmn_busy_mutex[setno]);
2772*0Sstevel@tonic-gate 		/* shall we drain all classes of this set? */
2773*0Sstevel@tonic-gate 		if (oclass == MD_COMM_ALL_CLASSES) {
2774*0Sstevel@tonic-gate 			for (class = 1; class < MD_MN_NCLASSES; class ++) {
2775*0Sstevel@tonic-gate 				commd_debug(MD_MMV_MISC,
2776*0Sstevel@tonic-gate 				    "suspend: suspending set %d, class %d\n",
2777*0Sstevel@tonic-gate 				    setno, class);
2778*0Sstevel@tonic-gate 				*retval = mdmn_mark_class_suspended(setno,
2779*0Sstevel@tonic-gate 				    class, MDMN_SUSPEND_ALL);
2780*0Sstevel@tonic-gate 				if (*retval == MDMNE_SET_NOT_DRAINED) {
2781*0Sstevel@tonic-gate 					failure++;
2782*0Sstevel@tonic-gate 				}
2783*0Sstevel@tonic-gate 			}
2784*0Sstevel@tonic-gate 		} else {
2785*0Sstevel@tonic-gate 			/* only drain one specific class */
2786*0Sstevel@tonic-gate 			commd_debug(MD_MMV_MISC,
2787*0Sstevel@tonic-gate 			    "suspend: suspending set=%d class=%d\n",
2788*0Sstevel@tonic-gate 			    setno, oclass);
2789*0Sstevel@tonic-gate 			*retval = mdmn_mark_class_suspended(setno, oclass,
2790*0Sstevel@tonic-gate 			    MDMN_SUSPEND_1);
2791*0Sstevel@tonic-gate 			if (*retval == MDMNE_SET_NOT_DRAINED) {
2792*0Sstevel@tonic-gate 				failure++;
2793*0Sstevel@tonic-gate 			}
2794*0Sstevel@tonic-gate 		}
2795*0Sstevel@tonic-gate 		mutex_unlock(&mdmn_busy_mutex[setno]);
2796*0Sstevel@tonic-gate 	}
2797*0Sstevel@tonic-gate 	/* If one or more sets are not entirely drained, failure is non-zero */
2798*0Sstevel@tonic-gate 	if (failure != 0) {
2799*0Sstevel@tonic-gate 		*retval = MDMNE_SET_NOT_DRAINED;
2800*0Sstevel@tonic-gate 		commd_debug(MD_MMV_MISC,
2801*0Sstevel@tonic-gate 		    "suspend: returning MDMNE_SET_NOT_DRAINED\n");
2802*0Sstevel@tonic-gate 	} else {
2803*0Sstevel@tonic-gate 		*retval = MDMNE_ACK;
2804*0Sstevel@tonic-gate 	}
2805*0Sstevel@tonic-gate 
2806*0Sstevel@tonic-gate 	return (retval);
2807*0Sstevel@tonic-gate }
2808*0Sstevel@tonic-gate 
2809*0Sstevel@tonic-gate /*
2810*0Sstevel@tonic-gate  * mdmn_comm_resume_svc_1(setno, class)
2811*0Sstevel@tonic-gate  *
2812*0Sstevel@tonic-gate  * Resume processing messages for a given set.
2813*0Sstevel@tonic-gate  * This incorporates the repeal of a previous suspend operation.
2814*0Sstevel@tonic-gate  *
2815*0Sstevel@tonic-gate  * 1 <= setno < MD_MAXSETS	or setno == MD_COMM_ALL_SETS
2816*0Sstevel@tonic-gate  * 1 <= class < MD_MN_NCLASSES	or class == MD_COMM_ALL_CLASSES
2817*0Sstevel@tonic-gate  *
2818*0Sstevel@tonic-gate  * If class _is_not_ MD_COMM_ALL_CLASSES, then we simply mark this
2819*0Sstevel@tonic-gate  * one class as being resumed.
2820*0Sstevel@tonic-gate  *
2821*0Sstevel@tonic-gate  * If class _is_ MD_COMM_ALL_CLASSES we resume all classes of this set.
2822*0Sstevel@tonic-gate  *
2823*0Sstevel@tonic-gate  * If setno is MD_COMM_ALL_SETS then we perform this on all possible sets.
2824*0Sstevel@tonic-gate  *
2825*0Sstevel@tonic-gate  * If both setno is MD_COMM_ALL_SETS and class is MD_COMM_ALL_CLASSES we also
2826*0Sstevel@tonic-gate  * reset any ABORT flag from the global state.
2827*0Sstevel@tonic-gate  *
2828*0Sstevel@tonic-gate  * Returns:
2829*0Sstevel@tonic-gate  *	MDMNE_ACK on sucess (resuming an unlocked set is Ok)
2830*0Sstevel@tonic-gate  *	MDMNE_EINVAL if setno is out of range
2831*0Sstevel@tonic-gate  *	MDMNE_NOT_JOINED if the set is not yet initialized on this node
2832*0Sstevel@tonic-gate  */
2833*0Sstevel@tonic-gate /* ARGSUSED */
2834*0Sstevel@tonic-gate int *
2835*0Sstevel@tonic-gate mdmn_comm_resume_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
2836*0Sstevel@tonic-gate {
2837*0Sstevel@tonic-gate 	int			*retval;
2838*0Sstevel@tonic-gate 	set_t			startset, endset;
2839*0Sstevel@tonic-gate 	set_t			setno  = msc->msc_set;
2840*0Sstevel@tonic-gate 	md_mn_msgclass_t	oclass = msc->msc_class;
2841*0Sstevel@tonic-gate 	uint_t			flags  = msc->msc_flags;
2842*0Sstevel@tonic-gate 	md_mn_msgclass_t	class;
2843*0Sstevel@tonic-gate 
2844*0Sstevel@tonic-gate 	retval = Malloc(sizeof (int));
2845*0Sstevel@tonic-gate 
2846*0Sstevel@tonic-gate 	/* check if the global initialization is done */
2847*0Sstevel@tonic-gate 	if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2848*0Sstevel@tonic-gate 		global_init();
2849*0Sstevel@tonic-gate 	}
2850*0Sstevel@tonic-gate 
2851*0Sstevel@tonic-gate 	/* is this rpc request coming from the local node ? */
2852*0Sstevel@tonic-gate 	if (check_license(rqstp, 0) == FALSE) {
2853*0Sstevel@tonic-gate 		xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc);
2854*0Sstevel@tonic-gate 		*retval = MDMNE_RPC_FAIL;
2855*0Sstevel@tonic-gate 		return (retval);
2856*0Sstevel@tonic-gate 	}
2857*0Sstevel@tonic-gate 
2858*0Sstevel@tonic-gate 	commd_debug(MD_MMV_MISC, "resume: called for set=%d class=%d\n",
2859*0Sstevel@tonic-gate 	    setno, oclass);
2860*0Sstevel@tonic-gate 
2861*0Sstevel@tonic-gate 	/* Perform some range checking */
2862*0Sstevel@tonic-gate 	if (setno > MD_MAXSETS) {
2863*0Sstevel@tonic-gate 		*retval = MDMNE_EINVAL;
2864*0Sstevel@tonic-gate 		return (retval);
2865*0Sstevel@tonic-gate 	}
2866*0Sstevel@tonic-gate 
2867*0Sstevel@tonic-gate 	if (setno == MD_COMM_ALL_SETS) {
2868*0Sstevel@tonic-gate 		startset = 1;
2869*0Sstevel@tonic-gate 		endset = MD_MAXSETS - 1;
2870*0Sstevel@tonic-gate 		if (oclass == MD_COMM_ALL_CLASSES) {
2871*0Sstevel@tonic-gate 			/* This is the point where we "unabort" the commd */
2872*0Sstevel@tonic-gate 			commd_debug(MD_MMV_MISC, "resume: resetting ABORT\n");
2873*0Sstevel@tonic-gate 			md_commd_global_state &= ~MD_CGS_ABORTED;
2874*0Sstevel@tonic-gate 		}
2875*0Sstevel@tonic-gate 	} else {
2876*0Sstevel@tonic-gate 		startset = setno;
2877*0Sstevel@tonic-gate 		endset = setno;
2878*0Sstevel@tonic-gate 	}
2879*0Sstevel@tonic-gate 
2880*0Sstevel@tonic-gate 	for (setno = startset; setno <= endset; setno++) {
2881*0Sstevel@tonic-gate 
2882*0Sstevel@tonic-gate 		/* Here we need the mutexes for the set to be setup */
2883*0Sstevel@tonic-gate 		if ((md_mn_set_inited[setno] & MDMN_SET_MUTEXES) == 0) {
2884*0Sstevel@tonic-gate 			(void) mdmn_init_set(setno, MDMN_SET_MUTEXES);
2885*0Sstevel@tonic-gate 		}
2886*0Sstevel@tonic-gate 
2887*0Sstevel@tonic-gate 		mutex_lock(&mdmn_busy_mutex[setno]);
2888*0Sstevel@tonic-gate 
2889*0Sstevel@tonic-gate 		if (oclass == MD_COMM_ALL_CLASSES) {
2890*0Sstevel@tonic-gate 			int end_class = 1;
2891*0Sstevel@tonic-gate 			/*
2892*0Sstevel@tonic-gate 			 * When SUSPENDing all classes, we go
2893*0Sstevel@tonic-gate 			 * from 1 to MD_MN_NCLASSES-1
2894*0Sstevel@tonic-gate 			 * The correct reverse action is RESUMing
2895*0Sstevel@tonic-gate 			 * from MD_MN_NCLASSES-1 to 1 (or 2)
2896*0Sstevel@tonic-gate 			 */
2897*0Sstevel@tonic-gate 
2898*0Sstevel@tonic-gate 			if (flags & MD_MSCF_DONT_RESUME_CLASS1) {
2899*0Sstevel@tonic-gate 				end_class = 2;
2900*0Sstevel@tonic-gate 			}
2901*0Sstevel@tonic-gate 
2902*0Sstevel@tonic-gate 			/*
2903*0Sstevel@tonic-gate 			 * Then mark all classes of this set as no longer
2904*0Sstevel@tonic-gate 			 * suspended. This supersedes any previous suspend(1)
2905*0Sstevel@tonic-gate 			 * calls and resumes the set entirely.
2906*0Sstevel@tonic-gate 			 */
2907*0Sstevel@tonic-gate 			for (class = MD_MN_NCLASSES - 1; class >= end_class;
2908*0Sstevel@tonic-gate 			    class --) {
2909*0Sstevel@tonic-gate 				commd_debug(MD_MMV_MISC,
2910*0Sstevel@tonic-gate 				    "resume: resuming set=%d class=%d\n",
2911*0Sstevel@tonic-gate 				    setno, class);
2912*0Sstevel@tonic-gate 				mdmn_mark_class_resumed(setno, class,
2913*0Sstevel@tonic-gate 				    (MDMN_SUSPEND_ALL | MDMN_SUSPEND_1));
2914*0Sstevel@tonic-gate 			}
2915*0Sstevel@tonic-gate 		} else {
2916*0Sstevel@tonic-gate 			/*
2917*0Sstevel@tonic-gate 			 * In this case only one class is marked as not
2918*0Sstevel@tonic-gate 			 * suspended. If a suspend(all) is currently active for
2919*0Sstevel@tonic-gate 			 * this set, this class will still be suspended.
2920*0Sstevel@tonic-gate 			 * That state will be cleared by a suspend(all)
2921*0Sstevel@tonic-gate 			 * (see above)
2922*0Sstevel@tonic-gate 			 */
2923*0Sstevel@tonic-gate 			commd_debug(MD_MMV_MISC,
2924*0Sstevel@tonic-gate 			    "resume: resuming set=%d class=%d\n",
2925*0Sstevel@tonic-gate 			    setno, oclass);
2926*0Sstevel@tonic-gate 			mdmn_mark_class_resumed(setno, oclass, MDMN_SUSPEND_1);
2927*0Sstevel@tonic-gate 		}
2928*0Sstevel@tonic-gate 
2929*0Sstevel@tonic-gate 		mutex_unlock(&mdmn_busy_mutex[setno]);
2930*0Sstevel@tonic-gate 	}
2931*0Sstevel@tonic-gate 
2932*0Sstevel@tonic-gate 	*retval = MDMNE_ACK;
2933*0Sstevel@tonic-gate 	return (retval);
2934*0Sstevel@tonic-gate }
2935*0Sstevel@tonic-gate /* ARGSUSED */
2936*0Sstevel@tonic-gate int *
2937*0Sstevel@tonic-gate mdmn_comm_reinit_set_svc_1(set_t *setnop, struct svc_req *rqstp)
2938*0Sstevel@tonic-gate {
2939*0Sstevel@tonic-gate 	int		*retval;
2940*0Sstevel@tonic-gate 	md_mnnode_desc	*node;
2941*0Sstevel@tonic-gate 	set_t		 setno = *setnop;
2942*0Sstevel@tonic-gate 
2943*0Sstevel@tonic-gate 	retval = Malloc(sizeof (int));
2944*0Sstevel@tonic-gate 
2945*0Sstevel@tonic-gate 	/* check if the global initialization is done */
2946*0Sstevel@tonic-gate 	if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2947*0Sstevel@tonic-gate 		global_init();
2948*0Sstevel@tonic-gate 	}
2949*0Sstevel@tonic-gate 
2950*0Sstevel@tonic-gate 	/* is this rpc request coming from the local node ? */
2951*0Sstevel@tonic-gate 	if (check_license(rqstp, 0) == FALSE) {
2952*0Sstevel@tonic-gate 		xdr_free(xdr_set_t, (caddr_t)setnop);
2953*0Sstevel@tonic-gate 		*retval = MDMNE_RPC_FAIL;
2954*0Sstevel@tonic-gate 		return (retval);
2955*0Sstevel@tonic-gate 	}
2956*0Sstevel@tonic-gate 
2957*0Sstevel@tonic-gate 	commd_debug(MD_MMV_MISC, "reinit: set=%d\n", setno);
2958*0Sstevel@tonic-gate 
2959*0Sstevel@tonic-gate 	rw_rdlock(&set_desc_rwlock[setno]);
2960*0Sstevel@tonic-gate 	/*
2961*0Sstevel@tonic-gate 	 * We assume, that all messages have been suspended previously.
2962*0Sstevel@tonic-gate 	 *
2963*0Sstevel@tonic-gate 	 * As we are modifying lots of clients here we grab the client_rwlock
2964*0Sstevel@tonic-gate 	 * in writer mode. This ensures, no new messages come in.
2965*0Sstevel@tonic-gate 	 */
2966*0Sstevel@tonic-gate 	rw_wrlock(&client_rwlock[setno]);
2967*0Sstevel@tonic-gate 	/* This set is no longer initialized */
2968*0Sstevel@tonic-gate 
2969*0Sstevel@tonic-gate 	if ((set_descriptor[setno] != NULL) &&
2970*0Sstevel@tonic-gate 	    (md_mn_set_inited[setno] & MDMN_SET_NODES)) {
2971*0Sstevel@tonic-gate 		/* destroy all rpc clients from this set */
2972*0Sstevel@tonic-gate 		for (node = set_descriptor[setno]->sd_nodelist; node;
2973*0Sstevel@tonic-gate 		    node = node->nd_next) {
2974*0Sstevel@tonic-gate 			mdmn_clnt_destroy(client[setno][node->nd_nodeid]);
2975*0Sstevel@tonic-gate 			if (client[setno][node->nd_nodeid] != (CLIENT *)NULL) {
2976*0Sstevel@tonic-gate 				client[setno][node->nd_nodeid] = (CLIENT *)NULL;
2977*0Sstevel@tonic-gate 			}
2978*0Sstevel@tonic-gate 		}
2979*0Sstevel@tonic-gate 	md_mn_set_inited[setno] &= ~MDMN_SET_NODES;
2980*0Sstevel@tonic-gate 	}
2981*0Sstevel@tonic-gate 
2982*0Sstevel@tonic-gate 	commd_debug(MD_MMV_MISC, "reinit: done init_set(%d)\n", setno);
2983*0Sstevel@tonic-gate 
2984*0Sstevel@tonic-gate 	rw_unlock(&client_rwlock[setno]);
2985*0Sstevel@tonic-gate 	rw_unlock(&set_desc_rwlock[setno]);
2986*0Sstevel@tonic-gate 	*retval = MDMNE_ACK;
2987*0Sstevel@tonic-gate 	return (retval);
2988*0Sstevel@tonic-gate }
2989*0Sstevel@tonic-gate 
2990*0Sstevel@tonic-gate /*
2991*0Sstevel@tonic-gate  * This is just an interface for testing purpose.
2992*0Sstevel@tonic-gate  * Here we can disable single message types.
2993*0Sstevel@tonic-gate  * If we block a message type, this is valid for all MN sets.
2994*0Sstevel@tonic-gate  * If a message arrives later, and  it's message type is blocked, it will
2995*0Sstevel@tonic-gate  * be returned immediately with MDMNE_CLASS_LOCKED, which causes the sender to
2996*0Sstevel@tonic-gate  * resend this message over and over again.
2997*0Sstevel@tonic-gate  */
2998*0Sstevel@tonic-gate 
2999*0Sstevel@tonic-gate /* ARGSUSED */
3000*0Sstevel@tonic-gate int *
3001*0Sstevel@tonic-gate mdmn_comm_msglock_svc_1(md_mn_type_and_lock_t *mmtl, struct svc_req *rqstp)
3002*0Sstevel@tonic-gate {
3003*0Sstevel@tonic-gate 	int			*retval;
3004*0Sstevel@tonic-gate 	md_mn_msgtype_t		type = mmtl->mmtl_type;
3005*0Sstevel@tonic-gate 	uint_t			lock = mmtl->mmtl_lock;
3006*0Sstevel@tonic-gate 
3007*0Sstevel@tonic-gate 	retval = Malloc(sizeof (int));
3008*0Sstevel@tonic-gate 
3009*0Sstevel@tonic-gate 	/* check if the global initialization is done */
3010*0Sstevel@tonic-gate 	if ((md_commd_global_state & MD_CGS_INITED) == 0) {
3011*0Sstevel@tonic-gate 		global_init();
3012*0Sstevel@tonic-gate 	}
3013*0Sstevel@tonic-gate 
3014*0Sstevel@tonic-gate 	/* is this rpc request coming from the local node ? */
3015*0Sstevel@tonic-gate 	if (check_license(rqstp, 0) == FALSE) {
3016*0Sstevel@tonic-gate 		xdr_free(xdr_md_mn_type_and_lock_t, (caddr_t)mmtl);
3017*0Sstevel@tonic-gate 		*retval = MDMNE_RPC_FAIL;
3018*0Sstevel@tonic-gate 		return (retval);
3019*0Sstevel@tonic-gate 	}
3020*0Sstevel@tonic-gate 
3021*0Sstevel@tonic-gate 	/* Perform some range checking */
3022*0Sstevel@tonic-gate 	if ((type == 0) || (type >= MD_MN_NMESSAGES)) {
3023*0Sstevel@tonic-gate 		*retval = MDMNE_EINVAL;
3024*0Sstevel@tonic-gate 		return (retval);
3025*0Sstevel@tonic-gate 	}
3026*0Sstevel@tonic-gate 
3027*0Sstevel@tonic-gate 	commd_debug(MD_MMV_MISC, "msglock: type=%d, lock=%d\n", type, lock);
3028*0Sstevel@tonic-gate 	msgtype_lock_state[type] = lock;
3029*0Sstevel@tonic-gate 
3030*0Sstevel@tonic-gate 	*retval = MDMNE_ACK;
3031*0Sstevel@tonic-gate 	return (retval);
3032*0Sstevel@tonic-gate }
3033