xref: /dflybsd-src/sys/kern/kern_dmsg.c (revision 211d4362597aee676ecea315377d5cb13da26bb5)
1 /*-
2  * Copyright (c) 2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * TODO: txcmd CREATE state is deferred by txmsgq, need to calculate
36  *	 a streaming response.  See subr_diskiocom()'s diskiodone().
37  */
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/kernel.h>
41 #include <sys/conf.h>
42 #include <sys/systm.h>
43 #include <sys/queue.h>
44 #include <sys/tree.h>
45 #include <sys/malloc.h>
46 #include <sys/mount.h>
47 #include <sys/socket.h>
48 #include <sys/vnode.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/priv.h>
52 #include <sys/thread.h>
53 #include <sys/globaldata.h>
54 #include <sys/limits.h>
55 
56 #include <sys/dmsg.h>
57 
58 RB_GENERATE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp);
59 RB_GENERATE(kdmsg_circuit_tree, kdmsg_circuit, rbnode, kdmsg_circuit_cmp);
60 
61 static int kdmsg_msg_receive_handling(kdmsg_msg_t *msg);
62 static int kdmsg_circ_msgrx(kdmsg_msg_t *msg);
63 static int kdmsg_state_msgrx(kdmsg_msg_t *msg);
64 static int kdmsg_state_msgtx(kdmsg_msg_t *msg);
65 static void kdmsg_state_cleanuprx(kdmsg_msg_t *msg);
66 static void kdmsg_state_cleanuptx(kdmsg_msg_t *msg);
67 static void kdmsg_state_abort(kdmsg_state_t *state);
68 static void kdmsg_state_free(kdmsg_state_t *state);
69 
70 static void kdmsg_iocom_thread_rd(void *arg);
71 static void kdmsg_iocom_thread_wr(void *arg);
72 static int kdmsg_autorxmsg(kdmsg_msg_t *msg);
73 static void kdmsg_autocirc(kdmsg_msg_t *msg);
74 static int kdmsg_autocirc_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
75 
76 static struct lwkt_token kdmsg_token = LWKT_TOKEN_INITIALIZER(kdmsg_token);
77 
78 void
79 kdmsg_circ_hold(kdmsg_circuit_t *circ)
80 {
81 	atomic_add_int(&circ->refs, 1);
82 }
83 
84 void
85 kdmsg_circ_drop(kdmsg_circuit_t *circ)
86 {
87 	kdmsg_iocom_t *iocom;
88 
89 	if (atomic_fetchadd_int(&circ->refs, -1) == 1) {
90 		KKASSERT(circ->span_state == NULL &&
91 			 circ->circ_state == NULL &&
92 			 circ->rcirc_state == NULL &&
93 			 circ->recorded == 0);
94 		iocom = circ->iocom;
95 		circ->iocom = NULL;
96 		kfree(circ, iocom->mmsg);
97 	}
98 }
99 
100 
101 /*
102  * Initialize the roll-up communications structure for a network
103  * messaging session.  This function does not install the socket.
104  */
105 void
106 kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, uint32_t flags,
107 		 struct malloc_type *mmsg,
108 		 int (*rcvmsg)(kdmsg_msg_t *msg))
109 {
110 	bzero(iocom, sizeof(*iocom));
111 	iocom->handle = handle;
112 	iocom->mmsg = mmsg;
113 	iocom->rcvmsg = rcvmsg;
114 	iocom->flags = flags;
115 	lockinit(&iocom->msglk, "h2msg", 0, 0);
116 	TAILQ_INIT(&iocom->msgq);
117 	RB_INIT(&iocom->circ_tree);
118 	RB_INIT(&iocom->staterd_tree);
119 	RB_INIT(&iocom->statewr_tree);
120 }
121 
122 /*
123  * [Re]connect using the passed file pointer.  The caller must ref the
124  * fp for us.  We own that ref now.
125  */
126 void
127 kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp,
128 		      const char *subsysname)
129 {
130 	/*
131 	 * Destroy the current connection
132 	 */
133 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
134 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILL);
135 	while (iocom->msgrd_td || iocom->msgwr_td) {
136 		wakeup(&iocom->msg_ctl);
137 		lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
138 	}
139 
140 	/*
141 	 * Drop communications descriptor
142 	 */
143 	if (iocom->msg_fp) {
144 		fdrop(iocom->msg_fp);
145 		iocom->msg_fp = NULL;
146 	}
147 
148 	/*
149 	 * Setup new communications descriptor
150 	 */
151 	iocom->msg_ctl = 0;
152 	iocom->msg_fp = fp;
153 	iocom->msg_seq = 0;
154 	iocom->flags &= ~KDMSG_IOCOMF_EXITNOACC;
155 
156 	lwkt_create(kdmsg_iocom_thread_rd, iocom, &iocom->msgrd_td,
157 		    NULL, 0, -1, "%s-msgrd", subsysname);
158 	lwkt_create(kdmsg_iocom_thread_wr, iocom, &iocom->msgwr_td,
159 		    NULL, 0, -1, "%s-msgwr", subsysname);
160 	lockmgr(&iocom->msglk, LK_RELEASE);
161 }
162 
163 /*
164  * Caller sets up iocom->auto_lnk_conn and iocom->auto_lnk_span, then calls
165  * this function to handle the state machine for LNK_CONN and LNK_SPAN.
166  *
167  * NOTE: Caller typically also sets the IOCOMF_AUTOCONN, IOCOMF_AUTOSPAN,
168  *	 and IOCOMF_AUTOCIRC in the kdmsg_iocom_init() call.  Clients
169  *	 typically set IOCOMF_AUTOFORGE to automatically forged circuits
170  *	 for received SPANs.
171  */
172 static int kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
173 static int kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
174 
175 void
176 kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom,
177 			 void (*auto_callback)(kdmsg_msg_t *msg))
178 {
179 	kdmsg_msg_t *msg;
180 
181 	iocom->auto_callback = auto_callback;
182 
183 	msg = kdmsg_msg_alloc(iocom, NULL,
184 			      DMSG_LNK_CONN | DMSGF_CREATE,
185 			      kdmsg_lnk_conn_reply, NULL);
186 	iocom->auto_lnk_conn.head = msg->any.head;
187 	msg->any.lnk_conn = iocom->auto_lnk_conn;
188 	iocom->conn_state = msg->state;
189 	kdmsg_msg_write(msg);
190 }
191 
192 static
193 int
194 kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
195 {
196 	kdmsg_iocom_t *iocom = state->iocom;
197 	kdmsg_msg_t *rmsg;
198 
199 	if (msg->any.head.cmd & DMSGF_CREATE) {
200 		rmsg = kdmsg_msg_alloc(iocom, NULL,
201 				       DMSG_LNK_SPAN | DMSGF_CREATE,
202 				       kdmsg_lnk_span_reply, NULL);
203 		iocom->auto_lnk_span.head = rmsg->any.head;
204 		rmsg->any.lnk_span = iocom->auto_lnk_span;
205 		kdmsg_msg_write(rmsg);
206 	}
207 
208 	/*
209 	 * Process shim after the CONN is acknowledged and before the CONN
210 	 * transaction is deleted.  For deletions this gives device drivers
211 	 * the ability to interlock new operations on the circuit before
212 	 * it becomes illegal and panics.
213 	 */
214 	if (iocom->auto_callback)
215 		iocom->auto_callback(msg);
216 
217 	if ((state->txcmd & DMSGF_DELETE) == 0 &&
218 	    (msg->any.head.cmd & DMSGF_DELETE)) {
219 		iocom->conn_state = NULL;
220 		kdmsg_msg_reply(msg, 0);
221 	}
222 
223 	return (0);
224 }
225 
226 static
227 int
228 kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
229 {
230 	/*
231 	 * Be sure to process shim before terminating the SPAN
232 	 * transaction.  Gives device drivers the ability to
233 	 * interlock new operations on the circuit before it
234 	 * becomes illegal and panics.
235 	 */
236 	if (state->iocom->auto_callback)
237 		state->iocom->auto_callback(msg);
238 
239 	if ((state->txcmd & DMSGF_DELETE) == 0 &&
240 	    (msg->any.head.cmd & DMSGF_DELETE)) {
241 		kdmsg_msg_reply(msg, 0);
242 	}
243 	return (0);
244 }
245 
246 /*
247  * Disconnect and clean up
248  */
249 void
250 kdmsg_iocom_uninit(kdmsg_iocom_t *iocom)
251 {
252 	/*
253 	 * Ask the cluster controller to go away
254 	 */
255 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
256 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILL);
257 
258 	while (iocom->msgrd_td || iocom->msgwr_td) {
259 		wakeup(&iocom->msg_ctl);
260 		lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
261 	}
262 
263 	/*
264 	 * Drop communications descriptor
265 	 */
266 	if (iocom->msg_fp) {
267 		fdrop(iocom->msg_fp);
268 		iocom->msg_fp = NULL;
269 	}
270 	lockmgr(&iocom->msglk, LK_RELEASE);
271 }
272 
273 /*
274  * Cluster controller thread.  Perform messaging functions.  We have one
275  * thread for the reader and one for the writer.  The writer handles
276  * shutdown requests (which should break the reader thread).
277  */
278 static
279 void
280 kdmsg_iocom_thread_rd(void *arg)
281 {
282 	kdmsg_iocom_t *iocom = arg;
283 	dmsg_hdr_t hdr;
284 	kdmsg_msg_t *msg = NULL;
285 	kdmsg_state_t *state;
286 	size_t hbytes;
287 	size_t abytes;
288 	int error = 0;
289 
290 	while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILL) == 0) {
291 		/*
292 		 * Retrieve the message from the pipe or socket.
293 		 */
294 		error = fp_read(iocom->msg_fp, &hdr, sizeof(hdr),
295 				NULL, 1, UIO_SYSSPACE);
296 		if (error)
297 			break;
298 		if (hdr.magic != DMSG_HDR_MAGIC) {
299 			kprintf("kdmsg: bad magic: %04x\n", hdr.magic);
300 			error = EINVAL;
301 			break;
302 		}
303 		hbytes = (hdr.cmd & DMSGF_SIZE) * DMSG_ALIGN;
304 		if (hbytes < sizeof(hdr) || hbytes > DMSG_AUX_MAX) {
305 			kprintf("kdmsg: bad header size %zd\n", hbytes);
306 			error = EINVAL;
307 			break;
308 		}
309 		/* XXX messy: mask cmd to avoid allocating state */
310 		msg = kdmsg_msg_alloc(iocom, NULL,
311 				      hdr.cmd & DMSGF_BASECMDMASK,
312 				      NULL, NULL);
313 		msg->any.head = hdr;
314 		msg->hdr_size = hbytes;
315 		if (hbytes > sizeof(hdr)) {
316 			error = fp_read(iocom->msg_fp, &msg->any.head + 1,
317 					hbytes - sizeof(hdr),
318 					NULL, 1, UIO_SYSSPACE);
319 			if (error) {
320 				kprintf("kdmsg: short msg received\n");
321 				error = EINVAL;
322 				break;
323 			}
324 		}
325 		msg->aux_size = hdr.aux_bytes;
326 		if (msg->aux_size > DMSG_AUX_MAX) {
327 			kprintf("kdmsg: illegal msg payload size %zd\n",
328 				msg->aux_size);
329 			error = EINVAL;
330 			break;
331 		}
332 		if (msg->aux_size) {
333 			abytes = DMSG_DOALIGN(msg->aux_size);
334 			msg->aux_data = kmalloc(abytes, iocom->mmsg, M_WAITOK);
335 			msg->flags |= KDMSG_FLAG_AUXALLOC;
336 			error = fp_read(iocom->msg_fp, msg->aux_data,
337 					abytes, NULL, 1, UIO_SYSSPACE);
338 			if (error) {
339 				kprintf("kdmsg: short msg payload received\n");
340 				break;
341 			}
342 		}
343 
344 		(void)kdmsg_circ_msgrx(msg);
345 		error = kdmsg_msg_receive_handling(msg);
346 		msg = NULL;
347 	}
348 
349 	if (error)
350 		kprintf("kdmsg: read failed error %d\n", error);
351 
352 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
353 	if (msg)
354 		kdmsg_msg_free(msg);
355 
356 	if ((state = iocom->freerd_state) != NULL) {
357 		iocom->freerd_state = NULL;
358 		kdmsg_state_free(state);
359 	}
360 
361 	/*
362 	 * Shutdown the socket before waiting for the transmit side.
363 	 *
364 	 * If we are dying due to e.g. a socket disconnect verses being
365 	 * killed explicity we have to set KILL in order to kick the tx
366 	 * side when it might not have any other work to do.  KILL might
367 	 * already be set if we are in an unmount or reconnect.
368 	 */
369 	fp_shutdown(iocom->msg_fp, SHUT_RDWR);
370 
371 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILL);
372 	wakeup(&iocom->msg_ctl);
373 
374 	/*
375 	 * Wait for the transmit side to drain remaining messages
376 	 * before cleaning up the rx state.  The transmit side will
377 	 * set KILLTX and wait for the rx side to completely finish
378 	 * (set msgrd_td to NULL) before cleaning up any remaining
379 	 * tx states.
380 	 */
381 	lockmgr(&iocom->msglk, LK_RELEASE);
382 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
383 	wakeup(&iocom->msg_ctl);
384 	while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLTX) == 0) {
385 		wakeup(&iocom->msg_ctl);
386 		tsleep(iocom, 0, "clstrkw", hz);
387 	}
388 
389 	iocom->msgrd_td = NULL;
390 
391 	/*
392 	 * iocom can be ripped out from under us at this point but
393 	 * wakeup() is safe.
394 	 */
395 	wakeup(iocom);
396 	lwkt_exit();
397 }
398 
399 static
400 void
401 kdmsg_iocom_thread_wr(void *arg)
402 {
403 	kdmsg_iocom_t *iocom = arg;
404 	kdmsg_msg_t *msg;
405 	kdmsg_state_t *state;
406 	ssize_t res;
407 	size_t abytes;
408 	int error = 0;
409 	int retries = 20;
410 
411 	/*
412 	 * Transmit loop
413 	 */
414 	msg = NULL;
415 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
416 
417 	while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILL) == 0 && error == 0) {
418 		/*
419 		 * Sleep if no messages pending.  Interlock with flag while
420 		 * holding msglk.
421 		 */
422 		if (TAILQ_EMPTY(&iocom->msgq)) {
423 			atomic_set_int(&iocom->msg_ctl,
424 				       KDMSG_CLUSTERCTL_SLEEPING);
425 			lksleep(&iocom->msg_ctl, &iocom->msglk, 0, "msgwr", hz);
426 			atomic_clear_int(&iocom->msg_ctl,
427 					 KDMSG_CLUSTERCTL_SLEEPING);
428 		}
429 
430 		while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
431 			/*
432 			 * Remove msg from the transmit queue and do
433 			 * persist and half-closed state handling.
434 			 */
435 			TAILQ_REMOVE(&iocom->msgq, msg, qentry);
436 			lockmgr(&iocom->msglk, LK_RELEASE);
437 
438 			error = kdmsg_state_msgtx(msg);
439 			if (error == EALREADY) {
440 				error = 0;
441 				kdmsg_msg_free(msg);
442 				lockmgr(&iocom->msglk, LK_EXCLUSIVE);
443 				continue;
444 			}
445 			if (error) {
446 				kdmsg_msg_free(msg);
447 				lockmgr(&iocom->msglk, LK_EXCLUSIVE);
448 				break;
449 			}
450 
451 			/*
452 			 * Dump the message to the pipe or socket.
453 			 *
454 			 * We have to clean up the message as if the transmit
455 			 * succeeded even if it failed.
456 			 */
457 			error = fp_write(iocom->msg_fp, &msg->any,
458 					 msg->hdr_size, &res, UIO_SYSSPACE);
459 			if (error || res != msg->hdr_size) {
460 				if (error == 0)
461 					error = EINVAL;
462 				kdmsg_state_cleanuptx(msg);
463 				lockmgr(&iocom->msglk, LK_EXCLUSIVE);
464 				break;
465 			}
466 			if (msg->aux_size) {
467 				abytes = DMSG_DOALIGN(msg->aux_size);
468 				error = fp_write(iocom->msg_fp,
469 						 msg->aux_data, abytes,
470 						 &res, UIO_SYSSPACE);
471 				if (error || res != abytes) {
472 					if (error == 0)
473 						error = EINVAL;
474 					kdmsg_state_cleanuptx(msg);
475 					lockmgr(&iocom->msglk, LK_EXCLUSIVE);
476 					break;
477 				}
478 			}
479 			kdmsg_state_cleanuptx(msg);
480 			lockmgr(&iocom->msglk, LK_EXCLUSIVE);
481 		}
482 	}
483 
484 	/*
485 	 * Cleanup messages pending transmission and release msgq lock.
486 	 */
487 	if (error)
488 		kprintf("kdmsg: write failed error %d\n", error);
489 	kprintf("thread_wr: Terminating iocom\n");
490 
491 	/*
492 	 * Shutdown the socket.  This will cause the rx thread to get an
493 	 * EOF and ensure that both threads get to a termination state.
494 	 */
495 	fp_shutdown(iocom->msg_fp, SHUT_RDWR);
496 
497 	/*
498 	 * Set KILLTX (which the rx side waits for), then wait for the RX
499 	 * side to completely finish before we clean out any remaining
500 	 * command states.
501 	 */
502 	lockmgr(&iocom->msglk, LK_RELEASE);
503 	atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLTX);
504 	wakeup(&iocom->msg_ctl);
505 	while (iocom->msgrd_td) {
506 		wakeup(&iocom->msg_ctl);
507 		tsleep(iocom, 0, "clstrkw", hz);
508 	}
509 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
510 
511 	/*
512 	 * Simulate received MSGF_DELETE's for any remaining states.
513 	 * (For remote masters).
514 	 *
515 	 * Drain the message queue to handle any device initiated writes
516 	 * due to state callbacks.
517 	 */
518 cleanuprd:
519 	kdmsg_drain_msgq(iocom);
520 	RB_FOREACH(state, kdmsg_state_tree, &iocom->staterd_tree) {
521 		if ((state->rxcmd & DMSGF_DELETE) == 0) {
522 			lockmgr(&iocom->msglk, LK_RELEASE);
523 			kdmsg_state_abort(state);
524 			lockmgr(&iocom->msglk, LK_EXCLUSIVE);
525 			goto cleanuprd;
526 		}
527 	}
528 
529 	/*
530 	 * Simulate received MSGF_DELETE's for any remaining states.
531 	 * (For local masters).
532 	 */
533 cleanupwr:
534 	kdmsg_drain_msgq(iocom);
535 	RB_FOREACH(state, kdmsg_state_tree, &iocom->statewr_tree) {
536 		if ((state->rxcmd & DMSGF_DELETE) == 0) {
537 			lockmgr(&iocom->msglk, LK_RELEASE);
538 			kdmsg_state_abort(state);
539 			lockmgr(&iocom->msglk, LK_EXCLUSIVE);
540 			goto cleanupwr;
541 		}
542 	}
543 
544 	/*
545 	 * Retry until all work is done
546 	 */
547 	if (--retries == 0)
548 		panic("kdmsg: comm thread shutdown couldn't drain");
549 	if (TAILQ_FIRST(&iocom->msgq) ||
550 	    RB_ROOT(&iocom->staterd_tree) ||
551 	    RB_ROOT(&iocom->statewr_tree)) {
552 		goto cleanuprd;
553 	}
554 	iocom->flags |= KDMSG_IOCOMF_EXITNOACC;
555 
556 	if ((state = iocom->freewr_state) != NULL) {
557 		iocom->freewr_state = NULL;
558 		kdmsg_state_free(state);
559 	}
560 
561 	lockmgr(&iocom->msglk, LK_RELEASE);
562 
563 	/*
564 	 * The state trees had better be empty now
565 	 */
566 	KKASSERT(RB_EMPTY(&iocom->staterd_tree));
567 	KKASSERT(RB_EMPTY(&iocom->statewr_tree));
568 	KKASSERT(iocom->conn_state == NULL);
569 
570 	if (iocom->exit_func) {
571 		/*
572 		 * iocom is invalid after we call the exit function.
573 		 */
574 		iocom->msgwr_td = NULL;
575 		iocom->exit_func(iocom);
576 	} else {
577 		/*
578 		 * iocom can be ripped out from under us once msgwr_td is
579 		 * set to NULL.  The wakeup is safe.
580 		 */
581 		iocom->msgwr_td = NULL;
582 		wakeup(iocom);
583 	}
584 	lwkt_exit();
585 }
586 
587 /*
588  * This cleans out the pending transmit message queue, adjusting any
589  * persistent states properly in the process.
590  *
591  * Caller must hold pmp->iocom.msglk
592  */
593 void
594 kdmsg_drain_msgq(kdmsg_iocom_t *iocom)
595 {
596 	kdmsg_msg_t *msg;
597 
598 	/*
599 	 * Clean out our pending transmit queue, executing the
600 	 * appropriate state adjustments.  If this tries to open
601 	 * any new outgoing transactions we have to loop up and
602 	 * clean them out.
603 	 */
604 	while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
605 		TAILQ_REMOVE(&iocom->msgq, msg, qentry);
606 		lockmgr(&iocom->msglk, LK_RELEASE);
607 		if (kdmsg_state_msgtx(msg))
608 			kdmsg_msg_free(msg);
609 		else
610 			kdmsg_state_cleanuptx(msg);
611 		lockmgr(&iocom->msglk, LK_EXCLUSIVE);
612 	}
613 }
614 
615 /*
616  * Do all processing required to handle a freshly received message
617  * after its low level header has been validated.
618  */
619 static
620 int
621 kdmsg_msg_receive_handling(kdmsg_msg_t *msg)
622 {
623 	kdmsg_iocom_t *iocom = msg->iocom;
624 	int error;
625 
626 	/*
627 	 * State machine tracking, state assignment for msg,
628 	 * returns error and discard status.  Errors are fatal
629 	 * to the connection except for EALREADY which forces
630 	 * a discard without execution.
631 	 */
632 	error = kdmsg_state_msgrx(msg);
633 	if (error) {
634 		/*
635 		 * Raw protocol or connection error
636 		 */
637 		kdmsg_msg_free(msg);
638 		if (error == EALREADY)
639 			error = 0;
640 	} else if (msg->state && msg->state->func) {
641 		/*
642 		 * Message related to state which already has a
643 		 * handling function installed for it.
644 		 */
645 		error = msg->state->func(msg->state, msg);
646 		kdmsg_state_cleanuprx(msg);
647 	} else if (iocom->flags & KDMSG_IOCOMF_AUTOANY) {
648 		error = kdmsg_autorxmsg(msg);
649 		kdmsg_state_cleanuprx(msg);
650 	} else {
651 		error = iocom->rcvmsg(msg);
652 		kdmsg_state_cleanuprx(msg);
653 	}
654 	return error;
655 }
656 
657 /*
658  * Process circuit tracking (NEEDS WORK)
659  */
660 static
661 int
662 kdmsg_circ_msgrx(kdmsg_msg_t *msg)
663 {
664 	kdmsg_circuit_t dummy;
665 	kdmsg_circuit_t *circ;
666 	int error = 0;
667 
668 	if (msg->any.head.circuit) {
669 		dummy.msgid = msg->any.head.circuit;
670 		lwkt_gettoken(&kdmsg_token);
671 		circ = RB_FIND(kdmsg_circuit_tree, &msg->iocom->circ_tree,
672 			       &dummy);
673 		if (circ) {
674 			msg->circ = circ;
675 			kdmsg_circ_hold(circ);
676 		}
677 		if (circ == NULL) {
678 			kprintf("KDMSG_CIRC_MSGRX CMD %08x: IOCOM %p "
679 				"Bad circuit %016jx\n",
680 				msg->any.head.cmd,
681 				msg->iocom,
682 				(intmax_t)msg->any.head.circuit);
683 			kprintf("KDMSG_CIRC_MSGRX: Avail circuits: ");
684 			RB_FOREACH(circ, kdmsg_circuit_tree,
685 				   &msg->iocom->circ_tree) {
686 				kprintf(" %016jx", (intmax_t)circ->msgid);
687 			}
688 			kprintf("\n");
689 			error = EINVAL;
690 		}
691 		lwkt_reltoken(&kdmsg_token);
692 	}
693 	return (error);
694 }
695 
696 /*
697  * Process state tracking for a message after reception, prior to
698  * execution.
699  *
700  * Called with msglk held and the msg dequeued.
701  *
702  * All messages are called with dummy state and return actual state.
703  * (One-off messages often just return the same dummy state).
704  *
705  * May request that caller discard the message by setting *discardp to 1.
706  * The returned state is not used in this case and is allowed to be NULL.
707  *
708  * --
709  *
710  * These routines handle persistent and command/reply message state via the
711  * CREATE and DELETE flags.  The first message in a command or reply sequence
712  * sets CREATE, the last message in a command or reply sequence sets DELETE.
713  *
714  * There can be any number of intermediate messages belonging to the same
715  * sequence sent inbetween the CREATE message and the DELETE message,
716  * which set neither flag.  This represents a streaming command or reply.
717  *
718  * Any command message received with CREATE set expects a reply sequence to
719  * be returned.  Reply sequences work the same as command sequences except the
720  * REPLY bit is also sent.  Both the command side and reply side can
721  * degenerate into a single message with both CREATE and DELETE set.  Note
722  * that one side can be streaming and the other side not, or neither, or both.
723  *
724  * The msgid is unique for the initiator.  That is, two sides sending a new
725  * message can use the same msgid without colliding.
726  *
727  * --
728  *
729  * ABORT sequences work by setting the ABORT flag along with normal message
730  * state.  However, ABORTs can also be sent on half-closed messages, that is
731  * even if the command or reply side has already sent a DELETE, as long as
732  * the message has not been fully closed it can still send an ABORT+DELETE
733  * to terminate the half-closed message state.
734  *
735  * Since ABORT+DELETEs can race we silently discard ABORT's for message
736  * state which has already been fully closed.  REPLY+ABORT+DELETEs can
737  * also race, and in this situation the other side might have already
738  * initiated a new unrelated command with the same message id.  Since
739  * the abort has not set the CREATE flag the situation can be detected
740  * and the message will also be discarded.
741  *
742  * Non-blocking requests can be initiated with ABORT+CREATE[+DELETE].
743  * The ABORT request is essentially integrated into the command instead
744  * of being sent later on.  In this situation the command implementation
745  * detects that CREATE and ABORT are both set (vs ABORT alone) and can
746  * special-case non-blocking operation for the command.
747  *
748  * NOTE!  Messages with ABORT set without CREATE or DELETE are considered
749  *	  to be mid-stream aborts for command/reply sequences.  ABORTs on
750  *	  one-way messages are not supported.
751  *
752  * NOTE!  If a command sequence does not support aborts the ABORT flag is
753  *	  simply ignored.
754  *
755  * --
756  *
757  * One-off messages (no reply expected) are sent with neither CREATE or DELETE
758  * set.  One-off messages cannot be aborted and typically aren't processed
759  * by these routines.  The REPLY bit can be used to distinguish whether a
760  * one-off message is a command or reply.  For example, one-off replies
761  * will typically just contain status updates.
762  */
763 static
764 int
765 kdmsg_state_msgrx(kdmsg_msg_t *msg)
766 {
767 	kdmsg_iocom_t *iocom = msg->iocom;
768 	kdmsg_state_t *state;
769 	int error;
770 
771 	/*
772 	 * Make sure a state structure is ready to go in case we need a new
773 	 * one.  This is the only routine which uses freerd_state so no
774 	 * races are possible.
775 	 */
776 	if ((state = iocom->freerd_state) == NULL) {
777 		state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
778 		state->flags = KDMSG_STATE_DYNAMIC;
779 		iocom->freerd_state = state;
780 	}
781 
782 	/*
783 	 * Lock RB tree and locate existing persistent state, if any.
784 	 *
785 	 * If received msg is a command state is on staterd_tree.
786 	 * If received msg is a reply state is on statewr_tree.
787 	 */
788 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
789 
790 	state->msgid = msg->any.head.msgid;
791 	state->circ = msg->circ;
792 	state->iocom = iocom;
793 	if (msg->any.head.cmd & DMSGF_REPLY)
794 		state = RB_FIND(kdmsg_state_tree, &iocom->statewr_tree, state);
795 	else
796 		state = RB_FIND(kdmsg_state_tree, &iocom->staterd_tree, state);
797 	msg->state = state;
798 
799 	/*
800 	 * Short-cut one-off or mid-stream messages (state may be NULL).
801 	 */
802 	if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
803 				  DMSGF_ABORT)) == 0) {
804 		lockmgr(&iocom->msglk, LK_RELEASE);
805 		return(0);
806 	}
807 
808 	/*
809 	 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
810 	 * inside the case statements.
811 	 */
812 	switch(msg->any.head.cmd & (DMSGF_CREATE|DMSGF_DELETE|DMSGF_REPLY)) {
813 	case DMSGF_CREATE:
814 	case DMSGF_CREATE | DMSGF_DELETE:
815 		/*
816 		 * New persistant command received.
817 		 */
818 		if (state) {
819 			kprintf("kdmsg_state_msgrx: duplicate transaction\n");
820 			error = EINVAL;
821 			break;
822 		}
823 		state = iocom->freerd_state;
824 		iocom->freerd_state = NULL;
825 		msg->state = state;
826 		state->msg = msg;
827 		state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
828 		state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
829 		state->txcmd = DMSGF_REPLY;
830 		state->msgid = msg->any.head.msgid;
831 		if ((state->circ = msg->circ) != NULL)
832 			kdmsg_circ_hold(state->circ);
833 		RB_INSERT(kdmsg_state_tree, &iocom->staterd_tree, state);
834 		state->flags |= KDMSG_STATE_INSERTED;
835 		error = 0;
836 		break;
837 	case DMSGF_DELETE:
838 		/*
839 		 * Persistent state is expected but might not exist if an
840 		 * ABORT+DELETE races the close.
841 		 */
842 		if (state == NULL) {
843 			if (msg->any.head.cmd & DMSGF_ABORT) {
844 				error = EALREADY;
845 			} else {
846 				kprintf("kdmsg_state_msgrx: "
847 					"no state for DELETE\n");
848 				error = EINVAL;
849 			}
850 			break;
851 		}
852 
853 		/*
854 		 * Handle another ABORT+DELETE case if the msgid has already
855 		 * been reused.
856 		 */
857 		if ((state->rxcmd & DMSGF_CREATE) == 0) {
858 			if (msg->any.head.cmd & DMSGF_ABORT) {
859 				error = EALREADY;
860 			} else {
861 				kprintf("kdmsg_state_msgrx: "
862 					"state reused for DELETE\n");
863 				error = EINVAL;
864 			}
865 			break;
866 		}
867 		error = 0;
868 		break;
869 	default:
870 		/*
871 		 * Check for mid-stream ABORT command received, otherwise
872 		 * allow.
873 		 */
874 		if (msg->any.head.cmd & DMSGF_ABORT) {
875 			if (state == NULL ||
876 			    (state->rxcmd & DMSGF_CREATE) == 0) {
877 				error = EALREADY;
878 				break;
879 			}
880 		}
881 		error = 0;
882 		break;
883 	case DMSGF_REPLY | DMSGF_CREATE:
884 	case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
885 		/*
886 		 * When receiving a reply with CREATE set the original
887 		 * persistent state message should already exist.
888 		 */
889 		if (state == NULL) {
890 			kprintf("kdmsg_state_msgrx: no state match for "
891 				"REPLY cmd=%08x msgid=%016jx\n",
892 				msg->any.head.cmd,
893 				(intmax_t)msg->any.head.msgid);
894 			error = EINVAL;
895 			break;
896 		}
897 		state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
898 		error = 0;
899 		break;
900 	case DMSGF_REPLY | DMSGF_DELETE:
901 		/*
902 		 * Received REPLY+ABORT+DELETE in case where msgid has
903 		 * already been fully closed, ignore the message.
904 		 */
905 		if (state == NULL) {
906 			if (msg->any.head.cmd & DMSGF_ABORT) {
907 				error = EALREADY;
908 			} else {
909 				kprintf("kdmsg_state_msgrx: no state match "
910 					"for REPLY|DELETE\n");
911 				error = EINVAL;
912 			}
913 			break;
914 		}
915 
916 		/*
917 		 * Received REPLY+ABORT+DELETE in case where msgid has
918 		 * already been reused for an unrelated message,
919 		 * ignore the message.
920 		 */
921 		if ((state->rxcmd & DMSGF_CREATE) == 0) {
922 			if (msg->any.head.cmd & DMSGF_ABORT) {
923 				error = EALREADY;
924 			} else {
925 				kprintf("kdmsg_state_msgrx: state reused "
926 					"for REPLY|DELETE\n");
927 				error = EINVAL;
928 			}
929 			break;
930 		}
931 		error = 0;
932 		break;
933 	case DMSGF_REPLY:
934 		/*
935 		 * Check for mid-stream ABORT reply received to sent command.
936 		 */
937 		if (msg->any.head.cmd & DMSGF_ABORT) {
938 			if (state == NULL ||
939 			    (state->rxcmd & DMSGF_CREATE) == 0) {
940 				error = EALREADY;
941 				break;
942 			}
943 		}
944 		error = 0;
945 		break;
946 	}
947 	lockmgr(&iocom->msglk, LK_RELEASE);
948 	return (error);
949 }
950 
951 /*
952  * Called instead of iocom->rcvmsg() if any of the AUTO flags are set.
953  * This routine must call iocom->rcvmsg() for anything not automatically
954  * handled.
955  */
956 static int
957 kdmsg_autorxmsg(kdmsg_msg_t *msg)
958 {
959 	kdmsg_iocom_t *iocom = msg->iocom;
960 	kdmsg_circuit_t *circ;
961 	int error = 0;
962 	uint32_t cmd;
963 
964 	/*
965 	 * Process a combination of the transaction command and the message
966 	 * flags.  For the purposes of this routine, the message command is
967 	 * only relevant when it initiates a transaction (where it is
968 	 * recorded in icmd).
969 	 */
970 	cmd = (msg->state ? msg->state->icmd : msg->any.head.cmd) &
971 	      DMSGF_BASECMDMASK;
972 	cmd |= msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE | DMSGF_REPLY);
973 
974 	switch(cmd) {
975 	case DMSG_LNK_CONN | DMSGF_CREATE:
976 	case DMSG_LNK_CONN | DMSGF_CREATE | DMSGF_DELETE:
977 		/*
978 		 * Received LNK_CONN transaction.  Transmit response and
979 		 * leave transaction open, which allows the other end to
980 		 * start to the SPAN protocol.
981 		 *
982 		 * Handle shim after acknowledging the CONN.
983 		 */
984 		if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
985 			if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
986 				kdmsg_msg_result(msg, 0);
987 				if (iocom->auto_callback)
988 					iocom->auto_callback(msg);
989 			} else {
990 				error = iocom->rcvmsg(msg);
991 			}
992 			break;
993 		}
994 		/* fall through */
995 	case DMSG_LNK_CONN | DMSGF_DELETE:
996 		/*
997 		 * This message is usually simulated after a link is lost
998 		 * to clean up the transaction.
999 		 */
1000 		if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1001 			if (iocom->auto_callback)
1002 				iocom->auto_callback(msg);
1003 			kdmsg_msg_reply(msg, 0);
1004 		} else {
1005 			error = iocom->rcvmsg(msg);
1006 		}
1007 		break;
1008 	case DMSG_LNK_SPAN | DMSGF_CREATE:
1009 	case DMSG_LNK_SPAN | DMSGF_CREATE | DMSGF_DELETE:
1010 		/*
1011 		 * Received LNK_SPAN transaction.  We do not have to respond
1012 		 * but we must leave the transaction open.
1013 		 *
1014 		 * If AUTOCIRC is set automatically initiate a virtual circuit
1015 		 * to the received span.  This will attach a kdmsg_circuit
1016 		 * to the SPAN state.  The circuit is lost when the span is
1017 		 * lost.
1018 		 *
1019 		 * Handle shim after acknowledging the SPAN.
1020 		 */
1021 		if (iocom->flags & KDMSG_IOCOMF_AUTOSPAN) {
1022 			if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1023 				if (iocom->flags & KDMSG_IOCOMF_AUTOFORGE)
1024 					kdmsg_autocirc(msg);
1025 				if (iocom->auto_callback)
1026 					iocom->auto_callback(msg);
1027 				break;
1028 			}
1029 			/* fall through */
1030 		} else {
1031 			error = iocom->rcvmsg(msg);
1032 			break;
1033 		}
1034 		/* fall through */
1035 	case DMSG_LNK_SPAN | DMSGF_DELETE:
1036 		/*
1037 		 * Process shims (auto_callback) before cleaning up the
1038 		 * circuit structure and closing the transactions.  Device
1039 		 * driver should ensure that the circuit is not used after
1040 		 * the auto_callback() returns.
1041 		 *
1042 		 * Handle shim before closing the SPAN transaction.
1043 		 */
1044 		if (iocom->flags & KDMSG_IOCOMF_AUTOSPAN) {
1045 			if (iocom->auto_callback)
1046 				iocom->auto_callback(msg);
1047 			if (iocom->flags & KDMSG_IOCOMF_AUTOFORGE)
1048 				kdmsg_autocirc(msg);
1049 			kdmsg_msg_reply(msg, 0);
1050 		} else {
1051 			error = iocom->rcvmsg(msg);
1052 		}
1053 		break;
1054 	case DMSG_LNK_CIRC | DMSGF_CREATE:
1055 	case DMSG_LNK_CIRC | DMSGF_CREATE | DMSGF_DELETE:
1056 		/*
1057 		 * Received LNK_CIRC transaction.  We must respond and should
1058 		 * leave the transaction open, allowing the circuit.  The
1059 		 * remote can start issuing commands to us over the circuit
1060 		 * even before we respond.
1061 		 */
1062 		if (iocom->flags & KDMSG_IOCOMF_AUTOCIRC) {
1063 			if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1064 				circ = kmalloc(sizeof(*circ), iocom->mmsg,
1065 					       M_WAITOK | M_ZERO);
1066 				lwkt_gettoken(&kdmsg_token);
1067 				msg->state->any.circ = circ;
1068 				circ->iocom = iocom;
1069 				circ->rcirc_state = msg->state;
1070 				kdmsg_circ_hold(circ);	/* for rcirc_state */
1071 				circ->weight = 0;
1072 				circ->msgid = circ->rcirc_state->msgid;
1073 				/* XXX no span link for received circuits */
1074 				kdmsg_circ_hold(circ);	/* for circ_state */
1075 
1076 				if (RB_INSERT(kdmsg_circuit_tree,
1077 					      &iocom->circ_tree, circ)) {
1078 					panic("duplicate circuitid allocated");
1079 				}
1080 				lwkt_reltoken(&kdmsg_token);
1081 				kdmsg_msg_result(msg, 0);
1082 
1083 				/*
1084 				 * Handle shim after adding the circuit and
1085 				 * after acknowledging the CIRC.
1086 				 */
1087 				if (iocom->auto_callback)
1088 					iocom->auto_callback(msg);
1089 				break;
1090 			}
1091 			/* fall through */
1092 		} else {
1093 			error = iocom->rcvmsg(msg);
1094 			break;
1095 		}
1096 		/* fall through */
1097 	case DMSG_LNK_CIRC | DMSGF_DELETE:
1098 		if (iocom->flags & KDMSG_IOCOMF_AUTOCIRC) {
1099 			circ = msg->state->any.circ;
1100 			if (circ == NULL)
1101 				break;
1102 
1103 			/*
1104 			 * Handle shim before terminating the circuit.
1105 			 */
1106 #if 0
1107 			kprintf("KDMSG VC: RECEIVE CIRC DELETE "
1108 				"IOCOM %p MSGID %016jx\n",
1109 				msg->iocom, circ->msgid);
1110 #endif
1111 			if (iocom->auto_callback)
1112 				iocom->auto_callback(msg);
1113 
1114 			KKASSERT(circ->rcirc_state == msg->state);
1115 			lwkt_gettoken(&kdmsg_token);
1116 			circ->rcirc_state = NULL;
1117 			msg->state->any.circ = NULL;
1118 			RB_REMOVE(kdmsg_circuit_tree, &iocom->circ_tree, circ);
1119 			lwkt_reltoken(&kdmsg_token);
1120 			kdmsg_circ_drop(circ);	/* for rcirc_state */
1121 			kdmsg_msg_reply(msg, 0);
1122 		} else {
1123 			error = iocom->rcvmsg(msg);
1124 		}
1125 		break;
1126 	default:
1127 		/*
1128 		 * Anything unhandled goes into rcvmsg.
1129 		 *
1130 		 * NOTE: Replies to link-level messages initiated by our side
1131 		 *	 are handled by the state callback, they are NOT
1132 		 *	 handled here.
1133 		 */
1134 		error = iocom->rcvmsg(msg);
1135 		break;
1136 	}
1137 	return (error);
1138 }
1139 
1140 /*
1141  * Handle automatic forging of virtual circuits based on received SPANs.
1142  * (AUTOFORGE).  Note that other code handles tracking received circuit
1143  * transactions (AUTOCIRC).
1144  *
1145  * We can ignore non-transactions here.  Use trans->icmd to test the
1146  * transactional command (once past the CREATE the individual message
1147  * commands are not usually the icmd).
1148  *
1149  * XXX locks
1150  */
1151 static
1152 void
1153 kdmsg_autocirc(kdmsg_msg_t *msg)
1154 {
1155 	kdmsg_iocom_t *iocom = msg->iocom;
1156 	kdmsg_circuit_t *circ;
1157 	kdmsg_msg_t *xmsg;	/* CIRC */
1158 
1159 	if (msg->state == NULL)
1160 		return;
1161 
1162 	/*
1163 	 * Gaining the SPAN, automatically forge a circuit to the target.
1164 	 *
1165 	 * NOTE!! The shim is not executed until we receive an acknowlegement
1166 	 *	  to our forged LNK_CIRC (see kdmsg_autocirc_reply()).
1167 	 */
1168 	if (msg->state->icmd == DMSG_LNK_SPAN &&
1169 	    (msg->any.head.cmd & DMSGF_CREATE)) {
1170 		circ = kmalloc(sizeof(*circ), iocom->mmsg, M_WAITOK | M_ZERO);
1171 		lwkt_gettoken(&kdmsg_token);
1172 		msg->state->any.circ = circ;
1173 		circ->iocom = iocom;
1174 		circ->span_state = msg->state;
1175 		kdmsg_circ_hold(circ);	/* for span_state */
1176 		xmsg = kdmsg_msg_alloc(iocom, NULL,
1177 				       DMSG_LNK_CIRC | DMSGF_CREATE,
1178 				       kdmsg_autocirc_reply, circ);
1179 		circ->circ_state = xmsg->state;
1180 		circ->weight = msg->any.lnk_span.dist;
1181 		circ->msgid = circ->circ_state->msgid;
1182 		kdmsg_circ_hold(circ);	/* for circ_state */
1183 #if 0
1184 		kprintf("KDMSG VC: CREATE SPAN->CIRC IOCOM %p MSGID %016jx\n",
1185 			msg->iocom, circ->msgid);
1186 #endif
1187 
1188 		if (RB_INSERT(kdmsg_circuit_tree, &iocom->circ_tree, circ))
1189 			panic("duplicate circuitid allocated");
1190 		lwkt_reltoken(&kdmsg_token);
1191 
1192 		xmsg->any.lnk_circ.target = msg->any.head.msgid;
1193 		kdmsg_msg_write(xmsg);
1194 	}
1195 
1196 	/*
1197 	 * Losing the SPAN
1198 	 *
1199 	 * NOTE: When losing a SPAN, any circuits using the span should be
1200 	 *	 deleted by the remote end first.  XXX might not be ordered
1201 	 *	 on actual loss of connection.
1202 	 */
1203 	if (msg->state->icmd == DMSG_LNK_SPAN &&
1204 	    (msg->any.head.cmd & DMSGF_DELETE) &&
1205 	    msg->state->any.circ) {
1206 		circ = msg->state->any.circ;
1207 		lwkt_gettoken(&kdmsg_token);
1208 		circ->span_state = NULL;
1209 		msg->state->any.circ = NULL;
1210 		RB_REMOVE(kdmsg_circuit_tree, &iocom->circ_tree, circ);
1211 #if 0
1212 		kprintf("KDMSG VC: DELETE SPAN->CIRC IOCOM %p MSGID %016jx\n",
1213 			msg->iocom, (intmax_t)circ->msgid);
1214 #endif
1215 		kdmsg_circ_drop(circ);	/* for span_state */
1216 		lwkt_reltoken(&kdmsg_token);
1217 	}
1218 }
1219 
1220 static
1221 int
1222 kdmsg_autocirc_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
1223 {
1224 	kdmsg_iocom_t *iocom = state->iocom;
1225 	kdmsg_circuit_t *circ = state->any.circ;
1226 
1227 	/*
1228 	 * Call shim after receiving an acknowlegement to our forged
1229 	 * circuit and before processing a received termination.
1230 	 */
1231 	if (iocom->auto_callback)
1232 		iocom->auto_callback(msg);
1233 
1234 	/*
1235 	 * If the remote is terminating the VC we terminate our side
1236 	 */
1237 	if ((state->txcmd & DMSGF_DELETE) == 0 &&
1238 	    (msg->any.head.cmd & DMSGF_DELETE)) {
1239 #if 0
1240 		kprintf("KDMSG VC: DELETE CIRC FROM REMOTE\n");
1241 #endif
1242 		lwkt_gettoken(&kdmsg_token);
1243 		circ->circ_state = NULL;
1244 		state->any.circ = NULL;
1245 		kdmsg_circ_drop(circ);		/* for circ_state */
1246 		lwkt_reltoken(&kdmsg_token);
1247 		kdmsg_msg_reply(msg, 0);
1248 	}
1249 	return (0);
1250 }
1251 
1252 /*
1253  * Post-receive-handling message and state cleanup.  This routine is called
1254  * after the state function handling/callback to properly dispose of the
1255  * message and update or dispose of the state.
1256  */
1257 static
1258 void
1259 kdmsg_state_cleanuprx(kdmsg_msg_t *msg)
1260 {
1261 	kdmsg_iocom_t *iocom = msg->iocom;
1262 	kdmsg_state_t *state;
1263 
1264 	if ((state = msg->state) == NULL) {
1265 		kdmsg_msg_free(msg);
1266 	} else if (msg->any.head.cmd & DMSGF_DELETE) {
1267 		lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1268 		KKASSERT((state->rxcmd & DMSGF_DELETE) == 0);
1269 		state->rxcmd |= DMSGF_DELETE;
1270 		if (state->txcmd & DMSGF_DELETE) {
1271 			KKASSERT(state->flags & KDMSG_STATE_INSERTED);
1272 			if (state->rxcmd & DMSGF_REPLY) {
1273 				KKASSERT(msg->any.head.cmd &
1274 					 DMSGF_REPLY);
1275 				RB_REMOVE(kdmsg_state_tree,
1276 					  &iocom->statewr_tree, state);
1277 			} else {
1278 				KKASSERT((msg->any.head.cmd &
1279 					  DMSGF_REPLY) == 0);
1280 				RB_REMOVE(kdmsg_state_tree,
1281 					  &iocom->staterd_tree, state);
1282 			}
1283 			state->flags &= ~KDMSG_STATE_INSERTED;
1284 			if (msg != state->msg)
1285 				kdmsg_msg_free(msg);
1286 			lockmgr(&iocom->msglk, LK_RELEASE);
1287 			kdmsg_state_free(state);
1288 		} else {
1289 			if (msg != state->msg)
1290 				kdmsg_msg_free(msg);
1291 			lockmgr(&iocom->msglk, LK_RELEASE);
1292 		}
1293 	} else if (msg != state->msg) {
1294 		kdmsg_msg_free(msg);
1295 	}
1296 }
1297 
1298 /*
1299  * Simulate receiving a message which terminates an active transaction
1300  * state.  Our simulated received message must set DELETE and may also
1301  * have to set CREATE.  It must also ensure that all fields are set such
1302  * that the receive handling code can find the state (kdmsg_state_msgrx())
1303  * or an endless loop will ensue.
1304  *
1305  * This is used when the other end of the link or virtual circuit is dead
1306  * so the device driver gets a completed transaction for all pending states.
1307  */
1308 static
1309 void
1310 kdmsg_state_abort(kdmsg_state_t *state)
1311 {
1312 	kdmsg_iocom_t *iocom = state->iocom;
1313 	kdmsg_msg_t *msg;
1314 
1315 	/*
1316 	 * Prevent recursive aborts which could otherwise occur if the
1317 	 * simulated message reception runs state->func which then turns
1318 	 * around and tries to reply to a broken circuit when then calls
1319 	 * the state abort code again.
1320 	 */
1321 	if (state->flags & KDMSG_STATE_ABORTING)
1322 		return;
1323 	state->flags |= KDMSG_STATE_ABORTING;
1324 
1325 	/*
1326 	 * Simulatem essage reception
1327 	 */
1328 	msg = kdmsg_msg_alloc(iocom, state->circ,
1329 			      DMSG_LNK_ERROR,
1330 			      NULL, NULL);
1331 	if ((state->rxcmd & DMSGF_CREATE) == 0)
1332 		msg->any.head.cmd |= DMSGF_CREATE;
1333 	msg->any.head.cmd |= DMSGF_DELETE | (state->rxcmd & DMSGF_REPLY);
1334 	msg->any.head.error = DMSG_ERR_LOSTLINK;
1335 	msg->any.head.msgid = state->msgid;
1336 	msg->state = state;
1337 	kdmsg_msg_receive_handling(msg);
1338 }
1339 
1340 /*
1341  * Process state tracking for a message prior to transmission.
1342  *
1343  * Called with msglk held and the msg dequeued.  Returns non-zero if
1344  * the message is bad and should be deleted by the caller.
1345  *
1346  * One-off messages are usually with dummy state and msg->state may be NULL
1347  * in this situation.
1348  *
1349  * New transactions (when CREATE is set) will insert the state.
1350  *
1351  * May request that caller discard the message by setting *discardp to 1.
1352  * A NULL state may be returned in this case.
1353  */
1354 static
1355 int
1356 kdmsg_state_msgtx(kdmsg_msg_t *msg)
1357 {
1358 	kdmsg_iocom_t *iocom = msg->iocom;
1359 	kdmsg_state_t *state;
1360 	int error;
1361 
1362 	/*
1363 	 * Make sure a state structure is ready to go in case we need a new
1364 	 * one.  This is the only routine which uses freewr_state so no
1365 	 * races are possible.
1366 	 */
1367 	if ((state = iocom->freewr_state) == NULL) {
1368 		state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1369 		state->flags = KDMSG_STATE_DYNAMIC;
1370 		state->iocom = iocom;
1371 		iocom->freewr_state = state;
1372 	}
1373 
1374 	/*
1375 	 * Lock RB tree.  If persistent state is present it will have already
1376 	 * been assigned to msg.
1377 	 */
1378 	lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1379 	state = msg->state;
1380 
1381 	/*
1382 	 * Short-cut one-off or mid-stream messages (state may be NULL).
1383 	 */
1384 	if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1385 				  DMSGF_ABORT)) == 0) {
1386 		lockmgr(&iocom->msglk, LK_RELEASE);
1387 		return(0);
1388 	}
1389 
1390 
1391 	/*
1392 	 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
1393 	 * inside the case statements.
1394 	 */
1395 	switch(msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1396 				    DMSGF_REPLY)) {
1397 	case DMSGF_CREATE:
1398 	case DMSGF_CREATE | DMSGF_DELETE:
1399 		/*
1400 		 * Insert the new persistent message state and mark
1401 		 * half-closed if DELETE is set.  Since this is a new
1402 		 * message it isn't possible to transition into the fully
1403 		 * closed state here.
1404 		 *
1405 		 * XXX state must be assigned and inserted by
1406 		 *     kdmsg_msg_write().  txcmd is assigned by us
1407 		 *     on-transmit.
1408 		 */
1409 		KKASSERT(state != NULL);
1410 		state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
1411 		state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1412 		state->rxcmd = DMSGF_REPLY;
1413 		error = 0;
1414 		break;
1415 	case DMSGF_DELETE:
1416 		/*
1417 		 * Sent ABORT+DELETE in case where msgid has already
1418 		 * been fully closed, ignore the message.
1419 		 */
1420 		if (state == NULL) {
1421 			if (msg->any.head.cmd & DMSGF_ABORT) {
1422 				error = EALREADY;
1423 			} else {
1424 				kprintf("kdmsg_state_msgtx: no state match "
1425 					"for DELETE cmd=%08x msgid=%016jx\n",
1426 					msg->any.head.cmd,
1427 					(intmax_t)msg->any.head.msgid);
1428 				error = EINVAL;
1429 			}
1430 			break;
1431 		}
1432 
1433 		/*
1434 		 * Sent ABORT+DELETE in case where msgid has
1435 		 * already been reused for an unrelated message,
1436 		 * ignore the message.
1437 		 */
1438 		if ((state->txcmd & DMSGF_CREATE) == 0) {
1439 			if (msg->any.head.cmd & DMSGF_ABORT) {
1440 				error = EALREADY;
1441 			} else {
1442 				kprintf("kdmsg_state_msgtx: state reused "
1443 					"for DELETE\n");
1444 				error = EINVAL;
1445 			}
1446 			break;
1447 		}
1448 		error = 0;
1449 		break;
1450 	default:
1451 		/*
1452 		 * Check for mid-stream ABORT command sent
1453 		 */
1454 		if (msg->any.head.cmd & DMSGF_ABORT) {
1455 			if (state == NULL ||
1456 			    (state->txcmd & DMSGF_CREATE) == 0) {
1457 				error = EALREADY;
1458 				break;
1459 			}
1460 		}
1461 		error = 0;
1462 		break;
1463 	case DMSGF_REPLY | DMSGF_CREATE:
1464 	case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
1465 		/*
1466 		 * When transmitting a reply with CREATE set the original
1467 		 * persistent state message should already exist.
1468 		 */
1469 		if (state == NULL) {
1470 			kprintf("kdmsg_state_msgtx: no state match "
1471 				"for REPLY | CREATE\n");
1472 			error = EINVAL;
1473 			break;
1474 		}
1475 		state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1476 		error = 0;
1477 		break;
1478 	case DMSGF_REPLY | DMSGF_DELETE:
1479 		/*
1480 		 * When transmitting a reply with DELETE set the original
1481 		 * persistent state message should already exist.
1482 		 *
1483 		 * This is very similar to the REPLY|CREATE|* case except
1484 		 * txcmd is already stored, so we just add the DELETE flag.
1485 		 *
1486 		 * Sent REPLY+ABORT+DELETE in case where msgid has
1487 		 * already been fully closed, ignore the message.
1488 		 */
1489 		if (state == NULL) {
1490 			if (msg->any.head.cmd & DMSGF_ABORT) {
1491 				error = EALREADY;
1492 			} else {
1493 				kprintf("kdmsg_state_msgtx: no state match "
1494 					"for REPLY | DELETE\n");
1495 				error = EINVAL;
1496 			}
1497 			break;
1498 		}
1499 
1500 		/*
1501 		 * Sent REPLY+ABORT+DELETE in case where msgid has already
1502 		 * been reused for an unrelated message, ignore the message.
1503 		 */
1504 		if ((state->txcmd & DMSGF_CREATE) == 0) {
1505 			if (msg->any.head.cmd & DMSGF_ABORT) {
1506 				error = EALREADY;
1507 			} else {
1508 				kprintf("kdmsg_state_msgtx: state reused "
1509 					"for REPLY | DELETE\n");
1510 				error = EINVAL;
1511 			}
1512 			break;
1513 		}
1514 		error = 0;
1515 		break;
1516 	case DMSGF_REPLY:
1517 		/*
1518 		 * Check for mid-stream ABORT reply sent.
1519 		 *
1520 		 * One-off REPLY messages are allowed for e.g. status updates.
1521 		 */
1522 		if (msg->any.head.cmd & DMSGF_ABORT) {
1523 			if (state == NULL ||
1524 			    (state->txcmd & DMSGF_CREATE) == 0) {
1525 				error = EALREADY;
1526 				break;
1527 			}
1528 		}
1529 		error = 0;
1530 		break;
1531 	}
1532 	lockmgr(&iocom->msglk, LK_RELEASE);
1533 	return (error);
1534 }
1535 
1536 static
1537 void
1538 kdmsg_state_cleanuptx(kdmsg_msg_t *msg)
1539 {
1540 	kdmsg_iocom_t *iocom = msg->iocom;
1541 	kdmsg_state_t *state;
1542 
1543 	if ((state = msg->state) == NULL) {
1544 		kdmsg_msg_free(msg);
1545 	} else if (msg->any.head.cmd & DMSGF_DELETE) {
1546 		lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1547 		KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1548 		state->txcmd |= DMSGF_DELETE;
1549 		if (state->rxcmd & DMSGF_DELETE) {
1550 			KKASSERT(state->flags & KDMSG_STATE_INSERTED);
1551 			if (state->txcmd & DMSGF_REPLY) {
1552 				KKASSERT(msg->any.head.cmd &
1553 					 DMSGF_REPLY);
1554 				RB_REMOVE(kdmsg_state_tree,
1555 					  &iocom->staterd_tree, state);
1556 			} else {
1557 				KKASSERT((msg->any.head.cmd &
1558 					  DMSGF_REPLY) == 0);
1559 				RB_REMOVE(kdmsg_state_tree,
1560 					  &iocom->statewr_tree, state);
1561 			}
1562 			state->flags &= ~KDMSG_STATE_INSERTED;
1563 			if (msg != state->msg)
1564 				kdmsg_msg_free(msg);
1565 			lockmgr(&iocom->msglk, LK_RELEASE);
1566 			kdmsg_state_free(state);
1567 		} else {
1568 			if (msg != state->msg)
1569 				kdmsg_msg_free(msg);
1570 			lockmgr(&iocom->msglk, LK_RELEASE);
1571 		}
1572 	} else if (msg != state->msg) {
1573 		kdmsg_msg_free(msg);
1574 	}
1575 }
1576 
1577 static
1578 void
1579 kdmsg_state_free(kdmsg_state_t *state)
1580 {
1581 	kdmsg_iocom_t *iocom = state->iocom;
1582 	kdmsg_msg_t *msg;
1583 
1584 	KKASSERT((state->flags & KDMSG_STATE_INSERTED) == 0);
1585 	msg = state->msg;
1586 	state->msg = NULL;
1587 	kfree(state, iocom->mmsg);
1588 	if (msg) {
1589 		msg->state = NULL;
1590 		kdmsg_msg_free(msg);
1591 	}
1592 }
1593 
1594 kdmsg_msg_t *
1595 kdmsg_msg_alloc(kdmsg_iocom_t *iocom, kdmsg_circuit_t *circ, uint32_t cmd,
1596 		int (*func)(kdmsg_state_t *, kdmsg_msg_t *), void *data)
1597 {
1598 	kdmsg_msg_t *msg;
1599 	kdmsg_state_t *state;
1600 	size_t hbytes;
1601 
1602 	KKASSERT(iocom != NULL);
1603 	hbytes = (cmd & DMSGF_SIZE) * DMSG_ALIGN;
1604 	msg = kmalloc(offsetof(struct kdmsg_msg, any) + hbytes,
1605 		      iocom->mmsg, M_WAITOK | M_ZERO);
1606 	msg->hdr_size = hbytes;
1607 	msg->iocom = iocom;
1608 	msg->any.head.magic = DMSG_HDR_MAGIC;
1609 	msg->any.head.cmd = cmd;
1610 	if (circ) {
1611 		kdmsg_circ_hold(circ);
1612 		msg->circ = circ;
1613 		msg->any.head.circuit = circ->msgid;
1614 	}
1615 
1616 	if (cmd & DMSGF_CREATE) {
1617 		/*
1618 		 * New transaction, requires tracking state and a unique
1619 		 * msgid to be allocated.
1620 		 */
1621 		KKASSERT(msg->state == NULL);
1622 		state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1623 		state->flags = KDMSG_STATE_DYNAMIC;
1624 		state->func = func;
1625 		state->any.any = data;
1626 		state->msg = msg;
1627 		state->msgid = (uint64_t)(uintptr_t)state;
1628 		state->circ = circ;
1629 		state->iocom = iocom;
1630 		msg->state = state;
1631 		if (circ)
1632 			kdmsg_circ_hold(circ);
1633 		/*msg->any.head.msgid = state->msgid;XXX*/
1634 
1635 		lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1636 		if (RB_INSERT(kdmsg_state_tree, &iocom->statewr_tree, state))
1637 			panic("duplicate msgid allocated");
1638 		state->flags |= KDMSG_STATE_INSERTED;
1639 		msg->any.head.msgid = state->msgid;
1640 		lockmgr(&iocom->msglk, LK_RELEASE);
1641 	}
1642 	return (msg);
1643 }
1644 
1645 kdmsg_msg_t *
1646 kdmsg_msg_alloc_state(kdmsg_state_t *state, uint32_t cmd,
1647 		      int (*func)(kdmsg_state_t *, kdmsg_msg_t *), void *data)
1648 {
1649 	kdmsg_iocom_t *iocom = state->iocom;
1650 	kdmsg_msg_t *msg;
1651 	size_t hbytes;
1652 
1653 	KKASSERT(iocom != NULL);
1654 	hbytes = (cmd & DMSGF_SIZE) * DMSG_ALIGN;
1655 	msg = kmalloc(offsetof(struct kdmsg_msg, any) + hbytes,
1656 		      iocom->mmsg, M_WAITOK | M_ZERO);
1657 	msg->hdr_size = hbytes;
1658 	msg->iocom = iocom;
1659 	msg->any.head.magic = DMSG_HDR_MAGIC;
1660 	msg->any.head.cmd = cmd;
1661 	msg->state = state;
1662 	if (state->circ) {
1663 		kdmsg_circ_hold(state->circ);
1664 		msg->circ = state->circ;
1665 		msg->any.head.circuit = state->circ->msgid;
1666 	}
1667 	return(msg);
1668 }
1669 
1670 void
1671 kdmsg_msg_free(kdmsg_msg_t *msg)
1672 {
1673 	kdmsg_iocom_t *iocom = msg->iocom;
1674 
1675 	if ((msg->flags & KDMSG_FLAG_AUXALLOC) &&
1676 	    msg->aux_data && msg->aux_size) {
1677 		kfree(msg->aux_data, iocom->mmsg);
1678 		msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1679 	}
1680 	if (msg->circ) {
1681 		kdmsg_circ_drop(msg->circ);
1682 		msg->circ = NULL;
1683 	}
1684 	if (msg->state) {
1685 		if (msg->state->msg == msg)
1686 			msg->state->msg = NULL;
1687 		msg->state = NULL;
1688 	}
1689 	msg->aux_data = NULL;
1690 	msg->aux_size = 0;
1691 	msg->iocom = NULL;
1692 	kfree(msg, iocom->mmsg);
1693 }
1694 
1695 /*
1696  * Circuits are tracked in a red-black tree by their circuit id (msgid).
1697  */
1698 int
1699 kdmsg_circuit_cmp(kdmsg_circuit_t *circ1, kdmsg_circuit_t *circ2)
1700 {
1701 	if (circ1->msgid < circ2->msgid)
1702 		return(-1);
1703 	if (circ1->msgid > circ2->msgid)
1704 		return(1);
1705 	return (0);
1706 }
1707 
1708 /*
1709  * Indexed messages are stored in a red-black tree indexed by their
1710  * msgid.  Only persistent messages are indexed.
1711  */
1712 int
1713 kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2)
1714 {
1715 	if (state1->iocom < state2->iocom)
1716 		return(-1);
1717 	if (state1->iocom > state2->iocom)
1718 		return(1);
1719 	if (state1->circ < state2->circ)
1720 		return(-1);
1721 	if (state1->circ > state2->circ)
1722 		return(1);
1723 	if (state1->msgid < state2->msgid)
1724 		return(-1);
1725 	if (state1->msgid > state2->msgid)
1726 		return(1);
1727 	return(0);
1728 }
1729 
1730 /*
1731  * Write a message.  All requisit command flags have been set.
1732  *
1733  * If msg->state is non-NULL the message is written to the existing
1734  * transaction.  msgid will be set accordingly.
1735  *
1736  * If msg->state is NULL and CREATE is set new state is allocated and
1737  * (func, data) is installed.  A msgid is assigned.
1738  *
1739  * If msg->state is NULL and CREATE is not set the message is assumed
1740  * to be a one-way message.  The originator must assign the msgid
1741  * (or leave it 0, which is typical.
1742  *
1743  * This function merely queues the message to the management thread, it
1744  * does not write to the message socket/pipe.
1745  */
1746 void
1747 kdmsg_msg_write(kdmsg_msg_t *msg)
1748 {
1749 	kdmsg_iocom_t *iocom = msg->iocom;
1750 	kdmsg_state_t *state;
1751 
1752 	if (msg->state) {
1753 		/*
1754 		 * Continuance or termination of existing transaction.
1755 		 * The transaction could have been initiated by either end.
1756 		 *
1757 		 * (Function callback and aux data for the receive side can
1758 		 * be replaced or left alone).
1759 		 */
1760 		state = msg->state;
1761 		msg->any.head.msgid = state->msgid;
1762 		lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1763 	} else {
1764 		/*
1765 		 * One-off message (always uses msgid 0 to distinguish
1766 		 * between a possibly lost in-transaction message due to
1767 		 * competing aborts and a real one-off message?)
1768 		 */
1769 		state = NULL;
1770 		msg->any.head.msgid = 0;
1771 		lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1772 	}
1773 
1774 	/*
1775 	 * With AUTOCIRC and AUTOFORGE it is possible for the circuit to
1776 	 * get ripped out in the rxthread while some other thread is
1777 	 * holding a ref on it inbetween allocating and sending a dmsg.
1778 	 */
1779 	if (msg->circ && msg->circ->rcirc_state == NULL &&
1780 	    (msg->circ->span_state == NULL || msg->circ->circ_state == NULL)) {
1781 		kprintf("kdmsg_msg_write: Attempt to write message to "
1782 		        "terminated circuit: msg %08x\n", msg->any.head.cmd);
1783 		lockmgr(&iocom->msglk, LK_RELEASE);
1784 		if (kdmsg_state_msgtx(msg)) {
1785 			if (state == NULL || msg != state->msg)
1786 				kdmsg_msg_free(msg);
1787 		} else if ((msg->state->rxcmd & DMSGF_DELETE) == 0) {
1788 			/* XXX SMP races simulating a response here */
1789 			kdmsg_state_t *state = msg->state;
1790 			kdmsg_state_cleanuptx(msg);
1791 			kdmsg_state_abort(state);
1792 		} else {
1793 			kdmsg_state_cleanuptx(msg);
1794 		}
1795 		return;
1796 	}
1797 
1798 	/*
1799 	 * This flag is not set until after the tx thread has drained
1800 	 * the txmsgq and simulated responses.  After that point the
1801 	 * txthread is dead and can no longer simulate responses.
1802 	 *
1803 	 * Device drivers should never try to send a message once this
1804 	 * flag is set.  They should have detected (through the state
1805 	 * closures) that the link is in trouble.
1806 	 */
1807 	if (iocom->flags & KDMSG_IOCOMF_EXITNOACC) {
1808 		lockmgr(&iocom->msglk, LK_RELEASE);
1809 		panic("kdmsg_msg_write: Attempt to write message to "
1810 		      "terminated iocom\n");
1811 	}
1812 
1813 	/*
1814 	 * Finish up the msg fields.  Note that msg->aux_size and the
1815 	 * aux_bytes stored in the message header represent the unaligned
1816 	 * (actual) bytes of data, but the buffer is sized to an aligned
1817 	 * size and the CRC is generated over the aligned length.
1818 	 */
1819 	msg->any.head.salt = /* (random << 8) | */ (iocom->msg_seq & 255);
1820 	++iocom->msg_seq;
1821 
1822 	if (msg->aux_data && msg->aux_size) {
1823 		uint32_t abytes = DMSG_DOALIGN(msg->aux_size);
1824 
1825 		msg->any.head.aux_bytes = msg->aux_size;
1826 		msg->any.head.aux_crc = iscsi_crc32(msg->aux_data, abytes);
1827 	}
1828 	msg->any.head.hdr_crc = 0;
1829 	msg->any.head.hdr_crc = iscsi_crc32(msg->any.buf, msg->hdr_size);
1830 
1831 	TAILQ_INSERT_TAIL(&iocom->msgq, msg, qentry);
1832 
1833 	if (iocom->msg_ctl & KDMSG_CLUSTERCTL_SLEEPING) {
1834 		atomic_clear_int(&iocom->msg_ctl,
1835 				 KDMSG_CLUSTERCTL_SLEEPING);
1836 		wakeup(&iocom->msg_ctl);
1837 	}
1838 
1839 	lockmgr(&iocom->msglk, LK_RELEASE);
1840 }
1841 
1842 /*
1843  * Reply to a message and terminate our side of the transaction.
1844  *
1845  * If msg->state is non-NULL we are replying to a one-way message.
1846  */
1847 void
1848 kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error)
1849 {
1850 	kdmsg_state_t *state = msg->state;
1851 	kdmsg_msg_t *nmsg;
1852 	uint32_t cmd;
1853 
1854 	/*
1855 	 * Reply with a simple error code and terminate the transaction.
1856 	 */
1857 	cmd = DMSG_LNK_ERROR;
1858 
1859 	/*
1860 	 * Check if our direction has even been initiated yet, set CREATE.
1861 	 *
1862 	 * Check what direction this is (command or reply direction).  Note
1863 	 * that txcmd might not have been initiated yet.
1864 	 *
1865 	 * If our direction has already been closed we just return without
1866 	 * doing anything.
1867 	 */
1868 	if (state) {
1869 		if (state->txcmd & DMSGF_DELETE)
1870 			return;
1871 		if ((state->txcmd & DMSGF_CREATE) == 0)
1872 			cmd |= DMSGF_CREATE;
1873 		if (state->txcmd & DMSGF_REPLY)
1874 			cmd |= DMSGF_REPLY;
1875 		cmd |= DMSGF_DELETE;
1876 	} else {
1877 		if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
1878 			cmd |= DMSGF_REPLY;
1879 	}
1880 
1881 	/* XXX messy mask cmd to avoid allocating state */
1882 	nmsg = kdmsg_msg_alloc_state(state, cmd, NULL, NULL);
1883 	nmsg->any.head.error = error;
1884 	kdmsg_msg_write(nmsg);
1885 }
1886 
1887 /*
1888  * Reply to a message and continue our side of the transaction.
1889  *
1890  * If msg->state is non-NULL we are replying to a one-way message and this
1891  * function degenerates into the same as kdmsg_msg_reply().
1892  */
1893 void
1894 kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error)
1895 {
1896 	kdmsg_state_t *state = msg->state;
1897 	kdmsg_msg_t *nmsg;
1898 	uint32_t cmd;
1899 
1900 	/*
1901 	 * Return a simple result code, do NOT terminate the transaction.
1902 	 */
1903 	cmd = DMSG_LNK_ERROR;
1904 
1905 	/*
1906 	 * Check if our direction has even been initiated yet, set CREATE.
1907 	 *
1908 	 * Check what direction this is (command or reply direction).  Note
1909 	 * that txcmd might not have been initiated yet.
1910 	 *
1911 	 * If our direction has already been closed we just return without
1912 	 * doing anything.
1913 	 */
1914 	if (state) {
1915 		if (state->txcmd & DMSGF_DELETE)
1916 			return;
1917 		if ((state->txcmd & DMSGF_CREATE) == 0)
1918 			cmd |= DMSGF_CREATE;
1919 		if (state->txcmd & DMSGF_REPLY)
1920 			cmd |= DMSGF_REPLY;
1921 		/* continuing transaction, do not set MSGF_DELETE */
1922 	} else {
1923 		if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
1924 			cmd |= DMSGF_REPLY;
1925 	}
1926 
1927 	/* XXX messy mask cmd to avoid allocating state */
1928 	nmsg = kdmsg_msg_alloc_state(state, cmd, NULL, NULL);
1929 	nmsg->any.head.error = error;
1930 	kdmsg_msg_write(nmsg);
1931 }
1932 
1933 /*
1934  * Reply to a message and terminate our side of the transaction.
1935  *
1936  * If msg->state is non-NULL we are replying to a one-way message.
1937  */
1938 void
1939 kdmsg_state_reply(kdmsg_state_t *state, uint32_t error)
1940 {
1941 	kdmsg_msg_t *nmsg;
1942 	uint32_t cmd;
1943 
1944 	/*
1945 	 * Reply with a simple error code and terminate the transaction.
1946 	 */
1947 	cmd = DMSG_LNK_ERROR;
1948 
1949 	/*
1950 	 * Check if our direction has even been initiated yet, set CREATE.
1951 	 *
1952 	 * Check what direction this is (command or reply direction).  Note
1953 	 * that txcmd might not have been initiated yet.
1954 	 *
1955 	 * If our direction has already been closed we just return without
1956 	 * doing anything.
1957 	 */
1958 	if (state) {
1959 		if (state->txcmd & DMSGF_DELETE)
1960 			return;
1961 		if ((state->txcmd & DMSGF_CREATE) == 0)
1962 			cmd |= DMSGF_CREATE;
1963 		if (state->txcmd & DMSGF_REPLY)
1964 			cmd |= DMSGF_REPLY;
1965 		cmd |= DMSGF_DELETE;
1966 	} else {
1967 		if ((state->txcmd & DMSGF_REPLY) == 0)
1968 			cmd |= DMSGF_REPLY;
1969 	}
1970 
1971 	/* XXX messy mask cmd to avoid allocating state */
1972 	nmsg = kdmsg_msg_alloc_state(state, cmd, NULL, NULL);
1973 	nmsg->any.head.error = error;
1974 	kdmsg_msg_write(nmsg);
1975 }
1976 
1977 /*
1978  * Reply to a message and continue our side of the transaction.
1979  *
1980  * If msg->state is non-NULL we are replying to a one-way message and this
1981  * function degenerates into the same as kdmsg_msg_reply().
1982  */
1983 void
1984 kdmsg_state_result(kdmsg_state_t *state, uint32_t error)
1985 {
1986 	kdmsg_msg_t *nmsg;
1987 	uint32_t cmd;
1988 
1989 	/*
1990 	 * Return a simple result code, do NOT terminate the transaction.
1991 	 */
1992 	cmd = DMSG_LNK_ERROR;
1993 
1994 	/*
1995 	 * Check if our direction has even been initiated yet, set CREATE.
1996 	 *
1997 	 * Check what direction this is (command or reply direction).  Note
1998 	 * that txcmd might not have been initiated yet.
1999 	 *
2000 	 * If our direction has already been closed we just return without
2001 	 * doing anything.
2002 	 */
2003 	if (state) {
2004 		if (state->txcmd & DMSGF_DELETE)
2005 			return;
2006 		if ((state->txcmd & DMSGF_CREATE) == 0)
2007 			cmd |= DMSGF_CREATE;
2008 		if (state->txcmd & DMSGF_REPLY)
2009 			cmd |= DMSGF_REPLY;
2010 		/* continuing transaction, do not set MSGF_DELETE */
2011 	} else {
2012 		if ((state->txcmd & DMSGF_REPLY) == 0)
2013 			cmd |= DMSGF_REPLY;
2014 	}
2015 
2016 	/* XXX messy mask cmd to avoid allocating state */
2017 	nmsg = kdmsg_msg_alloc_state(state, cmd, NULL, NULL);
2018 	nmsg->any.head.error = error;
2019 	kdmsg_msg_write(nmsg);
2020 }
2021