xref: /onnv-gate/usr/src/uts/common/io/ib/clients/rds/rdsib_ep.c (revision 4154:bd1265f2f9de)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *	- Redistributions of source code must retain the above
39  *	  copyright notice, this list of conditions and the following
40  *	  disclaimer.
41  *
42  *	- Redistributions in binary form must reproduce the above
43  *	  copyright notice, this list of conditions and the following
44  *	  disclaimer in the documentation and/or other materials
45  *	  provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 /*
58  * Sun elects to include this software in Sun product
59  * under the OpenIB BSD license.
60  *
61  *
62  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
63  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
66  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
67  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
68  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
69  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
70  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
71  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
72  * POSSIBILITY OF SUCH DAMAGE.
73  */
74 
75 #pragma ident	"%Z%%M%	%I%	%E% SMI"
76 
77 #include <sys/stream.h>
78 #include <sys/ib/clients/rds/rdsib_cm.h>
79 #include <sys/ib/clients/rds/rdsib_ib.h>
80 #include <sys/ib/clients/rds/rdsib_buf.h>
81 #include <sys/ib/clients/rds/rdsib_ep.h>
82 #include <sys/ib/clients/rds/rds_kstat.h>
83 #include <sys/zone.h>
84 
85 #define	RDS_POLL_CQ_IN_2TICKS	1
86 
87 /*
88  * This File contains the endpoint related calls
89  */
90 
91 extern int rds_get_ibaddr(ipaddr_t, ipaddr_t, ib_gid_t *, ib_gid_t *);
92 extern boolean_t rds_islocal(ipaddr_t addr);
93 extern uint_t rds_wc_signal;
94 
95 static uint8_t
96 rds_is_port_marked(rds_session_t *sp, in_port_t port)
97 {
98 	uint8_t	ret;
99 
100 	if (sp != NULL) {
101 		rw_enter(&sp->session_portmap_lock, RW_READER);
102 		ret = (sp->session_portmap[port/8] & (1 << (port % 8)));
103 		rw_exit(&sp->session_portmap_lock);
104 	} else {
105 		rw_enter(&rds_local_portmap_lock, RW_READER);
106 		ret = (rds_local_portmap[port/8] & (1 << (port % 8)));
107 		rw_exit(&rds_local_portmap_lock);
108 	}
109 
110 	return (ret);
111 }
112 
113 static uint8_t
114 rds_check_n_mark_port(rds_session_t *sp, in_port_t port)
115 {
116 	uint8_t	ret;
117 
118 	if (sp != NULL) {
119 		rw_enter(&sp->session_portmap_lock, RW_WRITER);
120 		ret = (sp->session_portmap[port/8] & (1 << (port % 8)));
121 		if (!ret) {
122 			/* port is not marked, mark it */
123 			sp->session_portmap[port/8] =
124 			    sp->session_portmap[port/8] | (1 << (port % 8));
125 		}
126 		rw_exit(&sp->session_portmap_lock);
127 	} else {
128 		rw_enter(&rds_local_portmap_lock, RW_WRITER);
129 		ret = (rds_local_portmap[port/8] & (1 << (port % 8)));
130 		if (!ret) {
131 			/* port is not marked, mark it */
132 			rds_local_portmap[port/8] =
133 			    rds_local_portmap[port/8] | (1 << (port % 8));
134 		}
135 		rw_exit(&rds_local_portmap_lock);
136 	}
137 
138 	return (ret);
139 }
140 
141 static uint8_t
142 rds_check_n_unmark_port(rds_session_t *sp, in_port_t port)
143 {
144 	uint8_t	ret;
145 
146 	if (sp != NULL) {
147 		rw_enter(&sp->session_portmap_lock, RW_WRITER);
148 		ret = (sp->session_portmap[port/8] & (1 << (port % 8)));
149 		if (ret) {
150 			/* port is marked, unmark it */
151 			sp->session_portmap[port/8] =
152 			    sp->session_portmap[port/8] & ~(1 << (port % 8));
153 		}
154 		rw_exit(&sp->session_portmap_lock);
155 	} else {
156 		rw_enter(&rds_local_portmap_lock, RW_WRITER);
157 		ret = (rds_local_portmap[port/8] & (1 << (port % 8)));
158 		if (ret) {
159 			/* port is marked, unmark it */
160 			rds_local_portmap[port/8] =
161 			    rds_local_portmap[port/8] & ~(1 << (port % 8));
162 		}
163 		rw_exit(&rds_local_portmap_lock);
164 	}
165 
166 	return (ret);
167 }
168 
169 static void
170 rds_mark_all_ports(rds_session_t *sp)
171 {
172 	if (sp != NULL) {
173 		rw_enter(&sp->session_portmap_lock, RW_WRITER);
174 		(void) memset(sp->session_portmap, 0xFF, RDS_PORT_MAP_SIZE);
175 		rw_exit(&sp->session_portmap_lock);
176 	} else {
177 		rw_enter(&rds_local_portmap_lock, RW_WRITER);
178 		(void) memset(rds_local_portmap, 0xFF, RDS_PORT_MAP_SIZE);
179 		rw_exit(&rds_local_portmap_lock);
180 	}
181 }
182 
183 static void
184 rds_unmark_all_ports(rds_session_t *sp)
185 {
186 	if (sp != NULL) {
187 		rw_enter(&sp->session_portmap_lock, RW_WRITER);
188 		bzero(sp->session_portmap, RDS_PORT_MAP_SIZE);
189 		rw_exit(&sp->session_portmap_lock);
190 	} else {
191 		rw_enter(&rds_local_portmap_lock, RW_WRITER);
192 		bzero(rds_local_portmap, RDS_PORT_MAP_SIZE);
193 		rw_exit(&rds_local_portmap_lock);
194 	}
195 }
196 
197 static void
198 rds_add_session(rds_session_t *sp, boolean_t locked)
199 {
200 	RDS_DPRINTF2("rds_add_session", "Enter: SP(%p)", sp);
201 
202 	if (!locked) {
203 		rw_enter(&rdsib_statep->rds_sessionlock, RW_WRITER);
204 	}
205 
206 	sp->session_nextp = rdsib_statep->rds_sessionlistp;
207 	rdsib_statep->rds_sessionlistp = sp;
208 	rdsib_statep->rds_nsessions++;
209 
210 	if (!locked) {
211 		rw_exit(&rdsib_statep->rds_sessionlock);
212 	}
213 	RDS_INCR_SESS();
214 
215 	RDS_DPRINTF2("rds_add_session", "Return: SP(%p)", sp);
216 }
217 
218 /* Session lookup based on destination IP or destination node guid */
219 rds_session_t *
220 rds_session_lkup(rds_state_t *statep, ipaddr_t remoteip, ib_guid_t node_guid)
221 {
222 	rds_session_t	*sp;
223 
224 	RDS_DPRINTF4("rds_session_lkup", "Enter: 0x%p 0x%x 0x%llx", statep,
225 	    remoteip, node_guid);
226 
227 	/* A read/write lock is expected, will panic if none of them are held */
228 	ASSERT(rw_lock_held(&statep->rds_sessionlock));
229 	sp = statep->rds_sessionlistp;
230 	while (sp) {
231 		if ((sp->session_rgid.gid_guid == node_guid) ||
232 		    (sp->session_remip == remoteip)) {
233 			break;
234 		}
235 
236 		sp = sp->session_nextp;
237 	}
238 
239 	RDS_DPRINTF4("rds_session_lkup", "Return: SP(%p)", sp);
240 
241 	return (sp);
242 }
243 
244 static void
245 rds_ep_fini(rds_ep_t *ep)
246 {
247 	RDS_DPRINTF3("rds_ep_fini", "Enter: EP(%p) type: %d", ep, ep->ep_type);
248 
249 	/* free send pool */
250 	rds_free_send_pool(ep);
251 
252 	/* free recv pool */
253 	rds_free_recv_pool(ep);
254 
255 	RDS_DPRINTF3("rds_ep_fini", "Return EP(%p)", ep);
256 }
257 
258 /* Assumes SP write lock is held */
259 int
260 rds_ep_init(rds_ep_t *ep)
261 {
262 	uint_t		ret;
263 
264 	RDS_DPRINTF3("rds_ep_init", "Enter: EP(%p) Type: %d", ep, ep->ep_type);
265 
266 	/* send pool */
267 	ret = rds_init_send_pool(ep);
268 	if (ret != 0) {
269 		RDS_DPRINTF2(LABEL, "EP(%p): rds_init_send_pool failed: %d",
270 		    ep, ret);
271 		return (-1);
272 	}
273 
274 	/* recv pool */
275 	ret = rds_init_recv_pool(ep);
276 	if (ret != 0) {
277 		RDS_DPRINTF2(LABEL, "EP(%p): rds_init_recv_pool failed: %d",
278 		    ep, ret);
279 		rds_free_send_pool(ep);
280 		return (-1);
281 	}
282 
283 	/* reset the ep state */
284 	mutex_enter(&ep->ep_lock);
285 	ep->ep_state = RDS_EP_STATE_UNCONNECTED;
286 	ep->ep_lbufid = NULL;
287 	ep->ep_rbufid = NULL;
288 	ep->ep_segfbp = NULL;
289 	ep->ep_seglbp = NULL;
290 
291 	/* Initialize the WR to send acknowledgements */
292 	ep->ep_ackwr.wr_id = RDS_RDMAW_WRID;
293 	ep->ep_ackwr.wr_flags = IBT_WR_SEND_SOLICIT;
294 	ep->ep_ackwr.wr_trans = IBT_RC_SRV;
295 	ep->ep_ackwr.wr_opcode = IBT_WRC_RDMAW;
296 	ep->ep_ackwr.wr_nds = 1;
297 	ep->ep_ackwr.wr_sgl = &ep->ep_ackds;
298 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = NULL;
299 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = 0;
300 	mutex_exit(&ep->ep_lock);
301 
302 	RDS_DPRINTF3("rds_ep_init", "Return: EP(%p) type: %d", ep, ep->ep_type);
303 
304 	return (0);
305 }
306 
307 static int
308 rds_ep_reinit(rds_ep_t *ep, ib_guid_t hca_guid)
309 {
310 	int	ret;
311 
312 	RDS_DPRINTF3("rds_ep_reinit", "Enter: EP(%p) Type: %d",
313 	    ep, ep->ep_type);
314 
315 	/* Re-initialize send pool */
316 	ret = rds_reinit_send_pool(ep, hca_guid);
317 	if (ret != 0) {
318 		RDS_DPRINTF2("rds_ep_reinit",
319 		    "EP(%p): rds_reinit_send_pool failed: %d", ep, ret);
320 		return (-1);
321 	}
322 
323 	/* free all the receive buffers in the pool */
324 	rds_free_recv_pool(ep);
325 
326 	RDS_DPRINTF3("rds_ep_reinit", "Return: EP(%p) Type: %d",
327 	    ep, ep->ep_type);
328 
329 	return (0);
330 }
331 
332 void
333 rds_session_fini(rds_session_t *sp)
334 {
335 	RDS_DPRINTF2("rds_session_fini", "Enter: SP(0x%p)", sp);
336 
337 	rds_ep_fini(&sp->session_dataep);
338 	rds_ep_fini(&sp->session_ctrlep);
339 
340 	RDS_DPRINTF2("rds_session_fini", "Return: SP(0x%p)", sp);
341 }
342 
343 /*
344  * Allocate and initialize the resources needed for the control and
345  * data channels
346  */
347 int
348 rds_session_init(rds_session_t *sp)
349 {
350 	int		ret;
351 
352 	RDS_DPRINTF2("rds_session_init", "Enter: SP(0x%p)", sp);
353 
354 	/* CALLED WITH SESSION WRITE LOCK */
355 
356 	/* allocate and initialize the ctrl channel */
357 	ret = rds_ep_init(&sp->session_ctrlep);
358 	if (ret != 0) {
359 		RDS_DPRINTF2(LABEL, "SP(%p): Ctrl EP(%p) initialization "
360 		    "failed", sp, &sp->session_ctrlep);
361 		return (-1);
362 	}
363 
364 	RDS_DPRINTF2(LABEL, "SP(%p) Control EP(%p)", sp, &sp->session_ctrlep);
365 
366 	/* allocate and initialize the data channel */
367 	ret = rds_ep_init(&sp->session_dataep);
368 	if (ret != 0) {
369 		RDS_DPRINTF2(LABEL, "SP(%p): Data EP(%p) initialization "
370 		    "failed", sp, &sp->session_dataep);
371 		rds_ep_fini(&sp->session_ctrlep);
372 		return (-1);
373 	}
374 
375 	RDS_DPRINTF2(LABEL, "SP(%p) Data EP(%p)", sp, &sp->session_dataep);
376 
377 	RDS_DPRINTF2("rds_session_init", "Return");
378 
379 	return (0);
380 }
381 
382 /*
383  * This should be called before moving a session from ERROR state to
384  * INIT state. This will update the HCA keys incase the session has moved from
385  * one HCA to another.
386  */
387 int
388 rds_session_reinit(rds_session_t *sp, ib_gid_t lgid)
389 {
390 	rds_hca_t	*hcap, *hcap1;
391 	int		ret;
392 
393 	RDS_DPRINTF2("rds_session_reinit", "Enter: SP(0x%p)", sp);
394 
395 	/* CALLED WITH SESSION WRITE LOCK */
396 
397 	hcap = rds_gid_to_hcap(rdsib_statep, lgid);
398 	if (hcap == NULL) {
399 		RDS_DPRINTF1("rds_session_reinit", "SGID is on an "
400 		    "uninitialized HCA: %llx", lgid.gid_guid);
401 		return (-1);
402 	}
403 
404 	hcap1 = rds_gid_to_hcap(rdsib_statep, sp->session_lgid);
405 	if (hcap1 == NULL) {
406 		RDS_DPRINTF1("rds_session_reinit", "Seems like HCA %llx "
407 		    "is unplugged", sp->session_lgid.gid_guid);
408 	} else if (hcap->hca_guid == hcap1->hca_guid) {
409 		/*
410 		 * No action is needed as the session did not move across
411 		 * HCAs
412 		 */
413 		RDS_DPRINTF2("rds_session_reinit", "Failover on the same HCA");
414 		return (0);
415 	}
416 
417 	RDS_DPRINTF2("rds_session_reinit", "Failover across HCAs");
418 
419 	/* re-initialize the control channel */
420 	ret = rds_ep_reinit(&sp->session_ctrlep, hcap->hca_guid);
421 	if (ret != 0) {
422 		RDS_DPRINTF2("rds_session_reinit",
423 		    "SP(%p): Ctrl EP(%p) re-initialization failed",
424 		    sp, &sp->session_ctrlep);
425 		return (-1);
426 	}
427 
428 	RDS_DPRINTF2("rds_session_reinit", "SP(%p) Control EP(%p)",
429 	    sp, &sp->session_ctrlep);
430 
431 	/* re-initialize the data channel */
432 	ret = rds_ep_reinit(&sp->session_dataep, hcap->hca_guid);
433 	if (ret != 0) {
434 		RDS_DPRINTF2("rds_session_reinit",
435 		    "SP(%p): Data EP(%p) re-initialization failed",
436 		    sp, &sp->session_dataep);
437 		return (-1);
438 	}
439 
440 	RDS_DPRINTF2("rds_session_reinit", "SP(%p) Data EP(%p)",
441 	    sp, &sp->session_dataep);
442 
443 	sp->session_lgid = lgid;
444 
445 	RDS_DPRINTF2("rds_session_reinit", "Return: SP(0x%p)", sp);
446 
447 	return (0);
448 }
449 
450 static int
451 rds_session_connect(rds_session_t *sp)
452 {
453 	ibt_channel_hdl_t	ctrlchan, datachan;
454 	rds_ep_t		*ep;
455 	ibt_path_info_t		pinfo;
456 	ibt_path_attr_t		pattr;
457 	ib_gid_t		lgid, rgid;
458 	int			ret;
459 
460 	RDS_DPRINTF2("rds_session_connect", "Enter SP(%p)", sp);
461 
462 	rw_enter(&sp->session_lock, RW_READER);
463 	rgid = sp->session_rgid;
464 	lgid = sp->session_lgid;
465 	rw_exit(&sp->session_lock);
466 
467 	/* get paths to the destination */
468 	bzero(&pattr, sizeof (ibt_path_attr_t));
469 	pattr.pa_dgids = &rgid;
470 	pattr.pa_sgid = lgid;
471 	pattr.pa_sd_flags = IBT_NO_SDATA;
472 	pattr.pa_num_dgids = 1;
473 	ret = ibt_get_paths(rdsib_statep->rds_ibhdl, IBT_PATH_NO_FLAGS,
474 	    &pattr, 1, &pinfo, NULL);
475 	if (ret != IBT_SUCCESS) {
476 		RDS_DPRINTF2(LABEL, "ibt_get_paths failed: %d", ret);
477 		return (-1);
478 	}
479 	pinfo.pi_sid = RDS_SERVICE_ID;
480 
481 	/* Override the packet life time based on the conf file */
482 	if (IBPktLifeTime != 0) {
483 		pinfo.pi_prim_cep_path.cep_cm_opaque1 = IBPktLifeTime;
484 	}
485 
486 	/* Session type may change if we run into peer-to-peer case. */
487 	rw_enter(&sp->session_lock, RW_READER);
488 	if (sp->session_type == RDS_SESSION_PASSIVE) {
489 		RDS_DPRINTF2("rds_session_connect", "SP(%p) is no longer the "
490 		    "active end", sp);
491 		rw_exit(&sp->session_lock);
492 		return (0); /* return success */
493 	}
494 	rw_exit(&sp->session_lock);
495 
496 	/* connect the data ep first */
497 	ep = &sp->session_dataep;
498 	mutex_enter(&ep->ep_lock);
499 	if (ep->ep_state == RDS_EP_STATE_UNCONNECTED) {
500 		ep->ep_state = RDS_EP_STATE_ACTIVE_PENDING;
501 		mutex_exit(&ep->ep_lock);
502 		ret = rds_open_rc_channel(ep, &pinfo, IBT_BLOCKING, &datachan);
503 		if (ret != IBT_SUCCESS) {
504 			RDS_DPRINTF2(LABEL, "EP(%p): rds_open_rc_channel "
505 			    "failed: %d", ep, ret);
506 			return (-1);
507 		}
508 		sp->session_dataep.ep_chanhdl = datachan;
509 	} else {
510 		RDS_DPRINTF2(LABEL, "SP(%p) Data EP(%p) is in "
511 		    "unexpected state: %d", sp, ep, ep->ep_state);
512 		mutex_exit(&ep->ep_lock);
513 		return (-1);
514 	}
515 
516 	RDS_DPRINTF3(LABEL, "SP(%p) EP(%p): Data channel is connected",
517 	    sp, ep);
518 
519 	ep = &sp->session_ctrlep;
520 	mutex_enter(&ep->ep_lock);
521 	if (ep->ep_state == RDS_EP_STATE_UNCONNECTED) {
522 		ep->ep_state = RDS_EP_STATE_ACTIVE_PENDING;
523 		mutex_exit(&ep->ep_lock);
524 		ret = rds_open_rc_channel(ep, &pinfo, IBT_BLOCKING, &ctrlchan);
525 		if (ret != IBT_SUCCESS) {
526 			RDS_DPRINTF2(LABEL, "EP(%p): rds_open_rc_channel "
527 			    "failed: %d", ep, ret);
528 			return (-1);
529 		}
530 		sp->session_ctrlep.ep_chanhdl = ctrlchan;
531 	} else {
532 		RDS_DPRINTF2(LABEL, "SP(%p) Control EP(%p) is in "
533 		    "unexpected state: %d", sp, ep, ep->ep_state);
534 		mutex_exit(&ep->ep_lock);
535 		return (-1);
536 	}
537 
538 	RDS_DPRINTF2(LABEL, "Session (%p) 0x%x <--> 0x%x is CONNECTED",
539 	    sp, sp->session_myip, sp->session_remip);
540 
541 	RDS_DPRINTF2("rds_session_connect", "Return SP(%p)", sp);
542 
543 	return (0);
544 }
545 
546 /*
547  * Can be called with or without session_lock.
548  */
549 void
550 rds_session_close(rds_session_t *sp, ibt_execution_mode_t mode, uint_t wait)
551 {
552 	rds_ep_t		*ep;
553 
554 	RDS_DPRINTF2("rds_session_close", "SP(%p) State: %d", sp,
555 	    sp->session_state);
556 
557 	ep = &sp->session_dataep;
558 	RDS_DPRINTF3(LABEL, "EP(%p) State: %d", ep, ep->ep_state);
559 
560 	/* wait until the SQ is empty before closing */
561 	(void) rds_is_sendq_empty(ep, wait);
562 
563 	mutex_enter(&ep->ep_lock);
564 	while (ep->ep_state == RDS_EP_STATE_CLOSING) {
565 		mutex_exit(&ep->ep_lock);
566 		delay(drv_usectohz(300000));
567 		mutex_enter(&ep->ep_lock);
568 	}
569 
570 	if (ep->ep_state == RDS_EP_STATE_CONNECTED) {
571 		ep->ep_state = RDS_EP_STATE_CLOSING;
572 		mutex_exit(&ep->ep_lock);
573 		(void) rds_close_rc_channel(ep->ep_chanhdl, mode);
574 		mutex_enter(&ep->ep_lock);
575 	}
576 	rds_ep_free_rc_channel(ep);
577 	ep->ep_state = RDS_EP_STATE_UNCONNECTED;
578 	ep->ep_segfbp = NULL;
579 	ep->ep_seglbp = NULL;
580 	mutex_exit(&ep->ep_lock);
581 
582 	ep = &sp->session_ctrlep;
583 	RDS_DPRINTF3(LABEL, "EP(%p) State: %d", ep, ep->ep_state);
584 
585 	/* wait until the SQ is empty before closing */
586 	(void) rds_is_sendq_empty(ep, 1);
587 
588 	mutex_enter(&ep->ep_lock);
589 	while (ep->ep_state == RDS_EP_STATE_CLOSING) {
590 		mutex_exit(&ep->ep_lock);
591 		delay(drv_usectohz(300000));
592 		mutex_enter(&ep->ep_lock);
593 	}
594 
595 	if (ep->ep_state == RDS_EP_STATE_CONNECTED) {
596 		mutex_exit(&ep->ep_lock);
597 		ep->ep_state = RDS_EP_STATE_CLOSING;
598 		(void) rds_close_rc_channel(ep->ep_chanhdl, mode);
599 		mutex_enter(&ep->ep_lock);
600 	}
601 	rds_ep_free_rc_channel(ep);
602 	ep->ep_state = RDS_EP_STATE_UNCONNECTED;
603 	ep->ep_segfbp = NULL;
604 	ep->ep_seglbp = NULL;
605 	mutex_exit(&ep->ep_lock);
606 
607 	RDS_DPRINTF2("rds_session_close", "Return (%p)", sp);
608 }
609 
610 /* Free the session */
611 static void
612 rds_destroy_session(rds_session_t *sp)
613 {
614 	rds_ep_t	*ep;
615 	rds_bufpool_t	*pool;
616 
617 	ASSERT((sp->session_state == RDS_SESSION_STATE_CLOSED) ||
618 	    (sp->session_state == RDS_SESSION_STATE_FAILED) ||
619 	    (sp->session_state == RDS_SESSION_STATE_FINI) ||
620 	    (sp->session_state == RDS_SESSION_STATE_PASSIVE_CLOSING));
621 
622 	rw_enter(&sp->session_lock, RW_READER);
623 	RDS_DPRINTF2("rds_destroy_session", "SP(%p) State: %d", sp,
624 	    sp->session_state);
625 	while (!((sp->session_state == RDS_SESSION_STATE_CLOSED) ||
626 	    (sp->session_state == RDS_SESSION_STATE_FAILED) ||
627 	    (sp->session_state == RDS_SESSION_STATE_FINI))) {
628 		rw_exit(&sp->session_lock);
629 		delay(drv_usectohz(1000000));
630 		rw_enter(&sp->session_lock, RW_READER);
631 		RDS_DPRINTF2("rds_destroy_session", "SP(%p) State: %d WAITING "
632 		    "ON SESSION", sp, sp->session_state);
633 	}
634 	rw_exit(&sp->session_lock);
635 
636 	/* data channel */
637 	ep = &sp->session_dataep;
638 
639 	/* send pool locks */
640 	pool = &ep->ep_sndpool;
641 	cv_destroy(&pool->pool_cv);
642 	mutex_destroy(&pool->pool_lock);
643 
644 	/* recv pool locks */
645 	pool = &ep->ep_rcvpool;
646 	cv_destroy(&pool->pool_cv);
647 	mutex_destroy(&pool->pool_lock);
648 	mutex_destroy(&ep->ep_recvqp.qp_lock);
649 
650 	/* control channel */
651 	ep = &sp->session_ctrlep;
652 
653 	/* send pool locks */
654 	pool = &ep->ep_sndpool;
655 	cv_destroy(&pool->pool_cv);
656 	mutex_destroy(&pool->pool_lock);
657 
658 	/* recv pool locks */
659 	pool = &ep->ep_rcvpool;
660 	cv_destroy(&pool->pool_cv);
661 	mutex_destroy(&pool->pool_lock);
662 	mutex_destroy(&ep->ep_recvqp.qp_lock);
663 
664 	/* session */
665 	rw_destroy(&sp->session_lock);
666 	rw_destroy(&sp->session_portmap_lock);
667 
668 	/* free the session */
669 	kmem_free(sp, sizeof (rds_session_t));
670 
671 	RDS_DPRINTF2("rds_destroy_session", "SP(%p) Return", sp);
672 }
673 
674 /* This is called on the taskq thread */
675 static void
676 rds_failover_session(void *arg)
677 {
678 	rds_session_t	*sp = (rds_session_t *)arg;
679 	ib_gid_t	lgid, rgid;
680 	ipaddr_t	myip, remip;
681 	int		ret, cnt = 0;
682 
683 	RDS_DPRINTF2("rds_failover_session", "Enter: (%p)", sp);
684 
685 	RDS_INCR_FAILOVERS();
686 
687 	rw_enter(&sp->session_lock, RW_WRITER);
688 	if (sp->session_type != RDS_SESSION_ACTIVE) {
689 		/*
690 		 * The remote side must have seen the error and initiated
691 		 * a re-connect.
692 		 */
693 		RDS_DPRINTF2("rds_failover_session",
694 		    "SP(%p) has become passive", sp);
695 		rw_exit(&sp->session_lock);
696 		return;
697 	}
698 	sp->session_failover++;
699 	rw_exit(&sp->session_lock);
700 
701 	/*
702 	 * The session is in ERROR state but close both channels
703 	 * for a clean start.
704 	 */
705 	rds_session_close(sp, IBT_BLOCKING, 1);
706 
707 	/* wait 1 sec before re-connecting */
708 	delay(drv_usectohz(1000000));
709 
710 	do {
711 		/* The ipaddr should be in the network order */
712 		myip = sp->session_myip;
713 		remip = sp->session_remip;
714 		ret = rds_sc_path_lookup(&myip, &remip);
715 		if (ret == 0) {
716 			RDS_DPRINTF2(LABEL, "Path not found (0x%x 0x%x)",
717 			    myip, remip);
718 		}
719 		/* check if we have (new) path from the source to destination */
720 		ret = rds_get_ibaddr(htonl(myip), htonl(remip), &lgid, &rgid);
721 		if (ret == 0) {
722 			break;
723 		}
724 
725 		RDS_DPRINTF1(LABEL, "rds_get_ibaddr failed: %d", ret);
726 		/* wait 1 sec before re-trying */
727 		delay(drv_usectohz(1000000));
728 		cnt++;
729 	} while (cnt < 3);
730 
731 	if (ret != 0) {
732 		rw_enter(&sp->session_lock, RW_WRITER);
733 		if (sp->session_type == RDS_SESSION_ACTIVE) {
734 			rds_session_fini(sp);
735 			sp->session_state = RDS_SESSION_STATE_FAILED;
736 			RDS_DPRINTF3("rds_failover_session",
737 			    "SP(%p) State RDS_SESSION_STATE_FAILED", sp);
738 		} else {
739 			RDS_DPRINTF2("rds_failover_session",
740 			    "SP(%p) has become passive", sp);
741 		}
742 		rw_exit(&sp->session_lock);
743 		return;
744 	}
745 
746 	RDS_DPRINTF2(LABEL, "lgid: %llx:%llx rgid: %llx:%llx",
747 	    lgid.gid_prefix, lgid.gid_guid, rgid.gid_prefix,
748 	    rgid.gid_guid);
749 
750 	rw_enter(&sp->session_lock, RW_WRITER);
751 	if (sp->session_type != RDS_SESSION_ACTIVE) {
752 		/*
753 		 * The remote side must have seen the error and initiated
754 		 * a re-connect.
755 		 */
756 		RDS_DPRINTF2("rds_failover_session",
757 		    "SP(%p) has become passive", sp);
758 		rw_exit(&sp->session_lock);
759 		return;
760 	}
761 
762 	/* move the session to init state */
763 	ret = rds_session_reinit(sp, lgid);
764 	sp->session_lgid = lgid;
765 	sp->session_rgid = rgid;
766 	if (ret != 0) {
767 		rds_session_fini(sp);
768 		sp->session_state = RDS_SESSION_STATE_FAILED;
769 		RDS_DPRINTF3("rds_failover_session",
770 		    "SP(%p) State RDS_SESSION_STATE_FAILED", sp);
771 		rw_exit(&sp->session_lock);
772 		return;
773 	} else {
774 		sp->session_state = RDS_SESSION_STATE_INIT;
775 		RDS_DPRINTF3("rds_failover_session",
776 		    "SP(%p) State RDS_SESSION_STATE_INIT", sp);
777 	}
778 	rw_exit(&sp->session_lock);
779 
780 	rds_session_open(sp);
781 
782 	RDS_DPRINTF2("rds_failover_session", "Return: (%p)", sp);
783 }
784 
785 void
786 rds_handle_send_error(rds_ep_t *ep)
787 {
788 	if (rds_is_sendq_empty(ep, 0)) {
789 		/* Session should already be in ERROR, try to reconnect */
790 		RDS_DPRINTF2("rds_handle_send_error",
791 		    "Dispatching taskq to failover SP(%p)", ep->ep_sp);
792 		(void) ddi_taskq_dispatch(rds_taskq, rds_failover_session,
793 		    (void *)ep->ep_sp, DDI_SLEEP);
794 	}
795 }
796 
797 /*
798  * Called in the CM handler on the passive side
799  * Called on a taskq thread.
800  */
801 void
802 rds_cleanup_passive_session(void *arg)
803 {
804 	rds_session_t	*sp = arg;
805 
806 	RDS_DPRINTF2("rds_cleanup_passive_session", "SP(%p) State: %d", sp,
807 	    sp->session_state);
808 	ASSERT((sp->session_state == RDS_SESSION_STATE_CLOSED) ||
809 	    (sp->session_state == RDS_SESSION_STATE_ERROR));
810 
811 	rds_session_close(sp, IBT_BLOCKING, 1);
812 
813 	rw_enter(&sp->session_lock, RW_WRITER);
814 	if (sp->session_state == RDS_SESSION_STATE_CLOSED) {
815 		rds_session_fini(sp);
816 		sp->session_state = RDS_SESSION_STATE_FINI;
817 		RDS_DPRINTF3("rds_cleanup_passive_session",
818 		    "SP(%p) State RDS_SESSION_STATE_FINI", sp);
819 	} else if (sp->session_state == RDS_SESSION_STATE_ERROR) {
820 		rds_session_fini(sp);
821 		sp->session_state = RDS_SESSION_STATE_FAILED;
822 		RDS_DPRINTF3("rds_cleanup_passive_session",
823 		    "SP(%p) State RDS_SESSION_STATE_FAILED", sp);
824 	}
825 	rw_exit(&sp->session_lock);
826 
827 	RDS_DPRINTF2("rds_cleanup_passive_session", "Return: SP (%p)", sp);
828 }
829 
830 /*
831  * Called by the CM handler on the passive side
832  * Called with WRITE lock on the session
833  */
834 void
835 rds_passive_session_fini(rds_session_t *sp)
836 {
837 	rds_ep_t	*ep;
838 
839 	RDS_DPRINTF2("rds_passive_session_fini", "SP(%p) State: %d", sp,
840 	    sp->session_state);
841 	ASSERT((sp->session_state == RDS_SESSION_STATE_CLOSED) ||
842 	    (sp->session_state == RDS_SESSION_STATE_ERROR));
843 
844 	/* clean the data channel */
845 	ep = &sp->session_dataep;
846 	(void) rds_is_sendq_empty(ep, 1);
847 	mutex_enter(&ep->ep_lock);
848 	RDS_DPRINTF2("rds_passive_session_fini", "EP(%p) State: %d", ep,
849 	    ep->ep_state);
850 	rds_ep_free_rc_channel(ep);
851 	mutex_exit(&ep->ep_lock);
852 
853 	/* clean the control channel */
854 	ep = &sp->session_ctrlep;
855 	(void) rds_is_sendq_empty(ep, 1);
856 	mutex_enter(&ep->ep_lock);
857 	RDS_DPRINTF2("rds_passive_session_fini", "EP(%p) State: %d", ep,
858 	    ep->ep_state);
859 	rds_ep_free_rc_channel(ep);
860 	mutex_exit(&ep->ep_lock);
861 
862 	rds_session_fini(sp);
863 
864 	RDS_DPRINTF2("rds_passive_session_fini", "Return: SP (%p)", sp);
865 }
866 
867 /*
868  * Can be called:
869  * 1. on driver detach
870  * 2. on taskq thread
871  * arg is always NULL
872  */
873 /* ARGSUSED */
874 void
875 rds_close_sessions(void *arg)
876 {
877 	rds_session_t *sp, *spnextp;
878 
879 	RDS_DPRINTF2("rds_close_sessions", "Enter");
880 
881 	/* wait until all the buffers are freed by the sockets */
882 	while (RDS_GET_RXPKTS_PEND() != 0) {
883 		/* wait one second and try again */
884 		RDS_DPRINTF2("rds_close_sessions", "waiting on "
885 		    "pending packets", RDS_GET_RXPKTS_PEND());
886 		delay(drv_usectohz(1000000));
887 	}
888 	RDS_DPRINTF2("rds_close_sessions", "No more RX packets pending");
889 
890 	/* close all the sessions */
891 	rw_enter(&rdsib_statep->rds_sessionlock, RW_WRITER);
892 	sp = rdsib_statep->rds_sessionlistp;
893 	while (sp) {
894 		rw_enter(&sp->session_lock, RW_WRITER);
895 		RDS_DPRINTF2("rds_close_sessions", "SP(%p) State: %d", sp,
896 		    sp->session_state);
897 
898 		switch (sp->session_state) {
899 		case RDS_SESSION_STATE_CONNECTED:
900 			sp->session_state = RDS_SESSION_STATE_ACTIVE_CLOSING;
901 			rw_exit(&sp->session_lock);
902 
903 			rds_session_close(sp, IBT_BLOCKING, 2);
904 
905 			rw_enter(&sp->session_lock, RW_WRITER);
906 			sp->session_state = RDS_SESSION_STATE_CLOSED;
907 			RDS_DPRINTF3("rds_close_sessions",
908 			    "SP(%p) State RDS_SESSION_STATE_CLOSED", sp);
909 			rds_session_fini(sp);
910 			sp->session_state = RDS_SESSION_STATE_FINI;
911 			RDS_DPRINTF3("rds_close_sessions",
912 			    "SP(%p) State RDS_SESSION_STATE_FINI", sp);
913 			break;
914 
915 		case RDS_SESSION_STATE_ERROR:
916 		case RDS_SESSION_STATE_PASSIVE_CLOSING:
917 		case RDS_SESSION_STATE_INIT:
918 			sp->session_state = RDS_SESSION_STATE_ACTIVE_CLOSING;
919 			rw_exit(&sp->session_lock);
920 
921 			rds_session_close(sp, IBT_BLOCKING, 1);
922 
923 			rw_enter(&sp->session_lock, RW_WRITER);
924 			sp->session_state = RDS_SESSION_STATE_CLOSED;
925 			RDS_DPRINTF3("rds_close_sessions",
926 			    "SP(%p) State RDS_SESSION_STATE_CLOSED", sp);
927 			/* FALLTHRU */
928 		case RDS_SESSION_STATE_CLOSED:
929 			rds_session_fini(sp);
930 			sp->session_state = RDS_SESSION_STATE_FINI;
931 			RDS_DPRINTF3("rds_close_sessions",
932 			    "SP(%p) State RDS_SESSION_STATE_FINI", sp);
933 			break;
934 		}
935 
936 		rw_exit(&sp->session_lock);
937 		sp = sp->session_nextp;
938 	}
939 
940 	sp = rdsib_statep->rds_sessionlistp;
941 	rdsib_statep->rds_sessionlistp = NULL;
942 	rdsib_statep->rds_nsessions = 0;
943 	rw_exit(&rdsib_statep->rds_sessionlock);
944 
945 	while (sp) {
946 		spnextp = sp->session_nextp;
947 		rds_destroy_session(sp);
948 		RDS_DECR_SESS();
949 		sp = spnextp;
950 	}
951 
952 	/* free the global pool */
953 	rds_free_recv_caches(rdsib_statep);
954 
955 	RDS_DPRINTF2("rds_close_sessions", "Return");
956 }
957 
958 void
959 rds_session_open(rds_session_t *sp)
960 {
961 	int		ret;
962 
963 	RDS_DPRINTF2("rds_session_open", "Enter SP(%p)", sp);
964 
965 	ret = rds_session_connect(sp);
966 	if (ret == -1) {
967 		/*
968 		 * may be the session has become passive due to
969 		 * hitting peer-to-peer case
970 		 */
971 		rw_enter(&sp->session_lock, RW_READER);
972 		if (sp->session_type == RDS_SESSION_PASSIVE) {
973 			RDS_DPRINTF2("rds_session_open", "SP(%p) "
974 			    "has become passive from active", sp);
975 			rw_exit(&sp->session_lock);
976 			return;
977 		}
978 
979 		/* get the lock for writing */
980 		rw_exit(&sp->session_lock);
981 		rw_enter(&sp->session_lock, RW_WRITER);
982 		sp->session_state = RDS_SESSION_STATE_ERROR;
983 		RDS_DPRINTF3("rds_session_open",
984 		    "SP(%p) State RDS_SESSION_STATE_ERROR", sp);
985 		rw_exit(&sp->session_lock);
986 
987 		/* Connect request failed */
988 		rds_session_close(sp, IBT_BLOCKING, 1);
989 
990 		rw_enter(&sp->session_lock, RW_WRITER);
991 		rds_session_fini(sp);
992 		sp->session_state = RDS_SESSION_STATE_FAILED;
993 		RDS_DPRINTF3("rds_session_open",
994 		    "SP(%p) State RDS_SESSION_STATE_FAILED", sp);
995 		rw_exit(&sp->session_lock);
996 
997 		return;
998 	}
999 
1000 	RDS_DPRINTF2("rds_session_open", "Return: SP(%p)", sp);
1001 }
1002 
1003 /*
1004  * Creates a session and inserts it into the list of sessions. The session
1005  * state would be CREATED.
1006  * Return Values:
1007  *	EWOULDBLOCK
1008  */
1009 rds_session_t *
1010 rds_session_create(rds_state_t *statep, ipaddr_t localip, ipaddr_t remip,
1011     ibt_cm_req_rcv_t *reqp, uint8_t type)
1012 {
1013 	ib_gid_t	lgid, rgid;
1014 	rds_session_t	*newp, *oldp;
1015 	rds_ep_t	*dataep, *ctrlep;
1016 	rds_bufpool_t	*pool;
1017 	rds_hca_t	*hcap;
1018 	int		ret;
1019 
1020 	RDS_DPRINTF2("rds_session_create", "Enter: 0x%p 0x%x 0x%x",
1021 	    statep, localip, remip);
1022 
1023 	/* Allocate and initialize global buffer pool */
1024 	ret = rds_init_recv_caches(statep);
1025 	if (ret != 0) {
1026 		RDS_DPRINTF2(LABEL, "Buffer Cache Initialization failed");
1027 		return (NULL);
1028 	}
1029 
1030 	/* enough memory for session (includes 2 endpoints) */
1031 	newp = kmem_zalloc(sizeof (rds_session_t), KM_SLEEP);
1032 
1033 	newp->session_remip = remip;
1034 	newp->session_myip = localip;
1035 	newp->session_type = type;
1036 	newp->session_state = RDS_SESSION_STATE_CREATED;
1037 	RDS_DPRINTF3("rds_session_create",
1038 	    "SP(%p) State RDS_SESSION_STATE_CREATED", newp);
1039 	rw_init(&newp->session_lock, NULL, RW_DRIVER, NULL);
1040 	rw_init(&newp->session_portmap_lock, NULL, RW_DRIVER, NULL);
1041 
1042 	/* Initialize data endpoint */
1043 	dataep = &newp->session_dataep;
1044 	dataep->ep_remip = newp->session_remip;
1045 	dataep->ep_myip = newp->session_myip;
1046 	dataep->ep_state = RDS_EP_STATE_UNCONNECTED;
1047 	dataep->ep_sp = newp;
1048 	dataep->ep_type = RDS_EP_TYPE_DATA;
1049 	mutex_init(&dataep->ep_lock, NULL, MUTEX_DRIVER, NULL);
1050 
1051 	/* Initialize send pool locks */
1052 	pool = &dataep->ep_sndpool;
1053 	mutex_init(&pool->pool_lock, NULL, MUTEX_DRIVER, NULL);
1054 	cv_init(&pool->pool_cv, NULL, CV_DRIVER, NULL);
1055 
1056 	/* Initialize recv pool locks */
1057 	pool = &dataep->ep_rcvpool;
1058 	mutex_init(&dataep->ep_recvqp.qp_lock, NULL, MUTEX_DRIVER, NULL);
1059 	mutex_init(&pool->pool_lock, NULL, MUTEX_DRIVER, NULL);
1060 	cv_init(&pool->pool_cv, NULL, CV_DRIVER, NULL);
1061 
1062 	/* Initialize control endpoint */
1063 	ctrlep = &newp->session_ctrlep;
1064 	ctrlep->ep_remip = newp->session_remip;
1065 	ctrlep->ep_myip = newp->session_myip;
1066 	ctrlep->ep_state = RDS_EP_STATE_UNCONNECTED;
1067 	ctrlep->ep_sp = newp;
1068 	ctrlep->ep_type = RDS_EP_TYPE_CTRL;
1069 	mutex_init(&ctrlep->ep_lock, NULL, MUTEX_DRIVER, NULL);
1070 
1071 	/* Initialize send pool locks */
1072 	pool = &ctrlep->ep_sndpool;
1073 	mutex_init(&pool->pool_lock, NULL, MUTEX_DRIVER, NULL);
1074 	cv_init(&pool->pool_cv, NULL, CV_DRIVER, NULL);
1075 
1076 	/* Initialize recv pool locks */
1077 	pool = &ctrlep->ep_rcvpool;
1078 	mutex_init(&ctrlep->ep_recvqp.qp_lock, NULL, MUTEX_DRIVER, NULL);
1079 	mutex_init(&pool->pool_lock, NULL, MUTEX_DRIVER, NULL);
1080 	cv_init(&pool->pool_cv, NULL, CV_DRIVER, NULL);
1081 
1082 	/* lkup if there is already a session */
1083 	rw_enter(&statep->rds_sessionlock, RW_WRITER);
1084 	oldp = rds_session_lkup(statep, remip, 0);
1085 	if (oldp != NULL) {
1086 		/* A session to this destination exists */
1087 		rw_exit(&statep->rds_sessionlock);
1088 		rw_destroy(&newp->session_lock);
1089 		rw_destroy(&newp->session_portmap_lock);
1090 		mutex_destroy(&dataep->ep_lock);
1091 		mutex_destroy(&ctrlep->ep_lock);
1092 		kmem_free(newp, sizeof (rds_session_t));
1093 		return (NULL);
1094 	}
1095 
1096 	/* Insert this session into the list */
1097 	rds_add_session(newp, B_TRUE);
1098 
1099 	/* unlock the session list */
1100 	rw_exit(&statep->rds_sessionlock);
1101 
1102 	if (type == RDS_SESSION_ACTIVE) {
1103 		ipaddr_t localip1, remip1;
1104 
1105 		/* The ipaddr should be in the network order */
1106 		localip1 = localip;
1107 		remip1 = remip;
1108 		ret = rds_sc_path_lookup(&localip1, &remip1);
1109 		if (ret == 0) {
1110 			RDS_DPRINTF2(LABEL, "Path not found (0x%x 0x%x)",
1111 			    localip, remip);
1112 		}
1113 
1114 		/* Get the gids for the source and destination ip addrs */
1115 		ret = rds_get_ibaddr(ntohl(localip1), ntohl(remip1),
1116 		    &lgid, &rgid);
1117 		if (ret != 0) {
1118 			RDS_DPRINTF1(LABEL, "rds_get_ibaddr failed: %d", ret);
1119 			RDS_SESSION_TRANSITION(newp, RDS_SESSION_STATE_FAILED);
1120 			return (NULL);
1121 		}
1122 
1123 		RDS_DPRINTF2(LABEL, "lgid: %llx:%llx rgid: %llx:%llx",
1124 		    lgid.gid_prefix, lgid.gid_guid, rgid.gid_prefix,
1125 		    rgid.gid_guid);
1126 	}
1127 
1128 	rw_enter(&newp->session_lock, RW_WRITER);
1129 	/* check for peer-to-peer case */
1130 	if (type == newp->session_type) {
1131 		/* no peer-to-peer case */
1132 		if (type == RDS_SESSION_ACTIVE) {
1133 			newp->session_lgid = lgid;
1134 			newp->session_rgid = rgid;
1135 		} else {
1136 			/* rgid is requester gid & lgid is receiver gid */
1137 			newp->session_rgid = reqp->req_prim_addr.av_dgid;
1138 			newp->session_lgid = reqp->req_prim_addr.av_sgid;
1139 		}
1140 
1141 		hcap = rds_gid_to_hcap(statep, newp->session_lgid);
1142 		if (hcap == NULL) {
1143 			RDS_DPRINTF1(LABEL, "SGID is on an uninitialized "
1144 			    "HCA: %llx", newp->session_lgid.gid_guid);
1145 			newp->session_state = RDS_SESSION_STATE_FAILED;
1146 			RDS_DPRINTF3("rds_session_create",
1147 			    "SP(%p) State RDS_SESSION_STATE_FAILED", newp);
1148 			rw_exit(&newp->session_lock);
1149 			return (NULL);
1150 		}
1151 		dataep->ep_hca_guid = hcap->hca_guid;
1152 		ctrlep->ep_hca_guid = hcap->hca_guid;
1153 	}
1154 	rw_exit(&newp->session_lock);
1155 
1156 	RDS_DPRINTF2("rds_session_create", "Return SP(%p)", newp);
1157 
1158 	return (newp);
1159 }
1160 
1161 void
1162 rds_handle_control_message(rds_session_t *sp, rds_ctrl_pkt_t *cpkt)
1163 {
1164 	cpkt->rcp_port = cpkt->rcp_port;
1165 	RDS_DPRINTF4("rds_handle_control_message", "Enter: SP(%p) code: %d "
1166 	    "port: %d", sp, cpkt->rcp_code, cpkt->rcp_port);
1167 
1168 	switch (cpkt->rcp_code) {
1169 	case RDS_CTRL_CODE_STALL:
1170 		RDS_INCR_STALLS_RCVD();
1171 		(void) rds_check_n_mark_port(sp, cpkt->rcp_port);
1172 		break;
1173 	case RDS_CTRL_CODE_UNSTALL:
1174 		RDS_INCR_UNSTALLS_RCVD();
1175 		(void) rds_check_n_unmark_port(sp, cpkt->rcp_port);
1176 		break;
1177 	case RDS_CTRL_CODE_STALL_PORTS:
1178 		rds_mark_all_ports(sp);
1179 		break;
1180 	case RDS_CTRL_CODE_UNSTALL_PORTS:
1181 		rds_unmark_all_ports(sp);
1182 		break;
1183 	case RDS_CTRL_CODE_HEARTBEAT:
1184 		break;
1185 	default:
1186 		RDS_DPRINTF2(LABEL, "ERROR: Invalid Control code: %d",
1187 		    cpkt->rcp_code);
1188 		break;
1189 	}
1190 
1191 	RDS_DPRINTF4("rds_handle_control_message", "Return");
1192 }
1193 
1194 void
1195 rds_post_control_message(rds_session_t *sp, rds_ctrl_pkt_t *cpkt)
1196 {
1197 	ibt_send_wr_t	wr;
1198 	rds_ep_t	*ep;
1199 	rds_buf_t	*bp;
1200 	rds_ctrl_pkt_t	*cp;
1201 	int		ret;
1202 
1203 	RDS_DPRINTF4("rds_post_control_message", "Enter: SP(%p) Code: %d "
1204 	    "Port: %d", sp, cpkt->rcp_code, cpkt->rcp_port);
1205 
1206 	ep = &sp->session_ctrlep;
1207 
1208 	bp = rds_get_send_buf(ep, 1);
1209 	if (bp == NULL) {
1210 		RDS_DPRINTF2(LABEL, "No buffers available to send control "
1211 		    "message: SP(%p) Code: %d Port: %d", sp, cpkt->rcp_code,
1212 		    cpkt->rcp_port);
1213 		return;
1214 	}
1215 
1216 	cp = (rds_ctrl_pkt_t *)(uintptr_t)bp->buf_ds.ds_va;
1217 	cp->rcp_code = cpkt->rcp_code;
1218 	cp->rcp_port = cpkt->rcp_port;
1219 	bp->buf_ds.ds_len = RDS_CTRLPKT_SIZE;
1220 
1221 	wr.wr_id = (uintptr_t)bp;
1222 	wr.wr_flags = IBT_WR_SEND_SOLICIT;
1223 	wr.wr_trans = IBT_RC_SRV;
1224 	wr.wr_opcode = IBT_WRC_SEND;
1225 	wr.wr_nds = 1;
1226 	wr.wr_sgl = &bp->buf_ds;
1227 	RDS_DPRINTF5(LABEL, "ds_va %p ds_len %d ds_lkey 0x%llx",
1228 	    bp->buf_ds.ds_va, bp->buf_ds.ds_len, bp->buf_ds.ds_key);
1229 	ret = ibt_post_send(ep->ep_chanhdl, &wr, 1, NULL);
1230 	if (ret != IBT_SUCCESS) {
1231 		RDS_DPRINTF2(LABEL, "EP(%p): ibt_post_send failed: "
1232 		    "%d", ep, ret);
1233 		bp->buf_state = RDS_SNDBUF_FREE;
1234 		rds_free_send_buf(ep, bp, NULL, 1, B_FALSE);
1235 		return;
1236 	}
1237 
1238 	RDS_DPRINTF4("rds_post_control_message", "Return SP(%p) Code: %d "
1239 	    "Port: %d", sp, cpkt->rcp_code, cpkt->rcp_port);
1240 }
1241 
1242 void
1243 rds_send_control_message(void *arg)
1244 {
1245 	rds_buf_t	*bp;
1246 	rds_ctrl_pkt_t	*cp;
1247 	rds_session_t	*sp;
1248 	uint_t		ix;
1249 
1250 	RDS_DPRINTF4("rds_send_control_message", "Enter");
1251 
1252 	bp = (rds_buf_t *)arg;
1253 	cp = (rds_ctrl_pkt_t *)(uintptr_t)bp->buf_ds.ds_va;
1254 
1255 	/* send the stall message on all sessions */
1256 	rw_enter(&rdsib_statep->rds_sessionlock, RW_READER);
1257 
1258 	sp = rdsib_statep->rds_sessionlistp;
1259 	for (ix = 0; ix < rdsib_statep->rds_nsessions; ix++) {
1260 		ASSERT(sp != NULL);
1261 		if (sp->session_state == RDS_SESSION_STATE_CONNECTED) {
1262 			rds_post_control_message(sp, cp);
1263 		}
1264 
1265 		sp = sp->session_nextp;
1266 	}
1267 
1268 	rw_exit(&rdsib_statep->rds_sessionlock);
1269 
1270 	/* free the arg */
1271 	rds_free_buf(&rds_cpool, bp, 1);
1272 
1273 	RDS_DPRINTF4("rds_send_control_message", "Return");
1274 }
1275 
1276 void
1277 rds_stall_port(in_port_t port)
1278 {
1279 	rds_ctrl_pkt_t	*cpkt;
1280 	rds_buf_t	*bp;
1281 	uint_t		ix;
1282 
1283 	RDS_DPRINTF4("rds_stall_port", "Enter: Port %d", port);
1284 
1285 	RDS_INCR_STALLS_TRIGGERED();
1286 	if (!rds_check_n_mark_port(NULL, port)) {
1287 
1288 		bp = rds_get_buf(&rds_cpool, 1, &ix);
1289 		if (bp == NULL) {
1290 			RDS_DPRINTF2(LABEL, "No buffers available "
1291 			    "to send control message: Code: %d "
1292 			    "Local Port: %d", RDS_CTRL_CODE_STALL, port);
1293 			(void) rds_check_n_unmark_port(NULL, port);
1294 			return;
1295 		}
1296 
1297 		cpkt = (rds_ctrl_pkt_t *)(uintptr_t)bp->buf_ds.ds_va;
1298 		cpkt->rcp_code = RDS_CTRL_CODE_STALL;
1299 		cpkt->rcp_port = port;
1300 #if 0
1301 		/*
1302 		 * Taskq runs at some later point in time and the port may
1303 		 * not be in stall state anymore at that time.
1304 		 */
1305 		(void) ddi_taskq_dispatch(rds_taskq,
1306 		    rds_send_control_message, (void *)bp, DDI_SLEEP);
1307 #else
1308 		rds_send_control_message((void *)bp);
1309 #endif
1310 		RDS_INCR_STALLS_SENT();
1311 	} else {
1312 		RDS_DPRINTF3(LABEL,
1313 		    "Port %d is already in stall state", port);
1314 	}
1315 
1316 	RDS_DPRINTF4("rds_stall_port", "Return: Port %d", port);
1317 }
1318 
1319 void
1320 rds_resume_port(in_port_t port)
1321 {
1322 	rds_ctrl_pkt_t	*cpkt;
1323 	rds_buf_t	*bp;
1324 	uint_t		ix;
1325 
1326 	RDS_DPRINTF4("rds_resume_port", "Enter: Port %d", port);
1327 
1328 	RDS_INCR_UNSTALLS_TRIGGERED();
1329 	if (rds_check_n_unmark_port(NULL, port)) {
1330 
1331 		bp = rds_get_buf(&rds_cpool, 1, &ix);
1332 		if (bp == NULL) {
1333 			RDS_DPRINTF2(LABEL, "No buffers available "
1334 			    "to send control message: Code: %d "
1335 			    "Local Port: %d", RDS_CTRL_CODE_UNSTALL, port);
1336 			(void) rds_check_n_mark_port(NULL, port);
1337 			return;
1338 		}
1339 
1340 		/* send control message to resume the port for remote traffic */
1341 		cpkt = (rds_ctrl_pkt_t *)(uintptr_t)bp->buf_ds.ds_va;
1342 		cpkt->rcp_code = RDS_CTRL_CODE_UNSTALL;
1343 		cpkt->rcp_port = port;
1344 		(void) ddi_taskq_dispatch(rds_taskq,
1345 		    rds_send_control_message, (void *)bp, DDI_SLEEP);
1346 		RDS_INCR_UNSTALLS_SENT();
1347 	} else {
1348 		RDS_DPRINTF5(LABEL,
1349 		    "Port %d is not stalled anymore", port);
1350 	}
1351 
1352 	RDS_DPRINTF4("rds_resume_port", "Return: Port %d", port);
1353 }
1354 
1355 static int
1356 rds_build_n_post_msg(rds_ep_t *ep, uio_t *uiop, in_port_t sendport,
1357     in_port_t recvport)
1358 {
1359 	ibt_send_wr_t	*wrp, wr;
1360 	rds_buf_t	*bp, *bp1;
1361 	rds_data_hdr_t	*pktp;
1362 	uint32_t	msgsize, npkts, residual, pktno, ix;
1363 	int		ret;
1364 
1365 	RDS_DPRINTF4("rds_build_n_post_msg", "Enter: EP(%p) UIOP(%p)",
1366 	    ep, uiop);
1367 
1368 	/* how many pkts are needed to carry this msg */
1369 	msgsize = uiop->uio_resid;
1370 	npkts = ((msgsize - 1) / UserBufferSize) + 1;
1371 	residual = ((msgsize - 1) % UserBufferSize) + 1;
1372 
1373 	RDS_DPRINTF5(LABEL, "EP(%p) UIOP(%p) msg size: %d npkts: %d", ep, uiop,
1374 	    msgsize, npkts);
1375 
1376 	/* Get the buffers needed to post this message */
1377 	bp = rds_get_send_buf(ep, npkts);
1378 	if (bp == NULL) {
1379 		RDS_INCR_ENOBUFS();
1380 		return (ENOBUFS);
1381 	}
1382 
1383 	if (npkts > 1) {
1384 		/*
1385 		 * multi-pkt messages are posted at the same time as a list
1386 		 * of WRs
1387 		 */
1388 		wrp = (ibt_send_wr_t *)kmem_zalloc(sizeof (ibt_send_wr_t) *
1389 		    npkts, KM_SLEEP);
1390 	}
1391 
1392 
1393 	pktno = 0;
1394 	bp1 = bp;
1395 	do {
1396 		/* prepare the header */
1397 		pktp = (rds_data_hdr_t *)(uintptr_t)bp1->buf_ds.ds_va;
1398 		pktp->dh_datalen = UserBufferSize;
1399 		pktp->dh_npkts = npkts - pktno;
1400 		pktp->dh_psn = pktno;
1401 		pktp->dh_sendport = sendport;
1402 		pktp->dh_recvport = recvport;
1403 		bp1->buf_ds.ds_len = RdsPktSize;
1404 
1405 		/* copy the data */
1406 		ret = uiomove((uint8_t *)pktp + RDS_DATA_HDR_SZ,
1407 		    UserBufferSize, UIO_WRITE, uiop);
1408 		if (ret != 0) {
1409 			break;
1410 		}
1411 
1412 		if (uiop->uio_resid == 0) {
1413 			pktp->dh_datalen = residual;
1414 			bp1->buf_ds.ds_len = residual + RDS_DATA_HDR_SZ;
1415 			break;
1416 		}
1417 		pktno++;
1418 		bp1 = bp1->buf_nextp;
1419 	} while (uiop->uio_resid);
1420 
1421 	if (ret) {
1422 		/* uiomove failed */
1423 		RDS_DPRINTF2("rds_build_n_post_msg", "UIO(%p) Move FAILED: %d",
1424 		    uiop, ret);
1425 		if (npkts > 1) {
1426 			kmem_free(wrp, npkts * sizeof (ibt_send_wr_t));
1427 		}
1428 		rds_free_send_buf(ep, bp, NULL, npkts, B_FALSE);
1429 		return (ret);
1430 	}
1431 
1432 	if (npkts > 1) {
1433 		/* multi-pkt message */
1434 		RDS_DPRINTF5(LABEL, "EP(%p) Sending Multiple Packets", ep);
1435 
1436 		bp1 = bp;
1437 		for (ix = 0; ix < npkts; ix++) {
1438 			wrp[ix].wr_id = (uintptr_t)bp1;
1439 			wrp[ix].wr_flags = IBT_WR_NO_FLAGS;
1440 			wrp[ix].wr_trans = IBT_RC_SRV;
1441 			wrp[ix].wr_opcode = IBT_WRC_SEND;
1442 			wrp[ix].wr_nds = 1;
1443 			wrp[ix].wr_sgl = &bp1->buf_ds;
1444 			bp1 = bp1->buf_nextp;
1445 		}
1446 		wrp[npkts - 1].wr_flags = IBT_WR_SEND_SOLICIT;
1447 
1448 		ret = ibt_post_send(ep->ep_chanhdl, wrp, npkts, &ix);
1449 		if (ret != IBT_SUCCESS) {
1450 			RDS_DPRINTF2(LABEL, "EP(%p): ibt_post_send failed: "
1451 			    "%d for %d pkts", ep, ret, npkts);
1452 			rds_free_send_buf(ep, bp, NULL, npkts, B_FALSE);
1453 			kmem_free(wrp, npkts * sizeof (ibt_send_wr_t));
1454 			return (ret);
1455 		}
1456 
1457 		kmem_free(wrp, npkts * sizeof (ibt_send_wr_t));
1458 	} else {
1459 		/* single pkt */
1460 		RDS_DPRINTF5(LABEL, "EP(%p) Sending Single Packet", ep);
1461 		wr.wr_id = (uintptr_t)bp;
1462 		wr.wr_flags = IBT_WR_SEND_SOLICIT;
1463 		wr.wr_trans = IBT_RC_SRV;
1464 		wr.wr_opcode = IBT_WRC_SEND;
1465 		wr.wr_nds = 1;
1466 		wr.wr_sgl = &bp->buf_ds;
1467 		RDS_DPRINTF5(LABEL, "ds_va %p ds_key 0x%llx ds_len %d ",
1468 		    bp->buf_ds.ds_va, bp->buf_ds.ds_key, bp->buf_ds.ds_len);
1469 		ret = ibt_post_send(ep->ep_chanhdl, &wr, 1, NULL);
1470 		if (ret != IBT_SUCCESS) {
1471 			RDS_DPRINTF2(LABEL, "EP(%p): ibt_post_send failed: "
1472 			    "%d", ep, ret);
1473 			rds_free_send_buf(ep, bp, NULL, 1, B_FALSE);
1474 			return (ret);
1475 		}
1476 	}
1477 
1478 	RDS_INCR_TXPKTS(npkts);
1479 	RDS_INCR_TXBYTES(msgsize);
1480 
1481 	RDS_DPRINTF4("rds_build_n_post_msg", "Return: EP(%p) UIOP(%p)",
1482 	    ep, uiop);
1483 
1484 	return (0);
1485 }
1486 
1487 static int
1488 rds_deliver_loopback_msg(uio_t *uiop, ipaddr_t recvip, ipaddr_t sendip,
1489     in_port_t recvport, in_port_t sendport, zoneid_t zoneid)
1490 {
1491 	mblk_t		*mp;
1492 	int		ret;
1493 
1494 	RDS_DPRINTF4("rds_deliver_loopback_msg", "Enter");
1495 
1496 	RDS_DPRINTF3(LABEL, "Loopback message: sendport: "
1497 	    "%d to recvport: %d", sendport, recvport);
1498 
1499 	mp = allocb(uiop->uio_resid, BPRI_MED);
1500 	if (mp == NULL) {
1501 		RDS_DPRINTF2(LABEL, "allocb failed, size: %d\n",
1502 		    uiop->uio_resid);
1503 		return (ENOSPC);
1504 	}
1505 	mp->b_wptr = mp->b_rptr + uiop->uio_resid;
1506 
1507 	ret = uiomove(mp->b_rptr, uiop->uio_resid, UIO_WRITE, uiop);
1508 	if (ret) {
1509 		RDS_DPRINTF2(LABEL, "ERROR: uiomove returned: %d", ret);
1510 		freeb(mp);
1511 		return (ret);
1512 	}
1513 
1514 	ret = rds_deliver_new_msg(mp, recvip, sendip, recvport, sendport,
1515 	    zoneid);
1516 	if (ret != 0) {
1517 		if (ret == ENOSPC) {
1518 			/*
1519 			 * The message is delivered but cannot take more,
1520 			 * stall the port, if it is not already stalled
1521 			 */
1522 			RDS_DPRINTF2(LABEL, "Port %d NO SPACE", recvport);
1523 			rds_stall_port(recvport);
1524 		} else {
1525 			RDS_DPRINTF2(LABEL, "Loopback message: port %d -> "
1526 			    "port %d failed: %d", sendport, recvport, ret);
1527 			return (ret);
1528 		}
1529 	}
1530 
1531 	RDS_DPRINTF4("rds_deliver_loopback_msg", "Return");
1532 	return (0);
1533 }
1534 
1535 static void
1536 rds_resend_messages(void *arg)
1537 {
1538 	rds_session_t	*sp = (rds_session_t *)arg;
1539 	rds_ep_t	*ep;
1540 	rds_bufpool_t	*spool;
1541 	rds_buf_t	*bp, *endp, *tmp;
1542 	ibt_send_wr_t	*wrp;
1543 	uint_t		nwr = 0, ix, jx;
1544 	int		ret;
1545 
1546 	RDS_DPRINTF2("rds_resend_messages", "Enter: SP(%p)", sp);
1547 
1548 	ep = &sp->session_dataep;
1549 
1550 	spool = &ep->ep_sndpool;
1551 	mutex_enter(&spool->pool_lock);
1552 
1553 	ASSERT(spool->pool_nfree == spool->pool_nbuffers);
1554 
1555 	if (ep->ep_lbufid == NULL) {
1556 		RDS_DPRINTF2("rds_resend_messages",
1557 		    "SP(%p) Remote session is cleaned up ", sp);
1558 		/*
1559 		 * The remote end cleaned up its session. There may be loss
1560 		 * of messages. Mark all buffers as acknowledged.
1561 		 */
1562 		tmp = spool->pool_tailp;
1563 	} else {
1564 		tmp = (rds_buf_t *)ep->ep_lbufid;
1565 		RDS_DPRINTF2("rds_resend_messages",
1566 		    "SP(%p) Last successful BP(%p) ", sp, tmp);
1567 	}
1568 
1569 	endp = spool->pool_tailp;
1570 	bp = spool->pool_headp;
1571 	jx = 0;
1572 	while ((bp != NULL) && (bp != tmp)) {
1573 		bp->buf_state = RDS_SNDBUF_FREE;
1574 		jx++;
1575 		bp = bp->buf_nextp;
1576 	}
1577 
1578 	if (bp == NULL) {
1579 		mutex_exit(&spool->pool_lock);
1580 		RDS_DPRINTF2("rds_resend_messages", "Alert: lbufid(%p) is not "
1581 		    "found in the list", tmp);
1582 
1583 		rw_enter(&sp->session_lock, RW_WRITER);
1584 		if (sp->session_state == RDS_SESSION_STATE_INIT) {
1585 			sp->session_state = RDS_SESSION_STATE_CONNECTED;
1586 		} else {
1587 			RDS_DPRINTF2("rds_resend_messages", "SP(%p) State: %d "
1588 			    "Expected State: %d", sp, sp->session_state,
1589 			    RDS_SESSION_STATE_CONNECTED);
1590 		}
1591 		sp->session_failover--;
1592 		rw_exit(&sp->session_lock);
1593 		return;
1594 	}
1595 
1596 	/* Found the match */
1597 	bp->buf_state = RDS_SNDBUF_FREE;
1598 	jx++;
1599 
1600 	spool->pool_tailp = bp;
1601 	bp = bp->buf_nextp;
1602 	spool->pool_tailp->buf_nextp = NULL;
1603 	nwr = spool->pool_nfree - jx;
1604 	spool->pool_nfree = jx;
1605 	mutex_exit(&spool->pool_lock);
1606 
1607 	RDS_DPRINTF2("rds_resend_messages", "SP(%p): Number of "
1608 	    "bufs (BP %p) to re-send: %d", sp, bp, nwr);
1609 
1610 	if (bp) {
1611 		wrp = (ibt_send_wr_t *)kmem_zalloc(sizeof (ibt_send_wr_t) * 100,
1612 		    KM_SLEEP);
1613 
1614 		while (nwr) {
1615 			jx = (nwr > 100) ? 100 : nwr;
1616 
1617 			tmp = bp;
1618 			for (ix = 0; ix < jx; ix++) {
1619 				bp->buf_state = RDS_SNDBUF_PENDING;
1620 				wrp[ix].wr_id = (uintptr_t)bp;
1621 				wrp[ix].wr_flags = IBT_WR_SEND_SOLICIT;
1622 				wrp[ix].wr_trans = IBT_RC_SRV;
1623 				wrp[ix].wr_opcode = IBT_WRC_SEND;
1624 				wrp[ix].wr_nds = 1;
1625 				wrp[ix].wr_sgl = &bp->buf_ds;
1626 				bp = bp->buf_nextp;
1627 			}
1628 
1629 			ret = ibt_post_send(ep->ep_chanhdl, wrp, jx, &ix);
1630 			if (ret != IBT_SUCCESS) {
1631 				RDS_DPRINTF2(LABEL, "EP(%p): ibt_post_send "
1632 				    "failed: %d for % pkts", ep, ret, jx);
1633 				break;
1634 			}
1635 
1636 			mutex_enter(&spool->pool_lock);
1637 			spool->pool_nbusy += jx;
1638 			mutex_exit(&spool->pool_lock);
1639 
1640 			nwr -= jx;
1641 		}
1642 
1643 		kmem_free(wrp, sizeof (ibt_send_wr_t) * 100);
1644 
1645 		if (nwr != 0) {
1646 
1647 			/*
1648 			 * An error while failover is in progress. Some WRs are
1649 			 * posted while other remain. If any of the posted WRs
1650 			 * complete in error then they would dispatch a taskq to
1651 			 * do a failover. Getting the session lock will prevent
1652 			 * the taskq to wait until we are done here.
1653 			 */
1654 			rw_enter(&sp->session_lock, RW_READER);
1655 
1656 			/*
1657 			 * Wait until all the previous WRs are completed and
1658 			 * then queue the remaining, otherwise the order of
1659 			 * the messages may change.
1660 			 */
1661 			(void) rds_is_sendq_empty(ep, 1);
1662 
1663 			/* free the remaining buffers */
1664 			rds_free_send_buf(ep, tmp, endp, nwr, B_FALSE);
1665 
1666 			rw_exit(&sp->session_lock);
1667 			return;
1668 		}
1669 	}
1670 
1671 	rw_enter(&sp->session_lock, RW_WRITER);
1672 	if (sp->session_state == RDS_SESSION_STATE_INIT) {
1673 		sp->session_state = RDS_SESSION_STATE_CONNECTED;
1674 	} else {
1675 		RDS_DPRINTF2("rds_resend_messages", "SP(%p) State: %d "
1676 		    "Expected State: %d", sp, sp->session_state,
1677 		    RDS_SESSION_STATE_CONNECTED);
1678 	}
1679 	sp->session_failover--;
1680 	rw_exit(&sp->session_lock);
1681 
1682 	RDS_DPRINTF2("rds_resend_messages", "Return: SP(%p)", sp);
1683 }
1684 
1685 /*
1686  * This is called when a channel is connected. Transition the session to
1687  * CONNECTED state iff both channels are connected.
1688  */
1689 void
1690 rds_session_active(rds_session_t *sp)
1691 {
1692 	rds_ep_t	*ep;
1693 	uint_t		failover;
1694 
1695 	RDS_DPRINTF2("rds_session_active", "Enter: 0x%p", sp);
1696 
1697 	rw_enter(&sp->session_lock, RW_READER);
1698 
1699 	failover = sp->session_failover;
1700 
1701 	/*
1702 	 * we establish the data channel first, so check the control channel
1703 	 * first but make sure it is initialized.
1704 	 */
1705 	ep = &sp->session_ctrlep;
1706 	mutex_enter(&ep->ep_lock);
1707 	if (ep->ep_state != RDS_EP_STATE_CONNECTED) {
1708 		/* the session is not ready yet */
1709 		mutex_exit(&ep->ep_lock);
1710 		rw_exit(&sp->session_lock);
1711 		return;
1712 	}
1713 	mutex_exit(&ep->ep_lock);
1714 
1715 	/* control channel is connected, check the data channel */
1716 	ep = &sp->session_dataep;
1717 	mutex_enter(&ep->ep_lock);
1718 	if (ep->ep_state != RDS_EP_STATE_CONNECTED) {
1719 		/* data channel is not yet connected */
1720 		mutex_exit(&ep->ep_lock);
1721 		rw_exit(&sp->session_lock);
1722 		return;
1723 	}
1724 	mutex_exit(&ep->ep_lock);
1725 
1726 	if (failover) {
1727 		rw_exit(&sp->session_lock);
1728 
1729 		/*
1730 		 * The session has failed over. Previous msgs have to be
1731 		 * re-sent before the session is moved to the connected
1732 		 * state.
1733 		 */
1734 		RDS_DPRINTF2("rds_session_active", "SP(%p) Dispatching taskq "
1735 		    "to re-send messages", sp);
1736 		(void) ddi_taskq_dispatch(rds_taskq,
1737 		    rds_resend_messages, (void *)sp, DDI_SLEEP);
1738 		return;
1739 	}
1740 
1741 	/* the session is ready */
1742 	sp->session_state = RDS_SESSION_STATE_CONNECTED;
1743 	RDS_DPRINTF3("rds_session_active",
1744 	    "SP(%p) State RDS_SESSION_STATE_CONNECTED", sp);
1745 
1746 	rw_exit(&sp->session_lock);
1747 
1748 	RDS_DPRINTF2("rds_session_active", "Return: SP(%p) is CONNECTED", sp);
1749 }
1750 
1751 static int
1752 rds_ep_sendmsg(rds_ep_t *ep, uio_t *uiop, in_port_t sendport,
1753     in_port_t recvport)
1754 {
1755 	int	ret;
1756 
1757 	RDS_DPRINTF4("rds_ep_sendmsg", "Enter: EP(%p) sendport: %d recvport: "
1758 	    "%d", ep, sendport, recvport);
1759 
1760 	/* make sure the port is not stalled */
1761 	if (rds_is_port_marked(ep->ep_sp, recvport)) {
1762 		RDS_DPRINTF2(LABEL, "SP(%p) Port:%d is in stall state",
1763 		    ep->ep_sp, recvport);
1764 		RDS_INCR_EWOULDBLOCK();
1765 		ret = ENOMEM;
1766 	} else {
1767 		ret = rds_build_n_post_msg(ep, uiop, sendport, recvport);
1768 	}
1769 
1770 	RDS_DPRINTF4("rds_ep_sendmsg", "Return: EP(%p)", ep);
1771 
1772 	return (ret);
1773 }
1774 
1775 /* Send a message to a destination socket */
1776 int
1777 rds_sendmsg(uio_t *uiop, ipaddr_t sendip, ipaddr_t recvip, in_port_t sendport,
1778     in_port_t recvport, zoneid_t zoneid)
1779 {
1780 	rds_session_t	*sp;
1781 	ib_gid_t	lgid, rgid;
1782 	rds_hca_t	*hcap;
1783 	int		ret;
1784 
1785 	RDS_DPRINTF4("rds_sendmsg", "Enter: uiop: 0x%p, srcIP: 0x%x destIP: "
1786 	    "0x%x sndport: %d recvport: %d", uiop, sendip, recvip,
1787 	    sendport, recvport);
1788 
1789 	/* If msg length is 0, just return success */
1790 	if (uiop->uio_resid == 0) {
1791 		RDS_DPRINTF2("rds_sendmsg", "Zero sized message");
1792 		return (0);
1793 	}
1794 
1795 	/* Is there a session to the destination? */
1796 	rw_enter(&rdsib_statep->rds_sessionlock, RW_READER);
1797 	sp = rds_session_lkup(rdsib_statep, recvip, 0);
1798 	rw_exit(&rdsib_statep->rds_sessionlock);
1799 
1800 	/* Is this a loopback message? */
1801 	if ((sp == NULL) && (rds_islocal(recvip))) {
1802 		/* make sure the port is not stalled */
1803 		if (rds_is_port_marked(NULL, recvport)) {
1804 			RDS_DPRINTF2(LABEL, "Local Port:%d is in stall state",
1805 			    recvport);
1806 			RDS_INCR_EWOULDBLOCK();
1807 			return (ENOMEM);
1808 		}
1809 		ret = rds_deliver_loopback_msg(uiop, recvip, sendip, recvport,
1810 		    sendport, zoneid);
1811 		return (ret);
1812 	}
1813 
1814 	/* Not a loopback message */
1815 	if (sp == NULL) {
1816 		/* There is no session to the destination, create one. */
1817 		RDS_DPRINTF3(LABEL, "There is no session to the destination "
1818 		    "IP: 0x%x", recvip);
1819 		sp = rds_session_create(rdsib_statep, sendip, recvip, NULL,
1820 		    RDS_SESSION_ACTIVE);
1821 		if (sp != NULL) {
1822 			rw_enter(&sp->session_lock, RW_WRITER);
1823 			if (sp->session_type == RDS_SESSION_ACTIVE) {
1824 				ret = rds_session_init(sp);
1825 				if (ret != 0) {
1826 					RDS_DPRINTF2("rds_sendmsg",
1827 					    "SP(%p): rds_session_init failed",
1828 					    sp);
1829 					sp->session_state =
1830 					    RDS_SESSION_STATE_FAILED;
1831 					RDS_DPRINTF3("rds_sendmsg",
1832 					    "SP(%p) State "
1833 					    "RDS_SESSION_STATE_FAILED", sp);
1834 					rw_exit(&sp->session_lock);
1835 					return (EFAULT);
1836 				}
1837 				sp->session_state = RDS_SESSION_STATE_INIT;
1838 				RDS_DPRINTF3("rds_sendmsg",
1839 				    "SP(%p) State "
1840 				    "RDS_SESSION_STATE_INIT", sp);
1841 				rw_exit(&sp->session_lock);
1842 				rds_session_open(sp);
1843 			} else {
1844 				rw_exit(&sp->session_lock);
1845 			}
1846 		} else {
1847 			/* Is a session created for this destination */
1848 			rw_enter(&rdsib_statep->rds_sessionlock, RW_READER);
1849 			sp = rds_session_lkup(rdsib_statep, recvip, 0);
1850 			rw_exit(&rdsib_statep->rds_sessionlock);
1851 			if (sp == NULL) {
1852 				return (EFAULT);
1853 			}
1854 		}
1855 	}
1856 
1857 	/* There is a session to the destination */
1858 	rw_enter(&sp->session_lock, RW_READER);
1859 	if (sp->session_state == RDS_SESSION_STATE_CONNECTED) {
1860 		rw_exit(&sp->session_lock);
1861 
1862 		ret = rds_ep_sendmsg(&sp->session_dataep, uiop, sendport,
1863 		    recvport);
1864 		return (ret);
1865 	} else if ((sp->session_state == RDS_SESSION_STATE_FAILED) ||
1866 	    (sp->session_state == RDS_SESSION_STATE_FINI)) {
1867 		ipaddr_t sendip1, recvip1;
1868 
1869 		RDS_DPRINTF3("rds_sendmsg", "SP(%p) is not connected, State: "
1870 		    "%d", sp);
1871 		rw_exit(&sp->session_lock);
1872 		rw_enter(&sp->session_lock, RW_WRITER);
1873 		if ((sp->session_state == RDS_SESSION_STATE_FAILED) ||
1874 		    (sp->session_state == RDS_SESSION_STATE_FINI)) {
1875 			sp->session_state = RDS_SESSION_STATE_CREATED;
1876 			sp->session_type = RDS_SESSION_ACTIVE;
1877 			RDS_DPRINTF3("rds_sendmsg", "SP(%p) State "
1878 			    "RDS_SESSION_STATE_CREATED", sp);
1879 			rw_exit(&sp->session_lock);
1880 
1881 
1882 			/* The ipaddr should be in the network order */
1883 			sendip1 = sendip;
1884 			recvip1 = recvip;
1885 			ret = rds_sc_path_lookup(&sendip1, &recvip1);
1886 			if (ret == 0) {
1887 				RDS_DPRINTF2(LABEL, "Path not found "
1888 				    "(0x%x 0x%x)", sendip1, recvip1);
1889 			}
1890 
1891 			/* Resolve the IP addresses */
1892 			ret = rds_get_ibaddr(htonl(sendip1), htonl(recvip1),
1893 			    &lgid, &rgid);
1894 			if (ret != 0) {
1895 				RDS_DPRINTF1(LABEL, "rds_get_ibaddr failed: %d",
1896 				    ret);
1897 				rw_enter(&sp->session_lock, RW_WRITER);
1898 				if (sp->session_type == RDS_SESSION_ACTIVE) {
1899 					sp->session_state =
1900 					    RDS_SESSION_STATE_FAILED;
1901 					RDS_DPRINTF3("rds_sendmsg",
1902 					    "SP(%p) State "
1903 					    "RDS_SESSION_STATE_FAILED", sp);
1904 					rw_exit(&sp->session_lock);
1905 					return (EFAULT);
1906 				} else {
1907 					rw_exit(&sp->session_lock);
1908 					return (ENOMEM);
1909 				}
1910 			}
1911 
1912 			RDS_DPRINTF2(LABEL, "lgid: %llx:%llx rgid: %llx:%llx",
1913 			    lgid.gid_prefix, lgid.gid_guid, rgid.gid_prefix,
1914 			    rgid.gid_guid);
1915 
1916 			rw_enter(&sp->session_lock, RW_WRITER);
1917 			if (sp->session_type == RDS_SESSION_ACTIVE) {
1918 				sp->session_lgid = lgid;
1919 				sp->session_rgid = rgid;
1920 				hcap = rds_gid_to_hcap(rdsib_statep, lgid);
1921 				if (hcap == NULL) {
1922 					RDS_DPRINTF1(LABEL, "REQ received on "
1923 					    "an uninitialized HCA: %llx",
1924 					    sp->session_lgid.gid_guid);
1925 					sp->session_state =
1926 					    RDS_SESSION_STATE_FAILED;
1927 					RDS_DPRINTF3("rds_sendmsg",
1928 					    "SP(%p) State "
1929 					    "RDS_SESSION_STATE_FAILED", sp);
1930 					rw_exit(&sp->session_lock);
1931 					return (ENOMEM);
1932 				}
1933 
1934 				ret = rds_session_init(sp);
1935 				if (ret != 0) {
1936 					RDS_DPRINTF2("rds_sendmsg",
1937 					    "SP(%p): rds_session_init failed",
1938 					    sp);
1939 					sp->session_state =
1940 					    RDS_SESSION_STATE_FAILED;
1941 					RDS_DPRINTF3("rds_sendmsg",
1942 					    "SP(%p) State "
1943 					    "RDS_SESSION_STATE_FAILED", sp);
1944 					rw_exit(&sp->session_lock);
1945 					return (EFAULT);
1946 				}
1947 				sp->session_state = RDS_SESSION_STATE_INIT;
1948 				rw_exit(&sp->session_lock);
1949 
1950 				rds_session_open(sp);
1951 
1952 			} else {
1953 				RDS_DPRINTF2(LABEL, "SP(%p): state changed "
1954 				    "to %d", sp, sp->session_state);
1955 				rw_exit(&sp->session_lock);
1956 				return (ENOMEM);
1957 			}
1958 		} else {
1959 			RDS_DPRINTF2(LABEL, "SP(%p): Session state %d changed",
1960 			    sp, sp->session_state);
1961 			rw_exit(&sp->session_lock);
1962 			return (ENOMEM);
1963 		}
1964 	} else {
1965 		RDS_DPRINTF2(LABEL, "SP(%p): Session is in %d state",
1966 		    sp, sp->session_state);
1967 		rw_exit(&sp->session_lock);
1968 		return (ENOMEM);
1969 	}
1970 
1971 	rw_enter(&sp->session_lock, RW_READER);
1972 	if (sp->session_state == RDS_SESSION_STATE_CONNECTED) {
1973 		rw_exit(&sp->session_lock);
1974 
1975 		ret = rds_ep_sendmsg(&sp->session_dataep, uiop, sendport,
1976 		    recvport);
1977 	} else {
1978 		RDS_DPRINTF2(LABEL, "SP(%p): state(%d) not connected",
1979 		    sp, sp->session_state);
1980 		rw_exit(&sp->session_lock);
1981 	}
1982 
1983 	RDS_DPRINTF4("rds_sendmsg", "Return: SP(%p) ret: %d", sp, ret);
1984 
1985 	return (ret);
1986 }
1987 
1988 /* Note: This is called on the CQ handler thread */
1989 void
1990 rds_received_msg(rds_ep_t *ep, rds_buf_t *bp)
1991 {
1992 	mblk_t		*mp, *mp1;
1993 	rds_data_hdr_t	*pktp, *pktp1;
1994 	uint8_t		*datap;
1995 	rds_buf_t	*bp1;
1996 	rds_bufpool_t	*rpool;
1997 	uint_t		npkts, ix;
1998 	int		ret;
1999 
2000 	RDS_DPRINTF4("rds_received_msg", "Enter: EP(%p)", ep);
2001 
2002 	pktp = (rds_data_hdr_t *)(uintptr_t)bp->buf_ds.ds_va;
2003 	datap = ((uint8_t *)(uintptr_t)bp->buf_ds.ds_va) + RDS_DATA_HDR_SZ;
2004 	npkts = pktp->dh_npkts;
2005 
2006 	/* increment rx pending here */
2007 	rpool = &ep->ep_rcvpool;
2008 	mutex_enter(&rpool->pool_lock);
2009 	rpool->pool_nbusy += npkts;
2010 	mutex_exit(&rpool->pool_lock);
2011 
2012 	/* this will get freed by sockfs */
2013 	mp = esballoc(datap, pktp->dh_datalen, BPRI_HI, &bp->buf_frtn);
2014 	if (mp == NULL) {
2015 		RDS_DPRINTF2(LABEL, "EP(%p) BP(%p): allocb failed",
2016 		    ep, bp);
2017 		rds_free_recv_buf(bp, npkts);
2018 		return;
2019 	}
2020 	mp->b_wptr = datap + pktp->dh_datalen;
2021 	mp->b_datap->db_type = M_DATA;
2022 
2023 	mp1 = mp;
2024 	bp1 = bp->buf_nextp;
2025 	while (bp1 != NULL) {
2026 		pktp1 = (rds_data_hdr_t *)(uintptr_t)bp1->buf_ds.ds_va;
2027 		datap = ((uint8_t *)(uintptr_t)bp1->buf_ds.ds_va) +
2028 		    RDS_DATA_HDR_SZ;
2029 
2030 		mp1->b_cont = esballoc(datap, pktp1->dh_datalen,
2031 		    BPRI_HI, &bp1->buf_frtn);
2032 		if (mp1->b_cont == NULL) {
2033 			RDS_DPRINTF2(LABEL, "EP(%p) BP(%p): allocb failed",
2034 			    ep, bp1);
2035 			freemsg(mp);
2036 			rds_free_recv_buf(bp1, pktp1->dh_npkts);
2037 			return;
2038 		}
2039 		mp1 = mp1->b_cont;
2040 		mp1->b_wptr = datap + pktp1->dh_datalen;
2041 		mp1->b_datap->db_type = M_DATA;
2042 
2043 		bp1 = bp1->buf_nextp;
2044 	}
2045 
2046 	RDS_INCR_RXPKTS_PEND(npkts);
2047 	RDS_INCR_RXPKTS(npkts);
2048 	RDS_INCR_RXBYTES(msgdsize(mp));
2049 
2050 	RDS_DPRINTF5(LABEL, "Deliver Message: sendIP: 0x%x recvIP: 0x%x "
2051 	    "sendport: %d recvport: %d npkts: %d pktno: %d", ep->ep_remip,
2052 	    ep->ep_myip, pktp->dh_sendport, pktp->dh_recvport,
2053 	    npkts, pktp->dh_psn);
2054 
2055 	/* store the last buffer id, no lock needed */
2056 	if (npkts > 1) {
2057 		ep->ep_rbufid = pktp1->dh_bufid;
2058 	} else {
2059 		ep->ep_rbufid = pktp->dh_bufid;
2060 	}
2061 
2062 	ret = rds_deliver_new_msg(mp, ep->ep_myip, ep->ep_remip,
2063 	    pktp->dh_recvport, pktp->dh_sendport, ALL_ZONES);
2064 	if (ret != 0) {
2065 		if (ret == ENOSPC) {
2066 			/*
2067 			 * The message is delivered but cannot take more,
2068 			 * stall the port
2069 			 */
2070 			RDS_DPRINTF2(LABEL, "Port %d NO SPACE",
2071 			    pktp->dh_recvport);
2072 			rds_stall_port(pktp->dh_recvport);
2073 		} else {
2074 			RDS_DPRINTF1(LABEL, "rds_deliver_new_msg returned: %d",
2075 			    ret);
2076 		}
2077 	}
2078 
2079 	mutex_enter(&ep->ep_lock);
2080 	if (ep->ep_rdmacnt == 0) {
2081 		ep->ep_rdmacnt++;
2082 		*(uintptr_t *)(uintptr_t)ep->ep_ackds.ds_va = ep->ep_rbufid;
2083 		mutex_exit(&ep->ep_lock);
2084 
2085 		/* send acknowledgement */
2086 		RDS_INCR_TXACKS();
2087 		ret = ibt_post_send(ep->ep_chanhdl, &ep->ep_ackwr, 1, &ix);
2088 		if (ret != IBT_SUCCESS) {
2089 			RDS_DPRINTF1(LABEL, "EP(%p): ibt_post_send for "
2090 			    "acknowledgement failed: %d, SQ depth: %d",
2091 			    ep, ret, ep->ep_sndpool.pool_nbusy);
2092 			mutex_enter(&ep->ep_lock);
2093 			ep->ep_rdmacnt--;
2094 			mutex_exit(&ep->ep_lock);
2095 		}
2096 	} else {
2097 		/* no room to send acknowledgement */
2098 		mutex_exit(&ep->ep_lock);
2099 	}
2100 
2101 	RDS_DPRINTF4("rds_received_msg", "Return: EP(%p)", ep);
2102 }
2103