1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 2006 Oracle. All rights reserved. 27 * 28 * This software is available to you under a choice of one of two 29 * licenses. You may choose to be licensed under the terms of the GNU 30 * General Public License (GPL) Version 2, available from the file 31 * COPYING in the main directory of this source tree, or the 32 * OpenIB.org BSD license below: 33 * 34 * Redistribution and use in source and binary forms, with or 35 * without modification, are permitted provided that the following 36 * conditions are met: 37 * 38 * - Redistributions of source code must retain the above 39 * copyright notice, this list of conditions and the following 40 * disclaimer. 41 * 42 * - Redistributions in binary form must reproduce the above 43 * copyright notice, this list of conditions and the following 44 * disclaimer in the documentation and/or other materials 45 * provided with the distribution. 46 * 47 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 48 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 49 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 50 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 51 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 52 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 53 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 54 * SOFTWARE. 55 * 56 */ 57 #include <sys/rds.h> 58 59 #include <sys/ib/clients/of/ofed_kernel.h> 60 #include <sys/ib/clients/of/rdma/ib_addr.h> 61 #include <sys/ib/clients/of/rdma/rdma_cm.h> 62 63 #include <sys/ib/clients/rdsv3/rdsv3.h> 64 #include <sys/ib/clients/rdsv3/ib.h> 65 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 66 67 extern ddi_taskq_t *rdsv3_taskq; 68 69 /* 70 * Set the selected protocol version 71 */ 72 static void 73 rdsv3_ib_set_protocol(struct rdsv3_connection *conn, unsigned int version) 74 { 75 RDSV3_DPRINTF4("rdsv3_ib_set_protocol", "conn: %p version: %d", 76 conn, version); 77 conn->c_version = version; 78 } 79 80 /* 81 * Set up flow control 82 */ 83 static void 84 rdsv3_ib_set_flow_control(struct rdsv3_connection *conn, uint32_t credits) 85 { 86 struct rdsv3_ib_connection *ic = conn->c_transport_data; 87 88 RDSV3_DPRINTF2("rdsv3_ib_set_flow_control", 89 "Enter: conn: %p credits: %d", conn, credits); 90 91 if (rdsv3_ib_sysctl_flow_control && credits != 0) { 92 /* We're doing flow control */ 93 ic->i_flowctl = 1; 94 rdsv3_ib_send_add_credits(conn, credits); 95 } else { 96 ic->i_flowctl = 0; 97 } 98 99 RDSV3_DPRINTF2("rdsv3_ib_set_flow_control", 100 "Return: conn: %p credits: %d", 101 conn, credits); 102 } 103 104 /* 105 * Tune RNR behavior. Without flow control, we use a rather 106 * low timeout, but not the absolute minimum - this should 107 * be tunable. 108 * 109 * We already set the RNR retry count to 7 (which is the 110 * smallest infinite number :-) above. 111 * If flow control is off, we want to change this back to 0 112 * so that we learn quickly when our credit accounting is 113 * buggy. 114 * 115 * Caller passes in a qp_attr pointer - don't waste stack spacv 116 * by allocation this twice. 117 */ 118 static void 119 rdsv3_ib_tune_rnr(struct rdsv3_ib_connection *ic, struct ib_qp_attr *attr) 120 { 121 int ret; 122 123 RDSV3_DPRINTF2("rdsv3_ib_tune_rnr", "Enter ic: %p attr: %p", 124 ic, attr); 125 126 attr->min_rnr_timer = IB_RNR_TIMER_000_32; 127 ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER); 128 if (ret) 129 RDSV3_DPRINTF2("rdsv3_ib_tune_rnr", 130 "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d", -ret); 131 } 132 133 /* 134 * Connection established. 135 * We get here for both outgoing and incoming connection. 136 */ 137 void 138 rdsv3_ib_cm_connect_complete(struct rdsv3_connection *conn, 139 struct rdma_cm_event *event) 140 { 141 const struct rdsv3_ib_connect_private *dp = NULL; 142 struct rdsv3_ib_connection *ic = conn->c_transport_data; 143 struct rdsv3_ib_device *rds_ibdev; 144 struct ib_qp_attr qp_attr; 145 int err; 146 147 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 148 "Enter conn: %p event: %p", conn, event); 149 150 if (event->param.conn.private_data_len >= sizeof (*dp)) { 151 dp = event->param.conn.private_data; 152 153 /* make sure it isn't empty data */ 154 if (dp->dp_protocol_major) { 155 rdsv3_ib_set_protocol(conn, 156 RDS_PROTOCOL(dp->dp_protocol_major, 157 dp->dp_protocol_minor)); 158 rdsv3_ib_set_flow_control(conn, 159 ntohl(dp->dp_credit)); 160 } 161 } 162 163 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 164 "RDS/IB: connected to %u.%u.%u.%u version %u.%u%s", 165 NIPQUAD(conn->c_faddr), 166 RDS_PROTOCOL_MAJOR(conn->c_version), 167 RDS_PROTOCOL_MINOR(conn->c_version), 168 ic->i_flowctl ? ", flow control" : ""); 169 170 /* 171 * Init rings and fill recv. this needs to wait until protocol 172 * negotiation 173 * is complete, since ring layout is different from 3.0 to 3.1. 174 */ 175 rdsv3_ib_send_init_ring(ic); 176 rdsv3_ib_recv_init_ring(ic); 177 /* 178 * Post receive buffers - as a side effect, this will update 179 * the posted credit count. 180 */ 181 (void) rdsv3_ib_recv_refill(conn, KM_NOSLEEP, 0, 1); 182 183 /* Tune RNR behavior */ 184 rdsv3_ib_tune_rnr(ic, &qp_attr); 185 186 qp_attr.qp_state = IB_QPS_RTS; 187 err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE); 188 if (err) 189 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 190 "ib_modify_qp(IB_QP_STATE, RTS): err=%d", err); 191 192 /* update ib_device with this local ipaddr & conn */ 193 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rdsv3_ib_client); 194 err = rdsv3_ib_update_ipaddr(rds_ibdev, conn->c_laddr); 195 if (err) 196 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 197 "rdsv3_ib_update_ipaddr failed (%d)", err); 198 rdsv3_ib_add_conn(rds_ibdev, conn); 199 200 /* 201 * If the peer gave us the last packet it saw, process this as if 202 * we had received a regular ACK. 203 */ 204 if (dp && dp->dp_ack_seq) 205 rdsv3_send_drop_acked(conn, ntohll(dp->dp_ack_seq), NULL); 206 207 rdsv3_connect_complete(conn); 208 209 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 210 "Return conn: %p event: %p", 211 conn, event); 212 } 213 214 static void 215 rdsv3_ib_cm_fill_conn_param(struct rdsv3_connection *conn, 216 struct rdma_conn_param *conn_param, 217 struct rdsv3_ib_connect_private *dp, 218 uint32_t protocol_version) 219 { 220 RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param", 221 "Enter conn: %p conn_param: %p private: %p version: %d", 222 conn, conn_param, dp, protocol_version); 223 224 (void) memset(conn_param, 0, sizeof (struct rdma_conn_param)); 225 /* XXX tune these? */ 226 conn_param->responder_resources = 1; 227 conn_param->initiator_depth = 1; 228 conn_param->retry_count = min(rdsv3_ib_retry_count, 7); 229 conn_param->rnr_retry_count = 7; 230 231 if (dp) { 232 struct rdsv3_ib_connection *ic = conn->c_transport_data; 233 234 (void) memset(dp, 0, sizeof (*dp)); 235 dp->dp_saddr = conn->c_laddr; 236 dp->dp_daddr = conn->c_faddr; 237 dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); 238 dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); 239 dp->dp_protocol_minor_mask = 240 htons(RDSV3_IB_SUPPORTED_PROTOCOLS); 241 dp->dp_ack_seq = rdsv3_ib_piggyb_ack(ic); 242 243 /* Advertise flow control */ 244 if (ic->i_flowctl) { 245 unsigned int credits; 246 247 credits = IB_GET_POST_CREDITS( 248 atomic_get(&ic->i_credits)); 249 dp->dp_credit = htonl(credits); 250 atomic_add_32(&ic->i_credits, 251 -IB_SET_POST_CREDITS(credits)); 252 } 253 254 conn_param->private_data = dp; 255 conn_param->private_data_len = sizeof (*dp); 256 } 257 258 RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param", 259 "Return conn: %p conn_param: %p private: %p version: %d", 260 conn, conn_param, dp, protocol_version); 261 } 262 263 static void 264 rdsv3_ib_cq_event_handler(struct ib_event *event, void *data) 265 { 266 RDSV3_DPRINTF3("rdsv3_ib_cq_event_handler", "event %u data %p", 267 event->event, data); 268 } 269 270 static void 271 rdsv3_ib_qp_event_handler(struct ib_event *event, void *data) 272 { 273 struct rdsv3_connection *conn = data; 274 struct rdsv3_ib_connection *ic = conn->c_transport_data; 275 276 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", "conn %p ic %p event %u", 277 conn, ic, event->event); 278 279 switch (event->event) { 280 case IB_EVENT_COMM_EST: 281 (void) rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); 282 break; 283 default: 284 if (conn) { 285 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", 286 "RDS/IB: Fatal QP Event %u - " 287 "connection %u.%u.%u.%u ->%u.%u.%u.%u " 288 "...reconnecting", 289 event->event, NIPQUAD(conn->c_laddr), 290 NIPQUAD(conn->c_faddr)); 291 rdsv3_conn_drop(conn); 292 } else { 293 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", 294 "RDS/IB: Fatal QP Event %u - connection" 295 "...reconnecting", event->event); 296 } 297 break; 298 } 299 300 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", "Return conn: %p event: %p", 301 conn, event); 302 } 303 304 extern int rdsv3_ib_alloc_hdrs(ib_device_t *dev, 305 struct rdsv3_ib_connection *ic); 306 extern void rdsv3_ib_free_hdrs(ib_device_t *dev, 307 struct rdsv3_ib_connection *ic); 308 309 /* 310 * This needs to be very careful to not leave IS_ERR pointers around for 311 * cleanup to trip over. 312 */ 313 static int 314 rdsv3_ib_setup_qp(struct rdsv3_connection *conn) 315 { 316 struct rdsv3_ib_connection *ic = conn->c_transport_data; 317 struct ib_device *dev = ic->i_cm_id->device; 318 struct ib_qp_init_attr attr; 319 struct rdsv3_ib_device *rds_ibdev; 320 ibt_send_wr_t *wrp; 321 ibt_wr_ds_t *sgl; 322 int ret, i; 323 324 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "Enter conn: %p", conn); 325 326 /* 327 * rdsv3_ib_add_one creates a rdsv3_ib_device object per IB device, 328 * and allocates a protection domain, memory range and FMR pool 329 * for each. If that fails for any reason, it will not register 330 * the rds_ibdev at all. 331 */ 332 rds_ibdev = ib_get_client_data(dev, &rdsv3_ib_client); 333 if (rds_ibdev == NULL) { 334 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 335 "RDS/IB: No client_data for device %s", dev->name); 336 return (-EOPNOTSUPP); 337 } 338 339 if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) 340 rdsv3_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); 341 if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1) 342 rdsv3_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1); 343 344 /* Protection domain and memory range */ 345 ic->i_pd = rds_ibdev->pd; 346 347 /* 348 * IB_CQ_VECTOR_LEAST_ATTACHED and/or the corresponding feature is 349 * not implmeneted in Hermon yet, but we can pass it to ib_create_cq() 350 * anyway. 351 */ 352 ic->i_send_cq = ib_create_cq(dev, rdsv3_ib_send_cq_comp_handler, 353 rdsv3_ib_cq_event_handler, conn, 354 ic->i_send_ring.w_nr + 1, 355 IB_CQ_VECTOR_LEAST_ATTACHED); 356 if (IS_ERR(ic->i_send_cq)) { 357 ret = PTR_ERR(ic->i_send_cq); 358 ic->i_send_cq = NULL; 359 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 360 "ib_create_cq send failed: %d", ret); 361 goto out; 362 } 363 364 /* 365 * IB_CQ_VECTOR_LEAST_ATTACHED and/or the corresponding feature is 366 * not implmeneted in Hermon yet, but we can pass it to ib_create_cq() 367 * anyway. 368 */ 369 ic->i_recv_cq = ib_create_cq(dev, rdsv3_ib_recv_cq_comp_handler, 370 rdsv3_ib_cq_event_handler, conn, 371 ic->i_recv_ring.w_nr, 372 IB_CQ_VECTOR_LEAST_ATTACHED); 373 if (IS_ERR(ic->i_recv_cq)) { 374 ret = PTR_ERR(ic->i_recv_cq); 375 ic->i_recv_cq = NULL; 376 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 377 "ib_create_cq recv failed: %d", ret); 378 goto out; 379 } 380 381 ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); 382 if (ret) { 383 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 384 "ib_req_notify_cq send failed: %d", ret); 385 goto out; 386 } 387 388 ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); 389 if (ret) { 390 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 391 "ib_req_notify_cq recv failed: %d", ret); 392 goto out; 393 } 394 395 /* XXX negotiate max send/recv with remote? */ 396 (void) memset(&attr, 0, sizeof (attr)); 397 attr.event_handler = rdsv3_ib_qp_event_handler; 398 attr.qp_context = conn; 399 /* + 1 to allow for the single ack message */ 400 attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1; 401 attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1; 402 attr.cap.max_send_sge = rds_ibdev->max_sge; 403 attr.cap.max_recv_sge = RDSV3_IB_RECV_SGE; 404 attr.sq_sig_type = IB_SIGNAL_REQ_WR; 405 attr.qp_type = IB_QPT_RC; 406 attr.send_cq = ic->i_send_cq; 407 attr.recv_cq = ic->i_recv_cq; 408 409 /* 410 * XXX this can fail if max_*_wr is too large? Are we supposed 411 * to back off until we get a value that the hardware can support? 412 */ 413 ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); 414 if (ret) { 415 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 416 "rdma_create_qp failed: %d", ret); 417 goto out; 418 } 419 420 ret = rdsv3_ib_alloc_hdrs(dev, ic); 421 if (ret != 0) { 422 ret = -ENOMEM; 423 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 424 "rdsv3_ib_alloc_hdrs failed: %d", ret); 425 goto out; 426 } 427 428 ic->i_sends = kmem_alloc(ic->i_send_ring.w_nr * 429 sizeof (struct rdsv3_ib_send_work), KM_NOSLEEP); 430 if (ic->i_sends == NULL) { 431 ret = -ENOMEM; 432 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 433 "send allocation failed: %d", ret); 434 goto out; 435 } 436 (void) memset(ic->i_sends, 0, ic->i_send_ring.w_nr * 437 sizeof (struct rdsv3_ib_send_work)); 438 439 ic->i_send_wrs = 440 kmem_alloc(RDSV3_IB_SEND_WRS * (sizeof (ibt_send_wr_t) + 441 RDSV3_IB_MAX_SGE * sizeof (ibt_wr_ds_t)), KM_NOSLEEP); 442 if (ic->i_send_wrs == NULL) { 443 ret = -ENOMEM; 444 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 445 "WR allocation failed: %d", ret); 446 goto out; 447 } 448 sgl = (ibt_wr_ds_t *)((uint8_t *)ic->i_send_wrs + 449 (RDSV3_IB_SEND_WRS * sizeof (ibt_send_wr_t))); 450 RDSV3_DPRINTF4("rdsv3_ib_setup_qp", "i_send_wrs: %p sgl: %p", 451 ic->i_send_wrs, sgl); 452 for (i = 0; i < RDSV3_IB_SEND_WRS; i++) { 453 wrp = &ic->i_send_wrs[i]; 454 wrp->wr_sgl = &sgl[i * RDSV3_IB_MAX_SGE]; 455 } 456 457 ic->i_recvs = kmem_alloc(ic->i_recv_ring.w_nr * 458 sizeof (struct rdsv3_ib_recv_work), KM_NOSLEEP); 459 if (ic->i_recvs == NULL) { 460 ret = -ENOMEM; 461 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 462 "recv allocation failed: %d", ret); 463 goto out; 464 } 465 (void) memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr * 466 sizeof (struct rdsv3_ib_recv_work)); 467 468 rdsv3_ib_recv_init_ack(ic); 469 470 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "conn %p pd %p mr %p cq %p %p", 471 conn, ic->i_pd, ic->i_mr, ic->i_send_cq, ic->i_recv_cq); 472 473 out: 474 return (ret); 475 } 476 477 static uint32_t 478 rdsv3_ib_protocol_compatible(struct rdma_cm_event *event) 479 { 480 const struct rdsv3_ib_connect_private *dp = 481 event->param.conn.private_data; 482 uint16_t common; 483 uint32_t version = 0; 484 485 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", "Enter event: %p", 486 event); 487 488 /* 489 * rdma_cm private data is odd - when there is any private data in the 490 * request, we will be given a pretty large buffer without telling us 491 * the 492 * original size. The only way to tell the difference is by looking at 493 * the contents, which are initialized to zero. 494 * If the protocol version fields aren't set, 495 * this is a connection attempt 496 * from an older version. This could could be 3.0 or 2.0 - 497 * we can't tell. 498 * We really should have changed this for OFED 1.3 :-( 499 */ 500 501 /* Be paranoid. RDS always has privdata */ 502 if (!event->param.conn.private_data_len) { 503 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", 504 "RDS incoming connection has no private data, rejecting"); 505 return (0); 506 } 507 508 /* Even if len is crap *now* I still want to check it. -ASG */ 509 if (event->param.conn.private_data_len < sizeof (*dp) || 510 dp->dp_protocol_major == 0) 511 return (RDS_PROTOCOL_3_0); 512 513 common = ntohs(dp->dp_protocol_minor_mask) & 514 RDSV3_IB_SUPPORTED_PROTOCOLS; 515 if (dp->dp_protocol_major == 3 && common) { 516 version = RDS_PROTOCOL_3_0; 517 while ((common >>= 1) != 0) 518 version++; 519 } else { 520 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", 521 "RDS: Connection from %u.%u.%u.%u using " 522 "incompatible protocol version %u.%u\n", 523 NIPQUAD(dp->dp_saddr), 524 dp->dp_protocol_major, 525 dp->dp_protocol_minor); 526 } 527 528 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", "Return event: %p", 529 event); 530 531 return (version); 532 } 533 534 int 535 rdsv3_ib_cm_handle_connect(struct rdma_cm_id *cm_id, 536 struct rdma_cm_event *event) 537 { 538 uint64_be_t lguid = cm_id->route.path_rec->sgid.global.interface_id; 539 uint64_be_t fguid = cm_id->route.path_rec->dgid.global.interface_id; 540 const struct rdsv3_ib_connect_private *dp = 541 event->param.conn.private_data; 542 struct rdsv3_ib_connect_private dp_rep; 543 struct rdsv3_connection *conn = NULL; 544 struct rdsv3_ib_connection *ic = NULL; 545 struct rdma_conn_param conn_param; 546 uint32_t version; 547 int err, destroy = 1; 548 boolean_t conn_created = B_FALSE; 549 550 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 551 "Enter cm_id: %p event: %p", cm_id, event); 552 553 /* Check whether the remote protocol version matches ours. */ 554 version = rdsv3_ib_protocol_compatible(event); 555 if (!version) { 556 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 557 "version mismatch"); 558 goto out; 559 } 560 561 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 562 "saddr %u.%u.%u.%u daddr %u.%u.%u.%u RDSv%d.%d lguid 0x%llx fguid " 563 "0x%llx", NIPQUAD(dp->dp_saddr), NIPQUAD(dp->dp_daddr), 564 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version), 565 (unsigned long long)ntohll(lguid), 566 (unsigned long long)ntohll(fguid)); 567 568 conn = rdsv3_conn_create(dp->dp_daddr, dp->dp_saddr, 569 &rdsv3_ib_transport, KM_NOSLEEP); 570 if (IS_ERR(conn)) { 571 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 572 "rdsv3_conn_create failed (%ld)", PTR_ERR(conn)); 573 conn = NULL; 574 goto out; 575 } 576 577 /* 578 * The connection request may occur while the 579 * previous connection exist, e.g. in case of failover. 580 * But as connections may be initiated simultaneously 581 * by both hosts, we have a random backoff mechanism - 582 * see the comment above rdsv3_queue_reconnect() 583 */ 584 mutex_enter(&conn->c_cm_lock); 585 if (!rdsv3_conn_transition(conn, RDSV3_CONN_DOWN, 586 RDSV3_CONN_CONNECTING)) { 587 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 588 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 589 "incoming connect when connected: %p", 590 conn); 591 rdsv3_conn_drop(conn); 592 rdsv3_ib_stats_inc(s_ib_listen_closed_stale); 593 mutex_exit(&conn->c_cm_lock); 594 goto out; 595 } else if (rdsv3_conn_state(conn) == RDSV3_CONN_CONNECTING) { 596 /* Wait and see - our connect may still be succeeding */ 597 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 598 "peer-to-peer connection request: %p, " 599 "lguid: 0x%llx fguid: 0x%llx", 600 conn, lguid, fguid); 601 rdsv3_ib_stats_inc(s_ib_connect_raced); 602 } 603 mutex_exit(&conn->c_cm_lock); 604 goto out; 605 } 606 607 ic = conn->c_transport_data; 608 609 rdsv3_ib_set_protocol(conn, version); 610 rdsv3_ib_set_flow_control(conn, ntohl(dp->dp_credit)); 611 612 /* 613 * If the peer gave us the last packet it saw, process this as if 614 * we had received a regular ACK. 615 */ 616 if (dp->dp_ack_seq) 617 rdsv3_send_drop_acked(conn, ntohll(dp->dp_ack_seq), NULL); 618 619 ASSERT(!cm_id->context); 620 ASSERT(!ic->i_cm_id); 621 622 if (ic->i_cm_id != NULL) 623 RDSV3_PANIC(); 624 625 ic->i_cm_id = cm_id; 626 cm_id->context = conn; 627 628 /* 629 * We got halfway through setting up the ib_connection, if we 630 * fail now, we have to take the long route out of this mess. 631 */ 632 destroy = 0; 633 634 err = rdsv3_ib_setup_qp(conn); 635 if (err) { 636 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 637 "rdsv3_ib_setup_qp failed (%d)", err); 638 mutex_exit(&conn->c_cm_lock); 639 rdsv3_conn_drop(conn); 640 goto out; 641 } 642 643 rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); 644 645 /* rdma_accept() calls rdma_reject() internally if it fails */ 646 err = rdma_accept(cm_id, &conn_param); 647 mutex_exit(&conn->c_cm_lock); 648 if (err) { 649 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 650 "rdma_accept failed (%d)", err); 651 rdsv3_conn_drop(conn); 652 goto out; 653 } 654 655 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 656 "Return cm_id: %p event: %p", cm_id, event); 657 658 return (0); 659 660 out: 661 (void) rdma_reject(cm_id, NULL, 0); 662 return (destroy); 663 } 664 665 666 int 667 rdsv3_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) 668 { 669 struct rdsv3_connection *conn = cm_id->context; 670 struct rdsv3_ib_connection *ic = conn->c_transport_data; 671 struct rdma_conn_param conn_param; 672 struct rdsv3_ib_connect_private dp; 673 int ret; 674 675 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", "Enter: cm_id: %p", 676 cm_id); 677 678 /* 679 * If the peer doesn't do protocol negotiation, we must 680 * default to RDSv3.0 681 */ 682 rdsv3_ib_set_protocol(conn, RDS_PROTOCOL_3_0); 683 ic->i_flowctl = 684 rdsv3_ib_sysctl_flow_control; /* advertise flow control */ 685 686 ret = rdsv3_ib_setup_qp(conn); 687 if (ret) { 688 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", 689 "rdsv3_ib_setup_qp failed (%d)", ret); 690 rdsv3_conn_drop(conn); 691 goto out; 692 } 693 694 (void) rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp, 695 RDS_PROTOCOL_VERSION); 696 697 ret = rdma_connect(cm_id, &conn_param); 698 if (ret) { 699 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", 700 "rdma_connect failed (%d)", ret); 701 rdsv3_conn_drop(conn); 702 } 703 704 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", 705 "Return: cm_id: %p", cm_id); 706 707 out: 708 /* 709 * Beware - returning non-zero tells the rdma_cm to destroy 710 * the cm_id. We should certainly not do it as long as we still 711 * "own" the cm_id. 712 */ 713 if (ret) { 714 if (ic->i_cm_id == cm_id) 715 ret = 0; 716 } 717 return (ret); 718 } 719 720 int 721 rdsv3_ib_conn_connect(struct rdsv3_connection *conn) 722 { 723 struct rdsv3_ib_connection *ic = conn->c_transport_data; 724 struct sockaddr_in src, dest; 725 ipaddr_t laddr, faddr; 726 int ret; 727 728 RDSV3_DPRINTF2("rdsv3_ib_conn_connect", "Enter: conn: %p", conn); 729 730 /* 731 * XXX I wonder what affect the port space has 732 */ 733 /* delegate cm event handler to rdma_transport */ 734 ic->i_cm_id = rdma_create_id(rdsv3_rdma_cm_event_handler, conn, 735 RDMA_PS_TCP); 736 if (IS_ERR(ic->i_cm_id)) { 737 ret = PTR_ERR(ic->i_cm_id); 738 ic->i_cm_id = NULL; 739 RDSV3_DPRINTF2("rdsv3_ib_conn_connect", 740 "rdma_create_id() failed: %d", ret); 741 goto out; 742 } 743 744 RDSV3_DPRINTF3("rdsv3_ib_conn_connect", 745 "created cm id %p for conn %p", ic->i_cm_id, conn); 746 747 /* The ipaddr should be in the network order */ 748 laddr = conn->c_laddr; 749 faddr = conn->c_faddr; 750 ret = rdsv3_sc_path_lookup(&laddr, &faddr); 751 if (ret == 0) { 752 RDSV3_DPRINTF2(LABEL, "Path not found (0x%x 0x%x)", 753 ntohl(laddr), ntohl(faddr)); 754 } 755 756 src.sin_family = AF_INET; 757 src.sin_addr.s_addr = (uint32_t)laddr; 758 src.sin_port = (uint16_t)htons(0); 759 760 dest.sin_family = AF_INET; 761 dest.sin_addr.s_addr = (uint32_t)faddr; 762 dest.sin_port = (uint16_t)htons(RDSV3_PORT); 763 764 ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, 765 (struct sockaddr *)&dest, 766 RDSV3_RDMA_RESOLVE_TIMEOUT_MS); 767 if (ret) { 768 RDSV3_DPRINTF2("rdsv3_ib_conn_connect", 769 "addr resolve failed for cm id %p: %d", ic->i_cm_id, ret); 770 rdma_destroy_id(ic->i_cm_id); 771 ic->i_cm_id = NULL; 772 } 773 774 RDSV3_DPRINTF2("rdsv3_ib_conn_connect", "Return: conn: %p", conn); 775 776 out: 777 return (ret); 778 } 779 780 /* 781 * This is so careful about only cleaning up resources that were built up 782 * so that it can be called at any point during startup. In fact it 783 * can be called multiple times for a given connection. 784 */ 785 void 786 rdsv3_ib_conn_shutdown(struct rdsv3_connection *conn) 787 { 788 struct rdsv3_ib_connection *ic = conn->c_transport_data; 789 int err = 0; 790 791 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", 792 "cm %p pd %p cq %p %p qp %p", ic->i_cm_id, 793 ic->i_pd, ic->i_send_cq, ic->i_recv_cq, 794 ic->i_cm_id ? ic->i_cm_id->qp : NULL); 795 796 if (ic->i_cm_id) { 797 struct ib_device *dev = ic->i_cm_id->device; 798 799 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", 800 "disconnecting cm %p", ic->i_cm_id); 801 err = rdma_disconnect(ic->i_cm_id); 802 if (err) { 803 /* 804 * Actually this may happen quite frequently, when 805 * an outgoing connect raced with an incoming connect. 806 */ 807 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", 808 "failed to disconnect, cm: %p err %d", 809 ic->i_cm_id, err); 810 } 811 812 if (ic->i_cm_id->qp) { 813 (void) ibt_flush_qp( 814 ib_get_ibt_channel_hdl(ic->i_cm_id)); 815 816 /* wait until all WRs are flushed */ 817 rdsv3_wait_event(&rdsv3_ib_ring_empty_wait, 818 rdsv3_ib_ring_empty(&ic->i_send_ring) && 819 rdsv3_ib_ring_empty(&ic->i_recv_ring)); 820 821 rdma_destroy_qp(ic->i_cm_id); 822 } 823 824 825 if (ic->i_mr) 826 rdsv3_ib_free_hdrs(dev, ic); 827 828 if (ic->i_sends) 829 rdsv3_ib_send_clear_ring(ic); 830 if (ic->i_recvs) 831 rdsv3_ib_recv_clear_ring(ic); 832 833 if (ic->i_send_cq) 834 (void) ib_destroy_cq(ic->i_send_cq); 835 if (ic->i_recv_cq) 836 (void) ib_destroy_cq(ic->i_recv_cq); 837 rdma_destroy_id(ic->i_cm_id); 838 839 /* 840 * Move connection back to the nodev list. 841 */ 842 if (ic->rds_ibdev) 843 rdsv3_ib_remove_conn(ic->rds_ibdev, conn); 844 845 ic->i_cm_id = NULL; 846 ic->i_pd = NULL; 847 ic->i_mr = NULL; 848 ic->i_send_cq = NULL; 849 ic->i_recv_cq = NULL; 850 ic->i_send_hdrs = NULL; 851 ic->i_recv_hdrs = NULL; 852 ic->i_ack = NULL; 853 } 854 ASSERT(!ic->rds_ibdev); 855 856 /* Clear pending transmit */ 857 if (ic->i_rm) { 858 rdsv3_message_put(ic->i_rm); 859 ic->i_rm = NULL; 860 } 861 862 /* Clear the ACK state */ 863 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); 864 ic->i_ack_next = 0; 865 ic->i_ack_recv = 0; 866 867 /* Clear flow control state */ 868 ic->i_flowctl = 0; 869 ic->i_credits = 0; 870 871 rdsv3_ib_ring_init(&ic->i_send_ring, rdsv3_ib_sysctl_max_send_wr); 872 rdsv3_ib_ring_init(&ic->i_recv_ring, rdsv3_ib_sysctl_max_recv_wr); 873 874 if (ic->i_ibinc) { 875 rdsv3_inc_put(&ic->i_ibinc->ii_inc); 876 ic->i_ibinc = NULL; 877 } 878 879 if (ic->i_sends) { 880 kmem_free(ic->i_sends, 881 ic->i_send_ring.w_nr * sizeof (struct rdsv3_ib_send_work)); 882 ic->i_sends = NULL; 883 } 884 if (ic->i_send_wrs) { 885 kmem_free(ic->i_send_wrs, RDSV3_IB_SEND_WRS * 886 (sizeof (ibt_send_wr_t) + 887 RDSV3_IB_MAX_SGE * sizeof (ibt_wr_ds_t))); 888 ic->i_send_wrs = NULL; 889 } 890 if (ic->i_recvs) { 891 kmem_free(ic->i_recvs, 892 ic->i_recv_ring.w_nr * sizeof (struct rdsv3_ib_recv_work)); 893 ic->i_recvs = NULL; 894 } 895 896 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", "Return conn: %p", conn); 897 } 898 899 /* 900 * the connection can be allocated from either rdsv3_conn_create_outgoing() 901 * or rdsv3_conn_create(), so ddi_taskq_create() can be called with the 902 * same string. This can print the kstat warning on the console. To prevent 903 * it, this counter value is used. 904 * Note that requests from rdsv3_conn_create_outgoing() refers to the cached 905 * value with the mutex lock before it allocates the connection, so that 906 * the warning cannot be produced in the case. (only between 907 * rdsv3_conn_create() and rdsv3_conn_create_outgoing(). 908 */ 909 static int conn_cnt; 910 911 /* ARGSUSED */ 912 int 913 rdsv3_ib_conn_alloc(struct rdsv3_connection *conn, int gfp) 914 { 915 struct rdsv3_ib_connection *ic; 916 char tq_name[TASKQ_NAMELEN]; 917 918 RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn: %p", conn); 919 920 /* XXX too lazy? */ 921 ic = kmem_zalloc(sizeof (struct rdsv3_ib_connection), gfp); 922 if (ic == NULL) 923 return (-ENOMEM); 924 925 list_link_init(&ic->ib_node); 926 (void) snprintf(tq_name, TASKQ_NAMELEN, "RDSV3_CONN_to_%x:%u", 927 htonl(conn->c_faddr), conn_cnt++ % 100); 928 ic->i_recv_tasklet = 929 ddi_taskq_create(NULL, tq_name, 1, TASKQ_DEFAULTPRI, 0); 930 931 932 mutex_init(&ic->i_recv_mutex, NULL, MUTEX_DRIVER, NULL); 933 mutex_init(&ic->i_ack_lock, NULL, MUTEX_DRIVER, NULL); 934 935 /* 936 * rdsv3_ib_conn_shutdown() waits for these to be emptied so they 937 * must be initialized before it can be called. 938 */ 939 rdsv3_ib_ring_init(&ic->i_send_ring, rdsv3_ib_sysctl_max_send_wr); 940 rdsv3_ib_ring_init(&ic->i_recv_ring, rdsv3_ib_sysctl_max_recv_wr); 941 942 ic->conn = conn; 943 conn->c_transport_data = ic; 944 945 mutex_enter(&ib_nodev_conns_lock); 946 list_insert_tail(&ib_nodev_conns, ic); 947 mutex_exit(&ib_nodev_conns_lock); 948 949 950 RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn %p conn ic %p", 951 conn, conn->c_transport_data); 952 return (0); 953 } 954 955 /* 956 * Free a connection. Connection must be shut down and not set for reconnect. 957 */ 958 void 959 rdsv3_ib_conn_free(void *arg) 960 { 961 struct rdsv3_ib_connection *ic = arg; 962 kmutex_t *lock_ptr; 963 964 RDSV3_DPRINTF2("rdsv3_ib_conn_free", "ic %p\n", ic); 965 966 #ifndef __lock_lint 967 /* 968 * Conn is either on a dev's list or on the nodev list. 969 * A race with shutdown() or connect() would cause problems 970 * (since rds_ibdev would change) but that should never happen. 971 */ 972 lock_ptr = ic->rds_ibdev ? 973 &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock; 974 975 mutex_enter(lock_ptr); 976 list_remove_node(&ic->ib_node); 977 mutex_exit(lock_ptr); 978 #endif 979 980 ddi_taskq_destroy(ic->i_recv_tasklet); 981 kmem_free(ic, sizeof (*ic)); 982 } 983 984 /* 985 * An error occurred on the connection 986 */ 987 void 988 __rdsv3_ib_conn_error(struct rdsv3_connection *conn) 989 { 990 rdsv3_conn_drop(conn); 991 } 992