112198SEiji.Ota@Sun.COM /*
212198SEiji.Ota@Sun.COM * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
312198SEiji.Ota@Sun.COM */
412198SEiji.Ota@Sun.COM
512198SEiji.Ota@Sun.COM /*
612763SGiri.Adari@Sun.COM * This file contains code imported from the OFED rds source file ib_cm.c
712763SGiri.Adari@Sun.COM * Oracle elects to have and use the contents of ib_cm.c under and governed
812763SGiri.Adari@Sun.COM * by the OpenIB.org BSD license (see below for full license text). However,
912763SGiri.Adari@Sun.COM * the following notice accompanied the original version of this file:
1012763SGiri.Adari@Sun.COM */
1112763SGiri.Adari@Sun.COM
1212763SGiri.Adari@Sun.COM /*
1312198SEiji.Ota@Sun.COM * Copyright (c) 2006 Oracle. All rights reserved.
1412198SEiji.Ota@Sun.COM *
1512198SEiji.Ota@Sun.COM * This software is available to you under a choice of one of two
1612198SEiji.Ota@Sun.COM * licenses. You may choose to be licensed under the terms of the GNU
1712198SEiji.Ota@Sun.COM * General Public License (GPL) Version 2, available from the file
1812198SEiji.Ota@Sun.COM * COPYING in the main directory of this source tree, or the
1912198SEiji.Ota@Sun.COM * OpenIB.org BSD license below:
2012198SEiji.Ota@Sun.COM *
2112198SEiji.Ota@Sun.COM * Redistribution and use in source and binary forms, with or
2212198SEiji.Ota@Sun.COM * without modification, are permitted provided that the following
2312198SEiji.Ota@Sun.COM * conditions are met:
2412198SEiji.Ota@Sun.COM *
2512198SEiji.Ota@Sun.COM * - Redistributions of source code must retain the above
2612198SEiji.Ota@Sun.COM * copyright notice, this list of conditions and the following
2712198SEiji.Ota@Sun.COM * disclaimer.
2812198SEiji.Ota@Sun.COM *
2912198SEiji.Ota@Sun.COM * - Redistributions in binary form must reproduce the above
3012198SEiji.Ota@Sun.COM * copyright notice, this list of conditions and the following
3112198SEiji.Ota@Sun.COM * disclaimer in the documentation and/or other materials
3212198SEiji.Ota@Sun.COM * provided with the distribution.
3312198SEiji.Ota@Sun.COM *
3412198SEiji.Ota@Sun.COM * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
3512198SEiji.Ota@Sun.COM * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
3612198SEiji.Ota@Sun.COM * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
3712198SEiji.Ota@Sun.COM * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
3812198SEiji.Ota@Sun.COM * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
3912198SEiji.Ota@Sun.COM * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
4012198SEiji.Ota@Sun.COM * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
4112198SEiji.Ota@Sun.COM * SOFTWARE.
4212198SEiji.Ota@Sun.COM *
4312198SEiji.Ota@Sun.COM */
4412198SEiji.Ota@Sun.COM #include <sys/rds.h>
4512198SEiji.Ota@Sun.COM
4612198SEiji.Ota@Sun.COM #include <sys/ib/clients/of/ofed_kernel.h>
4712198SEiji.Ota@Sun.COM #include <sys/ib/clients/of/rdma/ib_addr.h>
4812198SEiji.Ota@Sun.COM #include <sys/ib/clients/of/rdma/rdma_cm.h>
4912198SEiji.Ota@Sun.COM
5012198SEiji.Ota@Sun.COM #include <sys/ib/clients/rdsv3/rdsv3.h>
5112198SEiji.Ota@Sun.COM #include <sys/ib/clients/rdsv3/ib.h>
5212198SEiji.Ota@Sun.COM #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
5312198SEiji.Ota@Sun.COM
5412676SEiji.Ota@Sun.COM extern int rdsv3_enable_snd_cq;
5512198SEiji.Ota@Sun.COM
5612198SEiji.Ota@Sun.COM /*
5712198SEiji.Ota@Sun.COM * Set the selected protocol version
5812198SEiji.Ota@Sun.COM */
5912198SEiji.Ota@Sun.COM static void
rdsv3_ib_set_protocol(struct rdsv3_connection * conn,unsigned int version)6012198SEiji.Ota@Sun.COM rdsv3_ib_set_protocol(struct rdsv3_connection *conn, unsigned int version)
6112198SEiji.Ota@Sun.COM {
6212198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_ib_set_protocol", "conn: %p version: %d",
6312198SEiji.Ota@Sun.COM conn, version);
6412198SEiji.Ota@Sun.COM conn->c_version = version;
6512198SEiji.Ota@Sun.COM }
6612198SEiji.Ota@Sun.COM
6712198SEiji.Ota@Sun.COM /*
6812198SEiji.Ota@Sun.COM * Set up flow control
6912198SEiji.Ota@Sun.COM */
7012198SEiji.Ota@Sun.COM static void
rdsv3_ib_set_flow_control(struct rdsv3_connection * conn,uint32_t credits)7112198SEiji.Ota@Sun.COM rdsv3_ib_set_flow_control(struct rdsv3_connection *conn, uint32_t credits)
7212198SEiji.Ota@Sun.COM {
7312198SEiji.Ota@Sun.COM struct rdsv3_ib_connection *ic = conn->c_transport_data;
7412198SEiji.Ota@Sun.COM
7512198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_set_flow_control",
7612198SEiji.Ota@Sun.COM "Enter: conn: %p credits: %d", conn, credits);
7712198SEiji.Ota@Sun.COM
7812198SEiji.Ota@Sun.COM if (rdsv3_ib_sysctl_flow_control && credits != 0) {
7912198SEiji.Ota@Sun.COM /* We're doing flow control */
8012198SEiji.Ota@Sun.COM ic->i_flowctl = 1;
8112198SEiji.Ota@Sun.COM rdsv3_ib_send_add_credits(conn, credits);
8212198SEiji.Ota@Sun.COM } else {
8312198SEiji.Ota@Sun.COM ic->i_flowctl = 0;
8412198SEiji.Ota@Sun.COM }
8512198SEiji.Ota@Sun.COM
8612198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_set_flow_control",
8712198SEiji.Ota@Sun.COM "Return: conn: %p credits: %d",
8812198SEiji.Ota@Sun.COM conn, credits);
8912198SEiji.Ota@Sun.COM }
9012198SEiji.Ota@Sun.COM
9112198SEiji.Ota@Sun.COM /*
9212198SEiji.Ota@Sun.COM * Tune RNR behavior. Without flow control, we use a rather
9312198SEiji.Ota@Sun.COM * low timeout, but not the absolute minimum - this should
9412198SEiji.Ota@Sun.COM * be tunable.
9512198SEiji.Ota@Sun.COM *
9612198SEiji.Ota@Sun.COM * We already set the RNR retry count to 7 (which is the
9712198SEiji.Ota@Sun.COM * smallest infinite number :-) above.
9812198SEiji.Ota@Sun.COM * If flow control is off, we want to change this back to 0
9912198SEiji.Ota@Sun.COM * so that we learn quickly when our credit accounting is
10012198SEiji.Ota@Sun.COM * buggy.
10112198SEiji.Ota@Sun.COM *
10212198SEiji.Ota@Sun.COM * Caller passes in a qp_attr pointer - don't waste stack spacv
10312198SEiji.Ota@Sun.COM * by allocation this twice.
10412198SEiji.Ota@Sun.COM */
10512198SEiji.Ota@Sun.COM static void
rdsv3_ib_tune_rnr(struct rdsv3_ib_connection * ic,struct ib_qp_attr * attr)10612198SEiji.Ota@Sun.COM rdsv3_ib_tune_rnr(struct rdsv3_ib_connection *ic, struct ib_qp_attr *attr)
10712198SEiji.Ota@Sun.COM {
10812198SEiji.Ota@Sun.COM int ret;
10912198SEiji.Ota@Sun.COM
11012198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_tune_rnr", "Enter ic: %p attr: %p",
11112198SEiji.Ota@Sun.COM ic, attr);
11212198SEiji.Ota@Sun.COM
11312198SEiji.Ota@Sun.COM attr->min_rnr_timer = IB_RNR_TIMER_000_32;
11412198SEiji.Ota@Sun.COM ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER);
11512198SEiji.Ota@Sun.COM if (ret)
11612320SGiri.Adari@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_tune_rnr",
11712198SEiji.Ota@Sun.COM "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d", -ret);
11812198SEiji.Ota@Sun.COM }
11912198SEiji.Ota@Sun.COM
12012198SEiji.Ota@Sun.COM /*
12112198SEiji.Ota@Sun.COM * Connection established.
12212198SEiji.Ota@Sun.COM * We get here for both outgoing and incoming connection.
12312198SEiji.Ota@Sun.COM */
12412198SEiji.Ota@Sun.COM void
rdsv3_ib_cm_connect_complete(struct rdsv3_connection * conn,struct rdma_cm_event * event)12512198SEiji.Ota@Sun.COM rdsv3_ib_cm_connect_complete(struct rdsv3_connection *conn,
12612198SEiji.Ota@Sun.COM struct rdma_cm_event *event)
12712198SEiji.Ota@Sun.COM {
12812198SEiji.Ota@Sun.COM const struct rdsv3_ib_connect_private *dp = NULL;
12912198SEiji.Ota@Sun.COM struct rdsv3_ib_connection *ic = conn->c_transport_data;
13012676SEiji.Ota@Sun.COM struct rdsv3_ib_device *rds_ibdev =
13112676SEiji.Ota@Sun.COM ib_get_client_data(ic->i_cm_id->device, &rdsv3_ib_client);
13212198SEiji.Ota@Sun.COM struct ib_qp_attr qp_attr;
13312198SEiji.Ota@Sun.COM int err;
13412198SEiji.Ota@Sun.COM
13512198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
13612198SEiji.Ota@Sun.COM "Enter conn: %p event: %p", conn, event);
13712198SEiji.Ota@Sun.COM
13812198SEiji.Ota@Sun.COM if (event->param.conn.private_data_len >= sizeof (*dp)) {
13912198SEiji.Ota@Sun.COM dp = event->param.conn.private_data;
14012198SEiji.Ota@Sun.COM
14112198SEiji.Ota@Sun.COM /* make sure it isn't empty data */
14212198SEiji.Ota@Sun.COM if (dp->dp_protocol_major) {
14312198SEiji.Ota@Sun.COM rdsv3_ib_set_protocol(conn,
14412198SEiji.Ota@Sun.COM RDS_PROTOCOL(dp->dp_protocol_major,
14512198SEiji.Ota@Sun.COM dp->dp_protocol_minor));
14612198SEiji.Ota@Sun.COM rdsv3_ib_set_flow_control(conn,
14712198SEiji.Ota@Sun.COM ntohl(dp->dp_credit));
14812198SEiji.Ota@Sun.COM }
14912198SEiji.Ota@Sun.COM }
15012198SEiji.Ota@Sun.COM
15112676SEiji.Ota@Sun.COM if (conn->c_version < RDS_PROTOCOL(3, 1)) {
15212676SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
15312676SEiji.Ota@Sun.COM "RDS/IB: Connection to %u.%u.%u.%u version %u.%u failed",
15412676SEiji.Ota@Sun.COM NIPQUAD(conn->c_faddr),
15512676SEiji.Ota@Sun.COM RDS_PROTOCOL_MAJOR(conn->c_version),
15612676SEiji.Ota@Sun.COM RDS_PROTOCOL_MINOR(conn->c_version));
15712676SEiji.Ota@Sun.COM rdsv3_conn_destroy(conn);
15812676SEiji.Ota@Sun.COM return;
15912676SEiji.Ota@Sun.COM } else {
16012676SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
16112676SEiji.Ota@Sun.COM "RDS/IB: connected to %u.%u.%u.%u version %u.%u%s",
16212676SEiji.Ota@Sun.COM NIPQUAD(conn->c_faddr),
16312676SEiji.Ota@Sun.COM RDS_PROTOCOL_MAJOR(conn->c_version),
16412676SEiji.Ota@Sun.COM RDS_PROTOCOL_MINOR(conn->c_version),
16512676SEiji.Ota@Sun.COM ic->i_flowctl ? ", flow control" : "");
16612676SEiji.Ota@Sun.COM }
16712676SEiji.Ota@Sun.COM
16812676SEiji.Ota@Sun.COM ASSERT(ic->i_soft_cq == NULL);
16912676SEiji.Ota@Sun.COM ic->i_soft_cq = rdsv3_af_intr_thr_create(rdsv3_ib_tasklet_fn,
17012676SEiji.Ota@Sun.COM (void *)ic, SCQ_INTR_BIND_CPU, rds_ibdev->aft_hcagp,
17112676SEiji.Ota@Sun.COM ic->i_cq->ibt_cq);
17212676SEiji.Ota@Sun.COM if (rdsv3_enable_snd_cq) {
17312676SEiji.Ota@Sun.COM ic->i_snd_soft_cq = rdsv3_af_intr_thr_create(
17412676SEiji.Ota@Sun.COM rdsv3_ib_snd_tasklet_fn,
17512676SEiji.Ota@Sun.COM (void *)ic, SCQ_INTR_BIND_CPU, rds_ibdev->aft_hcagp,
17612676SEiji.Ota@Sun.COM ic->i_snd_cq->ibt_cq);
17712676SEiji.Ota@Sun.COM }
178*13118SEiji.Ota@Sun.COM /* rdsv3_ib_refill_fn is expecting i_max_recv_alloc set */
179*13118SEiji.Ota@Sun.COM ic->i_max_recv_alloc = rdsv3_ib_sysctl_max_recv_allocation;
18012676SEiji.Ota@Sun.COM ic->i_refill_rq = rdsv3_af_thr_create(rdsv3_ib_refill_fn, (void *)conn,
18112676SEiji.Ota@Sun.COM SCQ_WRK_BIND_CPU, rds_ibdev->aft_hcagp);
18212676SEiji.Ota@Sun.COM rdsv3_af_grp_draw(rds_ibdev->aft_hcagp);
18312676SEiji.Ota@Sun.COM
18412676SEiji.Ota@Sun.COM (void) ib_req_notify_cq(ic->i_cq, IB_CQ_SOLICITED);
18512676SEiji.Ota@Sun.COM if (rdsv3_enable_snd_cq) {
18612676SEiji.Ota@Sun.COM (void) ib_req_notify_cq(ic->i_snd_cq, IB_CQ_NEXT_COMP);
18712676SEiji.Ota@Sun.COM }
18812198SEiji.Ota@Sun.COM
18912198SEiji.Ota@Sun.COM /*
19012198SEiji.Ota@Sun.COM * Init rings and fill recv. this needs to wait until protocol
19112198SEiji.Ota@Sun.COM * negotiation
19212198SEiji.Ota@Sun.COM * is complete, since ring layout is different from 3.0 to 3.1.
19312198SEiji.Ota@Sun.COM */
19412198SEiji.Ota@Sun.COM rdsv3_ib_send_init_ring(ic);
19512198SEiji.Ota@Sun.COM rdsv3_ib_recv_init_ring(ic);
19612198SEiji.Ota@Sun.COM /*
19712198SEiji.Ota@Sun.COM * Post receive buffers - as a side effect, this will update
19812198SEiji.Ota@Sun.COM * the posted credit count.
19912198SEiji.Ota@Sun.COM */
20012676SEiji.Ota@Sun.COM (void) rdsv3_ib_recv_refill(conn, 1);
20112198SEiji.Ota@Sun.COM
20212198SEiji.Ota@Sun.COM /* Tune RNR behavior */
20312198SEiji.Ota@Sun.COM rdsv3_ib_tune_rnr(ic, &qp_attr);
20412198SEiji.Ota@Sun.COM
20512198SEiji.Ota@Sun.COM qp_attr.qp_state = IB_QPS_RTS;
20612198SEiji.Ota@Sun.COM err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
20712198SEiji.Ota@Sun.COM if (err)
20812320SGiri.Adari@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
20912198SEiji.Ota@Sun.COM "ib_modify_qp(IB_QP_STATE, RTS): err=%d", err);
21012198SEiji.Ota@Sun.COM
21112198SEiji.Ota@Sun.COM /* update ib_device with this local ipaddr & conn */
21212198SEiji.Ota@Sun.COM err = rdsv3_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
21312198SEiji.Ota@Sun.COM if (err)
21412320SGiri.Adari@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
21512198SEiji.Ota@Sun.COM "rdsv3_ib_update_ipaddr failed (%d)", err);
21612198SEiji.Ota@Sun.COM rdsv3_ib_add_conn(rds_ibdev, conn);
21712198SEiji.Ota@Sun.COM
21812198SEiji.Ota@Sun.COM /*
21912198SEiji.Ota@Sun.COM * If the peer gave us the last packet it saw, process this as if
22012198SEiji.Ota@Sun.COM * we had received a regular ACK.
22112198SEiji.Ota@Sun.COM */
22212198SEiji.Ota@Sun.COM if (dp && dp->dp_ack_seq)
22312198SEiji.Ota@Sun.COM rdsv3_send_drop_acked(conn, ntohll(dp->dp_ack_seq), NULL);
22412198SEiji.Ota@Sun.COM
22512198SEiji.Ota@Sun.COM rdsv3_connect_complete(conn);
22612198SEiji.Ota@Sun.COM
22712198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
22812198SEiji.Ota@Sun.COM "Return conn: %p event: %p",
22912198SEiji.Ota@Sun.COM conn, event);
23012198SEiji.Ota@Sun.COM }
23112198SEiji.Ota@Sun.COM
23212198SEiji.Ota@Sun.COM static void
rdsv3_ib_cm_fill_conn_param(struct rdsv3_connection * conn,struct rdma_conn_param * conn_param,struct rdsv3_ib_connect_private * dp,uint32_t protocol_version,uint32_t max_responder_resources,uint32_t max_initiator_depth)23312198SEiji.Ota@Sun.COM rdsv3_ib_cm_fill_conn_param(struct rdsv3_connection *conn,
23412198SEiji.Ota@Sun.COM struct rdma_conn_param *conn_param,
23512198SEiji.Ota@Sun.COM struct rdsv3_ib_connect_private *dp,
23612676SEiji.Ota@Sun.COM uint32_t protocol_version,
23712676SEiji.Ota@Sun.COM uint32_t max_responder_resources,
23812676SEiji.Ota@Sun.COM uint32_t max_initiator_depth)
23912198SEiji.Ota@Sun.COM {
24012676SEiji.Ota@Sun.COM struct rdsv3_ib_connection *ic = conn->c_transport_data;
24112676SEiji.Ota@Sun.COM struct rdsv3_ib_device *rds_ibdev;
24212676SEiji.Ota@Sun.COM
24312198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param",
24412198SEiji.Ota@Sun.COM "Enter conn: %p conn_param: %p private: %p version: %d",
24512198SEiji.Ota@Sun.COM conn, conn_param, dp, protocol_version);
24612198SEiji.Ota@Sun.COM
24712198SEiji.Ota@Sun.COM (void) memset(conn_param, 0, sizeof (struct rdma_conn_param));
24812676SEiji.Ota@Sun.COM
24912676SEiji.Ota@Sun.COM rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rdsv3_ib_client);
25012676SEiji.Ota@Sun.COM
25112676SEiji.Ota@Sun.COM conn_param->responder_resources =
25212676SEiji.Ota@Sun.COM MIN(rds_ibdev->max_responder_resources, max_responder_resources);
25312676SEiji.Ota@Sun.COM conn_param->initiator_depth =
25412676SEiji.Ota@Sun.COM MIN(rds_ibdev->max_initiator_depth, max_initiator_depth);
25512198SEiji.Ota@Sun.COM conn_param->retry_count = min(rdsv3_ib_retry_count, 7);
25612198SEiji.Ota@Sun.COM conn_param->rnr_retry_count = 7;
25712198SEiji.Ota@Sun.COM
25812198SEiji.Ota@Sun.COM if (dp) {
25912198SEiji.Ota@Sun.COM (void) memset(dp, 0, sizeof (*dp));
26012198SEiji.Ota@Sun.COM dp->dp_saddr = conn->c_laddr;
26112198SEiji.Ota@Sun.COM dp->dp_daddr = conn->c_faddr;
26212198SEiji.Ota@Sun.COM dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
26312198SEiji.Ota@Sun.COM dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
26412198SEiji.Ota@Sun.COM dp->dp_protocol_minor_mask =
26512198SEiji.Ota@Sun.COM htons(RDSV3_IB_SUPPORTED_PROTOCOLS);
26612198SEiji.Ota@Sun.COM dp->dp_ack_seq = rdsv3_ib_piggyb_ack(ic);
26712198SEiji.Ota@Sun.COM
26812198SEiji.Ota@Sun.COM /* Advertise flow control */
26912198SEiji.Ota@Sun.COM if (ic->i_flowctl) {
27012198SEiji.Ota@Sun.COM unsigned int credits;
27112198SEiji.Ota@Sun.COM
27212198SEiji.Ota@Sun.COM credits = IB_GET_POST_CREDITS(
27312198SEiji.Ota@Sun.COM atomic_get(&ic->i_credits));
27412198SEiji.Ota@Sun.COM dp->dp_credit = htonl(credits);
27512198SEiji.Ota@Sun.COM atomic_add_32(&ic->i_credits,
27612198SEiji.Ota@Sun.COM -IB_SET_POST_CREDITS(credits));
27712198SEiji.Ota@Sun.COM }
27812198SEiji.Ota@Sun.COM
27912198SEiji.Ota@Sun.COM conn_param->private_data = dp;
28012198SEiji.Ota@Sun.COM conn_param->private_data_len = sizeof (*dp);
28112198SEiji.Ota@Sun.COM }
28212198SEiji.Ota@Sun.COM
28312198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param",
28412198SEiji.Ota@Sun.COM "Return conn: %p conn_param: %p private: %p version: %d",
28512198SEiji.Ota@Sun.COM conn, conn_param, dp, protocol_version);
28612198SEiji.Ota@Sun.COM }
28712198SEiji.Ota@Sun.COM
28812198SEiji.Ota@Sun.COM static void
rdsv3_ib_cq_event_handler(struct ib_event * event,void * data)28912198SEiji.Ota@Sun.COM rdsv3_ib_cq_event_handler(struct ib_event *event, void *data)
29012198SEiji.Ota@Sun.COM {
29112198SEiji.Ota@Sun.COM RDSV3_DPRINTF3("rdsv3_ib_cq_event_handler", "event %u data %p",
29212198SEiji.Ota@Sun.COM event->event, data);
29312198SEiji.Ota@Sun.COM }
29412198SEiji.Ota@Sun.COM
29512198SEiji.Ota@Sun.COM static void
rdsv3_ib_snd_cq_comp_handler(struct ib_cq * cq,void * context)29612676SEiji.Ota@Sun.COM rdsv3_ib_snd_cq_comp_handler(struct ib_cq *cq, void *context)
29712676SEiji.Ota@Sun.COM {
29812676SEiji.Ota@Sun.COM struct rdsv3_connection *conn = context;
29912676SEiji.Ota@Sun.COM struct rdsv3_ib_connection *ic = conn->c_transport_data;
30012676SEiji.Ota@Sun.COM
30112676SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_ib_snd_cq_comp_handler",
30212676SEiji.Ota@Sun.COM "Enter(conn: %p ic: %p cq: %p)", conn, ic, cq);
30312676SEiji.Ota@Sun.COM
30412676SEiji.Ota@Sun.COM rdsv3_af_thr_fire(ic->i_snd_soft_cq);
30512676SEiji.Ota@Sun.COM }
30612676SEiji.Ota@Sun.COM
30712676SEiji.Ota@Sun.COM void
rdsv3_ib_snd_tasklet_fn(void * data)30812676SEiji.Ota@Sun.COM rdsv3_ib_snd_tasklet_fn(void *data)
30912676SEiji.Ota@Sun.COM {
31012676SEiji.Ota@Sun.COM struct rdsv3_ib_connection *ic = (struct rdsv3_ib_connection *)data;
31112676SEiji.Ota@Sun.COM struct rdsv3_connection *conn = ic->conn;
31212676SEiji.Ota@Sun.COM struct rdsv3_ib_ack_state ack_state = { 0, };
31312676SEiji.Ota@Sun.COM ibt_wc_t wc;
31412676SEiji.Ota@Sun.COM uint_t polled;
31512676SEiji.Ota@Sun.COM
31612676SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_ib_snd_tasklet_fn",
31712676SEiji.Ota@Sun.COM "Enter(conn: %p ic: %p)", conn, ic);
31812676SEiji.Ota@Sun.COM
31912676SEiji.Ota@Sun.COM /*
32012676SEiji.Ota@Sun.COM * Poll in a loop before and after enabling the next event
32112676SEiji.Ota@Sun.COM */
32212676SEiji.Ota@Sun.COM while (ibt_poll_cq(RDSV3_CQ2CQHDL(ic->i_snd_cq), &wc, 1, &polled) ==
32312676SEiji.Ota@Sun.COM IBT_SUCCESS) {
32412794SGiri.Adari@Sun.COM RDSV3_DPRINTF4("rdsv3_ib_snd_tasklet_fn",
32512676SEiji.Ota@Sun.COM "wc_id 0x%llx type %d status %u byte_len %u imm_data %u\n",
32612676SEiji.Ota@Sun.COM (unsigned long long)wc.wc_id, wc.wc_type, wc.wc_status,
32712676SEiji.Ota@Sun.COM wc.wc_bytes_xfer, ntohl(wc.wc_immed_data));
32812676SEiji.Ota@Sun.COM
32912676SEiji.Ota@Sun.COM ASSERT(wc.wc_id & RDSV3_IB_SEND_OP);
33012676SEiji.Ota@Sun.COM rdsv3_ib_send_cqe_handler(ic, &wc);
33112676SEiji.Ota@Sun.COM }
33212676SEiji.Ota@Sun.COM (void) ibt_enable_cq_notify(RDSV3_CQ2CQHDL(ic->i_snd_cq),
33312676SEiji.Ota@Sun.COM IBT_NEXT_COMPLETION);
33412794SGiri.Adari@Sun.COM while (ibt_poll_cq(RDSV3_CQ2CQHDL(ic->i_snd_cq), &wc, 1, &polled) ==
33512676SEiji.Ota@Sun.COM IBT_SUCCESS) {
33612794SGiri.Adari@Sun.COM RDSV3_DPRINTF4("rdsv3_ib_snd_tasklet_fn",
33712794SGiri.Adari@Sun.COM "wc_id 0x%llx type %d status %u byte_len %u imm_data %u\n",
33812794SGiri.Adari@Sun.COM (unsigned long long)wc.wc_id, wc.wc_type, wc.wc_status,
33912794SGiri.Adari@Sun.COM wc.wc_bytes_xfer, ntohl(wc.wc_immed_data));
34012794SGiri.Adari@Sun.COM
34112676SEiji.Ota@Sun.COM ASSERT(wc.wc_id & RDSV3_IB_SEND_OP);
34212676SEiji.Ota@Sun.COM rdsv3_ib_send_cqe_handler(ic, &wc);
34312676SEiji.Ota@Sun.COM }
34412676SEiji.Ota@Sun.COM }
34512676SEiji.Ota@Sun.COM
34612676SEiji.Ota@Sun.COM static void
rdsv3_ib_cq_comp_handler(struct ib_cq * cq,void * context)34712676SEiji.Ota@Sun.COM rdsv3_ib_cq_comp_handler(struct ib_cq *cq, void *context)
34812676SEiji.Ota@Sun.COM {
34912676SEiji.Ota@Sun.COM struct rdsv3_connection *conn = context;
35012676SEiji.Ota@Sun.COM struct rdsv3_ib_connection *ic = conn->c_transport_data;
35112676SEiji.Ota@Sun.COM
35212676SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_ib_cq_comp_handler",
35312676SEiji.Ota@Sun.COM "Enter(conn: %p cq: %p)", conn, cq);
35412676SEiji.Ota@Sun.COM
35512676SEiji.Ota@Sun.COM rdsv3_ib_stats_inc(s_ib_evt_handler_call);
35612676SEiji.Ota@Sun.COM
35712676SEiji.Ota@Sun.COM rdsv3_af_thr_fire(ic->i_soft_cq);
35812676SEiji.Ota@Sun.COM }
35912676SEiji.Ota@Sun.COM
36012676SEiji.Ota@Sun.COM void
rdsv3_ib_refill_fn(void * data)36112676SEiji.Ota@Sun.COM rdsv3_ib_refill_fn(void *data)
36212676SEiji.Ota@Sun.COM {
36312676SEiji.Ota@Sun.COM struct rdsv3_connection *conn = (struct rdsv3_connection *)data;
36412676SEiji.Ota@Sun.COM
36512676SEiji.Ota@Sun.COM (void) rdsv3_ib_recv_refill(conn, 0);
36612676SEiji.Ota@Sun.COM }
36712676SEiji.Ota@Sun.COM
36812676SEiji.Ota@Sun.COM void
rdsv3_ib_tasklet_fn(void * data)36912676SEiji.Ota@Sun.COM rdsv3_ib_tasklet_fn(void *data)
37012676SEiji.Ota@Sun.COM {
37112676SEiji.Ota@Sun.COM struct rdsv3_ib_connection *ic = (struct rdsv3_ib_connection *)data;
37212676SEiji.Ota@Sun.COM struct rdsv3_connection *conn = ic->conn;
37312676SEiji.Ota@Sun.COM struct rdsv3_ib_ack_state ack_state = { 0, };
374*13118SEiji.Ota@Sun.COM ibt_wc_t wc[RDSV3_IB_WC_POLL_SIZE];
37512676SEiji.Ota@Sun.COM uint_t polled;
376*13118SEiji.Ota@Sun.COM int i;
37712676SEiji.Ota@Sun.COM
37812676SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_ib_tasklet_fn",
37912676SEiji.Ota@Sun.COM "Enter(conn: %p ic: %p)", conn, ic);
38012676SEiji.Ota@Sun.COM
38112676SEiji.Ota@Sun.COM rdsv3_ib_stats_inc(s_ib_tasklet_call);
38212676SEiji.Ota@Sun.COM
38312676SEiji.Ota@Sun.COM /*
38412676SEiji.Ota@Sun.COM * Poll in a loop before and after enabling the next event
38512676SEiji.Ota@Sun.COM */
386*13118SEiji.Ota@Sun.COM while (ibt_poll_cq(RDSV3_CQ2CQHDL(ic->i_cq), &wc[0],
387*13118SEiji.Ota@Sun.COM RDSV3_IB_WC_POLL_SIZE, &polled) == IBT_SUCCESS) {
388*13118SEiji.Ota@Sun.COM for (i = 0; i < polled; i++) {
389*13118SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_ib_tasklet_fn",
390*13118SEiji.Ota@Sun.COM "wc_id 0x%llx type %d status %u byte_len %u \
391*13118SEiji.Ota@Sun.COM imm_data %u\n",
392*13118SEiji.Ota@Sun.COM (unsigned long long)wc[i].wc_id, wc[i].wc_type,
393*13118SEiji.Ota@Sun.COM wc[i].wc_status, wc[i].wc_bytes_xfer,
394*13118SEiji.Ota@Sun.COM ntohl(wc[i].wc_immed_data));
39512676SEiji.Ota@Sun.COM
396*13118SEiji.Ota@Sun.COM if (wc[i].wc_id & RDSV3_IB_SEND_OP) {
397*13118SEiji.Ota@Sun.COM rdsv3_ib_send_cqe_handler(ic, &wc[i]);
398*13118SEiji.Ota@Sun.COM } else {
399*13118SEiji.Ota@Sun.COM rdsv3_ib_recv_cqe_handler(ic, &wc[i],
400*13118SEiji.Ota@Sun.COM &ack_state);
401*13118SEiji.Ota@Sun.COM }
40212676SEiji.Ota@Sun.COM }
40312676SEiji.Ota@Sun.COM }
40412676SEiji.Ota@Sun.COM (void) ibt_enable_cq_notify(RDSV3_CQ2CQHDL(ic->i_cq),
40512676SEiji.Ota@Sun.COM IBT_NEXT_SOLICITED);
406*13118SEiji.Ota@Sun.COM while (ibt_poll_cq(RDSV3_CQ2CQHDL(ic->i_cq), &wc[0],
407*13118SEiji.Ota@Sun.COM RDSV3_IB_WC_POLL_SIZE, &polled) == IBT_SUCCESS) {
408*13118SEiji.Ota@Sun.COM for (i = 0; i < polled; i++) {
409*13118SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_ib_tasklet_fn",
410*13118SEiji.Ota@Sun.COM "wc_id 0x%llx type %d status %u byte_len %u \
411*13118SEiji.Ota@Sun.COM imm_data %u\n",
412*13118SEiji.Ota@Sun.COM (unsigned long long)wc[i].wc_id, wc[i].wc_type,
413*13118SEiji.Ota@Sun.COM wc[i].wc_status, wc[i].wc_bytes_xfer,
414*13118SEiji.Ota@Sun.COM ntohl(wc[i].wc_immed_data));
415*13118SEiji.Ota@Sun.COM
416*13118SEiji.Ota@Sun.COM if (wc[i].wc_id & RDSV3_IB_SEND_OP) {
417*13118SEiji.Ota@Sun.COM rdsv3_ib_send_cqe_handler(ic, &wc[i]);
418*13118SEiji.Ota@Sun.COM } else {
419*13118SEiji.Ota@Sun.COM rdsv3_ib_recv_cqe_handler(ic, &wc[i],
420*13118SEiji.Ota@Sun.COM &ack_state);
421*13118SEiji.Ota@Sun.COM }
422*13118SEiji.Ota@Sun.COM }
423*13118SEiji.Ota@Sun.COM }
42412676SEiji.Ota@Sun.COM
42512676SEiji.Ota@Sun.COM if (ack_state.ack_next_valid) {
42612676SEiji.Ota@Sun.COM rdsv3_ib_set_ack(ic, ack_state.ack_next,
42712676SEiji.Ota@Sun.COM ack_state.ack_required);
42812676SEiji.Ota@Sun.COM }
42912676SEiji.Ota@Sun.COM if (ack_state.ack_recv_valid && ack_state.ack_recv > ic->i_ack_recv) {
43012676SEiji.Ota@Sun.COM rdsv3_send_drop_acked(conn, ack_state.ack_recv, NULL);
43112676SEiji.Ota@Sun.COM ic->i_ack_recv = ack_state.ack_recv;
43212676SEiji.Ota@Sun.COM }
43312676SEiji.Ota@Sun.COM if (rdsv3_conn_up(conn)) {
43412676SEiji.Ota@Sun.COM if (!test_bit(RDSV3_LL_SEND_FULL, &conn->c_flags))
43512676SEiji.Ota@Sun.COM (void) rdsv3_send_xmit(ic->conn);
43612676SEiji.Ota@Sun.COM rdsv3_ib_attempt_ack(ic);
43712676SEiji.Ota@Sun.COM }
43812676SEiji.Ota@Sun.COM }
43912676SEiji.Ota@Sun.COM
44012676SEiji.Ota@Sun.COM static void
rdsv3_ib_qp_event_handler(struct ib_event * event,void * data)44112198SEiji.Ota@Sun.COM rdsv3_ib_qp_event_handler(struct ib_event *event, void *data)
44212198SEiji.Ota@Sun.COM {
44312198SEiji.Ota@Sun.COM struct rdsv3_connection *conn = data;
44412198SEiji.Ota@Sun.COM struct rdsv3_ib_connection *ic = conn->c_transport_data;
44512198SEiji.Ota@Sun.COM
44612198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", "conn %p ic %p event %u",
44712198SEiji.Ota@Sun.COM conn, ic, event->event);
44812198SEiji.Ota@Sun.COM
44912198SEiji.Ota@Sun.COM switch (event->event) {
45012198SEiji.Ota@Sun.COM case IB_EVENT_COMM_EST:
45112198SEiji.Ota@Sun.COM (void) rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
45212198SEiji.Ota@Sun.COM break;
45312198SEiji.Ota@Sun.COM default:
45412198SEiji.Ota@Sun.COM if (conn) {
45512198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler",
45612198SEiji.Ota@Sun.COM "RDS/IB: Fatal QP Event %u - "
45712198SEiji.Ota@Sun.COM "connection %u.%u.%u.%u ->%u.%u.%u.%u "
45812198SEiji.Ota@Sun.COM "...reconnecting",
45912198SEiji.Ota@Sun.COM event->event, NIPQUAD(conn->c_laddr),
46012198SEiji.Ota@Sun.COM NIPQUAD(conn->c_faddr));
46112198SEiji.Ota@Sun.COM rdsv3_conn_drop(conn);
46212198SEiji.Ota@Sun.COM } else {
46312198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler",
46412198SEiji.Ota@Sun.COM "RDS/IB: Fatal QP Event %u - connection"
46512198SEiji.Ota@Sun.COM "...reconnecting", event->event);
46612198SEiji.Ota@Sun.COM }
46712198SEiji.Ota@Sun.COM break;
46812198SEiji.Ota@Sun.COM }
46912198SEiji.Ota@Sun.COM
47012198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", "Return conn: %p event: %p",
47112198SEiji.Ota@Sun.COM conn, event);
47212198SEiji.Ota@Sun.COM }
47312198SEiji.Ota@Sun.COM
47412198SEiji.Ota@Sun.COM extern int rdsv3_ib_alloc_hdrs(ib_device_t *dev,
47512198SEiji.Ota@Sun.COM struct rdsv3_ib_connection *ic);
47612198SEiji.Ota@Sun.COM extern void rdsv3_ib_free_hdrs(ib_device_t *dev,
47712198SEiji.Ota@Sun.COM struct rdsv3_ib_connection *ic);
47812198SEiji.Ota@Sun.COM
47912198SEiji.Ota@Sun.COM /*
48012198SEiji.Ota@Sun.COM * This needs to be very careful to not leave IS_ERR pointers around for
48112198SEiji.Ota@Sun.COM * cleanup to trip over.
48212198SEiji.Ota@Sun.COM */
48312198SEiji.Ota@Sun.COM static int
rdsv3_ib_setup_qp(struct rdsv3_connection * conn)48412198SEiji.Ota@Sun.COM rdsv3_ib_setup_qp(struct rdsv3_connection *conn)
48512198SEiji.Ota@Sun.COM {
48612198SEiji.Ota@Sun.COM struct rdsv3_ib_connection *ic = conn->c_transport_data;
48712198SEiji.Ota@Sun.COM struct ib_device *dev = ic->i_cm_id->device;
48812198SEiji.Ota@Sun.COM struct ib_qp_init_attr attr;
48912198SEiji.Ota@Sun.COM struct rdsv3_ib_device *rds_ibdev;
49012198SEiji.Ota@Sun.COM ibt_send_wr_t *wrp;
49112198SEiji.Ota@Sun.COM ibt_wr_ds_t *sgl;
49212198SEiji.Ota@Sun.COM int ret, i;
49312198SEiji.Ota@Sun.COM
49412198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "Enter conn: %p", conn);
49512198SEiji.Ota@Sun.COM
49612198SEiji.Ota@Sun.COM /*
49712198SEiji.Ota@Sun.COM * rdsv3_ib_add_one creates a rdsv3_ib_device object per IB device,
49812198SEiji.Ota@Sun.COM * and allocates a protection domain, memory range and FMR pool
49912198SEiji.Ota@Sun.COM * for each. If that fails for any reason, it will not register
50012198SEiji.Ota@Sun.COM * the rds_ibdev at all.
50112198SEiji.Ota@Sun.COM */
50212198SEiji.Ota@Sun.COM rds_ibdev = ib_get_client_data(dev, &rdsv3_ib_client);
50312676SEiji.Ota@Sun.COM if (!rds_ibdev) {
50412320SGiri.Adari@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
50512198SEiji.Ota@Sun.COM "RDS/IB: No client_data for device %s", dev->name);
50612198SEiji.Ota@Sun.COM return (-EOPNOTSUPP);
50712198SEiji.Ota@Sun.COM }
50812444SGiri.Adari@Sun.COM ic->rds_ibdev = rds_ibdev;
50912198SEiji.Ota@Sun.COM
51012198SEiji.Ota@Sun.COM if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
51112198SEiji.Ota@Sun.COM rdsv3_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
51212198SEiji.Ota@Sun.COM if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1)
51312198SEiji.Ota@Sun.COM rdsv3_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1);
51412198SEiji.Ota@Sun.COM
51512198SEiji.Ota@Sun.COM /* Protection domain and memory range */
51612198SEiji.Ota@Sun.COM ic->i_pd = rds_ibdev->pd;
51712198SEiji.Ota@Sun.COM
51812414SEiji.Ota@Sun.COM /*
51912414SEiji.Ota@Sun.COM * IB_CQ_VECTOR_LEAST_ATTACHED and/or the corresponding feature is
52012414SEiji.Ota@Sun.COM * not implmeneted in Hermon yet, but we can pass it to ib_create_cq()
52112414SEiji.Ota@Sun.COM * anyway.
52212414SEiji.Ota@Sun.COM */
52312676SEiji.Ota@Sun.COM ic->i_cq = ib_create_cq(dev, rdsv3_ib_cq_comp_handler,
52412198SEiji.Ota@Sun.COM rdsv3_ib_cq_event_handler, conn,
52512676SEiji.Ota@Sun.COM ic->i_recv_ring.w_nr + ic->i_send_ring.w_nr + 1,
52612965SWilliam.Taylor@Oracle.COM rdsv3_af_grp_get_sched(ic->rds_ibdev->aft_hcagp));
52712676SEiji.Ota@Sun.COM if (IS_ERR(ic->i_cq)) {
52812676SEiji.Ota@Sun.COM ret = PTR_ERR(ic->i_cq);
52912676SEiji.Ota@Sun.COM ic->i_cq = NULL;
53012198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
53112676SEiji.Ota@Sun.COM "ib_create_cq failed: %d", ret);
53212198SEiji.Ota@Sun.COM goto out;
53312198SEiji.Ota@Sun.COM }
53412676SEiji.Ota@Sun.COM if (rdsv3_enable_snd_cq) {
53512676SEiji.Ota@Sun.COM ic->i_snd_cq = ib_create_cq(dev, rdsv3_ib_snd_cq_comp_handler,
53612676SEiji.Ota@Sun.COM rdsv3_ib_cq_event_handler, conn, ic->i_send_ring.w_nr + 1,
53712965SWilliam.Taylor@Oracle.COM rdsv3_af_grp_get_sched(ic->rds_ibdev->aft_hcagp));
53812676SEiji.Ota@Sun.COM if (IS_ERR(ic->i_snd_cq)) {
53912676SEiji.Ota@Sun.COM ret = PTR_ERR(ic->i_snd_cq);
54012676SEiji.Ota@Sun.COM (void) ib_destroy_cq(ic->i_cq);
54112676SEiji.Ota@Sun.COM ic->i_cq = NULL;
54212676SEiji.Ota@Sun.COM ic->i_snd_cq = NULL;
54312676SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
54412676SEiji.Ota@Sun.COM "ib_create_cq send cq failed: %d", ret);
54512676SEiji.Ota@Sun.COM goto out;
54612676SEiji.Ota@Sun.COM }
54712198SEiji.Ota@Sun.COM }
54812198SEiji.Ota@Sun.COM
54912198SEiji.Ota@Sun.COM /* XXX negotiate max send/recv with remote? */
55012198SEiji.Ota@Sun.COM (void) memset(&attr, 0, sizeof (attr));
55112198SEiji.Ota@Sun.COM attr.event_handler = rdsv3_ib_qp_event_handler;
55212198SEiji.Ota@Sun.COM attr.qp_context = conn;
55312198SEiji.Ota@Sun.COM /* + 1 to allow for the single ack message */
55412198SEiji.Ota@Sun.COM attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1;
55512198SEiji.Ota@Sun.COM attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
55612198SEiji.Ota@Sun.COM attr.cap.max_send_sge = rds_ibdev->max_sge;
55712198SEiji.Ota@Sun.COM attr.cap.max_recv_sge = RDSV3_IB_RECV_SGE;
55812198SEiji.Ota@Sun.COM attr.sq_sig_type = IB_SIGNAL_REQ_WR;
55912198SEiji.Ota@Sun.COM attr.qp_type = IB_QPT_RC;
56012676SEiji.Ota@Sun.COM if (rdsv3_enable_snd_cq) {
56112676SEiji.Ota@Sun.COM attr.send_cq = ic->i_snd_cq;
56212676SEiji.Ota@Sun.COM } else {
56312676SEiji.Ota@Sun.COM attr.send_cq = ic->i_cq;
56412676SEiji.Ota@Sun.COM }
56512676SEiji.Ota@Sun.COM attr.recv_cq = ic->i_cq;
56612198SEiji.Ota@Sun.COM
56712198SEiji.Ota@Sun.COM /*
56812198SEiji.Ota@Sun.COM * XXX this can fail if max_*_wr is too large? Are we supposed
56912198SEiji.Ota@Sun.COM * to back off until we get a value that the hardware can support?
57012198SEiji.Ota@Sun.COM */
57112198SEiji.Ota@Sun.COM ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
57212198SEiji.Ota@Sun.COM if (ret) {
57312198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
57412198SEiji.Ota@Sun.COM "rdma_create_qp failed: %d", ret);
57512198SEiji.Ota@Sun.COM goto out;
57612198SEiji.Ota@Sun.COM }
57712198SEiji.Ota@Sun.COM
57812198SEiji.Ota@Sun.COM ret = rdsv3_ib_alloc_hdrs(dev, ic);
57912198SEiji.Ota@Sun.COM if (ret != 0) {
58012198SEiji.Ota@Sun.COM ret = -ENOMEM;
58112198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
58212198SEiji.Ota@Sun.COM "rdsv3_ib_alloc_hdrs failed: %d", ret);
58312198SEiji.Ota@Sun.COM goto out;
58412198SEiji.Ota@Sun.COM }
58512198SEiji.Ota@Sun.COM
58612198SEiji.Ota@Sun.COM ic->i_sends = kmem_alloc(ic->i_send_ring.w_nr *
58712198SEiji.Ota@Sun.COM sizeof (struct rdsv3_ib_send_work), KM_NOSLEEP);
58812198SEiji.Ota@Sun.COM if (ic->i_sends == NULL) {
58912198SEiji.Ota@Sun.COM ret = -ENOMEM;
59012198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
59112198SEiji.Ota@Sun.COM "send allocation failed: %d", ret);
59212198SEiji.Ota@Sun.COM goto out;
59312198SEiji.Ota@Sun.COM }
59412198SEiji.Ota@Sun.COM (void) memset(ic->i_sends, 0, ic->i_send_ring.w_nr *
59512198SEiji.Ota@Sun.COM sizeof (struct rdsv3_ib_send_work));
59612198SEiji.Ota@Sun.COM
59712198SEiji.Ota@Sun.COM ic->i_send_wrs =
59812794SGiri.Adari@Sun.COM kmem_alloc(ic->i_send_ring.w_nr * (sizeof (ibt_send_wr_t) +
59912198SEiji.Ota@Sun.COM RDSV3_IB_MAX_SGE * sizeof (ibt_wr_ds_t)), KM_NOSLEEP);
60012198SEiji.Ota@Sun.COM if (ic->i_send_wrs == NULL) {
60112198SEiji.Ota@Sun.COM ret = -ENOMEM;
60212198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
60312444SGiri.Adari@Sun.COM "Send WR allocation failed: %d", ret);
60412198SEiji.Ota@Sun.COM goto out;
60512198SEiji.Ota@Sun.COM }
60612198SEiji.Ota@Sun.COM sgl = (ibt_wr_ds_t *)((uint8_t *)ic->i_send_wrs +
60712794SGiri.Adari@Sun.COM (ic->i_send_ring.w_nr * sizeof (ibt_send_wr_t)));
60812794SGiri.Adari@Sun.COM for (i = 0; i < ic->i_send_ring.w_nr; i++) {
60912198SEiji.Ota@Sun.COM wrp = &ic->i_send_wrs[i];
61012198SEiji.Ota@Sun.COM wrp->wr_sgl = &sgl[i * RDSV3_IB_MAX_SGE];
61112198SEiji.Ota@Sun.COM }
61212198SEiji.Ota@Sun.COM
61312198SEiji.Ota@Sun.COM ic->i_recvs = kmem_alloc(ic->i_recv_ring.w_nr *
61412198SEiji.Ota@Sun.COM sizeof (struct rdsv3_ib_recv_work), KM_NOSLEEP);
61512198SEiji.Ota@Sun.COM if (ic->i_recvs == NULL) {
61612198SEiji.Ota@Sun.COM ret = -ENOMEM;
61712198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
61812198SEiji.Ota@Sun.COM "recv allocation failed: %d", ret);
61912198SEiji.Ota@Sun.COM goto out;
62012198SEiji.Ota@Sun.COM }
62112198SEiji.Ota@Sun.COM (void) memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr *
62212198SEiji.Ota@Sun.COM sizeof (struct rdsv3_ib_recv_work));
62312198SEiji.Ota@Sun.COM
62412444SGiri.Adari@Sun.COM ic->i_recv_wrs =
62512444SGiri.Adari@Sun.COM kmem_alloc(ic->i_recv_ring.w_nr * sizeof (ibt_recv_wr_t),
62612444SGiri.Adari@Sun.COM KM_NOSLEEP);
62712444SGiri.Adari@Sun.COM if (ic->i_recv_wrs == NULL) {
62812444SGiri.Adari@Sun.COM ret = -ENOMEM;
62912444SGiri.Adari@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
63012444SGiri.Adari@Sun.COM "Recv WR allocation failed: %d", ret);
63112444SGiri.Adari@Sun.COM goto out;
63212444SGiri.Adari@Sun.COM }
63312444SGiri.Adari@Sun.COM
63412198SEiji.Ota@Sun.COM rdsv3_ib_recv_init_ack(ic);
63512198SEiji.Ota@Sun.COM
63612676SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "conn %p pd %p mr %p cq %p",
63712676SEiji.Ota@Sun.COM conn, ic->i_pd, ic->i_mr, ic->i_cq);
63812198SEiji.Ota@Sun.COM
63912198SEiji.Ota@Sun.COM out:
64012198SEiji.Ota@Sun.COM return (ret);
64112198SEiji.Ota@Sun.COM }
64212198SEiji.Ota@Sun.COM
64312198SEiji.Ota@Sun.COM static uint32_t
rdsv3_ib_protocol_compatible(struct rdma_cm_event * event)64412198SEiji.Ota@Sun.COM rdsv3_ib_protocol_compatible(struct rdma_cm_event *event)
64512198SEiji.Ota@Sun.COM {
64612198SEiji.Ota@Sun.COM const struct rdsv3_ib_connect_private *dp =
64712198SEiji.Ota@Sun.COM event->param.conn.private_data;
64812198SEiji.Ota@Sun.COM uint16_t common;
64912198SEiji.Ota@Sun.COM uint32_t version = 0;
65012198SEiji.Ota@Sun.COM
65112198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", "Enter event: %p",
65212198SEiji.Ota@Sun.COM event);
65312198SEiji.Ota@Sun.COM
65412198SEiji.Ota@Sun.COM /*
65512198SEiji.Ota@Sun.COM * rdma_cm private data is odd - when there is any private data in the
65612198SEiji.Ota@Sun.COM * request, we will be given a pretty large buffer without telling us
65712198SEiji.Ota@Sun.COM * the
65812198SEiji.Ota@Sun.COM * original size. The only way to tell the difference is by looking at
65912198SEiji.Ota@Sun.COM * the contents, which are initialized to zero.
66012198SEiji.Ota@Sun.COM * If the protocol version fields aren't set,
66112198SEiji.Ota@Sun.COM * this is a connection attempt
66212198SEiji.Ota@Sun.COM * from an older version. This could could be 3.0 or 2.0 -
66312198SEiji.Ota@Sun.COM * we can't tell.
66412198SEiji.Ota@Sun.COM * We really should have changed this for OFED 1.3 :-(
66512198SEiji.Ota@Sun.COM */
66612198SEiji.Ota@Sun.COM
66712198SEiji.Ota@Sun.COM /* Be paranoid. RDS always has privdata */
66812198SEiji.Ota@Sun.COM if (!event->param.conn.private_data_len) {
66912198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible",
67012198SEiji.Ota@Sun.COM "RDS incoming connection has no private data, rejecting");
67112198SEiji.Ota@Sun.COM return (0);
67212198SEiji.Ota@Sun.COM }
67312198SEiji.Ota@Sun.COM
67412198SEiji.Ota@Sun.COM /* Even if len is crap *now* I still want to check it. -ASG */
67512198SEiji.Ota@Sun.COM if (event->param.conn.private_data_len < sizeof (*dp) ||
67612198SEiji.Ota@Sun.COM dp->dp_protocol_major == 0)
67712198SEiji.Ota@Sun.COM return (RDS_PROTOCOL_3_0);
67812198SEiji.Ota@Sun.COM
67912198SEiji.Ota@Sun.COM common = ntohs(dp->dp_protocol_minor_mask) &
68012198SEiji.Ota@Sun.COM RDSV3_IB_SUPPORTED_PROTOCOLS;
68112198SEiji.Ota@Sun.COM if (dp->dp_protocol_major == 3 && common) {
68212198SEiji.Ota@Sun.COM version = RDS_PROTOCOL_3_0;
68312198SEiji.Ota@Sun.COM while ((common >>= 1) != 0)
68412198SEiji.Ota@Sun.COM version++;
68512198SEiji.Ota@Sun.COM } else {
68612320SGiri.Adari@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible",
68712198SEiji.Ota@Sun.COM "RDS: Connection from %u.%u.%u.%u using "
68812198SEiji.Ota@Sun.COM "incompatible protocol version %u.%u\n",
68912198SEiji.Ota@Sun.COM NIPQUAD(dp->dp_saddr),
69012198SEiji.Ota@Sun.COM dp->dp_protocol_major,
69112198SEiji.Ota@Sun.COM dp->dp_protocol_minor);
69212198SEiji.Ota@Sun.COM }
69312198SEiji.Ota@Sun.COM
69412198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", "Return event: %p",
69512198SEiji.Ota@Sun.COM event);
69612198SEiji.Ota@Sun.COM
69712198SEiji.Ota@Sun.COM return (version);
69812198SEiji.Ota@Sun.COM }
69912198SEiji.Ota@Sun.COM
70012198SEiji.Ota@Sun.COM int
rdsv3_ib_cm_handle_connect(struct rdma_cm_id * cm_id,struct rdma_cm_event * event)70112198SEiji.Ota@Sun.COM rdsv3_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
70212198SEiji.Ota@Sun.COM struct rdma_cm_event *event)
70312198SEiji.Ota@Sun.COM {
70412198SEiji.Ota@Sun.COM uint64_be_t lguid = cm_id->route.path_rec->sgid.global.interface_id;
70512198SEiji.Ota@Sun.COM uint64_be_t fguid = cm_id->route.path_rec->dgid.global.interface_id;
70612198SEiji.Ota@Sun.COM const struct rdsv3_ib_connect_private *dp =
70712198SEiji.Ota@Sun.COM event->param.conn.private_data;
70812198SEiji.Ota@Sun.COM struct rdsv3_ib_connect_private dp_rep;
70912198SEiji.Ota@Sun.COM struct rdsv3_connection *conn = NULL;
71012198SEiji.Ota@Sun.COM struct rdsv3_ib_connection *ic = NULL;
71112198SEiji.Ota@Sun.COM struct rdma_conn_param conn_param;
71212198SEiji.Ota@Sun.COM uint32_t version;
71312198SEiji.Ota@Sun.COM int err, destroy = 1;
71412198SEiji.Ota@Sun.COM boolean_t conn_created = B_FALSE;
71512198SEiji.Ota@Sun.COM
71612198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
71712198SEiji.Ota@Sun.COM "Enter cm_id: %p event: %p", cm_id, event);
71812198SEiji.Ota@Sun.COM
71912198SEiji.Ota@Sun.COM /* Check whether the remote protocol version matches ours. */
72012198SEiji.Ota@Sun.COM version = rdsv3_ib_protocol_compatible(event);
72112198SEiji.Ota@Sun.COM if (!version) {
72212198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
72312198SEiji.Ota@Sun.COM "version mismatch");
72412198SEiji.Ota@Sun.COM goto out;
72512198SEiji.Ota@Sun.COM }
72612198SEiji.Ota@Sun.COM
72712198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
72812198SEiji.Ota@Sun.COM "saddr %u.%u.%u.%u daddr %u.%u.%u.%u RDSv%d.%d lguid 0x%llx fguid "
72912198SEiji.Ota@Sun.COM "0x%llx", NIPQUAD(dp->dp_saddr), NIPQUAD(dp->dp_daddr),
73012198SEiji.Ota@Sun.COM RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
73112198SEiji.Ota@Sun.COM (unsigned long long)ntohll(lguid),
73212198SEiji.Ota@Sun.COM (unsigned long long)ntohll(fguid));
73312198SEiji.Ota@Sun.COM
73412198SEiji.Ota@Sun.COM conn = rdsv3_conn_create(dp->dp_daddr, dp->dp_saddr,
73512198SEiji.Ota@Sun.COM &rdsv3_ib_transport, KM_NOSLEEP);
73612198SEiji.Ota@Sun.COM if (IS_ERR(conn)) {
73712198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
73812198SEiji.Ota@Sun.COM "rdsv3_conn_create failed (%ld)", PTR_ERR(conn));
73912198SEiji.Ota@Sun.COM conn = NULL;
74012198SEiji.Ota@Sun.COM goto out;
74112198SEiji.Ota@Sun.COM }
74212198SEiji.Ota@Sun.COM
74312198SEiji.Ota@Sun.COM /*
74412198SEiji.Ota@Sun.COM * The connection request may occur while the
74512198SEiji.Ota@Sun.COM * previous connection exist, e.g. in case of failover.
74612198SEiji.Ota@Sun.COM * But as connections may be initiated simultaneously
74712198SEiji.Ota@Sun.COM * by both hosts, we have a random backoff mechanism -
74812198SEiji.Ota@Sun.COM * see the comment above rdsv3_queue_reconnect()
74912198SEiji.Ota@Sun.COM */
75012198SEiji.Ota@Sun.COM mutex_enter(&conn->c_cm_lock);
75112198SEiji.Ota@Sun.COM if (!rdsv3_conn_transition(conn, RDSV3_CONN_DOWN,
75212198SEiji.Ota@Sun.COM RDSV3_CONN_CONNECTING)) {
75312198SEiji.Ota@Sun.COM if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
75412198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
75512198SEiji.Ota@Sun.COM "incoming connect when connected: %p",
75612198SEiji.Ota@Sun.COM conn);
75712198SEiji.Ota@Sun.COM rdsv3_conn_drop(conn);
75812198SEiji.Ota@Sun.COM rdsv3_ib_stats_inc(s_ib_listen_closed_stale);
75912198SEiji.Ota@Sun.COM mutex_exit(&conn->c_cm_lock);
76012198SEiji.Ota@Sun.COM goto out;
76112198SEiji.Ota@Sun.COM } else if (rdsv3_conn_state(conn) == RDSV3_CONN_CONNECTING) {
76212198SEiji.Ota@Sun.COM /* Wait and see - our connect may still be succeeding */
76312198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
76412198SEiji.Ota@Sun.COM "peer-to-peer connection request: %p, "
76512198SEiji.Ota@Sun.COM "lguid: 0x%llx fguid: 0x%llx",
76612198SEiji.Ota@Sun.COM conn, lguid, fguid);
76712198SEiji.Ota@Sun.COM rdsv3_ib_stats_inc(s_ib_connect_raced);
76812198SEiji.Ota@Sun.COM }
76912198SEiji.Ota@Sun.COM mutex_exit(&conn->c_cm_lock);
77012198SEiji.Ota@Sun.COM goto out;
77112198SEiji.Ota@Sun.COM }
77212198SEiji.Ota@Sun.COM
77312198SEiji.Ota@Sun.COM ic = conn->c_transport_data;
77412198SEiji.Ota@Sun.COM
77512198SEiji.Ota@Sun.COM rdsv3_ib_set_protocol(conn, version);
77612198SEiji.Ota@Sun.COM rdsv3_ib_set_flow_control(conn, ntohl(dp->dp_credit));
77712198SEiji.Ota@Sun.COM
77812198SEiji.Ota@Sun.COM /*
77912198SEiji.Ota@Sun.COM * If the peer gave us the last packet it saw, process this as if
78012198SEiji.Ota@Sun.COM * we had received a regular ACK.
78112198SEiji.Ota@Sun.COM */
78212198SEiji.Ota@Sun.COM if (dp->dp_ack_seq)
78312198SEiji.Ota@Sun.COM rdsv3_send_drop_acked(conn, ntohll(dp->dp_ack_seq), NULL);
78412198SEiji.Ota@Sun.COM
78512198SEiji.Ota@Sun.COM ASSERT(!cm_id->context);
78612198SEiji.Ota@Sun.COM ASSERT(!ic->i_cm_id);
78712198SEiji.Ota@Sun.COM
78812198SEiji.Ota@Sun.COM if (ic->i_cm_id != NULL)
78912198SEiji.Ota@Sun.COM RDSV3_PANIC();
79012198SEiji.Ota@Sun.COM
79112198SEiji.Ota@Sun.COM ic->i_cm_id = cm_id;
79212198SEiji.Ota@Sun.COM cm_id->context = conn;
79312198SEiji.Ota@Sun.COM
79412198SEiji.Ota@Sun.COM /*
79512198SEiji.Ota@Sun.COM * We got halfway through setting up the ib_connection, if we
79612198SEiji.Ota@Sun.COM * fail now, we have to take the long route out of this mess.
79712198SEiji.Ota@Sun.COM */
79812198SEiji.Ota@Sun.COM destroy = 0;
79912198SEiji.Ota@Sun.COM
80012198SEiji.Ota@Sun.COM err = rdsv3_ib_setup_qp(conn);
80112198SEiji.Ota@Sun.COM if (err) {
80212198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
80312198SEiji.Ota@Sun.COM "rdsv3_ib_setup_qp failed (%d)", err);
80412320SGiri.Adari@Sun.COM mutex_exit(&conn->c_cm_lock);
80512198SEiji.Ota@Sun.COM rdsv3_conn_drop(conn);
80612198SEiji.Ota@Sun.COM goto out;
80712198SEiji.Ota@Sun.COM }
80812198SEiji.Ota@Sun.COM
80912676SEiji.Ota@Sun.COM rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
81012676SEiji.Ota@Sun.COM event->param.conn.responder_resources,
81112676SEiji.Ota@Sun.COM event->param.conn.initiator_depth);
81212198SEiji.Ota@Sun.COM
81312198SEiji.Ota@Sun.COM /* rdma_accept() calls rdma_reject() internally if it fails */
81412198SEiji.Ota@Sun.COM err = rdma_accept(cm_id, &conn_param);
81512198SEiji.Ota@Sun.COM mutex_exit(&conn->c_cm_lock);
81612198SEiji.Ota@Sun.COM if (err) {
81712198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
81812198SEiji.Ota@Sun.COM "rdma_accept failed (%d)", err);
81912198SEiji.Ota@Sun.COM rdsv3_conn_drop(conn);
82012198SEiji.Ota@Sun.COM goto out;
82112198SEiji.Ota@Sun.COM }
82212198SEiji.Ota@Sun.COM
82312198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
82412198SEiji.Ota@Sun.COM "Return cm_id: %p event: %p", cm_id, event);
82512198SEiji.Ota@Sun.COM
82612198SEiji.Ota@Sun.COM return (0);
82712198SEiji.Ota@Sun.COM
82812198SEiji.Ota@Sun.COM out:
82912198SEiji.Ota@Sun.COM (void) rdma_reject(cm_id, NULL, 0);
83012198SEiji.Ota@Sun.COM return (destroy);
83112198SEiji.Ota@Sun.COM }
83212198SEiji.Ota@Sun.COM
83312198SEiji.Ota@Sun.COM
83412198SEiji.Ota@Sun.COM int
rdsv3_ib_cm_initiate_connect(struct rdma_cm_id * cm_id)83512198SEiji.Ota@Sun.COM rdsv3_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
83612198SEiji.Ota@Sun.COM {
83712198SEiji.Ota@Sun.COM struct rdsv3_connection *conn = cm_id->context;
83812198SEiji.Ota@Sun.COM struct rdsv3_ib_connection *ic = conn->c_transport_data;
83912198SEiji.Ota@Sun.COM struct rdma_conn_param conn_param;
84012198SEiji.Ota@Sun.COM struct rdsv3_ib_connect_private dp;
84112198SEiji.Ota@Sun.COM int ret;
84212198SEiji.Ota@Sun.COM
84312198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", "Enter: cm_id: %p",
84412198SEiji.Ota@Sun.COM cm_id);
84512198SEiji.Ota@Sun.COM
84612198SEiji.Ota@Sun.COM /*
84712198SEiji.Ota@Sun.COM * If the peer doesn't do protocol negotiation, we must
84812198SEiji.Ota@Sun.COM * default to RDSv3.0
84912198SEiji.Ota@Sun.COM */
85012198SEiji.Ota@Sun.COM rdsv3_ib_set_protocol(conn, RDS_PROTOCOL_3_0);
85112198SEiji.Ota@Sun.COM ic->i_flowctl =
85212198SEiji.Ota@Sun.COM rdsv3_ib_sysctl_flow_control; /* advertise flow control */
85312198SEiji.Ota@Sun.COM
85412198SEiji.Ota@Sun.COM ret = rdsv3_ib_setup_qp(conn);
85512198SEiji.Ota@Sun.COM if (ret) {
85612198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect",
85712198SEiji.Ota@Sun.COM "rdsv3_ib_setup_qp failed (%d)", ret);
85812198SEiji.Ota@Sun.COM rdsv3_conn_drop(conn);
85912198SEiji.Ota@Sun.COM goto out;
86012198SEiji.Ota@Sun.COM }
86112198SEiji.Ota@Sun.COM
86212676SEiji.Ota@Sun.COM rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp,
86312676SEiji.Ota@Sun.COM RDS_PROTOCOL_VERSION, UINT_MAX, UINT_MAX);
86412198SEiji.Ota@Sun.COM
86512198SEiji.Ota@Sun.COM ret = rdma_connect(cm_id, &conn_param);
86612198SEiji.Ota@Sun.COM if (ret) {
86712198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect",
86812198SEiji.Ota@Sun.COM "rdma_connect failed (%d)", ret);
86912198SEiji.Ota@Sun.COM rdsv3_conn_drop(conn);
87012198SEiji.Ota@Sun.COM }
87112198SEiji.Ota@Sun.COM
87212198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect",
87312198SEiji.Ota@Sun.COM "Return: cm_id: %p", cm_id);
87412198SEiji.Ota@Sun.COM
87512198SEiji.Ota@Sun.COM out:
87612198SEiji.Ota@Sun.COM /*
87712198SEiji.Ota@Sun.COM * Beware - returning non-zero tells the rdma_cm to destroy
87812198SEiji.Ota@Sun.COM * the cm_id. We should certainly not do it as long as we still
87912198SEiji.Ota@Sun.COM * "own" the cm_id.
88012198SEiji.Ota@Sun.COM */
88112198SEiji.Ota@Sun.COM if (ret) {
88212198SEiji.Ota@Sun.COM if (ic->i_cm_id == cm_id)
88312198SEiji.Ota@Sun.COM ret = 0;
88412198SEiji.Ota@Sun.COM }
88512198SEiji.Ota@Sun.COM return (ret);
88612198SEiji.Ota@Sun.COM }
88712198SEiji.Ota@Sun.COM
88812198SEiji.Ota@Sun.COM int
rdsv3_ib_conn_connect(struct rdsv3_connection * conn)88912198SEiji.Ota@Sun.COM rdsv3_ib_conn_connect(struct rdsv3_connection *conn)
89012198SEiji.Ota@Sun.COM {
89112198SEiji.Ota@Sun.COM struct rdsv3_ib_connection *ic = conn->c_transport_data;
89212198SEiji.Ota@Sun.COM struct sockaddr_in src, dest;
89312198SEiji.Ota@Sun.COM ipaddr_t laddr, faddr;
89412198SEiji.Ota@Sun.COM int ret;
89512198SEiji.Ota@Sun.COM
89612198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_conn_connect", "Enter: conn: %p", conn);
89712198SEiji.Ota@Sun.COM
89812198SEiji.Ota@Sun.COM /*
89912198SEiji.Ota@Sun.COM * XXX I wonder what affect the port space has
90012198SEiji.Ota@Sun.COM */
90112198SEiji.Ota@Sun.COM /* delegate cm event handler to rdma_transport */
90212198SEiji.Ota@Sun.COM ic->i_cm_id = rdma_create_id(rdsv3_rdma_cm_event_handler, conn,
90312198SEiji.Ota@Sun.COM RDMA_PS_TCP);
90412198SEiji.Ota@Sun.COM if (IS_ERR(ic->i_cm_id)) {
90512198SEiji.Ota@Sun.COM ret = PTR_ERR(ic->i_cm_id);
90612198SEiji.Ota@Sun.COM ic->i_cm_id = NULL;
90712198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_conn_connect",
90812198SEiji.Ota@Sun.COM "rdma_create_id() failed: %d", ret);
90912198SEiji.Ota@Sun.COM goto out;
91012198SEiji.Ota@Sun.COM }
91112198SEiji.Ota@Sun.COM
91212198SEiji.Ota@Sun.COM RDSV3_DPRINTF3("rdsv3_ib_conn_connect",
91312198SEiji.Ota@Sun.COM "created cm id %p for conn %p", ic->i_cm_id, conn);
91412198SEiji.Ota@Sun.COM
91512198SEiji.Ota@Sun.COM /* The ipaddr should be in the network order */
91612198SEiji.Ota@Sun.COM laddr = conn->c_laddr;
91712198SEiji.Ota@Sun.COM faddr = conn->c_faddr;
91812198SEiji.Ota@Sun.COM ret = rdsv3_sc_path_lookup(&laddr, &faddr);
91912198SEiji.Ota@Sun.COM if (ret == 0) {
92012198SEiji.Ota@Sun.COM RDSV3_DPRINTF2(LABEL, "Path not found (0x%x 0x%x)",
92112198SEiji.Ota@Sun.COM ntohl(laddr), ntohl(faddr));
92212198SEiji.Ota@Sun.COM }
92312198SEiji.Ota@Sun.COM
92412198SEiji.Ota@Sun.COM src.sin_family = AF_INET;
92512198SEiji.Ota@Sun.COM src.sin_addr.s_addr = (uint32_t)laddr;
92612198SEiji.Ota@Sun.COM src.sin_port = (uint16_t)htons(0);
92712198SEiji.Ota@Sun.COM
92812198SEiji.Ota@Sun.COM dest.sin_family = AF_INET;
92912198SEiji.Ota@Sun.COM dest.sin_addr.s_addr = (uint32_t)faddr;
93012198SEiji.Ota@Sun.COM dest.sin_port = (uint16_t)htons(RDSV3_PORT);
93112198SEiji.Ota@Sun.COM
93212198SEiji.Ota@Sun.COM ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
93312198SEiji.Ota@Sun.COM (struct sockaddr *)&dest,
93412198SEiji.Ota@Sun.COM RDSV3_RDMA_RESOLVE_TIMEOUT_MS);
93512198SEiji.Ota@Sun.COM if (ret) {
93612198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_conn_connect",
93712198SEiji.Ota@Sun.COM "addr resolve failed for cm id %p: %d", ic->i_cm_id, ret);
93812198SEiji.Ota@Sun.COM rdma_destroy_id(ic->i_cm_id);
93912198SEiji.Ota@Sun.COM ic->i_cm_id = NULL;
94012198SEiji.Ota@Sun.COM }
94112198SEiji.Ota@Sun.COM
94212198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_conn_connect", "Return: conn: %p", conn);
94312198SEiji.Ota@Sun.COM
94412198SEiji.Ota@Sun.COM out:
94512198SEiji.Ota@Sun.COM return (ret);
94612198SEiji.Ota@Sun.COM }
94712198SEiji.Ota@Sun.COM
94812198SEiji.Ota@Sun.COM /*
94912198SEiji.Ota@Sun.COM * This is so careful about only cleaning up resources that were built up
95012198SEiji.Ota@Sun.COM * so that it can be called at any point during startup. In fact it
95112198SEiji.Ota@Sun.COM * can be called multiple times for a given connection.
95212198SEiji.Ota@Sun.COM */
95312198SEiji.Ota@Sun.COM void
rdsv3_ib_conn_shutdown(struct rdsv3_connection * conn)95412198SEiji.Ota@Sun.COM rdsv3_ib_conn_shutdown(struct rdsv3_connection *conn)
95512198SEiji.Ota@Sun.COM {
95612198SEiji.Ota@Sun.COM struct rdsv3_ib_connection *ic = conn->c_transport_data;
95712198SEiji.Ota@Sun.COM int err = 0;
95812198SEiji.Ota@Sun.COM
95912198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown",
96012676SEiji.Ota@Sun.COM "cm %p pd %p cq %p qp %p", ic->i_cm_id,
96112676SEiji.Ota@Sun.COM ic->i_pd, ic->i_cq, ic->i_cm_id ? ic->i_cm_id->qp : NULL);
96212198SEiji.Ota@Sun.COM
96312198SEiji.Ota@Sun.COM if (ic->i_cm_id) {
96412198SEiji.Ota@Sun.COM struct ib_device *dev = ic->i_cm_id->device;
96512198SEiji.Ota@Sun.COM
96612198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown",
96712198SEiji.Ota@Sun.COM "disconnecting cm %p", ic->i_cm_id);
96812198SEiji.Ota@Sun.COM err = rdma_disconnect(ic->i_cm_id);
96912198SEiji.Ota@Sun.COM if (err) {
97012198SEiji.Ota@Sun.COM /*
97112198SEiji.Ota@Sun.COM * Actually this may happen quite frequently, when
97212198SEiji.Ota@Sun.COM * an outgoing connect raced with an incoming connect.
97312198SEiji.Ota@Sun.COM */
97412198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown",
97512198SEiji.Ota@Sun.COM "failed to disconnect, cm: %p err %d",
97612198SEiji.Ota@Sun.COM ic->i_cm_id, err);
97712198SEiji.Ota@Sun.COM }
97812198SEiji.Ota@Sun.COM
97912198SEiji.Ota@Sun.COM if (ic->i_cm_id->qp) {
98012198SEiji.Ota@Sun.COM (void) ibt_flush_qp(
98112198SEiji.Ota@Sun.COM ib_get_ibt_channel_hdl(ic->i_cm_id));
98212676SEiji.Ota@Sun.COM /*
98312676SEiji.Ota@Sun.COM * Don't wait for the send ring to be empty -- there
98412676SEiji.Ota@Sun.COM * may be completed non-signaled entries sitting on
98512676SEiji.Ota@Sun.COM * there. We unmap these below.
98612676SEiji.Ota@Sun.COM */
98712676SEiji.Ota@Sun.COM rdsv3_wait_event(&ic->i_recv_ring.w_empty_wait,
98812198SEiji.Ota@Sun.COM rdsv3_ib_ring_empty(&ic->i_recv_ring));
98912676SEiji.Ota@Sun.COM /*
99012676SEiji.Ota@Sun.COM * Note that Linux original code calls
99112676SEiji.Ota@Sun.COM * rdma_destroy_qp() after rdsv3_ib_recv_clear_ring(ic).
99212676SEiji.Ota@Sun.COM */
99312198SEiji.Ota@Sun.COM rdma_destroy_qp(ic->i_cm_id);
99412198SEiji.Ota@Sun.COM }
99512198SEiji.Ota@Sun.COM
99612676SEiji.Ota@Sun.COM if (rdsv3_enable_snd_cq) {
99712676SEiji.Ota@Sun.COM if (ic->i_snd_soft_cq) {
99812676SEiji.Ota@Sun.COM rdsv3_af_thr_destroy(ic->i_snd_soft_cq);
99912676SEiji.Ota@Sun.COM ic->i_snd_soft_cq = NULL;
100012676SEiji.Ota@Sun.COM }
100112676SEiji.Ota@Sun.COM if (ic->i_snd_cq)
100212676SEiji.Ota@Sun.COM (void) ib_destroy_cq(ic->i_snd_cq);
100312676SEiji.Ota@Sun.COM }
100412676SEiji.Ota@Sun.COM if (ic->i_soft_cq) {
100512676SEiji.Ota@Sun.COM rdsv3_af_thr_destroy(ic->i_soft_cq);
100612676SEiji.Ota@Sun.COM ic->i_soft_cq = NULL;
100712676SEiji.Ota@Sun.COM }
100812676SEiji.Ota@Sun.COM if (ic->i_refill_rq) {
100912676SEiji.Ota@Sun.COM rdsv3_af_thr_destroy(ic->i_refill_rq);
101012676SEiji.Ota@Sun.COM ic->i_refill_rq = NULL;
101112676SEiji.Ota@Sun.COM }
101212676SEiji.Ota@Sun.COM if (ic->i_cq)
101312676SEiji.Ota@Sun.COM (void) ib_destroy_cq(ic->i_cq);
101412198SEiji.Ota@Sun.COM
101512198SEiji.Ota@Sun.COM if (ic->i_mr)
101612198SEiji.Ota@Sun.COM rdsv3_ib_free_hdrs(dev, ic);
101712198SEiji.Ota@Sun.COM
101812198SEiji.Ota@Sun.COM if (ic->i_sends)
101912198SEiji.Ota@Sun.COM rdsv3_ib_send_clear_ring(ic);
102012198SEiji.Ota@Sun.COM if (ic->i_recvs)
102112198SEiji.Ota@Sun.COM rdsv3_ib_recv_clear_ring(ic);
102212198SEiji.Ota@Sun.COM
102312198SEiji.Ota@Sun.COM rdma_destroy_id(ic->i_cm_id);
102412198SEiji.Ota@Sun.COM
102512198SEiji.Ota@Sun.COM /*
102612198SEiji.Ota@Sun.COM * Move connection back to the nodev list.
102712198SEiji.Ota@Sun.COM */
102812444SGiri.Adari@Sun.COM if (ic->i_on_dev_list)
102912198SEiji.Ota@Sun.COM rdsv3_ib_remove_conn(ic->rds_ibdev, conn);
103012198SEiji.Ota@Sun.COM
103112198SEiji.Ota@Sun.COM ic->i_cm_id = NULL;
103212198SEiji.Ota@Sun.COM ic->i_pd = NULL;
103312198SEiji.Ota@Sun.COM ic->i_mr = NULL;
103412676SEiji.Ota@Sun.COM ic->i_cq = NULL;
103512676SEiji.Ota@Sun.COM ic->i_snd_cq = NULL;
103612198SEiji.Ota@Sun.COM ic->i_send_hdrs = NULL;
103712198SEiji.Ota@Sun.COM ic->i_recv_hdrs = NULL;
103812198SEiji.Ota@Sun.COM ic->i_ack = NULL;
103912198SEiji.Ota@Sun.COM }
104012444SGiri.Adari@Sun.COM ASSERT(!ic->i_on_dev_list);
104112198SEiji.Ota@Sun.COM
104212198SEiji.Ota@Sun.COM /* Clear pending transmit */
104312198SEiji.Ota@Sun.COM if (ic->i_rm) {
104412198SEiji.Ota@Sun.COM rdsv3_message_put(ic->i_rm);
104512198SEiji.Ota@Sun.COM ic->i_rm = NULL;
104612198SEiji.Ota@Sun.COM }
104712198SEiji.Ota@Sun.COM
104812198SEiji.Ota@Sun.COM /* Clear the ACK state */
104912198SEiji.Ota@Sun.COM clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
105012198SEiji.Ota@Sun.COM ic->i_ack_next = 0;
105112198SEiji.Ota@Sun.COM ic->i_ack_recv = 0;
105212198SEiji.Ota@Sun.COM
105312198SEiji.Ota@Sun.COM /* Clear flow control state */
105412198SEiji.Ota@Sun.COM ic->i_flowctl = 0;
105512198SEiji.Ota@Sun.COM ic->i_credits = 0;
105612198SEiji.Ota@Sun.COM
105712198SEiji.Ota@Sun.COM rdsv3_ib_ring_init(&ic->i_send_ring, rdsv3_ib_sysctl_max_send_wr);
105812198SEiji.Ota@Sun.COM rdsv3_ib_ring_init(&ic->i_recv_ring, rdsv3_ib_sysctl_max_recv_wr);
105912198SEiji.Ota@Sun.COM
106012198SEiji.Ota@Sun.COM if (ic->i_ibinc) {
106112198SEiji.Ota@Sun.COM rdsv3_inc_put(&ic->i_ibinc->ii_inc);
106212198SEiji.Ota@Sun.COM ic->i_ibinc = NULL;
106312198SEiji.Ota@Sun.COM }
106412198SEiji.Ota@Sun.COM
106512198SEiji.Ota@Sun.COM if (ic->i_sends) {
106612198SEiji.Ota@Sun.COM kmem_free(ic->i_sends,
106712198SEiji.Ota@Sun.COM ic->i_send_ring.w_nr * sizeof (struct rdsv3_ib_send_work));
106812198SEiji.Ota@Sun.COM ic->i_sends = NULL;
106912198SEiji.Ota@Sun.COM }
107012198SEiji.Ota@Sun.COM if (ic->i_send_wrs) {
107112794SGiri.Adari@Sun.COM kmem_free(ic->i_send_wrs, ic->i_send_ring.w_nr *
107212198SEiji.Ota@Sun.COM (sizeof (ibt_send_wr_t) +
107312198SEiji.Ota@Sun.COM RDSV3_IB_MAX_SGE * sizeof (ibt_wr_ds_t)));
107412198SEiji.Ota@Sun.COM ic->i_send_wrs = NULL;
107512198SEiji.Ota@Sun.COM }
107612198SEiji.Ota@Sun.COM if (ic->i_recvs) {
107712198SEiji.Ota@Sun.COM kmem_free(ic->i_recvs,
107812198SEiji.Ota@Sun.COM ic->i_recv_ring.w_nr * sizeof (struct rdsv3_ib_recv_work));
107912198SEiji.Ota@Sun.COM ic->i_recvs = NULL;
108012198SEiji.Ota@Sun.COM }
108112676SEiji.Ota@Sun.COM if (ic->i_recv_wrs) {
108212676SEiji.Ota@Sun.COM kmem_free(ic->i_recv_wrs, ic->i_recv_ring.w_nr *
108312676SEiji.Ota@Sun.COM (sizeof (ibt_recv_wr_t)));
108412676SEiji.Ota@Sun.COM ic->i_recv_wrs = NULL;
108512676SEiji.Ota@Sun.COM }
108612198SEiji.Ota@Sun.COM
108712198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", "Return conn: %p", conn);
108812198SEiji.Ota@Sun.COM }
108912198SEiji.Ota@Sun.COM
109012198SEiji.Ota@Sun.COM /* ARGSUSED */
109112198SEiji.Ota@Sun.COM int
rdsv3_ib_conn_alloc(struct rdsv3_connection * conn,int gfp)109212198SEiji.Ota@Sun.COM rdsv3_ib_conn_alloc(struct rdsv3_connection *conn, int gfp)
109312198SEiji.Ota@Sun.COM {
109412198SEiji.Ota@Sun.COM struct rdsv3_ib_connection *ic;
109512198SEiji.Ota@Sun.COM
109612198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn: %p", conn);
109712198SEiji.Ota@Sun.COM
109812198SEiji.Ota@Sun.COM /* XXX too lazy? */
109912198SEiji.Ota@Sun.COM ic = kmem_zalloc(sizeof (struct rdsv3_ib_connection), gfp);
110012676SEiji.Ota@Sun.COM if (!ic)
110112198SEiji.Ota@Sun.COM return (-ENOMEM);
110212198SEiji.Ota@Sun.COM
110312198SEiji.Ota@Sun.COM list_link_init(&ic->ib_node);
110412198SEiji.Ota@Sun.COM
110512198SEiji.Ota@Sun.COM mutex_init(&ic->i_recv_mutex, NULL, MUTEX_DRIVER, NULL);
110612198SEiji.Ota@Sun.COM mutex_init(&ic->i_ack_lock, NULL, MUTEX_DRIVER, NULL);
110712198SEiji.Ota@Sun.COM
110812198SEiji.Ota@Sun.COM /*
110912198SEiji.Ota@Sun.COM * rdsv3_ib_conn_shutdown() waits for these to be emptied so they
111012198SEiji.Ota@Sun.COM * must be initialized before it can be called.
111112198SEiji.Ota@Sun.COM */
111212198SEiji.Ota@Sun.COM rdsv3_ib_ring_init(&ic->i_send_ring, rdsv3_ib_sysctl_max_send_wr);
111312198SEiji.Ota@Sun.COM rdsv3_ib_ring_init(&ic->i_recv_ring, rdsv3_ib_sysctl_max_recv_wr);
111412198SEiji.Ota@Sun.COM
111512198SEiji.Ota@Sun.COM ic->conn = conn;
111612198SEiji.Ota@Sun.COM conn->c_transport_data = ic;
111712198SEiji.Ota@Sun.COM
111812198SEiji.Ota@Sun.COM mutex_enter(&ib_nodev_conns_lock);
111912198SEiji.Ota@Sun.COM list_insert_tail(&ib_nodev_conns, ic);
112012198SEiji.Ota@Sun.COM mutex_exit(&ib_nodev_conns_lock);
112112198SEiji.Ota@Sun.COM
112212198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn %p conn ic %p",
112312198SEiji.Ota@Sun.COM conn, conn->c_transport_data);
112412198SEiji.Ota@Sun.COM return (0);
112512198SEiji.Ota@Sun.COM }
112612198SEiji.Ota@Sun.COM
112712198SEiji.Ota@Sun.COM /*
112812198SEiji.Ota@Sun.COM * Free a connection. Connection must be shut down and not set for reconnect.
112912198SEiji.Ota@Sun.COM */
113012198SEiji.Ota@Sun.COM void
rdsv3_ib_conn_free(void * arg)113112198SEiji.Ota@Sun.COM rdsv3_ib_conn_free(void *arg)
113212198SEiji.Ota@Sun.COM {
113312198SEiji.Ota@Sun.COM struct rdsv3_ib_connection *ic = arg;
113412198SEiji.Ota@Sun.COM kmutex_t *lock_ptr;
113512198SEiji.Ota@Sun.COM
113612198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_conn_free", "ic %p\n", ic);
113712198SEiji.Ota@Sun.COM
113812198SEiji.Ota@Sun.COM #ifndef __lock_lint
113912198SEiji.Ota@Sun.COM /*
114012198SEiji.Ota@Sun.COM * Conn is either on a dev's list or on the nodev list.
114112198SEiji.Ota@Sun.COM * A race with shutdown() or connect() would cause problems
114212198SEiji.Ota@Sun.COM * (since rds_ibdev would change) but that should never happen.
114312198SEiji.Ota@Sun.COM */
114412444SGiri.Adari@Sun.COM lock_ptr = ic->i_on_dev_list ?
114512198SEiji.Ota@Sun.COM &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock;
114612198SEiji.Ota@Sun.COM
114712198SEiji.Ota@Sun.COM mutex_enter(lock_ptr);
114812198SEiji.Ota@Sun.COM list_remove_node(&ic->ib_node);
114912198SEiji.Ota@Sun.COM mutex_exit(lock_ptr);
115012198SEiji.Ota@Sun.COM #endif
115112198SEiji.Ota@Sun.COM kmem_free(ic, sizeof (*ic));
115212198SEiji.Ota@Sun.COM }
115312198SEiji.Ota@Sun.COM
115412198SEiji.Ota@Sun.COM /*
115512198SEiji.Ota@Sun.COM * An error occurred on the connection
115612198SEiji.Ota@Sun.COM */
115712198SEiji.Ota@Sun.COM void
__rdsv3_ib_conn_error(struct rdsv3_connection * conn)115812198SEiji.Ota@Sun.COM __rdsv3_ib_conn_error(struct rdsv3_connection *conn)
115912198SEiji.Ota@Sun.COM {
116012198SEiji.Ota@Sun.COM rdsv3_conn_drop(conn);
116112198SEiji.Ota@Sun.COM }
1162