xref: /onnv-gate/usr/src/uts/common/io/ib/clients/rdsv3/ib.c (revision 12922:c58988eacbb2)
112198SEiji.Ota@Sun.COM /*
212198SEiji.Ota@Sun.COM  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
312198SEiji.Ota@Sun.COM  */
412198SEiji.Ota@Sun.COM 
512198SEiji.Ota@Sun.COM /*
612763SGiri.Adari@Sun.COM  * This file contains code imported from the OFED rds source file ib.c
712763SGiri.Adari@Sun.COM  * Oracle elects to have and use the contents of ib.c under and governed
812763SGiri.Adari@Sun.COM  * by the OpenIB.org BSD license (see below for full license text). However,
912763SGiri.Adari@Sun.COM  * the following notice accompanied the original version of this file:
1012763SGiri.Adari@Sun.COM  */
1112763SGiri.Adari@Sun.COM 
1212763SGiri.Adari@Sun.COM /*
1312198SEiji.Ota@Sun.COM  * Copyright (c) 2006 Oracle.  All rights reserved.
1412198SEiji.Ota@Sun.COM  *
1512198SEiji.Ota@Sun.COM  * This software is available to you under a choice of one of two
1612198SEiji.Ota@Sun.COM  * licenses.  You may choose to be licensed under the terms of the GNU
1712198SEiji.Ota@Sun.COM  * General Public License (GPL) Version 2, available from the file
1812198SEiji.Ota@Sun.COM  * COPYING in the main directory of this source tree, or the
1912198SEiji.Ota@Sun.COM  * OpenIB.org BSD license below:
2012198SEiji.Ota@Sun.COM  *
2112198SEiji.Ota@Sun.COM  *     Redistribution and use in source and binary forms, with or
2212198SEiji.Ota@Sun.COM  *     without modification, are permitted provided that the following
2312198SEiji.Ota@Sun.COM  *     conditions are met:
2412198SEiji.Ota@Sun.COM  *
2512198SEiji.Ota@Sun.COM  *      - Redistributions of source code must retain the above
2612198SEiji.Ota@Sun.COM  *        copyright notice, this list of conditions and the following
2712198SEiji.Ota@Sun.COM  *        disclaimer.
2812198SEiji.Ota@Sun.COM  *
2912198SEiji.Ota@Sun.COM  *      - Redistributions in binary form must reproduce the above
3012198SEiji.Ota@Sun.COM  *        copyright notice, this list of conditions and the following
3112198SEiji.Ota@Sun.COM  *        disclaimer in the documentation and/or other materials
3212198SEiji.Ota@Sun.COM  *        provided with the distribution.
3312198SEiji.Ota@Sun.COM  *
3412198SEiji.Ota@Sun.COM  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
3512198SEiji.Ota@Sun.COM  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
3612198SEiji.Ota@Sun.COM  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
3712198SEiji.Ota@Sun.COM  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
3812198SEiji.Ota@Sun.COM  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
3912198SEiji.Ota@Sun.COM  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
4012198SEiji.Ota@Sun.COM  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
4112198SEiji.Ota@Sun.COM  * SOFTWARE.
4212198SEiji.Ota@Sun.COM  *
4312198SEiji.Ota@Sun.COM  */
4412198SEiji.Ota@Sun.COM #include <sys/sysmacros.h>
4512198SEiji.Ota@Sun.COM #include <sys/rds.h>
4612198SEiji.Ota@Sun.COM 
4712198SEiji.Ota@Sun.COM #include <sys/ib/ibtl/ibti.h>
4812198SEiji.Ota@Sun.COM #include <sys/ib/clients/rdsv3/rdsv3.h>
4912198SEiji.Ota@Sun.COM #include <sys/ib/clients/rdsv3/ib.h>
5012198SEiji.Ota@Sun.COM #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
5112198SEiji.Ota@Sun.COM 
5212198SEiji.Ota@Sun.COM unsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT;
5312198SEiji.Ota@Sun.COM 
5412198SEiji.Ota@Sun.COM struct list	rdsv3_ib_devices;
5512198SEiji.Ota@Sun.COM 
5612198SEiji.Ota@Sun.COM /* NOTE: if also grabbing ibdev lock, grab this first */
5712198SEiji.Ota@Sun.COM kmutex_t ib_nodev_conns_lock;
5812198SEiji.Ota@Sun.COM list_t ib_nodev_conns;
5912198SEiji.Ota@Sun.COM 
6012444SGiri.Adari@Sun.COM extern int rdsv3_ib_frag_constructor(void *buf, void *arg, int kmflags);
6112444SGiri.Adari@Sun.COM extern void rdsv3_ib_frag_destructor(void *buf, void *arg);
6212444SGiri.Adari@Sun.COM 
6312198SEiji.Ota@Sun.COM void
rdsv3_ib_add_one(ib_device_t * device)6412198SEiji.Ota@Sun.COM rdsv3_ib_add_one(ib_device_t *device)
6512198SEiji.Ota@Sun.COM {
6612198SEiji.Ota@Sun.COM 	struct rdsv3_ib_device *rds_ibdev;
6712198SEiji.Ota@Sun.COM 	ibt_hca_attr_t *dev_attr;
6812444SGiri.Adari@Sun.COM 	char name[64];
6912198SEiji.Ota@Sun.COM 
7012580SGiri.Adari@Sun.COM 	RDSV3_DPRINTF2("rdsv3_ib_add_one", "device: %p", device);
7112198SEiji.Ota@Sun.COM 
7212198SEiji.Ota@Sun.COM 	/* Only handle IB (no iWARP) devices */
7312198SEiji.Ota@Sun.COM 	if (device->node_type != RDMA_NODE_IB_CA)
7412198SEiji.Ota@Sun.COM 		return;
7512198SEiji.Ota@Sun.COM 
7612198SEiji.Ota@Sun.COM 	dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr),
7712198SEiji.Ota@Sun.COM 	    KM_NOSLEEP);
7812198SEiji.Ota@Sun.COM 	if (!dev_attr)
7912198SEiji.Ota@Sun.COM 		return;
8012198SEiji.Ota@Sun.COM 
8112198SEiji.Ota@Sun.COM 	if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) {
8212444SGiri.Adari@Sun.COM 		RDSV3_DPRINTF2("rdsv3_ib_add_one",
8312198SEiji.Ota@Sun.COM 		    "Query device failed for %s", device->name);
8412198SEiji.Ota@Sun.COM 		goto free_attr;
8512198SEiji.Ota@Sun.COM 	}
8612198SEiji.Ota@Sun.COM 
8712198SEiji.Ota@Sun.COM 	/* We depend on Reserved Lkey */
8812198SEiji.Ota@Sun.COM 	if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) {
8912444SGiri.Adari@Sun.COM 		RDSV3_DPRINTF2("rdsv3_ib_add_one",
9012198SEiji.Ota@Sun.COM 		    "Reserved Lkey support is required: %s",
9112198SEiji.Ota@Sun.COM 		    device->name);
9212198SEiji.Ota@Sun.COM 		goto free_attr;
9312198SEiji.Ota@Sun.COM 	}
9412198SEiji.Ota@Sun.COM 
9512198SEiji.Ota@Sun.COM 	rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP);
9612198SEiji.Ota@Sun.COM 	if (!rds_ibdev)
9712198SEiji.Ota@Sun.COM 		goto free_attr;
9812198SEiji.Ota@Sun.COM 
9912676SEiji.Ota@Sun.COM 	rds_ibdev->ibt_hca_hdl = ib_get_ibt_hca_hdl(device);
10012676SEiji.Ota@Sun.COM 	rds_ibdev->hca_attr =  *dev_attr;
10112676SEiji.Ota@Sun.COM 
10212676SEiji.Ota@Sun.COM 	rw_init(&rds_ibdev->rwlock, NULL, RW_DRIVER, NULL);
10312198SEiji.Ota@Sun.COM 	mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL);
10412198SEiji.Ota@Sun.COM 
10512198SEiji.Ota@Sun.COM 	rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz;
10612198SEiji.Ota@Sun.COM 	rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE);
10712198SEiji.Ota@Sun.COM 
10812676SEiji.Ota@Sun.COM 	rds_ibdev->max_initiator_depth = (uint_t)dev_attr->hca_max_rdma_in_qp;
10912676SEiji.Ota@Sun.COM 	rds_ibdev->max_responder_resources =
11012676SEiji.Ota@Sun.COM 	    (uint_t)dev_attr->hca_max_rdma_in_qp;
11112676SEiji.Ota@Sun.COM 
11212198SEiji.Ota@Sun.COM 	rds_ibdev->dev = device;
11312198SEiji.Ota@Sun.COM 	rds_ibdev->pd = ib_alloc_pd(device);
11412198SEiji.Ota@Sun.COM 	if (IS_ERR(rds_ibdev->pd))
11512198SEiji.Ota@Sun.COM 		goto free_dev;
11612198SEiji.Ota@Sun.COM 
11712198SEiji.Ota@Sun.COM 	if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) {
11812198SEiji.Ota@Sun.COM 		goto free_dev;
11912198SEiji.Ota@Sun.COM 	}
12012198SEiji.Ota@Sun.COM 
12112676SEiji.Ota@Sun.COM 	if (rdsv3_ib_create_inc_pool(rds_ibdev) != 0) {
12212676SEiji.Ota@Sun.COM 		rdsv3_ib_destroy_mr_pool(rds_ibdev);
12312676SEiji.Ota@Sun.COM 		goto free_dev;
12412676SEiji.Ota@Sun.COM 	}
12512676SEiji.Ota@Sun.COM 
12612444SGiri.Adari@Sun.COM 	(void) snprintf(name, 64, "RDSV3_IB_FRAG_%llx",
12712444SGiri.Adari@Sun.COM 	    (longlong_t)htonll(dev_attr->hca_node_guid));
12812444SGiri.Adari@Sun.COM 	rds_ibdev->ib_frag_slab = kmem_cache_create(name,
12912444SGiri.Adari@Sun.COM 	    sizeof (struct rdsv3_page_frag), 0, rdsv3_ib_frag_constructor,
13012444SGiri.Adari@Sun.COM 	    rdsv3_ib_frag_destructor, NULL, (void *)rds_ibdev, NULL, 0);
13112444SGiri.Adari@Sun.COM 	if (rds_ibdev->ib_frag_slab == NULL) {
13212444SGiri.Adari@Sun.COM 		RDSV3_DPRINTF2("rdsv3_ib_add_one",
13312444SGiri.Adari@Sun.COM 		    "kmem_cache_create for ib_frag_slab failed for device: %s",
13412444SGiri.Adari@Sun.COM 		    device->name);
13512444SGiri.Adari@Sun.COM 		rdsv3_ib_destroy_mr_pool(rds_ibdev);
13612676SEiji.Ota@Sun.COM 		rdsv3_ib_destroy_inc_pool(rds_ibdev);
13712444SGiri.Adari@Sun.COM 		goto free_dev;
13812444SGiri.Adari@Sun.COM 	}
13912444SGiri.Adari@Sun.COM 
14012676SEiji.Ota@Sun.COM 	rds_ibdev->aft_hcagp = rdsv3_af_grp_create(rds_ibdev->ibt_hca_hdl,
14112676SEiji.Ota@Sun.COM 	    (uint64_t)rds_ibdev->hca_attr.hca_node_guid);
14212676SEiji.Ota@Sun.COM 	if (rds_ibdev->aft_hcagp == NULL) {
14312676SEiji.Ota@Sun.COM 		rdsv3_ib_destroy_mr_pool(rds_ibdev);
14412676SEiji.Ota@Sun.COM 		rdsv3_ib_destroy_inc_pool(rds_ibdev);
14512676SEiji.Ota@Sun.COM 		kmem_cache_destroy(rds_ibdev->ib_frag_slab);
14612676SEiji.Ota@Sun.COM 		goto free_dev;
14712676SEiji.Ota@Sun.COM 	}
14812676SEiji.Ota@Sun.COM 	rds_ibdev->fmr_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_mrlist_fn,
14912676SEiji.Ota@Sun.COM 	    (void *)rds_ibdev->fmr_pool, SCQ_HCA_BIND_CPU,
15012676SEiji.Ota@Sun.COM 	    rds_ibdev->aft_hcagp);
15112676SEiji.Ota@Sun.COM 	if (rds_ibdev->fmr_soft_cq == NULL) {
15212676SEiji.Ota@Sun.COM 		rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
15312676SEiji.Ota@Sun.COM 		rdsv3_ib_destroy_mr_pool(rds_ibdev);
15412676SEiji.Ota@Sun.COM 		rdsv3_ib_destroy_inc_pool(rds_ibdev);
15512676SEiji.Ota@Sun.COM 		kmem_cache_destroy(rds_ibdev->ib_frag_slab);
15612676SEiji.Ota@Sun.COM 		goto free_dev;
15712676SEiji.Ota@Sun.COM 	}
15812676SEiji.Ota@Sun.COM 
15912676SEiji.Ota@Sun.COM 	rds_ibdev->inc_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_inclist,
16012676SEiji.Ota@Sun.COM 	    (void *)rds_ibdev->inc_pool, SCQ_HCA_BIND_CPU,
16112676SEiji.Ota@Sun.COM 	    rds_ibdev->aft_hcagp);
16212676SEiji.Ota@Sun.COM 	if (rds_ibdev->inc_soft_cq == NULL) {
16312676SEiji.Ota@Sun.COM 		rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq);
16412676SEiji.Ota@Sun.COM 		rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
16512676SEiji.Ota@Sun.COM 		rdsv3_ib_destroy_mr_pool(rds_ibdev);
16612676SEiji.Ota@Sun.COM 		rdsv3_ib_destroy_inc_pool(rds_ibdev);
16712676SEiji.Ota@Sun.COM 		kmem_cache_destroy(rds_ibdev->ib_frag_slab);
16812676SEiji.Ota@Sun.COM 		goto free_dev;
16912676SEiji.Ota@Sun.COM 	}
17012444SGiri.Adari@Sun.COM 
17112198SEiji.Ota@Sun.COM 	list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr),
17212198SEiji.Ota@Sun.COM 	    offsetof(struct rdsv3_ib_ipaddr, list));
17312198SEiji.Ota@Sun.COM 	list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection),
17412198SEiji.Ota@Sun.COM 	    offsetof(struct rdsv3_ib_connection, ib_node));
17512198SEiji.Ota@Sun.COM 
17612198SEiji.Ota@Sun.COM 	list_insert_tail(&rdsv3_ib_devices, rds_ibdev);
17712198SEiji.Ota@Sun.COM 
17812198SEiji.Ota@Sun.COM 	ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev);
17912198SEiji.Ota@Sun.COM 
18012580SGiri.Adari@Sun.COM 	RDSV3_DPRINTF2("rdsv3_ib_add_one", "Return: device: %p", device);
18112198SEiji.Ota@Sun.COM 
18212198SEiji.Ota@Sun.COM 	goto free_attr;
18312198SEiji.Ota@Sun.COM 
18412198SEiji.Ota@Sun.COM err_pd:
18512198SEiji.Ota@Sun.COM 	(void) ib_dealloc_pd(rds_ibdev->pd);
18612198SEiji.Ota@Sun.COM free_dev:
18712676SEiji.Ota@Sun.COM 	mutex_destroy(&rds_ibdev->spinlock);
18812676SEiji.Ota@Sun.COM 	rw_destroy(&rds_ibdev->rwlock);
18912198SEiji.Ota@Sun.COM 	kmem_free(rds_ibdev, sizeof (*rds_ibdev));
19012198SEiji.Ota@Sun.COM free_attr:
19112198SEiji.Ota@Sun.COM 	kmem_free(dev_attr, sizeof (*dev_attr));
19212198SEiji.Ota@Sun.COM }
19312198SEiji.Ota@Sun.COM 
19412198SEiji.Ota@Sun.COM void
rdsv3_ib_remove_one(struct ib_device * device)19512198SEiji.Ota@Sun.COM rdsv3_ib_remove_one(struct ib_device *device)
19612198SEiji.Ota@Sun.COM {
19712198SEiji.Ota@Sun.COM 	struct rdsv3_ib_device *rds_ibdev;
19812198SEiji.Ota@Sun.COM 	struct rdsv3_ib_ipaddr *i_ipaddr, *i_next;
19912198SEiji.Ota@Sun.COM 
20012580SGiri.Adari@Sun.COM 	RDSV3_DPRINTF2("rdsv3_ib_remove_one", "device: %p", device);
20112198SEiji.Ota@Sun.COM 
20212198SEiji.Ota@Sun.COM 	rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client);
20312198SEiji.Ota@Sun.COM 	if (!rds_ibdev)
20412198SEiji.Ota@Sun.COM 		return;
20512198SEiji.Ota@Sun.COM 
20612198SEiji.Ota@Sun.COM 	RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list,
20712198SEiji.Ota@Sun.COM 	    list) {
20812198SEiji.Ota@Sun.COM 		list_remove_node(&i_ipaddr->list);
20912198SEiji.Ota@Sun.COM 		kmem_free(i_ipaddr, sizeof (*i_ipaddr));
21012198SEiji.Ota@Sun.COM 	}
21112198SEiji.Ota@Sun.COM 
21212198SEiji.Ota@Sun.COM 	rdsv3_ib_destroy_conns(rds_ibdev);
21312198SEiji.Ota@Sun.COM 
21412676SEiji.Ota@Sun.COM 	if (rds_ibdev->fmr_soft_cq)
21512676SEiji.Ota@Sun.COM 		rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq);
21612676SEiji.Ota@Sun.COM 	if (rds_ibdev->inc_soft_cq)
21712676SEiji.Ota@Sun.COM 		rdsv3_af_thr_destroy(rds_ibdev->inc_soft_cq);
21812676SEiji.Ota@Sun.COM 
21912198SEiji.Ota@Sun.COM 	rdsv3_ib_destroy_mr_pool(rds_ibdev);
22012676SEiji.Ota@Sun.COM 	rdsv3_ib_destroy_inc_pool(rds_ibdev);
22112198SEiji.Ota@Sun.COM 
22212444SGiri.Adari@Sun.COM 	kmem_cache_destroy(rds_ibdev->ib_frag_slab);
22312444SGiri.Adari@Sun.COM 
22412676SEiji.Ota@Sun.COM 	rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
22512676SEiji.Ota@Sun.COM 
22612198SEiji.Ota@Sun.COM #if 0
22712198SEiji.Ota@Sun.COM 	while (ib_dealloc_pd(rds_ibdev->pd)) {
22812198SEiji.Ota@Sun.COM #ifndef __lock_lint
22912198SEiji.Ota@Sun.COM 		RDSV3_DPRINTF5("rdsv3_ib_remove_one",
23012198SEiji.Ota@Sun.COM 		    "%s-%d Failed to dealloc pd %p",
23112198SEiji.Ota@Sun.COM 		    __func__, __LINE__, rds_ibdev->pd);
23212198SEiji.Ota@Sun.COM #endif
23312198SEiji.Ota@Sun.COM 		delay(drv_usectohz(1000));
23412198SEiji.Ota@Sun.COM 	}
23512198SEiji.Ota@Sun.COM #else
23612198SEiji.Ota@Sun.COM 	if (ib_dealloc_pd(rds_ibdev->pd)) {
23712198SEiji.Ota@Sun.COM #ifndef __lock_lint
23812198SEiji.Ota@Sun.COM 		RDSV3_DPRINTF2("rdsv3_ib_remove_one",
23912414SEiji.Ota@Sun.COM 		    "Failed to dealloc pd %p\n", rds_ibdev->pd);
24012198SEiji.Ota@Sun.COM #endif
24112198SEiji.Ota@Sun.COM 	}
24212198SEiji.Ota@Sun.COM #endif
24312198SEiji.Ota@Sun.COM 
24412198SEiji.Ota@Sun.COM 	list_destroy(&rds_ibdev->ipaddr_list);
24512198SEiji.Ota@Sun.COM 	list_destroy(&rds_ibdev->conn_list);
24612198SEiji.Ota@Sun.COM 	list_remove_node(&rds_ibdev->list);
24712676SEiji.Ota@Sun.COM 	mutex_destroy(&rds_ibdev->spinlock);
24812676SEiji.Ota@Sun.COM 	rw_destroy(&rds_ibdev->rwlock);
24912198SEiji.Ota@Sun.COM 	kmem_free(rds_ibdev, sizeof (*rds_ibdev));
25012198SEiji.Ota@Sun.COM 
25112580SGiri.Adari@Sun.COM 	RDSV3_DPRINTF2("rdsv3_ib_remove_one", "Return: device: %p", device);
25212198SEiji.Ota@Sun.COM }
25312198SEiji.Ota@Sun.COM 
25412198SEiji.Ota@Sun.COM #ifndef __lock_lint
25512198SEiji.Ota@Sun.COM struct ib_client rdsv3_ib_client = {
25612198SEiji.Ota@Sun.COM 	.name		= "rdsv3_ib",
25712198SEiji.Ota@Sun.COM 	.add		= rdsv3_ib_add_one,
25812198SEiji.Ota@Sun.COM 	.remove		= rdsv3_ib_remove_one,
25912198SEiji.Ota@Sun.COM 	.clnt_hdl	= NULL,
26012198SEiji.Ota@Sun.COM 	.state		= IB_CLNT_UNINITIALIZED
26112198SEiji.Ota@Sun.COM };
26212198SEiji.Ota@Sun.COM #else
26312198SEiji.Ota@Sun.COM struct ib_client rdsv3_ib_client = {
26412198SEiji.Ota@Sun.COM 	"rdsv3_ib",
26512198SEiji.Ota@Sun.COM 	rdsv3_ib_add_one,
26612198SEiji.Ota@Sun.COM 	rdsv3_ib_remove_one,
26712198SEiji.Ota@Sun.COM 	NULL,
26812198SEiji.Ota@Sun.COM 	NULL,
26912198SEiji.Ota@Sun.COM 	IB_CLNT_UNINITIALIZED
27012198SEiji.Ota@Sun.COM };
27112198SEiji.Ota@Sun.COM #endif
27212198SEiji.Ota@Sun.COM 
27312198SEiji.Ota@Sun.COM static int
rds_ib_conn_info_visitor(struct rdsv3_connection * conn,void * buffer)27412198SEiji.Ota@Sun.COM rds_ib_conn_info_visitor(struct rdsv3_connection *conn,
27512198SEiji.Ota@Sun.COM     void *buffer)
27612198SEiji.Ota@Sun.COM {
27712863SEiji.Ota@Sun.COM 	struct rds_info_rdma_connection *iinfo = buffer;
27812198SEiji.Ota@Sun.COM 	struct rdsv3_ib_connection *ic;
27912198SEiji.Ota@Sun.COM 
28012198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
28112198SEiji.Ota@Sun.COM 	    conn, buffer);
28212198SEiji.Ota@Sun.COM 
28312198SEiji.Ota@Sun.COM 	/* We will only ever look at IB transports */
28412198SEiji.Ota@Sun.COM 	if (conn->c_trans != &rdsv3_ib_transport)
28512198SEiji.Ota@Sun.COM 		return (0);
28612198SEiji.Ota@Sun.COM 
28712198SEiji.Ota@Sun.COM 	iinfo->src_addr = conn->c_laddr;
28812198SEiji.Ota@Sun.COM 	iinfo->dst_addr = conn->c_faddr;
28912198SEiji.Ota@Sun.COM 
29012198SEiji.Ota@Sun.COM 	(void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid));
29112198SEiji.Ota@Sun.COM 	(void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid));
29212198SEiji.Ota@Sun.COM 	if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
29312198SEiji.Ota@Sun.COM 		struct rdsv3_ib_device *rds_ibdev;
29412198SEiji.Ota@Sun.COM 		struct rdma_dev_addr *dev_addr;
29512198SEiji.Ota@Sun.COM 
29612198SEiji.Ota@Sun.COM 		ic = conn->c_transport_data;
29712198SEiji.Ota@Sun.COM 		dev_addr = &ic->i_cm_id->route.addr.dev_addr;
29812198SEiji.Ota@Sun.COM 
29912198SEiji.Ota@Sun.COM 		ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid);
30012198SEiji.Ota@Sun.COM 		ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid);
30112198SEiji.Ota@Sun.COM 
30212198SEiji.Ota@Sun.COM 		rds_ibdev = ib_get_client_data(ic->i_cm_id->device,
30312198SEiji.Ota@Sun.COM 		    &rdsv3_ib_client);
30412198SEiji.Ota@Sun.COM 		iinfo->max_send_wr = ic->i_send_ring.w_nr;
30512198SEiji.Ota@Sun.COM 		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
30612198SEiji.Ota@Sun.COM 		iinfo->max_send_sge = rds_ibdev->max_sge;
30712198SEiji.Ota@Sun.COM 	}
30812198SEiji.Ota@Sun.COM 
30912198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
31012198SEiji.Ota@Sun.COM 	    conn, buffer);
31112198SEiji.Ota@Sun.COM 	return (1);
31212198SEiji.Ota@Sun.COM }
31312198SEiji.Ota@Sun.COM 
31412198SEiji.Ota@Sun.COM static void
rds_ib_ic_info(struct rsock * sock,unsigned int len,struct rdsv3_info_iterator * iter,struct rdsv3_info_lengths * lens)31512198SEiji.Ota@Sun.COM rds_ib_ic_info(struct rsock *sock, unsigned int len,
31612198SEiji.Ota@Sun.COM     struct rdsv3_info_iterator *iter,
31712198SEiji.Ota@Sun.COM     struct rdsv3_info_lengths *lens)
31812198SEiji.Ota@Sun.COM {
31912198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d",
32012198SEiji.Ota@Sun.COM 	    sock, iter, lens, len);
32112198SEiji.Ota@Sun.COM 
32212198SEiji.Ota@Sun.COM 	rdsv3_for_each_conn_info(sock, len, iter, lens,
32312198SEiji.Ota@Sun.COM 	    rds_ib_conn_info_visitor,
32412863SEiji.Ota@Sun.COM 	    sizeof (struct rds_info_rdma_connection));
32512198SEiji.Ota@Sun.COM }
32612198SEiji.Ota@Sun.COM 
32712198SEiji.Ota@Sun.COM /*
32812198SEiji.Ota@Sun.COM  * Early RDS/IB was built to only bind to an address if there is an IPoIB
32912198SEiji.Ota@Sun.COM  * device with that address set.
33012198SEiji.Ota@Sun.COM  *
33112198SEiji.Ota@Sun.COM  * If it were me, I'd advocate for something more flexible.  Sending and
33212198SEiji.Ota@Sun.COM  * receiving should be device-agnostic.  Transports would try and maintain
33312198SEiji.Ota@Sun.COM  * connections between peers who have messages queued.  Userspace would be
33412198SEiji.Ota@Sun.COM  * allowed to influence which paths have priority.  We could call userspace
33512198SEiji.Ota@Sun.COM  * asserting this policy "routing".
33612198SEiji.Ota@Sun.COM  */
33712198SEiji.Ota@Sun.COM static int
rds_ib_laddr_check(uint32_be_t addr)33812198SEiji.Ota@Sun.COM rds_ib_laddr_check(uint32_be_t addr)
33912198SEiji.Ota@Sun.COM {
34012198SEiji.Ota@Sun.COM 	int ret;
34112198SEiji.Ota@Sun.COM 	struct rdma_cm_id *cm_id;
34212198SEiji.Ota@Sun.COM 	struct sockaddr_in sin;
34312198SEiji.Ota@Sun.COM 
34412198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr));
34512198SEiji.Ota@Sun.COM 
34612198SEiji.Ota@Sun.COM 	/*
34712198SEiji.Ota@Sun.COM 	 * Create a CMA ID and try to bind it. This catches both
34812198SEiji.Ota@Sun.COM 	 * IB and iWARP capable NICs.
34912198SEiji.Ota@Sun.COM 	 */
35012198SEiji.Ota@Sun.COM 	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
35112414SEiji.Ota@Sun.COM 	if (!cm_id)
35212414SEiji.Ota@Sun.COM 		return (-EADDRNOTAVAIL);
35312198SEiji.Ota@Sun.COM 
35412198SEiji.Ota@Sun.COM 	(void) memset(&sin, 0, sizeof (sin));
35512198SEiji.Ota@Sun.COM 	sin.sin_family = AF_INET;
35612198SEiji.Ota@Sun.COM 	sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr);
35712198SEiji.Ota@Sun.COM 
35812198SEiji.Ota@Sun.COM 	/* rdma_bind_addr will only succeed for IB & iWARP devices */
35912198SEiji.Ota@Sun.COM 	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
36012198SEiji.Ota@Sun.COM 	/*
36112198SEiji.Ota@Sun.COM 	 * due to this, we will claim to support iWARP devices unless we
36212198SEiji.Ota@Sun.COM 	 * check node_type.
36312198SEiji.Ota@Sun.COM 	 */
36412198SEiji.Ota@Sun.COM 	if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
36512198SEiji.Ota@Sun.COM 		ret = -EADDRNOTAVAIL;
36612198SEiji.Ota@Sun.COM 
36712198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF5("rds_ib_laddr_check",
36812198SEiji.Ota@Sun.COM 	    "addr %u.%u.%u.%u ret %d node type %d",
36912198SEiji.Ota@Sun.COM 	    NIPQUAD(addr), ret,
37012198SEiji.Ota@Sun.COM 	    cm_id->device ? cm_id->device->node_type : -1);
37112198SEiji.Ota@Sun.COM 
37212198SEiji.Ota@Sun.COM 	rdma_destroy_id(cm_id);
37312198SEiji.Ota@Sun.COM 
37412198SEiji.Ota@Sun.COM 	return (ret);
37512198SEiji.Ota@Sun.COM }
37612198SEiji.Ota@Sun.COM 
37712198SEiji.Ota@Sun.COM void
rdsv3_ib_exit(void)37812198SEiji.Ota@Sun.COM rdsv3_ib_exit(void)
37912198SEiji.Ota@Sun.COM {
38012198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rds_ib_exit", "Enter");
38112198SEiji.Ota@Sun.COM 
38212863SEiji.Ota@Sun.COM 	rdsv3_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
38312198SEiji.Ota@Sun.COM 	rdsv3_ib_destroy_nodev_conns();
38412198SEiji.Ota@Sun.COM 	ib_unregister_client(&rdsv3_ib_client);
38512198SEiji.Ota@Sun.COM 	rdsv3_ib_sysctl_exit();
38612198SEiji.Ota@Sun.COM 	rdsv3_ib_recv_exit();
38712198SEiji.Ota@Sun.COM 	rdsv3_trans_unregister(&rdsv3_ib_transport);
388*12922SGiri.Adari@Sun.COM 	kmem_free(rdsv3_ib_stats,
389*12922SGiri.Adari@Sun.COM 	    nr_cpus * sizeof (struct rdsv3_ib_statistics));
39012198SEiji.Ota@Sun.COM 	mutex_destroy(&ib_nodev_conns_lock);
39112198SEiji.Ota@Sun.COM 	list_destroy(&ib_nodev_conns);
39212198SEiji.Ota@Sun.COM 	list_destroy(&rdsv3_ib_devices);
39312198SEiji.Ota@Sun.COM 
39412198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rds_ib_exit", "Return");
39512198SEiji.Ota@Sun.COM }
39612198SEiji.Ota@Sun.COM 
39712198SEiji.Ota@Sun.COM #ifndef __lock_lint
39812198SEiji.Ota@Sun.COM struct rdsv3_transport rdsv3_ib_transport = {
39912198SEiji.Ota@Sun.COM 	.laddr_check		= rds_ib_laddr_check,
40012198SEiji.Ota@Sun.COM 	.xmit_complete		= rdsv3_ib_xmit_complete,
40112198SEiji.Ota@Sun.COM 	.xmit			= rdsv3_ib_xmit,
40212198SEiji.Ota@Sun.COM 	.xmit_cong_map		= NULL,
40312198SEiji.Ota@Sun.COM 	.xmit_rdma		= rdsv3_ib_xmit_rdma,
40412198SEiji.Ota@Sun.COM 	.recv			= rdsv3_ib_recv,
40512198SEiji.Ota@Sun.COM 	.conn_alloc		= rdsv3_ib_conn_alloc,
40612198SEiji.Ota@Sun.COM 	.conn_free		= rdsv3_ib_conn_free,
40712198SEiji.Ota@Sun.COM 	.conn_connect		= rdsv3_ib_conn_connect,
40812198SEiji.Ota@Sun.COM 	.conn_shutdown		= rdsv3_ib_conn_shutdown,
40912198SEiji.Ota@Sun.COM 	.inc_copy_to_user	= rdsv3_ib_inc_copy_to_user,
41012198SEiji.Ota@Sun.COM 	.inc_free		= rdsv3_ib_inc_free,
41112198SEiji.Ota@Sun.COM 	.cm_initiate_connect	= rdsv3_ib_cm_initiate_connect,
41212198SEiji.Ota@Sun.COM 	.cm_handle_connect	= rdsv3_ib_cm_handle_connect,
41312198SEiji.Ota@Sun.COM 	.cm_connect_complete	= rdsv3_ib_cm_connect_complete,
41412198SEiji.Ota@Sun.COM 	.stats_info_copy	= rdsv3_ib_stats_info_copy,
41512198SEiji.Ota@Sun.COM 	.exit			= rdsv3_ib_exit,
41612198SEiji.Ota@Sun.COM 	.get_mr			= rdsv3_ib_get_mr,
41712198SEiji.Ota@Sun.COM 	.sync_mr		= rdsv3_ib_sync_mr,
41812198SEiji.Ota@Sun.COM 	.free_mr		= rdsv3_ib_free_mr,
41912198SEiji.Ota@Sun.COM 	.flush_mrs		= rdsv3_ib_flush_mrs,
42012198SEiji.Ota@Sun.COM 	.t_name			= "infiniband",
42112414SEiji.Ota@Sun.COM 	.t_type			= RDS_TRANS_IB
42212198SEiji.Ota@Sun.COM };
42312198SEiji.Ota@Sun.COM #else
42412198SEiji.Ota@Sun.COM struct rdsv3_transport rdsv3_ib_transport;
42512198SEiji.Ota@Sun.COM #endif
42612198SEiji.Ota@Sun.COM 
42712198SEiji.Ota@Sun.COM int
rdsv3_ib_init(void)42812198SEiji.Ota@Sun.COM rdsv3_ib_init(void)
42912198SEiji.Ota@Sun.COM {
43012198SEiji.Ota@Sun.COM 	int ret;
43112198SEiji.Ota@Sun.COM 
43212198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rds_ib_init", "Enter");
43312198SEiji.Ota@Sun.COM 
43412198SEiji.Ota@Sun.COM 	list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device),
43512198SEiji.Ota@Sun.COM 	    offsetof(struct rdsv3_ib_device, list));
43612198SEiji.Ota@Sun.COM 	list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection),
43712198SEiji.Ota@Sun.COM 	    offsetof(struct rdsv3_ib_connection, ib_node));
43812198SEiji.Ota@Sun.COM 	mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL);
43912198SEiji.Ota@Sun.COM 
440*12922SGiri.Adari@Sun.COM 	/* allocate space for ib statistics */
441*12922SGiri.Adari@Sun.COM 	ASSERT(rdsv3_ib_stats == NULL);
442*12922SGiri.Adari@Sun.COM 	rdsv3_ib_stats = kmem_zalloc(nr_cpus *
443*12922SGiri.Adari@Sun.COM 	    sizeof (struct rdsv3_ib_statistics), KM_SLEEP);
444*12922SGiri.Adari@Sun.COM 
44512198SEiji.Ota@Sun.COM 	rdsv3_ib_client.dip = rdsv3_dev_info;
44612198SEiji.Ota@Sun.COM 	ret = ib_register_client(&rdsv3_ib_client);
44712198SEiji.Ota@Sun.COM 	if (ret)
44812198SEiji.Ota@Sun.COM 		goto out;
44912198SEiji.Ota@Sun.COM 
45012198SEiji.Ota@Sun.COM 	ret = rdsv3_ib_sysctl_init();
45112198SEiji.Ota@Sun.COM 	if (ret)
45212198SEiji.Ota@Sun.COM 		goto out_ibreg;
45312198SEiji.Ota@Sun.COM 
45412198SEiji.Ota@Sun.COM 	ret = rdsv3_ib_recv_init();
45512198SEiji.Ota@Sun.COM 	if (ret)
45612198SEiji.Ota@Sun.COM 		goto out_sysctl;
45712198SEiji.Ota@Sun.COM 
45812198SEiji.Ota@Sun.COM 	ret = rdsv3_trans_register(&rdsv3_ib_transport);
45912198SEiji.Ota@Sun.COM 	if (ret)
46012198SEiji.Ota@Sun.COM 		goto out_recv;
46112198SEiji.Ota@Sun.COM 
46212863SEiji.Ota@Sun.COM 	rdsv3_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
46312198SEiji.Ota@Sun.COM 
46412198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rds_ib_init", "Return");
46512198SEiji.Ota@Sun.COM 
46612198SEiji.Ota@Sun.COM 	return (0);
46712198SEiji.Ota@Sun.COM 
46812198SEiji.Ota@Sun.COM out_recv:
46912198SEiji.Ota@Sun.COM 	rdsv3_ib_recv_exit();
47012198SEiji.Ota@Sun.COM out_sysctl:
47112198SEiji.Ota@Sun.COM 	rdsv3_ib_sysctl_exit();
47212198SEiji.Ota@Sun.COM out_ibreg:
47312198SEiji.Ota@Sun.COM 	ib_unregister_client(&rdsv3_ib_client);
47412198SEiji.Ota@Sun.COM out:
475*12922SGiri.Adari@Sun.COM 	kmem_free(rdsv3_ib_stats,
476*12922SGiri.Adari@Sun.COM 	    nr_cpus * sizeof (struct rdsv3_ib_statistics));
47712198SEiji.Ota@Sun.COM 	mutex_destroy(&ib_nodev_conns_lock);
47812198SEiji.Ota@Sun.COM 	list_destroy(&ib_nodev_conns);
47912198SEiji.Ota@Sun.COM 	list_destroy(&rdsv3_ib_devices);
48012198SEiji.Ota@Sun.COM 	return (ret);
48112198SEiji.Ota@Sun.COM }
482