xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rdsv3/ib.c (revision 6a634c9dca3093f3922e4b7ab826d7bdf17bf78e)
1c0dd49bdSEiji Ota /*
2c0dd49bdSEiji Ota  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
3c0dd49bdSEiji Ota  */
4c0dd49bdSEiji Ota 
5c0dd49bdSEiji Ota /*
616e76cddSagiri  * This file contains code imported from the OFED rds source file ib.c
716e76cddSagiri  * Oracle elects to have and use the contents of ib.c under and governed
816e76cddSagiri  * by the OpenIB.org BSD license (see below for full license text). However,
916e76cddSagiri  * the following notice accompanied the original version of this file:
1016e76cddSagiri  */
1116e76cddSagiri 
1216e76cddSagiri /*
13c0dd49bdSEiji Ota  * Copyright (c) 2006 Oracle.  All rights reserved.
14c0dd49bdSEiji Ota  *
15c0dd49bdSEiji Ota  * This software is available to you under a choice of one of two
16c0dd49bdSEiji Ota  * licenses.  You may choose to be licensed under the terms of the GNU
17c0dd49bdSEiji Ota  * General Public License (GPL) Version 2, available from the file
18c0dd49bdSEiji Ota  * COPYING in the main directory of this source tree, or the
19c0dd49bdSEiji Ota  * OpenIB.org BSD license below:
20c0dd49bdSEiji Ota  *
21c0dd49bdSEiji Ota  *     Redistribution and use in source and binary forms, with or
22c0dd49bdSEiji Ota  *     without modification, are permitted provided that the following
23c0dd49bdSEiji Ota  *     conditions are met:
24c0dd49bdSEiji Ota  *
25c0dd49bdSEiji Ota  *      - Redistributions of source code must retain the above
26c0dd49bdSEiji Ota  *        copyright notice, this list of conditions and the following
27c0dd49bdSEiji Ota  *        disclaimer.
28c0dd49bdSEiji Ota  *
29c0dd49bdSEiji Ota  *      - Redistributions in binary form must reproduce the above
30c0dd49bdSEiji Ota  *        copyright notice, this list of conditions and the following
31c0dd49bdSEiji Ota  *        disclaimer in the documentation and/or other materials
32c0dd49bdSEiji Ota  *        provided with the distribution.
33c0dd49bdSEiji Ota  *
34c0dd49bdSEiji Ota  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
35c0dd49bdSEiji Ota  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
36c0dd49bdSEiji Ota  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
37c0dd49bdSEiji Ota  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
38c0dd49bdSEiji Ota  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
39c0dd49bdSEiji Ota  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
40c0dd49bdSEiji Ota  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
41c0dd49bdSEiji Ota  * SOFTWARE.
42c0dd49bdSEiji Ota  *
43c0dd49bdSEiji Ota  */
44c0dd49bdSEiji Ota #include <sys/sysmacros.h>
45c0dd49bdSEiji Ota #include <sys/rds.h>
46c0dd49bdSEiji Ota 
47c0dd49bdSEiji Ota #include <sys/ib/ibtl/ibti.h>
48c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3.h>
49c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/ib.h>
50c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
51c0dd49bdSEiji Ota 
52c0dd49bdSEiji Ota unsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT;
53c0dd49bdSEiji Ota 
54c0dd49bdSEiji Ota struct list	rdsv3_ib_devices;
55c0dd49bdSEiji Ota 
56c0dd49bdSEiji Ota /* NOTE: if also grabbing ibdev lock, grab this first */
57c0dd49bdSEiji Ota kmutex_t ib_nodev_conns_lock;
58c0dd49bdSEiji Ota list_t ib_nodev_conns;
59c0dd49bdSEiji Ota 
60d2b539e7Sagiri extern int rdsv3_ib_frag_constructor(void *buf, void *arg, int kmflags);
61d2b539e7Sagiri extern void rdsv3_ib_frag_destructor(void *buf, void *arg);
62d2b539e7Sagiri 
63c0dd49bdSEiji Ota void
rdsv3_ib_add_one(ib_device_t * device)64c0dd49bdSEiji Ota rdsv3_ib_add_one(ib_device_t *device)
65c0dd49bdSEiji Ota {
66c0dd49bdSEiji Ota 	struct rdsv3_ib_device *rds_ibdev;
67c0dd49bdSEiji Ota 	ibt_hca_attr_t *dev_attr;
68d2b539e7Sagiri 	char name[64];
69c0dd49bdSEiji Ota 
70b27516f5Sagiri 	RDSV3_DPRINTF2("rdsv3_ib_add_one", "device: %p", device);
71c0dd49bdSEiji Ota 
72c0dd49bdSEiji Ota 	/* Only handle IB (no iWARP) devices */
73c0dd49bdSEiji Ota 	if (device->node_type != RDMA_NODE_IB_CA)
74c0dd49bdSEiji Ota 		return;
75c0dd49bdSEiji Ota 
76c0dd49bdSEiji Ota 	dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr),
77c0dd49bdSEiji Ota 	    KM_NOSLEEP);
78c0dd49bdSEiji Ota 	if (!dev_attr)
79c0dd49bdSEiji Ota 		return;
80c0dd49bdSEiji Ota 
81c0dd49bdSEiji Ota 	if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) {
82d2b539e7Sagiri 		RDSV3_DPRINTF2("rdsv3_ib_add_one",
83c0dd49bdSEiji Ota 		    "Query device failed for %s", device->name);
84c0dd49bdSEiji Ota 		goto free_attr;
85c0dd49bdSEiji Ota 	}
86c0dd49bdSEiji Ota 
87c0dd49bdSEiji Ota 	/* We depend on Reserved Lkey */
88c0dd49bdSEiji Ota 	if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) {
89d2b539e7Sagiri 		RDSV3_DPRINTF2("rdsv3_ib_add_one",
90c0dd49bdSEiji Ota 		    "Reserved Lkey support is required: %s",
91c0dd49bdSEiji Ota 		    device->name);
92c0dd49bdSEiji Ota 		goto free_attr;
93c0dd49bdSEiji Ota 	}
94c0dd49bdSEiji Ota 
95c0dd49bdSEiji Ota 	rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP);
96c0dd49bdSEiji Ota 	if (!rds_ibdev)
97c0dd49bdSEiji Ota 		goto free_attr;
98c0dd49bdSEiji Ota 
995d5562f5SEiji Ota 	rds_ibdev->ibt_hca_hdl = ib_get_ibt_hca_hdl(device);
1005d5562f5SEiji Ota 	rds_ibdev->hca_attr =  *dev_attr;
1015d5562f5SEiji Ota 
1025d5562f5SEiji Ota 	rw_init(&rds_ibdev->rwlock, NULL, RW_DRIVER, NULL);
103c0dd49bdSEiji Ota 	mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL);
104c0dd49bdSEiji Ota 
105c0dd49bdSEiji Ota 	rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz;
106c0dd49bdSEiji Ota 	rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE);
107c0dd49bdSEiji Ota 
1085d5562f5SEiji Ota 	rds_ibdev->max_initiator_depth = (uint_t)dev_attr->hca_max_rdma_in_qp;
1095d5562f5SEiji Ota 	rds_ibdev->max_responder_resources =
1105d5562f5SEiji Ota 	    (uint_t)dev_attr->hca_max_rdma_in_qp;
1115d5562f5SEiji Ota 
112c0dd49bdSEiji Ota 	rds_ibdev->dev = device;
113c0dd49bdSEiji Ota 	rds_ibdev->pd = ib_alloc_pd(device);
114c0dd49bdSEiji Ota 	if (IS_ERR(rds_ibdev->pd))
115c0dd49bdSEiji Ota 		goto free_dev;
116c0dd49bdSEiji Ota 
117c0dd49bdSEiji Ota 	if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) {
118c0dd49bdSEiji Ota 		goto free_dev;
119c0dd49bdSEiji Ota 	}
120c0dd49bdSEiji Ota 
1215d5562f5SEiji Ota 	if (rdsv3_ib_create_inc_pool(rds_ibdev) != 0) {
1225d5562f5SEiji Ota 		rdsv3_ib_destroy_mr_pool(rds_ibdev);
1235d5562f5SEiji Ota 		goto free_dev;
1245d5562f5SEiji Ota 	}
1255d5562f5SEiji Ota 
126d2b539e7Sagiri 	(void) snprintf(name, 64, "RDSV3_IB_FRAG_%llx",
127d2b539e7Sagiri 	    (longlong_t)htonll(dev_attr->hca_node_guid));
128d2b539e7Sagiri 	rds_ibdev->ib_frag_slab = kmem_cache_create(name,
129d2b539e7Sagiri 	    sizeof (struct rdsv3_page_frag), 0, rdsv3_ib_frag_constructor,
130d2b539e7Sagiri 	    rdsv3_ib_frag_destructor, NULL, (void *)rds_ibdev, NULL, 0);
131d2b539e7Sagiri 	if (rds_ibdev->ib_frag_slab == NULL) {
132d2b539e7Sagiri 		RDSV3_DPRINTF2("rdsv3_ib_add_one",
133d2b539e7Sagiri 		    "kmem_cache_create for ib_frag_slab failed for device: %s",
134d2b539e7Sagiri 		    device->name);
135d2b539e7Sagiri 		rdsv3_ib_destroy_mr_pool(rds_ibdev);
1365d5562f5SEiji Ota 		rdsv3_ib_destroy_inc_pool(rds_ibdev);
137d2b539e7Sagiri 		goto free_dev;
138d2b539e7Sagiri 	}
139d2b539e7Sagiri 
1405d5562f5SEiji Ota 	rds_ibdev->aft_hcagp = rdsv3_af_grp_create(rds_ibdev->ibt_hca_hdl,
1415d5562f5SEiji Ota 	    (uint64_t)rds_ibdev->hca_attr.hca_node_guid);
1425d5562f5SEiji Ota 	if (rds_ibdev->aft_hcagp == NULL) {
1435d5562f5SEiji Ota 		rdsv3_ib_destroy_mr_pool(rds_ibdev);
1445d5562f5SEiji Ota 		rdsv3_ib_destroy_inc_pool(rds_ibdev);
1455d5562f5SEiji Ota 		kmem_cache_destroy(rds_ibdev->ib_frag_slab);
1465d5562f5SEiji Ota 		goto free_dev;
1475d5562f5SEiji Ota 	}
1485d5562f5SEiji Ota 	rds_ibdev->fmr_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_mrlist_fn,
1495d5562f5SEiji Ota 	    (void *)rds_ibdev->fmr_pool, SCQ_HCA_BIND_CPU,
1505d5562f5SEiji Ota 	    rds_ibdev->aft_hcagp);
1515d5562f5SEiji Ota 	if (rds_ibdev->fmr_soft_cq == NULL) {
1525d5562f5SEiji Ota 		rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
1535d5562f5SEiji Ota 		rdsv3_ib_destroy_mr_pool(rds_ibdev);
1545d5562f5SEiji Ota 		rdsv3_ib_destroy_inc_pool(rds_ibdev);
1555d5562f5SEiji Ota 		kmem_cache_destroy(rds_ibdev->ib_frag_slab);
1565d5562f5SEiji Ota 		goto free_dev;
1575d5562f5SEiji Ota 	}
1585d5562f5SEiji Ota 
1595d5562f5SEiji Ota 	rds_ibdev->inc_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_inclist,
1605d5562f5SEiji Ota 	    (void *)rds_ibdev->inc_pool, SCQ_HCA_BIND_CPU,
1615d5562f5SEiji Ota 	    rds_ibdev->aft_hcagp);
1625d5562f5SEiji Ota 	if (rds_ibdev->inc_soft_cq == NULL) {
1635d5562f5SEiji Ota 		rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq);
1645d5562f5SEiji Ota 		rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
1655d5562f5SEiji Ota 		rdsv3_ib_destroy_mr_pool(rds_ibdev);
1665d5562f5SEiji Ota 		rdsv3_ib_destroy_inc_pool(rds_ibdev);
1675d5562f5SEiji Ota 		kmem_cache_destroy(rds_ibdev->ib_frag_slab);
1685d5562f5SEiji Ota 		goto free_dev;
1695d5562f5SEiji Ota 	}
170d2b539e7Sagiri 
171c0dd49bdSEiji Ota 	list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr),
172c0dd49bdSEiji Ota 	    offsetof(struct rdsv3_ib_ipaddr, list));
173c0dd49bdSEiji Ota 	list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection),
174c0dd49bdSEiji Ota 	    offsetof(struct rdsv3_ib_connection, ib_node));
175c0dd49bdSEiji Ota 
176c0dd49bdSEiji Ota 	list_insert_tail(&rdsv3_ib_devices, rds_ibdev);
177c0dd49bdSEiji Ota 
178c0dd49bdSEiji Ota 	ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev);
179c0dd49bdSEiji Ota 
180b27516f5Sagiri 	RDSV3_DPRINTF2("rdsv3_ib_add_one", "Return: device: %p", device);
181c0dd49bdSEiji Ota 
182c0dd49bdSEiji Ota 	goto free_attr;
183c0dd49bdSEiji Ota 
184c0dd49bdSEiji Ota err_pd:
185c0dd49bdSEiji Ota 	(void) ib_dealloc_pd(rds_ibdev->pd);
186c0dd49bdSEiji Ota free_dev:
1875d5562f5SEiji Ota 	mutex_destroy(&rds_ibdev->spinlock);
1885d5562f5SEiji Ota 	rw_destroy(&rds_ibdev->rwlock);
189c0dd49bdSEiji Ota 	kmem_free(rds_ibdev, sizeof (*rds_ibdev));
190c0dd49bdSEiji Ota free_attr:
191c0dd49bdSEiji Ota 	kmem_free(dev_attr, sizeof (*dev_attr));
192c0dd49bdSEiji Ota }
193c0dd49bdSEiji Ota 
194c0dd49bdSEiji Ota void
rdsv3_ib_remove_one(struct ib_device * device)195c0dd49bdSEiji Ota rdsv3_ib_remove_one(struct ib_device *device)
196c0dd49bdSEiji Ota {
197c0dd49bdSEiji Ota 	struct rdsv3_ib_device *rds_ibdev;
198c0dd49bdSEiji Ota 	struct rdsv3_ib_ipaddr *i_ipaddr, *i_next;
199c0dd49bdSEiji Ota 
200b27516f5Sagiri 	RDSV3_DPRINTF2("rdsv3_ib_remove_one", "device: %p", device);
201c0dd49bdSEiji Ota 
202c0dd49bdSEiji Ota 	rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client);
203c0dd49bdSEiji Ota 	if (!rds_ibdev)
204c0dd49bdSEiji Ota 		return;
205c0dd49bdSEiji Ota 
206c0dd49bdSEiji Ota 	RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list,
207c0dd49bdSEiji Ota 	    list) {
208c0dd49bdSEiji Ota 		list_remove_node(&i_ipaddr->list);
209c0dd49bdSEiji Ota 		kmem_free(i_ipaddr, sizeof (*i_ipaddr));
210c0dd49bdSEiji Ota 	}
211c0dd49bdSEiji Ota 
212c0dd49bdSEiji Ota 	rdsv3_ib_destroy_conns(rds_ibdev);
213c0dd49bdSEiji Ota 
2145d5562f5SEiji Ota 	if (rds_ibdev->fmr_soft_cq)
2155d5562f5SEiji Ota 		rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq);
2165d5562f5SEiji Ota 	if (rds_ibdev->inc_soft_cq)
2175d5562f5SEiji Ota 		rdsv3_af_thr_destroy(rds_ibdev->inc_soft_cq);
2185d5562f5SEiji Ota 
219c0dd49bdSEiji Ota 	rdsv3_ib_destroy_mr_pool(rds_ibdev);
2205d5562f5SEiji Ota 	rdsv3_ib_destroy_inc_pool(rds_ibdev);
221c0dd49bdSEiji Ota 
222d2b539e7Sagiri 	kmem_cache_destroy(rds_ibdev->ib_frag_slab);
223d2b539e7Sagiri 
2245d5562f5SEiji Ota 	rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
2255d5562f5SEiji Ota 
226c0dd49bdSEiji Ota #if 0
227c0dd49bdSEiji Ota 	while (ib_dealloc_pd(rds_ibdev->pd)) {
228c0dd49bdSEiji Ota #ifndef __lock_lint
229c0dd49bdSEiji Ota 		RDSV3_DPRINTF5("rdsv3_ib_remove_one",
230c0dd49bdSEiji Ota 		    "%s-%d Failed to dealloc pd %p",
231c0dd49bdSEiji Ota 		    __func__, __LINE__, rds_ibdev->pd);
232c0dd49bdSEiji Ota #endif
233c0dd49bdSEiji Ota 		delay(drv_usectohz(1000));
234c0dd49bdSEiji Ota 	}
235c0dd49bdSEiji Ota #else
236c0dd49bdSEiji Ota 	if (ib_dealloc_pd(rds_ibdev->pd)) {
237c0dd49bdSEiji Ota #ifndef __lock_lint
238c0dd49bdSEiji Ota 		RDSV3_DPRINTF2("rdsv3_ib_remove_one",
239cadbfdc3SEiji Ota 		    "Failed to dealloc pd %p\n", rds_ibdev->pd);
240c0dd49bdSEiji Ota #endif
241c0dd49bdSEiji Ota 	}
242c0dd49bdSEiji Ota #endif
243c0dd49bdSEiji Ota 
244c0dd49bdSEiji Ota 	list_destroy(&rds_ibdev->ipaddr_list);
245c0dd49bdSEiji Ota 	list_destroy(&rds_ibdev->conn_list);
246c0dd49bdSEiji Ota 	list_remove_node(&rds_ibdev->list);
2475d5562f5SEiji Ota 	mutex_destroy(&rds_ibdev->spinlock);
2485d5562f5SEiji Ota 	rw_destroy(&rds_ibdev->rwlock);
249c0dd49bdSEiji Ota 	kmem_free(rds_ibdev, sizeof (*rds_ibdev));
250c0dd49bdSEiji Ota 
251b27516f5Sagiri 	RDSV3_DPRINTF2("rdsv3_ib_remove_one", "Return: device: %p", device);
252c0dd49bdSEiji Ota }
253c0dd49bdSEiji Ota 
254c0dd49bdSEiji Ota #ifndef __lock_lint
255c0dd49bdSEiji Ota struct ib_client rdsv3_ib_client = {
256c0dd49bdSEiji Ota 	.name		= "rdsv3_ib",
257c0dd49bdSEiji Ota 	.add		= rdsv3_ib_add_one,
258c0dd49bdSEiji Ota 	.remove		= rdsv3_ib_remove_one,
259c0dd49bdSEiji Ota 	.clnt_hdl	= NULL,
260c0dd49bdSEiji Ota 	.state		= IB_CLNT_UNINITIALIZED
261c0dd49bdSEiji Ota };
262c0dd49bdSEiji Ota #else
263c0dd49bdSEiji Ota struct ib_client rdsv3_ib_client = {
264c0dd49bdSEiji Ota 	"rdsv3_ib",
265c0dd49bdSEiji Ota 	rdsv3_ib_add_one,
266c0dd49bdSEiji Ota 	rdsv3_ib_remove_one,
267c0dd49bdSEiji Ota 	NULL,
268c0dd49bdSEiji Ota 	NULL,
269c0dd49bdSEiji Ota 	IB_CLNT_UNINITIALIZED
270c0dd49bdSEiji Ota };
271c0dd49bdSEiji Ota #endif
272c0dd49bdSEiji Ota 
273c0dd49bdSEiji Ota static int
rds_ib_conn_info_visitor(struct rdsv3_connection * conn,void * buffer)274c0dd49bdSEiji Ota rds_ib_conn_info_visitor(struct rdsv3_connection *conn,
275c0dd49bdSEiji Ota     void *buffer)
276c0dd49bdSEiji Ota {
277fe817b60SEiji Ota 	struct rds_info_rdma_connection *iinfo = buffer;
278c0dd49bdSEiji Ota 	struct rdsv3_ib_connection *ic;
279c0dd49bdSEiji Ota 
280c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
281c0dd49bdSEiji Ota 	    conn, buffer);
282c0dd49bdSEiji Ota 
283c0dd49bdSEiji Ota 	/* We will only ever look at IB transports */
284c0dd49bdSEiji Ota 	if (conn->c_trans != &rdsv3_ib_transport)
285c0dd49bdSEiji Ota 		return (0);
286c0dd49bdSEiji Ota 
287c0dd49bdSEiji Ota 	iinfo->src_addr = conn->c_laddr;
288c0dd49bdSEiji Ota 	iinfo->dst_addr = conn->c_faddr;
289c0dd49bdSEiji Ota 
290c0dd49bdSEiji Ota 	(void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid));
291c0dd49bdSEiji Ota 	(void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid));
292c0dd49bdSEiji Ota 	if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
293c0dd49bdSEiji Ota 		struct rdsv3_ib_device *rds_ibdev;
294c0dd49bdSEiji Ota 		struct rdma_dev_addr *dev_addr;
295c0dd49bdSEiji Ota 
296c0dd49bdSEiji Ota 		ic = conn->c_transport_data;
297c0dd49bdSEiji Ota 		dev_addr = &ic->i_cm_id->route.addr.dev_addr;
298c0dd49bdSEiji Ota 
299c0dd49bdSEiji Ota 		ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid);
300c0dd49bdSEiji Ota 		ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid);
301c0dd49bdSEiji Ota 
302c0dd49bdSEiji Ota 		rds_ibdev = ib_get_client_data(ic->i_cm_id->device,
303c0dd49bdSEiji Ota 		    &rdsv3_ib_client);
304c0dd49bdSEiji Ota 		iinfo->max_send_wr = ic->i_send_ring.w_nr;
305c0dd49bdSEiji Ota 		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
306c0dd49bdSEiji Ota 		iinfo->max_send_sge = rds_ibdev->max_sge;
307c0dd49bdSEiji Ota 	}
308c0dd49bdSEiji Ota 
309c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
310c0dd49bdSEiji Ota 	    conn, buffer);
311c0dd49bdSEiji Ota 	return (1);
312c0dd49bdSEiji Ota }
313c0dd49bdSEiji Ota 
314c0dd49bdSEiji Ota static void
rds_ib_ic_info(struct rsock * sock,unsigned int len,struct rdsv3_info_iterator * iter,struct rdsv3_info_lengths * lens)315c0dd49bdSEiji Ota rds_ib_ic_info(struct rsock *sock, unsigned int len,
316c0dd49bdSEiji Ota     struct rdsv3_info_iterator *iter,
317c0dd49bdSEiji Ota     struct rdsv3_info_lengths *lens)
318c0dd49bdSEiji Ota {
319c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d",
320c0dd49bdSEiji Ota 	    sock, iter, lens, len);
321c0dd49bdSEiji Ota 
322c0dd49bdSEiji Ota 	rdsv3_for_each_conn_info(sock, len, iter, lens,
323c0dd49bdSEiji Ota 	    rds_ib_conn_info_visitor,
324fe817b60SEiji Ota 	    sizeof (struct rds_info_rdma_connection));
325c0dd49bdSEiji Ota }
326c0dd49bdSEiji Ota 
327c0dd49bdSEiji Ota /*
328c0dd49bdSEiji Ota  * Early RDS/IB was built to only bind to an address if there is an IPoIB
329c0dd49bdSEiji Ota  * device with that address set.
330c0dd49bdSEiji Ota  *
331c0dd49bdSEiji Ota  * If it were me, I'd advocate for something more flexible.  Sending and
332c0dd49bdSEiji Ota  * receiving should be device-agnostic.  Transports would try and maintain
333c0dd49bdSEiji Ota  * connections between peers who have messages queued.  Userspace would be
334c0dd49bdSEiji Ota  * allowed to influence which paths have priority.  We could call userspace
335c0dd49bdSEiji Ota  * asserting this policy "routing".
336c0dd49bdSEiji Ota  */
337c0dd49bdSEiji Ota static int
rds_ib_laddr_check(uint32_be_t addr)338c0dd49bdSEiji Ota rds_ib_laddr_check(uint32_be_t addr)
339c0dd49bdSEiji Ota {
340c0dd49bdSEiji Ota 	int ret;
341c0dd49bdSEiji Ota 	struct rdma_cm_id *cm_id;
342c0dd49bdSEiji Ota 	struct sockaddr_in sin;
343c0dd49bdSEiji Ota 
344c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr));
345c0dd49bdSEiji Ota 
346c0dd49bdSEiji Ota 	/*
347c0dd49bdSEiji Ota 	 * Create a CMA ID and try to bind it. This catches both
348c0dd49bdSEiji Ota 	 * IB and iWARP capable NICs.
349c0dd49bdSEiji Ota 	 */
350c0dd49bdSEiji Ota 	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
351cadbfdc3SEiji Ota 	if (!cm_id)
352cadbfdc3SEiji Ota 		return (-EADDRNOTAVAIL);
353c0dd49bdSEiji Ota 
354c0dd49bdSEiji Ota 	(void) memset(&sin, 0, sizeof (sin));
355c0dd49bdSEiji Ota 	sin.sin_family = AF_INET;
356c0dd49bdSEiji Ota 	sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr);
357c0dd49bdSEiji Ota 
358c0dd49bdSEiji Ota 	/* rdma_bind_addr will only succeed for IB & iWARP devices */
359c0dd49bdSEiji Ota 	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
360c0dd49bdSEiji Ota 	/*
361c0dd49bdSEiji Ota 	 * due to this, we will claim to support iWARP devices unless we
362c0dd49bdSEiji Ota 	 * check node_type.
363c0dd49bdSEiji Ota 	 */
364c0dd49bdSEiji Ota 	if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
365c0dd49bdSEiji Ota 		ret = -EADDRNOTAVAIL;
366c0dd49bdSEiji Ota 
367c0dd49bdSEiji Ota 	RDSV3_DPRINTF5("rds_ib_laddr_check",
368c0dd49bdSEiji Ota 	    "addr %u.%u.%u.%u ret %d node type %d",
369c0dd49bdSEiji Ota 	    NIPQUAD(addr), ret,
370c0dd49bdSEiji Ota 	    cm_id->device ? cm_id->device->node_type : -1);
371c0dd49bdSEiji Ota 
372c0dd49bdSEiji Ota 	rdma_destroy_id(cm_id);
373c0dd49bdSEiji Ota 
374c0dd49bdSEiji Ota 	return (ret);
375c0dd49bdSEiji Ota }
376c0dd49bdSEiji Ota 
377c0dd49bdSEiji Ota void
rdsv3_ib_exit(void)378c0dd49bdSEiji Ota rdsv3_ib_exit(void)
379c0dd49bdSEiji Ota {
380c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rds_ib_exit", "Enter");
381c0dd49bdSEiji Ota 
382fe817b60SEiji Ota 	rdsv3_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
383c0dd49bdSEiji Ota 	rdsv3_ib_destroy_nodev_conns();
384c0dd49bdSEiji Ota 	ib_unregister_client(&rdsv3_ib_client);
385c0dd49bdSEiji Ota 	rdsv3_ib_sysctl_exit();
386c0dd49bdSEiji Ota 	rdsv3_ib_recv_exit();
387c0dd49bdSEiji Ota 	rdsv3_trans_unregister(&rdsv3_ib_transport);
388*a530e0a9Sagiri 	kmem_free(rdsv3_ib_stats,
389*a530e0a9Sagiri 	    nr_cpus * sizeof (struct rdsv3_ib_statistics));
390c0dd49bdSEiji Ota 	mutex_destroy(&ib_nodev_conns_lock);
391c0dd49bdSEiji Ota 	list_destroy(&ib_nodev_conns);
392c0dd49bdSEiji Ota 	list_destroy(&rdsv3_ib_devices);
393c0dd49bdSEiji Ota 
394c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rds_ib_exit", "Return");
395c0dd49bdSEiji Ota }
396c0dd49bdSEiji Ota 
397c0dd49bdSEiji Ota #ifndef __lock_lint
398c0dd49bdSEiji Ota struct rdsv3_transport rdsv3_ib_transport = {
399c0dd49bdSEiji Ota 	.laddr_check		= rds_ib_laddr_check,
400c0dd49bdSEiji Ota 	.xmit_complete		= rdsv3_ib_xmit_complete,
401c0dd49bdSEiji Ota 	.xmit			= rdsv3_ib_xmit,
402c0dd49bdSEiji Ota 	.xmit_cong_map		= NULL,
403c0dd49bdSEiji Ota 	.xmit_rdma		= rdsv3_ib_xmit_rdma,
404c0dd49bdSEiji Ota 	.recv			= rdsv3_ib_recv,
405c0dd49bdSEiji Ota 	.conn_alloc		= rdsv3_ib_conn_alloc,
406c0dd49bdSEiji Ota 	.conn_free		= rdsv3_ib_conn_free,
407c0dd49bdSEiji Ota 	.conn_connect		= rdsv3_ib_conn_connect,
408c0dd49bdSEiji Ota 	.conn_shutdown		= rdsv3_ib_conn_shutdown,
409c0dd49bdSEiji Ota 	.inc_copy_to_user	= rdsv3_ib_inc_copy_to_user,
410c0dd49bdSEiji Ota 	.inc_free		= rdsv3_ib_inc_free,
411c0dd49bdSEiji Ota 	.cm_initiate_connect	= rdsv3_ib_cm_initiate_connect,
412c0dd49bdSEiji Ota 	.cm_handle_connect	= rdsv3_ib_cm_handle_connect,
413c0dd49bdSEiji Ota 	.cm_connect_complete	= rdsv3_ib_cm_connect_complete,
414c0dd49bdSEiji Ota 	.stats_info_copy	= rdsv3_ib_stats_info_copy,
415c0dd49bdSEiji Ota 	.exit			= rdsv3_ib_exit,
416c0dd49bdSEiji Ota 	.get_mr			= rdsv3_ib_get_mr,
417c0dd49bdSEiji Ota 	.sync_mr		= rdsv3_ib_sync_mr,
418c0dd49bdSEiji Ota 	.free_mr		= rdsv3_ib_free_mr,
419c0dd49bdSEiji Ota 	.flush_mrs		= rdsv3_ib_flush_mrs,
420c0dd49bdSEiji Ota 	.t_name			= "infiniband",
421cadbfdc3SEiji Ota 	.t_type			= RDS_TRANS_IB
422c0dd49bdSEiji Ota };
423c0dd49bdSEiji Ota #else
424c0dd49bdSEiji Ota struct rdsv3_transport rdsv3_ib_transport;
425c0dd49bdSEiji Ota #endif
426c0dd49bdSEiji Ota 
427c0dd49bdSEiji Ota int
rdsv3_ib_init(void)428c0dd49bdSEiji Ota rdsv3_ib_init(void)
429c0dd49bdSEiji Ota {
430c0dd49bdSEiji Ota 	int ret;
431c0dd49bdSEiji Ota 
432c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rds_ib_init", "Enter");
433c0dd49bdSEiji Ota 
434c0dd49bdSEiji Ota 	list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device),
435c0dd49bdSEiji Ota 	    offsetof(struct rdsv3_ib_device, list));
436c0dd49bdSEiji Ota 	list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection),
437c0dd49bdSEiji Ota 	    offsetof(struct rdsv3_ib_connection, ib_node));
438c0dd49bdSEiji Ota 	mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL);
439c0dd49bdSEiji Ota 
440*a530e0a9Sagiri 	/* allocate space for ib statistics */
441*a530e0a9Sagiri 	ASSERT(rdsv3_ib_stats == NULL);
442*a530e0a9Sagiri 	rdsv3_ib_stats = kmem_zalloc(nr_cpus *
443*a530e0a9Sagiri 	    sizeof (struct rdsv3_ib_statistics), KM_SLEEP);
444*a530e0a9Sagiri 
445c0dd49bdSEiji Ota 	rdsv3_ib_client.dip = rdsv3_dev_info;
446c0dd49bdSEiji Ota 	ret = ib_register_client(&rdsv3_ib_client);
447c0dd49bdSEiji Ota 	if (ret)
448c0dd49bdSEiji Ota 		goto out;
449c0dd49bdSEiji Ota 
450c0dd49bdSEiji Ota 	ret = rdsv3_ib_sysctl_init();
451c0dd49bdSEiji Ota 	if (ret)
452c0dd49bdSEiji Ota 		goto out_ibreg;
453c0dd49bdSEiji Ota 
454c0dd49bdSEiji Ota 	ret = rdsv3_ib_recv_init();
455c0dd49bdSEiji Ota 	if (ret)
456c0dd49bdSEiji Ota 		goto out_sysctl;
457c0dd49bdSEiji Ota 
458c0dd49bdSEiji Ota 	ret = rdsv3_trans_register(&rdsv3_ib_transport);
459c0dd49bdSEiji Ota 	if (ret)
460c0dd49bdSEiji Ota 		goto out_recv;
461c0dd49bdSEiji Ota 
462fe817b60SEiji Ota 	rdsv3_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
463c0dd49bdSEiji Ota 
464c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rds_ib_init", "Return");
465c0dd49bdSEiji Ota 
466c0dd49bdSEiji Ota 	return (0);
467c0dd49bdSEiji Ota 
468c0dd49bdSEiji Ota out_recv:
469c0dd49bdSEiji Ota 	rdsv3_ib_recv_exit();
470c0dd49bdSEiji Ota out_sysctl:
471c0dd49bdSEiji Ota 	rdsv3_ib_sysctl_exit();
472c0dd49bdSEiji Ota out_ibreg:
473c0dd49bdSEiji Ota 	ib_unregister_client(&rdsv3_ib_client);
474c0dd49bdSEiji Ota out:
475*a530e0a9Sagiri 	kmem_free(rdsv3_ib_stats,
476*a530e0a9Sagiri 	    nr_cpus * sizeof (struct rdsv3_ib_statistics));
477c0dd49bdSEiji Ota 	mutex_destroy(&ib_nodev_conns_lock);
478c0dd49bdSEiji Ota 	list_destroy(&ib_nodev_conns);
479c0dd49bdSEiji Ota 	list_destroy(&rdsv3_ib_devices);
480c0dd49bdSEiji Ota 	return (ret);
481c0dd49bdSEiji Ota }
482