112198SEiji.Ota@Sun.COM /*
212198SEiji.Ota@Sun.COM * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
312198SEiji.Ota@Sun.COM */
412198SEiji.Ota@Sun.COM
512198SEiji.Ota@Sun.COM /*
612763SGiri.Adari@Sun.COM * This file contains code imported from the OFED rds source file ib.c
712763SGiri.Adari@Sun.COM * Oracle elects to have and use the contents of ib.c under and governed
812763SGiri.Adari@Sun.COM * by the OpenIB.org BSD license (see below for full license text). However,
912763SGiri.Adari@Sun.COM * the following notice accompanied the original version of this file:
1012763SGiri.Adari@Sun.COM */
1112763SGiri.Adari@Sun.COM
1212763SGiri.Adari@Sun.COM /*
1312198SEiji.Ota@Sun.COM * Copyright (c) 2006 Oracle. All rights reserved.
1412198SEiji.Ota@Sun.COM *
1512198SEiji.Ota@Sun.COM * This software is available to you under a choice of one of two
1612198SEiji.Ota@Sun.COM * licenses. You may choose to be licensed under the terms of the GNU
1712198SEiji.Ota@Sun.COM * General Public License (GPL) Version 2, available from the file
1812198SEiji.Ota@Sun.COM * COPYING in the main directory of this source tree, or the
1912198SEiji.Ota@Sun.COM * OpenIB.org BSD license below:
2012198SEiji.Ota@Sun.COM *
2112198SEiji.Ota@Sun.COM * Redistribution and use in source and binary forms, with or
2212198SEiji.Ota@Sun.COM * without modification, are permitted provided that the following
2312198SEiji.Ota@Sun.COM * conditions are met:
2412198SEiji.Ota@Sun.COM *
2512198SEiji.Ota@Sun.COM * - Redistributions of source code must retain the above
2612198SEiji.Ota@Sun.COM * copyright notice, this list of conditions and the following
2712198SEiji.Ota@Sun.COM * disclaimer.
2812198SEiji.Ota@Sun.COM *
2912198SEiji.Ota@Sun.COM * - Redistributions in binary form must reproduce the above
3012198SEiji.Ota@Sun.COM * copyright notice, this list of conditions and the following
3112198SEiji.Ota@Sun.COM * disclaimer in the documentation and/or other materials
3212198SEiji.Ota@Sun.COM * provided with the distribution.
3312198SEiji.Ota@Sun.COM *
3412198SEiji.Ota@Sun.COM * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
3512198SEiji.Ota@Sun.COM * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
3612198SEiji.Ota@Sun.COM * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
3712198SEiji.Ota@Sun.COM * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
3812198SEiji.Ota@Sun.COM * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
3912198SEiji.Ota@Sun.COM * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
4012198SEiji.Ota@Sun.COM * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
4112198SEiji.Ota@Sun.COM * SOFTWARE.
4212198SEiji.Ota@Sun.COM *
4312198SEiji.Ota@Sun.COM */
4412198SEiji.Ota@Sun.COM #include <sys/sysmacros.h>
4512198SEiji.Ota@Sun.COM #include <sys/rds.h>
4612198SEiji.Ota@Sun.COM
4712198SEiji.Ota@Sun.COM #include <sys/ib/ibtl/ibti.h>
4812198SEiji.Ota@Sun.COM #include <sys/ib/clients/rdsv3/rdsv3.h>
4912198SEiji.Ota@Sun.COM #include <sys/ib/clients/rdsv3/ib.h>
5012198SEiji.Ota@Sun.COM #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
5112198SEiji.Ota@Sun.COM
5212198SEiji.Ota@Sun.COM unsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT;
5312198SEiji.Ota@Sun.COM
5412198SEiji.Ota@Sun.COM struct list rdsv3_ib_devices;
5512198SEiji.Ota@Sun.COM
5612198SEiji.Ota@Sun.COM /* NOTE: if also grabbing ibdev lock, grab this first */
5712198SEiji.Ota@Sun.COM kmutex_t ib_nodev_conns_lock;
5812198SEiji.Ota@Sun.COM list_t ib_nodev_conns;
5912198SEiji.Ota@Sun.COM
6012444SGiri.Adari@Sun.COM extern int rdsv3_ib_frag_constructor(void *buf, void *arg, int kmflags);
6112444SGiri.Adari@Sun.COM extern void rdsv3_ib_frag_destructor(void *buf, void *arg);
6212444SGiri.Adari@Sun.COM
6312198SEiji.Ota@Sun.COM void
rdsv3_ib_add_one(ib_device_t * device)6412198SEiji.Ota@Sun.COM rdsv3_ib_add_one(ib_device_t *device)
6512198SEiji.Ota@Sun.COM {
6612198SEiji.Ota@Sun.COM struct rdsv3_ib_device *rds_ibdev;
6712198SEiji.Ota@Sun.COM ibt_hca_attr_t *dev_attr;
6812444SGiri.Adari@Sun.COM char name[64];
6912198SEiji.Ota@Sun.COM
7012580SGiri.Adari@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_add_one", "device: %p", device);
7112198SEiji.Ota@Sun.COM
7212198SEiji.Ota@Sun.COM /* Only handle IB (no iWARP) devices */
7312198SEiji.Ota@Sun.COM if (device->node_type != RDMA_NODE_IB_CA)
7412198SEiji.Ota@Sun.COM return;
7512198SEiji.Ota@Sun.COM
7612198SEiji.Ota@Sun.COM dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr),
7712198SEiji.Ota@Sun.COM KM_NOSLEEP);
7812198SEiji.Ota@Sun.COM if (!dev_attr)
7912198SEiji.Ota@Sun.COM return;
8012198SEiji.Ota@Sun.COM
8112198SEiji.Ota@Sun.COM if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) {
8212444SGiri.Adari@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_add_one",
8312198SEiji.Ota@Sun.COM "Query device failed for %s", device->name);
8412198SEiji.Ota@Sun.COM goto free_attr;
8512198SEiji.Ota@Sun.COM }
8612198SEiji.Ota@Sun.COM
8712198SEiji.Ota@Sun.COM /* We depend on Reserved Lkey */
8812198SEiji.Ota@Sun.COM if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) {
8912444SGiri.Adari@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_add_one",
9012198SEiji.Ota@Sun.COM "Reserved Lkey support is required: %s",
9112198SEiji.Ota@Sun.COM device->name);
9212198SEiji.Ota@Sun.COM goto free_attr;
9312198SEiji.Ota@Sun.COM }
9412198SEiji.Ota@Sun.COM
9512198SEiji.Ota@Sun.COM rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP);
9612198SEiji.Ota@Sun.COM if (!rds_ibdev)
9712198SEiji.Ota@Sun.COM goto free_attr;
9812198SEiji.Ota@Sun.COM
9912676SEiji.Ota@Sun.COM rds_ibdev->ibt_hca_hdl = ib_get_ibt_hca_hdl(device);
10012676SEiji.Ota@Sun.COM rds_ibdev->hca_attr = *dev_attr;
10112676SEiji.Ota@Sun.COM
10212676SEiji.Ota@Sun.COM rw_init(&rds_ibdev->rwlock, NULL, RW_DRIVER, NULL);
10312198SEiji.Ota@Sun.COM mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL);
10412198SEiji.Ota@Sun.COM
10512198SEiji.Ota@Sun.COM rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz;
10612198SEiji.Ota@Sun.COM rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE);
10712198SEiji.Ota@Sun.COM
10812676SEiji.Ota@Sun.COM rds_ibdev->max_initiator_depth = (uint_t)dev_attr->hca_max_rdma_in_qp;
10912676SEiji.Ota@Sun.COM rds_ibdev->max_responder_resources =
11012676SEiji.Ota@Sun.COM (uint_t)dev_attr->hca_max_rdma_in_qp;
11112676SEiji.Ota@Sun.COM
11212198SEiji.Ota@Sun.COM rds_ibdev->dev = device;
11312198SEiji.Ota@Sun.COM rds_ibdev->pd = ib_alloc_pd(device);
11412198SEiji.Ota@Sun.COM if (IS_ERR(rds_ibdev->pd))
11512198SEiji.Ota@Sun.COM goto free_dev;
11612198SEiji.Ota@Sun.COM
11712198SEiji.Ota@Sun.COM if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) {
11812198SEiji.Ota@Sun.COM goto free_dev;
11912198SEiji.Ota@Sun.COM }
12012198SEiji.Ota@Sun.COM
12112676SEiji.Ota@Sun.COM if (rdsv3_ib_create_inc_pool(rds_ibdev) != 0) {
12212676SEiji.Ota@Sun.COM rdsv3_ib_destroy_mr_pool(rds_ibdev);
12312676SEiji.Ota@Sun.COM goto free_dev;
12412676SEiji.Ota@Sun.COM }
12512676SEiji.Ota@Sun.COM
12612444SGiri.Adari@Sun.COM (void) snprintf(name, 64, "RDSV3_IB_FRAG_%llx",
12712444SGiri.Adari@Sun.COM (longlong_t)htonll(dev_attr->hca_node_guid));
12812444SGiri.Adari@Sun.COM rds_ibdev->ib_frag_slab = kmem_cache_create(name,
12912444SGiri.Adari@Sun.COM sizeof (struct rdsv3_page_frag), 0, rdsv3_ib_frag_constructor,
13012444SGiri.Adari@Sun.COM rdsv3_ib_frag_destructor, NULL, (void *)rds_ibdev, NULL, 0);
13112444SGiri.Adari@Sun.COM if (rds_ibdev->ib_frag_slab == NULL) {
13212444SGiri.Adari@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_add_one",
13312444SGiri.Adari@Sun.COM "kmem_cache_create for ib_frag_slab failed for device: %s",
13412444SGiri.Adari@Sun.COM device->name);
13512444SGiri.Adari@Sun.COM rdsv3_ib_destroy_mr_pool(rds_ibdev);
13612676SEiji.Ota@Sun.COM rdsv3_ib_destroy_inc_pool(rds_ibdev);
13712444SGiri.Adari@Sun.COM goto free_dev;
13812444SGiri.Adari@Sun.COM }
13912444SGiri.Adari@Sun.COM
14012676SEiji.Ota@Sun.COM rds_ibdev->aft_hcagp = rdsv3_af_grp_create(rds_ibdev->ibt_hca_hdl,
14112676SEiji.Ota@Sun.COM (uint64_t)rds_ibdev->hca_attr.hca_node_guid);
14212676SEiji.Ota@Sun.COM if (rds_ibdev->aft_hcagp == NULL) {
14312676SEiji.Ota@Sun.COM rdsv3_ib_destroy_mr_pool(rds_ibdev);
14412676SEiji.Ota@Sun.COM rdsv3_ib_destroy_inc_pool(rds_ibdev);
14512676SEiji.Ota@Sun.COM kmem_cache_destroy(rds_ibdev->ib_frag_slab);
14612676SEiji.Ota@Sun.COM goto free_dev;
14712676SEiji.Ota@Sun.COM }
14812676SEiji.Ota@Sun.COM rds_ibdev->fmr_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_mrlist_fn,
14912676SEiji.Ota@Sun.COM (void *)rds_ibdev->fmr_pool, SCQ_HCA_BIND_CPU,
15012676SEiji.Ota@Sun.COM rds_ibdev->aft_hcagp);
15112676SEiji.Ota@Sun.COM if (rds_ibdev->fmr_soft_cq == NULL) {
15212676SEiji.Ota@Sun.COM rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
15312676SEiji.Ota@Sun.COM rdsv3_ib_destroy_mr_pool(rds_ibdev);
15412676SEiji.Ota@Sun.COM rdsv3_ib_destroy_inc_pool(rds_ibdev);
15512676SEiji.Ota@Sun.COM kmem_cache_destroy(rds_ibdev->ib_frag_slab);
15612676SEiji.Ota@Sun.COM goto free_dev;
15712676SEiji.Ota@Sun.COM }
15812676SEiji.Ota@Sun.COM
15912676SEiji.Ota@Sun.COM rds_ibdev->inc_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_inclist,
16012676SEiji.Ota@Sun.COM (void *)rds_ibdev->inc_pool, SCQ_HCA_BIND_CPU,
16112676SEiji.Ota@Sun.COM rds_ibdev->aft_hcagp);
16212676SEiji.Ota@Sun.COM if (rds_ibdev->inc_soft_cq == NULL) {
16312676SEiji.Ota@Sun.COM rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq);
16412676SEiji.Ota@Sun.COM rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
16512676SEiji.Ota@Sun.COM rdsv3_ib_destroy_mr_pool(rds_ibdev);
16612676SEiji.Ota@Sun.COM rdsv3_ib_destroy_inc_pool(rds_ibdev);
16712676SEiji.Ota@Sun.COM kmem_cache_destroy(rds_ibdev->ib_frag_slab);
16812676SEiji.Ota@Sun.COM goto free_dev;
16912676SEiji.Ota@Sun.COM }
17012444SGiri.Adari@Sun.COM
17112198SEiji.Ota@Sun.COM list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr),
17212198SEiji.Ota@Sun.COM offsetof(struct rdsv3_ib_ipaddr, list));
17312198SEiji.Ota@Sun.COM list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection),
17412198SEiji.Ota@Sun.COM offsetof(struct rdsv3_ib_connection, ib_node));
17512198SEiji.Ota@Sun.COM
17612198SEiji.Ota@Sun.COM list_insert_tail(&rdsv3_ib_devices, rds_ibdev);
17712198SEiji.Ota@Sun.COM
17812198SEiji.Ota@Sun.COM ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev);
17912198SEiji.Ota@Sun.COM
18012580SGiri.Adari@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_add_one", "Return: device: %p", device);
18112198SEiji.Ota@Sun.COM
18212198SEiji.Ota@Sun.COM goto free_attr;
18312198SEiji.Ota@Sun.COM
18412198SEiji.Ota@Sun.COM err_pd:
18512198SEiji.Ota@Sun.COM (void) ib_dealloc_pd(rds_ibdev->pd);
18612198SEiji.Ota@Sun.COM free_dev:
18712676SEiji.Ota@Sun.COM mutex_destroy(&rds_ibdev->spinlock);
18812676SEiji.Ota@Sun.COM rw_destroy(&rds_ibdev->rwlock);
18912198SEiji.Ota@Sun.COM kmem_free(rds_ibdev, sizeof (*rds_ibdev));
19012198SEiji.Ota@Sun.COM free_attr:
19112198SEiji.Ota@Sun.COM kmem_free(dev_attr, sizeof (*dev_attr));
19212198SEiji.Ota@Sun.COM }
19312198SEiji.Ota@Sun.COM
19412198SEiji.Ota@Sun.COM void
rdsv3_ib_remove_one(struct ib_device * device)19512198SEiji.Ota@Sun.COM rdsv3_ib_remove_one(struct ib_device *device)
19612198SEiji.Ota@Sun.COM {
19712198SEiji.Ota@Sun.COM struct rdsv3_ib_device *rds_ibdev;
19812198SEiji.Ota@Sun.COM struct rdsv3_ib_ipaddr *i_ipaddr, *i_next;
19912198SEiji.Ota@Sun.COM
20012580SGiri.Adari@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_remove_one", "device: %p", device);
20112198SEiji.Ota@Sun.COM
20212198SEiji.Ota@Sun.COM rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client);
20312198SEiji.Ota@Sun.COM if (!rds_ibdev)
20412198SEiji.Ota@Sun.COM return;
20512198SEiji.Ota@Sun.COM
20612198SEiji.Ota@Sun.COM RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list,
20712198SEiji.Ota@Sun.COM list) {
20812198SEiji.Ota@Sun.COM list_remove_node(&i_ipaddr->list);
20912198SEiji.Ota@Sun.COM kmem_free(i_ipaddr, sizeof (*i_ipaddr));
21012198SEiji.Ota@Sun.COM }
21112198SEiji.Ota@Sun.COM
21212198SEiji.Ota@Sun.COM rdsv3_ib_destroy_conns(rds_ibdev);
21312198SEiji.Ota@Sun.COM
21412676SEiji.Ota@Sun.COM if (rds_ibdev->fmr_soft_cq)
21512676SEiji.Ota@Sun.COM rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq);
21612676SEiji.Ota@Sun.COM if (rds_ibdev->inc_soft_cq)
21712676SEiji.Ota@Sun.COM rdsv3_af_thr_destroy(rds_ibdev->inc_soft_cq);
21812676SEiji.Ota@Sun.COM
21912198SEiji.Ota@Sun.COM rdsv3_ib_destroy_mr_pool(rds_ibdev);
22012676SEiji.Ota@Sun.COM rdsv3_ib_destroy_inc_pool(rds_ibdev);
22112198SEiji.Ota@Sun.COM
22212444SGiri.Adari@Sun.COM kmem_cache_destroy(rds_ibdev->ib_frag_slab);
22312444SGiri.Adari@Sun.COM
22412676SEiji.Ota@Sun.COM rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
22512676SEiji.Ota@Sun.COM
22612198SEiji.Ota@Sun.COM #if 0
22712198SEiji.Ota@Sun.COM while (ib_dealloc_pd(rds_ibdev->pd)) {
22812198SEiji.Ota@Sun.COM #ifndef __lock_lint
22912198SEiji.Ota@Sun.COM RDSV3_DPRINTF5("rdsv3_ib_remove_one",
23012198SEiji.Ota@Sun.COM "%s-%d Failed to dealloc pd %p",
23112198SEiji.Ota@Sun.COM __func__, __LINE__, rds_ibdev->pd);
23212198SEiji.Ota@Sun.COM #endif
23312198SEiji.Ota@Sun.COM delay(drv_usectohz(1000));
23412198SEiji.Ota@Sun.COM }
23512198SEiji.Ota@Sun.COM #else
23612198SEiji.Ota@Sun.COM if (ib_dealloc_pd(rds_ibdev->pd)) {
23712198SEiji.Ota@Sun.COM #ifndef __lock_lint
23812198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_remove_one",
23912414SEiji.Ota@Sun.COM "Failed to dealloc pd %p\n", rds_ibdev->pd);
24012198SEiji.Ota@Sun.COM #endif
24112198SEiji.Ota@Sun.COM }
24212198SEiji.Ota@Sun.COM #endif
24312198SEiji.Ota@Sun.COM
24412198SEiji.Ota@Sun.COM list_destroy(&rds_ibdev->ipaddr_list);
24512198SEiji.Ota@Sun.COM list_destroy(&rds_ibdev->conn_list);
24612198SEiji.Ota@Sun.COM list_remove_node(&rds_ibdev->list);
24712676SEiji.Ota@Sun.COM mutex_destroy(&rds_ibdev->spinlock);
24812676SEiji.Ota@Sun.COM rw_destroy(&rds_ibdev->rwlock);
24912198SEiji.Ota@Sun.COM kmem_free(rds_ibdev, sizeof (*rds_ibdev));
25012198SEiji.Ota@Sun.COM
25112580SGiri.Adari@Sun.COM RDSV3_DPRINTF2("rdsv3_ib_remove_one", "Return: device: %p", device);
25212198SEiji.Ota@Sun.COM }
25312198SEiji.Ota@Sun.COM
25412198SEiji.Ota@Sun.COM #ifndef __lock_lint
25512198SEiji.Ota@Sun.COM struct ib_client rdsv3_ib_client = {
25612198SEiji.Ota@Sun.COM .name = "rdsv3_ib",
25712198SEiji.Ota@Sun.COM .add = rdsv3_ib_add_one,
25812198SEiji.Ota@Sun.COM .remove = rdsv3_ib_remove_one,
25912198SEiji.Ota@Sun.COM .clnt_hdl = NULL,
26012198SEiji.Ota@Sun.COM .state = IB_CLNT_UNINITIALIZED
26112198SEiji.Ota@Sun.COM };
26212198SEiji.Ota@Sun.COM #else
26312198SEiji.Ota@Sun.COM struct ib_client rdsv3_ib_client = {
26412198SEiji.Ota@Sun.COM "rdsv3_ib",
26512198SEiji.Ota@Sun.COM rdsv3_ib_add_one,
26612198SEiji.Ota@Sun.COM rdsv3_ib_remove_one,
26712198SEiji.Ota@Sun.COM NULL,
26812198SEiji.Ota@Sun.COM NULL,
26912198SEiji.Ota@Sun.COM IB_CLNT_UNINITIALIZED
27012198SEiji.Ota@Sun.COM };
27112198SEiji.Ota@Sun.COM #endif
27212198SEiji.Ota@Sun.COM
27312198SEiji.Ota@Sun.COM static int
rds_ib_conn_info_visitor(struct rdsv3_connection * conn,void * buffer)27412198SEiji.Ota@Sun.COM rds_ib_conn_info_visitor(struct rdsv3_connection *conn,
27512198SEiji.Ota@Sun.COM void *buffer)
27612198SEiji.Ota@Sun.COM {
27712863SEiji.Ota@Sun.COM struct rds_info_rdma_connection *iinfo = buffer;
27812198SEiji.Ota@Sun.COM struct rdsv3_ib_connection *ic;
27912198SEiji.Ota@Sun.COM
28012198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
28112198SEiji.Ota@Sun.COM conn, buffer);
28212198SEiji.Ota@Sun.COM
28312198SEiji.Ota@Sun.COM /* We will only ever look at IB transports */
28412198SEiji.Ota@Sun.COM if (conn->c_trans != &rdsv3_ib_transport)
28512198SEiji.Ota@Sun.COM return (0);
28612198SEiji.Ota@Sun.COM
28712198SEiji.Ota@Sun.COM iinfo->src_addr = conn->c_laddr;
28812198SEiji.Ota@Sun.COM iinfo->dst_addr = conn->c_faddr;
28912198SEiji.Ota@Sun.COM
29012198SEiji.Ota@Sun.COM (void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid));
29112198SEiji.Ota@Sun.COM (void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid));
29212198SEiji.Ota@Sun.COM if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
29312198SEiji.Ota@Sun.COM struct rdsv3_ib_device *rds_ibdev;
29412198SEiji.Ota@Sun.COM struct rdma_dev_addr *dev_addr;
29512198SEiji.Ota@Sun.COM
29612198SEiji.Ota@Sun.COM ic = conn->c_transport_data;
29712198SEiji.Ota@Sun.COM dev_addr = &ic->i_cm_id->route.addr.dev_addr;
29812198SEiji.Ota@Sun.COM
29912198SEiji.Ota@Sun.COM ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid);
30012198SEiji.Ota@Sun.COM ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid);
30112198SEiji.Ota@Sun.COM
30212198SEiji.Ota@Sun.COM rds_ibdev = ib_get_client_data(ic->i_cm_id->device,
30312198SEiji.Ota@Sun.COM &rdsv3_ib_client);
30412198SEiji.Ota@Sun.COM iinfo->max_send_wr = ic->i_send_ring.w_nr;
30512198SEiji.Ota@Sun.COM iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
30612198SEiji.Ota@Sun.COM iinfo->max_send_sge = rds_ibdev->max_sge;
30712198SEiji.Ota@Sun.COM }
30812198SEiji.Ota@Sun.COM
30912198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
31012198SEiji.Ota@Sun.COM conn, buffer);
31112198SEiji.Ota@Sun.COM return (1);
31212198SEiji.Ota@Sun.COM }
31312198SEiji.Ota@Sun.COM
31412198SEiji.Ota@Sun.COM static void
rds_ib_ic_info(struct rsock * sock,unsigned int len,struct rdsv3_info_iterator * iter,struct rdsv3_info_lengths * lens)31512198SEiji.Ota@Sun.COM rds_ib_ic_info(struct rsock *sock, unsigned int len,
31612198SEiji.Ota@Sun.COM struct rdsv3_info_iterator *iter,
31712198SEiji.Ota@Sun.COM struct rdsv3_info_lengths *lens)
31812198SEiji.Ota@Sun.COM {
31912198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d",
32012198SEiji.Ota@Sun.COM sock, iter, lens, len);
32112198SEiji.Ota@Sun.COM
32212198SEiji.Ota@Sun.COM rdsv3_for_each_conn_info(sock, len, iter, lens,
32312198SEiji.Ota@Sun.COM rds_ib_conn_info_visitor,
32412863SEiji.Ota@Sun.COM sizeof (struct rds_info_rdma_connection));
32512198SEiji.Ota@Sun.COM }
32612198SEiji.Ota@Sun.COM
32712198SEiji.Ota@Sun.COM /*
32812198SEiji.Ota@Sun.COM * Early RDS/IB was built to only bind to an address if there is an IPoIB
32912198SEiji.Ota@Sun.COM * device with that address set.
33012198SEiji.Ota@Sun.COM *
33112198SEiji.Ota@Sun.COM * If it were me, I'd advocate for something more flexible. Sending and
33212198SEiji.Ota@Sun.COM * receiving should be device-agnostic. Transports would try and maintain
33312198SEiji.Ota@Sun.COM * connections between peers who have messages queued. Userspace would be
33412198SEiji.Ota@Sun.COM * allowed to influence which paths have priority. We could call userspace
33512198SEiji.Ota@Sun.COM * asserting this policy "routing".
33612198SEiji.Ota@Sun.COM */
33712198SEiji.Ota@Sun.COM static int
rds_ib_laddr_check(uint32_be_t addr)33812198SEiji.Ota@Sun.COM rds_ib_laddr_check(uint32_be_t addr)
33912198SEiji.Ota@Sun.COM {
34012198SEiji.Ota@Sun.COM int ret;
34112198SEiji.Ota@Sun.COM struct rdma_cm_id *cm_id;
34212198SEiji.Ota@Sun.COM struct sockaddr_in sin;
34312198SEiji.Ota@Sun.COM
34412198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr));
34512198SEiji.Ota@Sun.COM
34612198SEiji.Ota@Sun.COM /*
34712198SEiji.Ota@Sun.COM * Create a CMA ID and try to bind it. This catches both
34812198SEiji.Ota@Sun.COM * IB and iWARP capable NICs.
34912198SEiji.Ota@Sun.COM */
35012198SEiji.Ota@Sun.COM cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
35112414SEiji.Ota@Sun.COM if (!cm_id)
35212414SEiji.Ota@Sun.COM return (-EADDRNOTAVAIL);
35312198SEiji.Ota@Sun.COM
35412198SEiji.Ota@Sun.COM (void) memset(&sin, 0, sizeof (sin));
35512198SEiji.Ota@Sun.COM sin.sin_family = AF_INET;
35612198SEiji.Ota@Sun.COM sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr);
35712198SEiji.Ota@Sun.COM
35812198SEiji.Ota@Sun.COM /* rdma_bind_addr will only succeed for IB & iWARP devices */
35912198SEiji.Ota@Sun.COM ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
36012198SEiji.Ota@Sun.COM /*
36112198SEiji.Ota@Sun.COM * due to this, we will claim to support iWARP devices unless we
36212198SEiji.Ota@Sun.COM * check node_type.
36312198SEiji.Ota@Sun.COM */
36412198SEiji.Ota@Sun.COM if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
36512198SEiji.Ota@Sun.COM ret = -EADDRNOTAVAIL;
36612198SEiji.Ota@Sun.COM
36712198SEiji.Ota@Sun.COM RDSV3_DPRINTF5("rds_ib_laddr_check",
36812198SEiji.Ota@Sun.COM "addr %u.%u.%u.%u ret %d node type %d",
36912198SEiji.Ota@Sun.COM NIPQUAD(addr), ret,
37012198SEiji.Ota@Sun.COM cm_id->device ? cm_id->device->node_type : -1);
37112198SEiji.Ota@Sun.COM
37212198SEiji.Ota@Sun.COM rdma_destroy_id(cm_id);
37312198SEiji.Ota@Sun.COM
37412198SEiji.Ota@Sun.COM return (ret);
37512198SEiji.Ota@Sun.COM }
37612198SEiji.Ota@Sun.COM
37712198SEiji.Ota@Sun.COM void
rdsv3_ib_exit(void)37812198SEiji.Ota@Sun.COM rdsv3_ib_exit(void)
37912198SEiji.Ota@Sun.COM {
38012198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rds_ib_exit", "Enter");
38112198SEiji.Ota@Sun.COM
38212863SEiji.Ota@Sun.COM rdsv3_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
38312198SEiji.Ota@Sun.COM rdsv3_ib_destroy_nodev_conns();
38412198SEiji.Ota@Sun.COM ib_unregister_client(&rdsv3_ib_client);
38512198SEiji.Ota@Sun.COM rdsv3_ib_sysctl_exit();
38612198SEiji.Ota@Sun.COM rdsv3_ib_recv_exit();
38712198SEiji.Ota@Sun.COM rdsv3_trans_unregister(&rdsv3_ib_transport);
388*12922SGiri.Adari@Sun.COM kmem_free(rdsv3_ib_stats,
389*12922SGiri.Adari@Sun.COM nr_cpus * sizeof (struct rdsv3_ib_statistics));
39012198SEiji.Ota@Sun.COM mutex_destroy(&ib_nodev_conns_lock);
39112198SEiji.Ota@Sun.COM list_destroy(&ib_nodev_conns);
39212198SEiji.Ota@Sun.COM list_destroy(&rdsv3_ib_devices);
39312198SEiji.Ota@Sun.COM
39412198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rds_ib_exit", "Return");
39512198SEiji.Ota@Sun.COM }
39612198SEiji.Ota@Sun.COM
39712198SEiji.Ota@Sun.COM #ifndef __lock_lint
39812198SEiji.Ota@Sun.COM struct rdsv3_transport rdsv3_ib_transport = {
39912198SEiji.Ota@Sun.COM .laddr_check = rds_ib_laddr_check,
40012198SEiji.Ota@Sun.COM .xmit_complete = rdsv3_ib_xmit_complete,
40112198SEiji.Ota@Sun.COM .xmit = rdsv3_ib_xmit,
40212198SEiji.Ota@Sun.COM .xmit_cong_map = NULL,
40312198SEiji.Ota@Sun.COM .xmit_rdma = rdsv3_ib_xmit_rdma,
40412198SEiji.Ota@Sun.COM .recv = rdsv3_ib_recv,
40512198SEiji.Ota@Sun.COM .conn_alloc = rdsv3_ib_conn_alloc,
40612198SEiji.Ota@Sun.COM .conn_free = rdsv3_ib_conn_free,
40712198SEiji.Ota@Sun.COM .conn_connect = rdsv3_ib_conn_connect,
40812198SEiji.Ota@Sun.COM .conn_shutdown = rdsv3_ib_conn_shutdown,
40912198SEiji.Ota@Sun.COM .inc_copy_to_user = rdsv3_ib_inc_copy_to_user,
41012198SEiji.Ota@Sun.COM .inc_free = rdsv3_ib_inc_free,
41112198SEiji.Ota@Sun.COM .cm_initiate_connect = rdsv3_ib_cm_initiate_connect,
41212198SEiji.Ota@Sun.COM .cm_handle_connect = rdsv3_ib_cm_handle_connect,
41312198SEiji.Ota@Sun.COM .cm_connect_complete = rdsv3_ib_cm_connect_complete,
41412198SEiji.Ota@Sun.COM .stats_info_copy = rdsv3_ib_stats_info_copy,
41512198SEiji.Ota@Sun.COM .exit = rdsv3_ib_exit,
41612198SEiji.Ota@Sun.COM .get_mr = rdsv3_ib_get_mr,
41712198SEiji.Ota@Sun.COM .sync_mr = rdsv3_ib_sync_mr,
41812198SEiji.Ota@Sun.COM .free_mr = rdsv3_ib_free_mr,
41912198SEiji.Ota@Sun.COM .flush_mrs = rdsv3_ib_flush_mrs,
42012198SEiji.Ota@Sun.COM .t_name = "infiniband",
42112414SEiji.Ota@Sun.COM .t_type = RDS_TRANS_IB
42212198SEiji.Ota@Sun.COM };
42312198SEiji.Ota@Sun.COM #else
42412198SEiji.Ota@Sun.COM struct rdsv3_transport rdsv3_ib_transport;
42512198SEiji.Ota@Sun.COM #endif
42612198SEiji.Ota@Sun.COM
42712198SEiji.Ota@Sun.COM int
rdsv3_ib_init(void)42812198SEiji.Ota@Sun.COM rdsv3_ib_init(void)
42912198SEiji.Ota@Sun.COM {
43012198SEiji.Ota@Sun.COM int ret;
43112198SEiji.Ota@Sun.COM
43212198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rds_ib_init", "Enter");
43312198SEiji.Ota@Sun.COM
43412198SEiji.Ota@Sun.COM list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device),
43512198SEiji.Ota@Sun.COM offsetof(struct rdsv3_ib_device, list));
43612198SEiji.Ota@Sun.COM list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection),
43712198SEiji.Ota@Sun.COM offsetof(struct rdsv3_ib_connection, ib_node));
43812198SEiji.Ota@Sun.COM mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL);
43912198SEiji.Ota@Sun.COM
440*12922SGiri.Adari@Sun.COM /* allocate space for ib statistics */
441*12922SGiri.Adari@Sun.COM ASSERT(rdsv3_ib_stats == NULL);
442*12922SGiri.Adari@Sun.COM rdsv3_ib_stats = kmem_zalloc(nr_cpus *
443*12922SGiri.Adari@Sun.COM sizeof (struct rdsv3_ib_statistics), KM_SLEEP);
444*12922SGiri.Adari@Sun.COM
44512198SEiji.Ota@Sun.COM rdsv3_ib_client.dip = rdsv3_dev_info;
44612198SEiji.Ota@Sun.COM ret = ib_register_client(&rdsv3_ib_client);
44712198SEiji.Ota@Sun.COM if (ret)
44812198SEiji.Ota@Sun.COM goto out;
44912198SEiji.Ota@Sun.COM
45012198SEiji.Ota@Sun.COM ret = rdsv3_ib_sysctl_init();
45112198SEiji.Ota@Sun.COM if (ret)
45212198SEiji.Ota@Sun.COM goto out_ibreg;
45312198SEiji.Ota@Sun.COM
45412198SEiji.Ota@Sun.COM ret = rdsv3_ib_recv_init();
45512198SEiji.Ota@Sun.COM if (ret)
45612198SEiji.Ota@Sun.COM goto out_sysctl;
45712198SEiji.Ota@Sun.COM
45812198SEiji.Ota@Sun.COM ret = rdsv3_trans_register(&rdsv3_ib_transport);
45912198SEiji.Ota@Sun.COM if (ret)
46012198SEiji.Ota@Sun.COM goto out_recv;
46112198SEiji.Ota@Sun.COM
46212863SEiji.Ota@Sun.COM rdsv3_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
46312198SEiji.Ota@Sun.COM
46412198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rds_ib_init", "Return");
46512198SEiji.Ota@Sun.COM
46612198SEiji.Ota@Sun.COM return (0);
46712198SEiji.Ota@Sun.COM
46812198SEiji.Ota@Sun.COM out_recv:
46912198SEiji.Ota@Sun.COM rdsv3_ib_recv_exit();
47012198SEiji.Ota@Sun.COM out_sysctl:
47112198SEiji.Ota@Sun.COM rdsv3_ib_sysctl_exit();
47212198SEiji.Ota@Sun.COM out_ibreg:
47312198SEiji.Ota@Sun.COM ib_unregister_client(&rdsv3_ib_client);
47412198SEiji.Ota@Sun.COM out:
475*12922SGiri.Adari@Sun.COM kmem_free(rdsv3_ib_stats,
476*12922SGiri.Adari@Sun.COM nr_cpus * sizeof (struct rdsv3_ib_statistics));
47712198SEiji.Ota@Sun.COM mutex_destroy(&ib_nodev_conns_lock);
47812198SEiji.Ota@Sun.COM list_destroy(&ib_nodev_conns);
47912198SEiji.Ota@Sun.COM list_destroy(&rdsv3_ib_devices);
48012198SEiji.Ota@Sun.COM return (ret);
48112198SEiji.Ota@Sun.COM }
482