xref: /onnv-gate/usr/src/uts/common/io/ib/clients/rdsv3/rdma.c (revision 12991:b4084e8201cf)
112198SEiji.Ota@Sun.COM /*
212198SEiji.Ota@Sun.COM  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
312198SEiji.Ota@Sun.COM  */
412198SEiji.Ota@Sun.COM 
512198SEiji.Ota@Sun.COM /*
612763SGiri.Adari@Sun.COM  * This file contains code imported from the OFED rds source file rdma.c
712763SGiri.Adari@Sun.COM  * Oracle elects to have and use the contents of rdma.c under and governed
812763SGiri.Adari@Sun.COM  * by the OpenIB.org BSD license (see below for full license text). However,
912763SGiri.Adari@Sun.COM  * the following notice accompanied the original version of this file:
1012763SGiri.Adari@Sun.COM  */
1112763SGiri.Adari@Sun.COM 
1212763SGiri.Adari@Sun.COM /*
1312198SEiji.Ota@Sun.COM  * Copyright (c) 2007 Oracle.  All rights reserved.
1412198SEiji.Ota@Sun.COM  *
1512198SEiji.Ota@Sun.COM  * This software is available to you under a choice of one of two
1612198SEiji.Ota@Sun.COM  * licenses.  You may choose to be licensed under the terms of the GNU
1712198SEiji.Ota@Sun.COM  * General Public License (GPL) Version 2, available from the file
1812198SEiji.Ota@Sun.COM  * COPYING in the main directory of this source tree, or the
1912198SEiji.Ota@Sun.COM  * OpenIB.org BSD license below:
2012198SEiji.Ota@Sun.COM  *
2112198SEiji.Ota@Sun.COM  *     Redistribution and use in source and binary forms, with or
2212198SEiji.Ota@Sun.COM  *     without modification, are permitted provided that the following
2312198SEiji.Ota@Sun.COM  *     conditions are met:
2412198SEiji.Ota@Sun.COM  *
2512198SEiji.Ota@Sun.COM  *      - Redistributions of source code must retain the above
2612198SEiji.Ota@Sun.COM  *        copyright notice, this list of conditions and the following
2712198SEiji.Ota@Sun.COM  *        disclaimer.
2812198SEiji.Ota@Sun.COM  *
2912198SEiji.Ota@Sun.COM  *      - Redistributions in binary form must reproduce the above
3012198SEiji.Ota@Sun.COM  *        copyright notice, this list of conditions and the following
3112198SEiji.Ota@Sun.COM  *        disclaimer in the documentation and/or other materials
3212198SEiji.Ota@Sun.COM  *        provided with the distribution.
3312198SEiji.Ota@Sun.COM  *
3412198SEiji.Ota@Sun.COM  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
3512198SEiji.Ota@Sun.COM  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
3612198SEiji.Ota@Sun.COM  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
3712198SEiji.Ota@Sun.COM  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
3812198SEiji.Ota@Sun.COM  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
3912198SEiji.Ota@Sun.COM  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
4012198SEiji.Ota@Sun.COM  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
4112198SEiji.Ota@Sun.COM  * SOFTWARE.
4212198SEiji.Ota@Sun.COM  *
4312198SEiji.Ota@Sun.COM  */
4412198SEiji.Ota@Sun.COM #include <sys/ib/clients/of/rdma/ib_verbs.h>
4512198SEiji.Ota@Sun.COM #include <sys/ib/clients/of/rdma/ib_addr.h>
4612198SEiji.Ota@Sun.COM #include <sys/ib/clients/of/rdma/rdma_cm.h>
4712198SEiji.Ota@Sun.COM 
4812198SEiji.Ota@Sun.COM #include <sys/ib/clients/rdsv3/ib.h>
4912198SEiji.Ota@Sun.COM #include <sys/ib/clients/rdsv3/rdma.h>
5012198SEiji.Ota@Sun.COM #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
5112198SEiji.Ota@Sun.COM 
5212198SEiji.Ota@Sun.COM #define	DMA_TO_DEVICE 0
5312198SEiji.Ota@Sun.COM #define	DMA_FROM_DEVICE 1
5412198SEiji.Ota@Sun.COM #define	RB_CLEAR_NODE(nodep) AVL_SETPARENT(nodep, nodep);
5512198SEiji.Ota@Sun.COM 
5612198SEiji.Ota@Sun.COM /*
5712198SEiji.Ota@Sun.COM  * XXX
5812198SEiji.Ota@Sun.COM  *  - build with sparse
5912198SEiji.Ota@Sun.COM  *  - should we limit the size of a mr region?  let transport return failure?
6012198SEiji.Ota@Sun.COM  *  - should we detect duplicate keys on a socket?  hmm.
6112198SEiji.Ota@Sun.COM  *  - an rdma is an mlock, apply rlimit?
6212198SEiji.Ota@Sun.COM  */
6312198SEiji.Ota@Sun.COM 
6412198SEiji.Ota@Sun.COM /*
6512198SEiji.Ota@Sun.COM  * get the number of pages by looking at the page indices that the start and
6612198SEiji.Ota@Sun.COM  * end addresses fall in.
6712198SEiji.Ota@Sun.COM  *
6812198SEiji.Ota@Sun.COM  * Returns 0 if the vec is invalid.  It is invalid if the number of bytes
6912198SEiji.Ota@Sun.COM  * causes the address to wrap or overflows an unsigned int.  This comes
7012198SEiji.Ota@Sun.COM  * from being stored in the 'length' member of 'struct rdsv3_scatterlist'.
7112198SEiji.Ota@Sun.COM  */
7212198SEiji.Ota@Sun.COM static unsigned int
rdsv3_pages_in_vec(struct rds_iovec * vec)7312863SEiji.Ota@Sun.COM rdsv3_pages_in_vec(struct rds_iovec *vec)
7412198SEiji.Ota@Sun.COM {
7512198SEiji.Ota@Sun.COM 	if ((vec->addr + vec->bytes <= vec->addr) ||
7612198SEiji.Ota@Sun.COM 	    (vec->bytes > (uint64_t)UINT_MAX)) {
7712198SEiji.Ota@Sun.COM 		return (0);
7812198SEiji.Ota@Sun.COM 	}
7912198SEiji.Ota@Sun.COM 
8012198SEiji.Ota@Sun.COM 	return (((vec->addr + vec->bytes + PAGESIZE - 1) >>
8112198SEiji.Ota@Sun.COM 	    PAGESHIFT) - (vec->addr >> PAGESHIFT));
8212198SEiji.Ota@Sun.COM }
8312198SEiji.Ota@Sun.COM 
8412198SEiji.Ota@Sun.COM static struct rdsv3_mr *
rdsv3_mr_tree_walk(struct avl_tree * root,uint32_t key,struct rdsv3_mr * insert)8512198SEiji.Ota@Sun.COM rdsv3_mr_tree_walk(struct avl_tree *root, uint32_t key,
8612198SEiji.Ota@Sun.COM 	struct rdsv3_mr *insert)
8712198SEiji.Ota@Sun.COM {
8812198SEiji.Ota@Sun.COM 	struct rdsv3_mr *mr;
8912198SEiji.Ota@Sun.COM 	avl_index_t where;
9012198SEiji.Ota@Sun.COM 
9112198SEiji.Ota@Sun.COM 	mr = avl_find(root, &key, &where);
9212198SEiji.Ota@Sun.COM 	if ((mr == NULL) && (insert != NULL)) {
9312198SEiji.Ota@Sun.COM 		avl_insert(root, (void *)insert, where);
9412198SEiji.Ota@Sun.COM 		atomic_add_32(&insert->r_refcount, 1);
9512198SEiji.Ota@Sun.COM 		return (NULL);
9612198SEiji.Ota@Sun.COM 	}
9712198SEiji.Ota@Sun.COM 
9812198SEiji.Ota@Sun.COM 	return (mr);
9912198SEiji.Ota@Sun.COM }
10012198SEiji.Ota@Sun.COM 
10112198SEiji.Ota@Sun.COM /*
10212198SEiji.Ota@Sun.COM  * Destroy the transport-specific part of a MR.
10312198SEiji.Ota@Sun.COM  */
10412198SEiji.Ota@Sun.COM static void
rdsv3_destroy_mr(struct rdsv3_mr * mr)10512198SEiji.Ota@Sun.COM rdsv3_destroy_mr(struct rdsv3_mr *mr)
10612198SEiji.Ota@Sun.COM {
10712198SEiji.Ota@Sun.COM 	struct rdsv3_sock *rs = mr->r_sock;
10812198SEiji.Ota@Sun.COM 	void *trans_private = NULL;
10912198SEiji.Ota@Sun.COM 	avl_node_t *np;
11012198SEiji.Ota@Sun.COM 
11112198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF5("rdsv3_destroy_mr",
11212198SEiji.Ota@Sun.COM 	    "RDS: destroy mr key is %x refcnt %u",
11312198SEiji.Ota@Sun.COM 	    mr->r_key, atomic_get(&mr->r_refcount));
11412198SEiji.Ota@Sun.COM 
11512198SEiji.Ota@Sun.COM 	if (test_and_set_bit(RDSV3_MR_DEAD, &mr->r_state))
11612198SEiji.Ota@Sun.COM 		return;
11712198SEiji.Ota@Sun.COM 
11812198SEiji.Ota@Sun.COM 	mutex_enter(&rs->rs_rdma_lock);
11912198SEiji.Ota@Sun.COM 	np = &mr->r_rb_node;
12012198SEiji.Ota@Sun.COM 	if (AVL_XPARENT(np) != np)
12112198SEiji.Ota@Sun.COM 		avl_remove(&rs->rs_rdma_keys, mr);
12212198SEiji.Ota@Sun.COM 	trans_private = mr->r_trans_private;
12312198SEiji.Ota@Sun.COM 	mr->r_trans_private = NULL;
12412198SEiji.Ota@Sun.COM 	mutex_exit(&rs->rs_rdma_lock);
12512198SEiji.Ota@Sun.COM 
12612198SEiji.Ota@Sun.COM 	if (trans_private)
12712198SEiji.Ota@Sun.COM 		mr->r_trans->free_mr(trans_private, mr->r_invalidate);
12812198SEiji.Ota@Sun.COM }
12912198SEiji.Ota@Sun.COM 
13012198SEiji.Ota@Sun.COM void
__rdsv3_put_mr_final(struct rdsv3_mr * mr)13112198SEiji.Ota@Sun.COM __rdsv3_put_mr_final(struct rdsv3_mr *mr)
13212198SEiji.Ota@Sun.COM {
13312198SEiji.Ota@Sun.COM 	rdsv3_destroy_mr(mr);
13412198SEiji.Ota@Sun.COM 	kmem_free(mr, sizeof (*mr));
13512198SEiji.Ota@Sun.COM }
13612198SEiji.Ota@Sun.COM 
13712198SEiji.Ota@Sun.COM /*
13812198SEiji.Ota@Sun.COM  * By the time this is called we can't have any more ioctls called on
13912198SEiji.Ota@Sun.COM  * the socket so we don't need to worry about racing with others.
14012198SEiji.Ota@Sun.COM  */
14112198SEiji.Ota@Sun.COM void
rdsv3_rdma_drop_keys(struct rdsv3_sock * rs)14212198SEiji.Ota@Sun.COM rdsv3_rdma_drop_keys(struct rdsv3_sock *rs)
14312198SEiji.Ota@Sun.COM {
14412198SEiji.Ota@Sun.COM 	struct rdsv3_mr *mr;
14512198SEiji.Ota@Sun.COM 	struct avl_node *node;
14612198SEiji.Ota@Sun.COM 
14712198SEiji.Ota@Sun.COM 	/* Release any MRs associated with this socket */
14812198SEiji.Ota@Sun.COM 	mutex_enter(&rs->rs_rdma_lock);
14912198SEiji.Ota@Sun.COM 	while ((node = avl_first(&rs->rs_rdma_keys))) {
15012198SEiji.Ota@Sun.COM 		mr = container_of(node, struct rdsv3_mr, r_rb_node);
15112198SEiji.Ota@Sun.COM 		if (mr->r_trans == rs->rs_transport)
15212198SEiji.Ota@Sun.COM 			mr->r_invalidate = 0;
15312198SEiji.Ota@Sun.COM 		avl_remove(&rs->rs_rdma_keys, &mr->r_rb_node);
15412198SEiji.Ota@Sun.COM 		RB_CLEAR_NODE(&mr->r_rb_node)
15512198SEiji.Ota@Sun.COM 		mutex_exit(&rs->rs_rdma_lock);
15612198SEiji.Ota@Sun.COM 		rdsv3_destroy_mr(mr);
15712198SEiji.Ota@Sun.COM 		rdsv3_mr_put(mr);
15812198SEiji.Ota@Sun.COM 		mutex_enter(&rs->rs_rdma_lock);
15912198SEiji.Ota@Sun.COM 	}
16012198SEiji.Ota@Sun.COM 	mutex_exit(&rs->rs_rdma_lock);
16112198SEiji.Ota@Sun.COM 
16212198SEiji.Ota@Sun.COM 	if (rs->rs_transport && rs->rs_transport->flush_mrs)
16312198SEiji.Ota@Sun.COM 		rs->rs_transport->flush_mrs();
16412198SEiji.Ota@Sun.COM }
16512198SEiji.Ota@Sun.COM 
16612198SEiji.Ota@Sun.COM static int
__rdsv3_rdma_map(struct rdsv3_sock * rs,struct rds_get_mr_args * args,uint64_t * cookie_ret,struct rdsv3_mr ** mr_ret)16712863SEiji.Ota@Sun.COM __rdsv3_rdma_map(struct rdsv3_sock *rs, struct rds_get_mr_args *args,
16812198SEiji.Ota@Sun.COM 	uint64_t *cookie_ret, struct rdsv3_mr **mr_ret)
16912198SEiji.Ota@Sun.COM {
17012198SEiji.Ota@Sun.COM 	struct rdsv3_mr *mr = NULL, *found;
17112198SEiji.Ota@Sun.COM 	void *trans_private;
17212863SEiji.Ota@Sun.COM 	rds_rdma_cookie_t cookie;
17312198SEiji.Ota@Sun.COM 	unsigned int nents = 0;
17412198SEiji.Ota@Sun.COM 	int ret;
17512198SEiji.Ota@Sun.COM 
17612198SEiji.Ota@Sun.COM 	if (rs->rs_bound_addr == 0) {
17712198SEiji.Ota@Sun.COM 		ret = -ENOTCONN; /* XXX not a great errno */
17812198SEiji.Ota@Sun.COM 		goto out;
17912198SEiji.Ota@Sun.COM 	}
18012198SEiji.Ota@Sun.COM 
18112676SEiji.Ota@Sun.COM 	if (!rs->rs_transport->get_mr) {
18212198SEiji.Ota@Sun.COM 		ret = -EOPNOTSUPP;
18312198SEiji.Ota@Sun.COM 		goto out;
18412198SEiji.Ota@Sun.COM 	}
18512198SEiji.Ota@Sun.COM 
18612198SEiji.Ota@Sun.COM 	mr = kmem_zalloc(sizeof (struct rdsv3_mr), KM_NOSLEEP);
18712676SEiji.Ota@Sun.COM 	if (!mr) {
18812198SEiji.Ota@Sun.COM 		ret = -ENOMEM;
18912198SEiji.Ota@Sun.COM 		goto out;
19012198SEiji.Ota@Sun.COM 	}
19112198SEiji.Ota@Sun.COM 
19212198SEiji.Ota@Sun.COM 	mr->r_refcount = 1;
19312198SEiji.Ota@Sun.COM 	RB_CLEAR_NODE(&mr->r_rb_node);
19412198SEiji.Ota@Sun.COM 	mr->r_trans = rs->rs_transport;
19512198SEiji.Ota@Sun.COM 	mr->r_sock = rs;
19612198SEiji.Ota@Sun.COM 
19712863SEiji.Ota@Sun.COM 	if (args->flags & RDS_RDMA_USE_ONCE)
19812198SEiji.Ota@Sun.COM 		mr->r_use_once = 1;
19912863SEiji.Ota@Sun.COM 	if (args->flags & RDS_RDMA_INVALIDATE)
20012198SEiji.Ota@Sun.COM 		mr->r_invalidate = 1;
20112863SEiji.Ota@Sun.COM 	if (args->flags & RDS_RDMA_READWRITE)
20212198SEiji.Ota@Sun.COM 		mr->r_write = 1;
20312198SEiji.Ota@Sun.COM 
20412198SEiji.Ota@Sun.COM 	/*
20512198SEiji.Ota@Sun.COM 	 * Obtain a transport specific MR. If this succeeds, the
20612198SEiji.Ota@Sun.COM 	 * s/g list is now owned by the MR.
20712198SEiji.Ota@Sun.COM 	 * Note that dma_map() implies that pending writes are
20812198SEiji.Ota@Sun.COM 	 * flushed to RAM, so no dma_sync is needed here.
20912198SEiji.Ota@Sun.COM 	 */
21012198SEiji.Ota@Sun.COM 	trans_private = rs->rs_transport->get_mr(&args->vec, nents, rs,
21112198SEiji.Ota@Sun.COM 	    &mr->r_key);
21212198SEiji.Ota@Sun.COM 
21312198SEiji.Ota@Sun.COM 	if (IS_ERR(trans_private)) {
21412198SEiji.Ota@Sun.COM 		ret = PTR_ERR(trans_private);
21512198SEiji.Ota@Sun.COM 		goto out;
21612198SEiji.Ota@Sun.COM 	}
21712198SEiji.Ota@Sun.COM 
21812198SEiji.Ota@Sun.COM 	mr->r_trans_private = trans_private;
21912198SEiji.Ota@Sun.COM 
22012198SEiji.Ota@Sun.COM 	/*
22112198SEiji.Ota@Sun.COM 	 * The user may pass us an unaligned address, but we can only
22212198SEiji.Ota@Sun.COM 	 * map page aligned regions. So we keep the offset, and build
22312198SEiji.Ota@Sun.COM 	 * a 64bit cookie containing <R_Key, offset> and pass that
22412198SEiji.Ota@Sun.COM 	 * around.
22512198SEiji.Ota@Sun.COM 	 */
22612198SEiji.Ota@Sun.COM 	cookie = rdsv3_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGEMASK);
22712198SEiji.Ota@Sun.COM 	if (cookie_ret)
22812198SEiji.Ota@Sun.COM 		*cookie_ret = cookie;
22912198SEiji.Ota@Sun.COM 
23012198SEiji.Ota@Sun.COM 	/*
23112198SEiji.Ota@Sun.COM 	 * copy value of cookie to user address at args->cookie_addr
23212198SEiji.Ota@Sun.COM 	 */
23312198SEiji.Ota@Sun.COM 	if (args->cookie_addr) {
23412198SEiji.Ota@Sun.COM 		ret = ddi_copyout((void *)&cookie,
23512198SEiji.Ota@Sun.COM 		    (void *)((intptr_t)args->cookie_addr),
23612863SEiji.Ota@Sun.COM 		    sizeof (rds_rdma_cookie_t), 0);
23712198SEiji.Ota@Sun.COM 		if (ret != 0) {
23812198SEiji.Ota@Sun.COM 			ret = -EFAULT;
23912198SEiji.Ota@Sun.COM 			goto out;
24012198SEiji.Ota@Sun.COM 		}
24112198SEiji.Ota@Sun.COM 	}
24212198SEiji.Ota@Sun.COM 
24312198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF5("__rdsv3_rdma_map",
24412198SEiji.Ota@Sun.COM 	    "RDS: get_mr mr 0x%p addr 0x%llx key 0x%x",
24512198SEiji.Ota@Sun.COM 	    mr, args->vec.addr, mr->r_key);
24612198SEiji.Ota@Sun.COM 	/*
24712198SEiji.Ota@Sun.COM 	 * Inserting the new MR into the rbtree bumps its
24812198SEiji.Ota@Sun.COM 	 * reference count.
24912198SEiji.Ota@Sun.COM 	 */
25012198SEiji.Ota@Sun.COM 	mutex_enter(&rs->rs_rdma_lock);
25112198SEiji.Ota@Sun.COM 	found = rdsv3_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr);
25212198SEiji.Ota@Sun.COM 	mutex_exit(&rs->rs_rdma_lock);
25312198SEiji.Ota@Sun.COM 
25412198SEiji.Ota@Sun.COM 	ASSERT(!(found && found != mr));
25512198SEiji.Ota@Sun.COM 
25612198SEiji.Ota@Sun.COM 	if (mr_ret) {
25712198SEiji.Ota@Sun.COM 		atomic_add_32(&mr->r_refcount, 1);
25812198SEiji.Ota@Sun.COM 		*mr_ret = mr;
25912198SEiji.Ota@Sun.COM 	}
26012198SEiji.Ota@Sun.COM 
26112198SEiji.Ota@Sun.COM 	ret = 0;
26212198SEiji.Ota@Sun.COM out:
26312198SEiji.Ota@Sun.COM 	if (mr)
26412198SEiji.Ota@Sun.COM 		rdsv3_mr_put(mr);
26512198SEiji.Ota@Sun.COM 	return (ret);
26612198SEiji.Ota@Sun.COM }
26712198SEiji.Ota@Sun.COM 
26812198SEiji.Ota@Sun.COM int
rdsv3_get_mr(struct rdsv3_sock * rs,const void * optval,int optlen)26912198SEiji.Ota@Sun.COM rdsv3_get_mr(struct rdsv3_sock *rs, const void *optval, int optlen)
27012198SEiji.Ota@Sun.COM {
27112863SEiji.Ota@Sun.COM 	struct rds_get_mr_args args;
27212198SEiji.Ota@Sun.COM 
27312863SEiji.Ota@Sun.COM 	if (optlen != sizeof (struct rds_get_mr_args))
27412198SEiji.Ota@Sun.COM 		return (-EINVAL);
27512198SEiji.Ota@Sun.COM 
27612198SEiji.Ota@Sun.COM #if 1
27712863SEiji.Ota@Sun.COM 	bcopy((struct rds_get_mr_args *)optval, &args,
27812863SEiji.Ota@Sun.COM 	    sizeof (struct rds_get_mr_args));
27912198SEiji.Ota@Sun.COM #else
28012198SEiji.Ota@Sun.COM 	if (ddi_copyin(optval, &args, optlen, 0))
28112198SEiji.Ota@Sun.COM 		return (-EFAULT);
28212198SEiji.Ota@Sun.COM #endif
28312198SEiji.Ota@Sun.COM 
28412198SEiji.Ota@Sun.COM 	return (__rdsv3_rdma_map(rs, &args, NULL, NULL));
28512198SEiji.Ota@Sun.COM }
28612198SEiji.Ota@Sun.COM 
28712414SEiji.Ota@Sun.COM int
rdsv3_get_mr_for_dest(struct rdsv3_sock * rs,const void * optval,int optlen)28812414SEiji.Ota@Sun.COM rdsv3_get_mr_for_dest(struct rdsv3_sock *rs, const void *optval,
28912414SEiji.Ota@Sun.COM     int optlen)
29012414SEiji.Ota@Sun.COM {
29112863SEiji.Ota@Sun.COM 	struct rds_get_mr_for_dest_args args;
29212863SEiji.Ota@Sun.COM 	struct rds_get_mr_args new_args;
29312414SEiji.Ota@Sun.COM 
29412863SEiji.Ota@Sun.COM 	if (optlen != sizeof (struct rds_get_mr_for_dest_args))
29512414SEiji.Ota@Sun.COM 		return (-EINVAL);
29612414SEiji.Ota@Sun.COM 
29712414SEiji.Ota@Sun.COM #if 1
29812863SEiji.Ota@Sun.COM 	bcopy((struct rds_get_mr_for_dest_args *)optval, &args,
29912863SEiji.Ota@Sun.COM 	    sizeof (struct rds_get_mr_for_dest_args));
30012414SEiji.Ota@Sun.COM #else
30112414SEiji.Ota@Sun.COM 	if (ddi_copyin(optval, &args, optlen, 0))
30212414SEiji.Ota@Sun.COM 		return (-EFAULT);
30312414SEiji.Ota@Sun.COM #endif
30412414SEiji.Ota@Sun.COM 
30512414SEiji.Ota@Sun.COM 	/*
30612414SEiji.Ota@Sun.COM 	 * Initially, just behave like get_mr().
30712414SEiji.Ota@Sun.COM 	 * TODO: Implement get_mr as wrapper around this
30812414SEiji.Ota@Sun.COM 	 *	 and deprecate it.
30912414SEiji.Ota@Sun.COM 	 */
31012414SEiji.Ota@Sun.COM 	new_args.vec = args.vec;
31112414SEiji.Ota@Sun.COM 	new_args.cookie_addr = args.cookie_addr;
31212414SEiji.Ota@Sun.COM 	new_args.flags = args.flags;
31312414SEiji.Ota@Sun.COM 
31412414SEiji.Ota@Sun.COM 	return (__rdsv3_rdma_map(rs, &new_args, NULL, NULL));
31512414SEiji.Ota@Sun.COM }
31612414SEiji.Ota@Sun.COM 
31712198SEiji.Ota@Sun.COM /*
31812198SEiji.Ota@Sun.COM  * Free the MR indicated by the given R_Key
31912198SEiji.Ota@Sun.COM  */
32012198SEiji.Ota@Sun.COM int
rdsv3_free_mr(struct rdsv3_sock * rs,const void * optval,int optlen)32112198SEiji.Ota@Sun.COM rdsv3_free_mr(struct rdsv3_sock *rs, const void *optval, int optlen)
32212198SEiji.Ota@Sun.COM {
32312863SEiji.Ota@Sun.COM 	struct rds_free_mr_args args;
32412198SEiji.Ota@Sun.COM 	struct rdsv3_mr *mr;
32512198SEiji.Ota@Sun.COM 
32612863SEiji.Ota@Sun.COM 	if (optlen != sizeof (struct rds_free_mr_args))
32712198SEiji.Ota@Sun.COM 		return (-EINVAL);
32812198SEiji.Ota@Sun.COM 
32912198SEiji.Ota@Sun.COM #if 1
33012863SEiji.Ota@Sun.COM 	bcopy((struct rds_free_mr_args *)optval, &args,
33112863SEiji.Ota@Sun.COM 	    sizeof (struct rds_free_mr_args));
33212198SEiji.Ota@Sun.COM #else
33312863SEiji.Ota@Sun.COM 	if (ddi_copyin((struct rds_free_mr_args *)optval, &args,
33412863SEiji.Ota@Sun.COM 	    sizeof (struct rds_free_mr_args), 0))
33512198SEiji.Ota@Sun.COM 		return (-EFAULT);
33612198SEiji.Ota@Sun.COM #endif
33712198SEiji.Ota@Sun.COM 
33812198SEiji.Ota@Sun.COM 	/* Special case - a null cookie means flush all unused MRs */
33912198SEiji.Ota@Sun.COM 	if (args.cookie == 0) {
34012198SEiji.Ota@Sun.COM 		if (!rs->rs_transport || !rs->rs_transport->flush_mrs)
34112198SEiji.Ota@Sun.COM 			return (-EINVAL);
34212198SEiji.Ota@Sun.COM 		rs->rs_transport->flush_mrs();
34312198SEiji.Ota@Sun.COM 		return (0);
34412198SEiji.Ota@Sun.COM 	}
34512198SEiji.Ota@Sun.COM 
34612198SEiji.Ota@Sun.COM 	/*
34712198SEiji.Ota@Sun.COM 	 * Look up the MR given its R_key and remove it from the rbtree
34812198SEiji.Ota@Sun.COM 	 * so nobody else finds it.
34912198SEiji.Ota@Sun.COM 	 * This should also prevent races with rdsv3_rdma_unuse.
35012198SEiji.Ota@Sun.COM 	 */
35112198SEiji.Ota@Sun.COM 	mutex_enter(&rs->rs_rdma_lock);
35212198SEiji.Ota@Sun.COM 	mr = rdsv3_mr_tree_walk(&rs->rs_rdma_keys,
35312198SEiji.Ota@Sun.COM 	    rdsv3_rdma_cookie_key(args.cookie), NULL);
35412198SEiji.Ota@Sun.COM 	if (mr) {
35512198SEiji.Ota@Sun.COM 		avl_remove(&rs->rs_rdma_keys, &mr->r_rb_node);
35612198SEiji.Ota@Sun.COM 		RB_CLEAR_NODE(&mr->r_rb_node);
35712863SEiji.Ota@Sun.COM 		if (args.flags & RDS_RDMA_INVALIDATE)
35812198SEiji.Ota@Sun.COM 			mr->r_invalidate = 1;
35912198SEiji.Ota@Sun.COM 	}
36012198SEiji.Ota@Sun.COM 	mutex_exit(&rs->rs_rdma_lock);
36112198SEiji.Ota@Sun.COM 
36212198SEiji.Ota@Sun.COM 	if (!mr)
36312198SEiji.Ota@Sun.COM 		return (-EINVAL);
36412198SEiji.Ota@Sun.COM 
36512198SEiji.Ota@Sun.COM 	/*
36612198SEiji.Ota@Sun.COM 	 * call rdsv3_destroy_mr() ourselves so that we're sure it's done
36712198SEiji.Ota@Sun.COM 	 * by time we return.  If we let rdsv3_mr_put() do it it might not
36812198SEiji.Ota@Sun.COM 	 * happen until someone else drops their ref.
36912198SEiji.Ota@Sun.COM 	 */
37012198SEiji.Ota@Sun.COM 	rdsv3_destroy_mr(mr);
37112198SEiji.Ota@Sun.COM 	rdsv3_mr_put(mr);
37212198SEiji.Ota@Sun.COM 	return (0);
37312198SEiji.Ota@Sun.COM }
37412198SEiji.Ota@Sun.COM 
37512198SEiji.Ota@Sun.COM /*
37612198SEiji.Ota@Sun.COM  * This is called when we receive an extension header that
37712198SEiji.Ota@Sun.COM  * tells us this MR was used. It allows us to implement
37812198SEiji.Ota@Sun.COM  * use_once semantics
37912198SEiji.Ota@Sun.COM  */
38012198SEiji.Ota@Sun.COM void
rdsv3_rdma_unuse(struct rdsv3_sock * rs,uint32_t r_key,int force)38112198SEiji.Ota@Sun.COM rdsv3_rdma_unuse(struct rdsv3_sock *rs, uint32_t r_key, int force)
38212198SEiji.Ota@Sun.COM {
38312198SEiji.Ota@Sun.COM 	struct rdsv3_mr *mr;
38412198SEiji.Ota@Sun.COM 	int zot_me = 0;
38512198SEiji.Ota@Sun.COM 
38612198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rdsv3_rdma_unuse", "Enter rkey: 0x%x", r_key);
38712198SEiji.Ota@Sun.COM 
38812198SEiji.Ota@Sun.COM 	mutex_enter(&rs->rs_rdma_lock);
38912198SEiji.Ota@Sun.COM 	mr = rdsv3_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
39012676SEiji.Ota@Sun.COM 	if (!mr) {
39112676SEiji.Ota@Sun.COM 		RDSV3_DPRINTF4("rdsv3_rdma_unuse",
39212676SEiji.Ota@Sun.COM 		    "rdsv3: trying to unuse MR with unknown r_key %u!", r_key);
39312676SEiji.Ota@Sun.COM 		mutex_exit(&rs->rs_rdma_lock);
39412676SEiji.Ota@Sun.COM 		return;
39512676SEiji.Ota@Sun.COM 	}
39612676SEiji.Ota@Sun.COM 
39712676SEiji.Ota@Sun.COM 	if (mr->r_use_once || force) {
39812198SEiji.Ota@Sun.COM 		avl_remove(&rs->rs_rdma_keys, &mr->r_rb_node);
39912198SEiji.Ota@Sun.COM 		RB_CLEAR_NODE(&mr->r_rb_node);
40012198SEiji.Ota@Sun.COM 		zot_me = 1;
401*12991SEiji.Ota@Sun.COM 	} else {
402*12991SEiji.Ota@Sun.COM 		atomic_add_32(&mr->r_refcount, 1);
40312676SEiji.Ota@Sun.COM 	}
40412198SEiji.Ota@Sun.COM 	mutex_exit(&rs->rs_rdma_lock);
40512198SEiji.Ota@Sun.COM 
40612198SEiji.Ota@Sun.COM 	/*
40712198SEiji.Ota@Sun.COM 	 * May have to issue a dma_sync on this memory region.
40812198SEiji.Ota@Sun.COM 	 * Note we could avoid this if the operation was a RDMA READ,
40912198SEiji.Ota@Sun.COM 	 * but at this point we can't tell.
41012198SEiji.Ota@Sun.COM 	 */
41112676SEiji.Ota@Sun.COM 	if (mr->r_trans->sync_mr)
41212676SEiji.Ota@Sun.COM 		mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
41312198SEiji.Ota@Sun.COM 
41412676SEiji.Ota@Sun.COM 	/*
41512676SEiji.Ota@Sun.COM 	 * If the MR was marked as invalidate, this will
41612676SEiji.Ota@Sun.COM 	 * trigger an async flush.
41712676SEiji.Ota@Sun.COM 	 */
41812676SEiji.Ota@Sun.COM 	if (zot_me)
41912676SEiji.Ota@Sun.COM 		rdsv3_destroy_mr(mr);
42012676SEiji.Ota@Sun.COM 	rdsv3_mr_put(mr);
42112198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF4("rdsv3_rdma_unuse", "Return");
42212198SEiji.Ota@Sun.COM }
42312198SEiji.Ota@Sun.COM 
42412198SEiji.Ota@Sun.COM void
rdsv3_rdma_free_op(struct rdsv3_rdma_op * ro)42512198SEiji.Ota@Sun.COM rdsv3_rdma_free_op(struct rdsv3_rdma_op *ro)
42612198SEiji.Ota@Sun.COM {
42712198SEiji.Ota@Sun.COM 	unsigned int i;
42812198SEiji.Ota@Sun.COM 
42912198SEiji.Ota@Sun.COM 	/* deallocate RDMA resources on rdsv3_message */
43012198SEiji.Ota@Sun.COM 	for (i = 0; i < ro->r_nents; i++) {
43112198SEiji.Ota@Sun.COM 		ddi_umem_unlock(ro->r_rdma_sg[i].umem_cookie);
43212198SEiji.Ota@Sun.COM 	}
43312198SEiji.Ota@Sun.COM 
43412198SEiji.Ota@Sun.COM 	if (ro->r_notifier)
43512198SEiji.Ota@Sun.COM 		kmem_free(ro->r_notifier, sizeof (*ro->r_notifier));
43612198SEiji.Ota@Sun.COM 	kmem_free(ro, sizeof (*ro));
43712198SEiji.Ota@Sun.COM }
43812198SEiji.Ota@Sun.COM 
43912198SEiji.Ota@Sun.COM /*
44012198SEiji.Ota@Sun.COM  * args is a pointer to an in-kernel copy in the sendmsg cmsg.
44112198SEiji.Ota@Sun.COM  */
44212198SEiji.Ota@Sun.COM static struct rdsv3_rdma_op *
rdsv3_rdma_prepare(struct rdsv3_sock * rs,struct rds_rdma_args * args)44312863SEiji.Ota@Sun.COM rdsv3_rdma_prepare(struct rdsv3_sock *rs, struct rds_rdma_args *args)
44412198SEiji.Ota@Sun.COM {
44512863SEiji.Ota@Sun.COM 	struct rds_iovec vec;
44612198SEiji.Ota@Sun.COM 	struct rdsv3_rdma_op *op = NULL;
44712198SEiji.Ota@Sun.COM 	unsigned int nr_bytes;
44812863SEiji.Ota@Sun.COM 	struct rds_iovec *local_vec;
44912198SEiji.Ota@Sun.COM 	unsigned int nr;
45012198SEiji.Ota@Sun.COM 	unsigned int i;
45112198SEiji.Ota@Sun.COM 	ddi_umem_cookie_t umem_cookie;
45212198SEiji.Ota@Sun.COM 	size_t umem_len;
45312198SEiji.Ota@Sun.COM 	caddr_t umem_addr;
45412198SEiji.Ota@Sun.COM 	int ret;
45512198SEiji.Ota@Sun.COM 
45612198SEiji.Ota@Sun.COM 	if (rs->rs_bound_addr == 0) {
45712198SEiji.Ota@Sun.COM 		ret = -ENOTCONN; /* XXX not a great errno */
45812198SEiji.Ota@Sun.COM 		goto out;
45912198SEiji.Ota@Sun.COM 	}
46012198SEiji.Ota@Sun.COM 
46112198SEiji.Ota@Sun.COM 	if (args->nr_local > (uint64_t)UINT_MAX) {
46212198SEiji.Ota@Sun.COM 		ret = -EMSGSIZE;
46312198SEiji.Ota@Sun.COM 		goto out;
46412198SEiji.Ota@Sun.COM 	}
46512198SEiji.Ota@Sun.COM 
46612198SEiji.Ota@Sun.COM 	op = kmem_zalloc(offsetof(struct rdsv3_rdma_op,
46712198SEiji.Ota@Sun.COM 	    r_rdma_sg[args->nr_local]), KM_NOSLEEP);
46812198SEiji.Ota@Sun.COM 	if (op == NULL) {
46912198SEiji.Ota@Sun.COM 		ret = -ENOMEM;
47012198SEiji.Ota@Sun.COM 		goto out;
47112198SEiji.Ota@Sun.COM 	}
47212198SEiji.Ota@Sun.COM 
47312863SEiji.Ota@Sun.COM 	op->r_write = !!(args->flags & RDS_RDMA_READWRITE);
47412863SEiji.Ota@Sun.COM 	op->r_fence = !!(args->flags & RDS_RDMA_FENCE);
47512863SEiji.Ota@Sun.COM 	op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
47612198SEiji.Ota@Sun.COM 	op->r_recverr = rs->rs_recverr;
47712198SEiji.Ota@Sun.COM 
47812198SEiji.Ota@Sun.COM 	if (op->r_notify || op->r_recverr) {
47912198SEiji.Ota@Sun.COM 		/*
48012198SEiji.Ota@Sun.COM 		 * We allocate an uninitialized notifier here, because
48112198SEiji.Ota@Sun.COM 		 * we don't want to do that in the completion handler. We
48212198SEiji.Ota@Sun.COM 		 * would have to use GFP_ATOMIC there, and don't want to deal
48312198SEiji.Ota@Sun.COM 		 * with failed allocations.
48412198SEiji.Ota@Sun.COM 		 */
48512198SEiji.Ota@Sun.COM 		op->r_notifier = kmem_alloc(sizeof (struct rdsv3_notifier),
48612198SEiji.Ota@Sun.COM 		    KM_NOSLEEP);
48712198SEiji.Ota@Sun.COM 		if (!op->r_notifier) {
48812198SEiji.Ota@Sun.COM 			ret = -ENOMEM;
48912198SEiji.Ota@Sun.COM 			goto out;
49012198SEiji.Ota@Sun.COM 		}
49112198SEiji.Ota@Sun.COM 		op->r_notifier->n_user_token = args->user_token;
49212863SEiji.Ota@Sun.COM 		op->r_notifier->n_status = RDS_RDMA_SUCCESS;
49312198SEiji.Ota@Sun.COM 	}
49412198SEiji.Ota@Sun.COM 
49512198SEiji.Ota@Sun.COM 	/*
49612198SEiji.Ota@Sun.COM 	 * The cookie contains the R_Key of the remote memory region, and
49712198SEiji.Ota@Sun.COM 	 * optionally an offset into it. This is how we implement RDMA into
49812198SEiji.Ota@Sun.COM 	 * unaligned memory.
49912198SEiji.Ota@Sun.COM 	 * When setting up the RDMA, we need to add that offset to the
50012198SEiji.Ota@Sun.COM 	 * destination address (which is really an offset into the MR)
50112198SEiji.Ota@Sun.COM 	 * FIXME: We may want to move this into ib_rdma.c
50212198SEiji.Ota@Sun.COM 	 */
50312198SEiji.Ota@Sun.COM 	op->r_key = rdsv3_rdma_cookie_key(args->cookie);
50412198SEiji.Ota@Sun.COM 	op->r_remote_addr = args->remote_vec.addr +
50512198SEiji.Ota@Sun.COM 	    rdsv3_rdma_cookie_offset(args->cookie);
50612198SEiji.Ota@Sun.COM 
50712198SEiji.Ota@Sun.COM 	nr_bytes = 0;
50812198SEiji.Ota@Sun.COM 
50912198SEiji.Ota@Sun.COM 	RDSV3_DPRINTF5("rdsv3_rdma_prepare",
51012198SEiji.Ota@Sun.COM 	    "RDS: rdma prepare nr_local %llu rva %llx rkey %x",
51112198SEiji.Ota@Sun.COM 	    (unsigned long long)args->nr_local,
51212198SEiji.Ota@Sun.COM 	    (unsigned long long)args->remote_vec.addr,
51312198SEiji.Ota@Sun.COM 	    op->r_key);
51412198SEiji.Ota@Sun.COM 
51512863SEiji.Ota@Sun.COM 	local_vec = (struct rds_iovec *)(unsigned long) args->local_vec_addr;
51612198SEiji.Ota@Sun.COM 
51712198SEiji.Ota@Sun.COM 	/* pin the scatter list of user buffers */
51812198SEiji.Ota@Sun.COM 	for (i = 0; i < args->nr_local; i++) {
51912198SEiji.Ota@Sun.COM 		if (ddi_copyin(&local_vec[i], &vec,
52012863SEiji.Ota@Sun.COM 		    sizeof (struct rds_iovec), 0)) {
52112198SEiji.Ota@Sun.COM 			ret = -EFAULT;
52212198SEiji.Ota@Sun.COM 			goto out;
52312198SEiji.Ota@Sun.COM 		}
52412198SEiji.Ota@Sun.COM 
52512198SEiji.Ota@Sun.COM 		nr = rdsv3_pages_in_vec(&vec);
52612198SEiji.Ota@Sun.COM 		if (nr == 0) {
52712198SEiji.Ota@Sun.COM 			RDSV3_DPRINTF2("rdsv3_rdma_prepare",
52812198SEiji.Ota@Sun.COM 			    "rdsv3_pages_in_vec returned 0");
52912198SEiji.Ota@Sun.COM 			ret = -EINVAL;
53012198SEiji.Ota@Sun.COM 			goto out;
53112198SEiji.Ota@Sun.COM 		}
53212198SEiji.Ota@Sun.COM 
53312198SEiji.Ota@Sun.COM 		rs->rs_user_addr = vec.addr;
53412198SEiji.Ota@Sun.COM 		rs->rs_user_bytes = vec.bytes;
53512198SEiji.Ota@Sun.COM 
53612198SEiji.Ota@Sun.COM 		/* pin user memory pages */
53712198SEiji.Ota@Sun.COM 		umem_len = ptob(btopr(vec.bytes +
53812198SEiji.Ota@Sun.COM 		    ((uintptr_t)vec.addr & PAGEOFFSET)));
53912198SEiji.Ota@Sun.COM 		umem_addr = (caddr_t)((uintptr_t)vec.addr & ~PAGEOFFSET);
54012444SGiri.Adari@Sun.COM 		ret = umem_lockmemory(umem_addr, umem_len,
54112444SGiri.Adari@Sun.COM 		    DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ,
54212444SGiri.Adari@Sun.COM 		    &umem_cookie, NULL, NULL);
54312198SEiji.Ota@Sun.COM 		if (ret != 0) {
54412198SEiji.Ota@Sun.COM 			RDSV3_DPRINTF2("rdsv3_rdma_prepare",
54512198SEiji.Ota@Sun.COM 			    "umem_lockmemory() returned %d", ret);
54612198SEiji.Ota@Sun.COM 			ret = -EFAULT;
54712198SEiji.Ota@Sun.COM 			goto out;
54812198SEiji.Ota@Sun.COM 		}
54912198SEiji.Ota@Sun.COM 		op->r_rdma_sg[i].umem_cookie = umem_cookie;
55012198SEiji.Ota@Sun.COM 		op->r_rdma_sg[i].iovec = vec;
55112198SEiji.Ota@Sun.COM 		nr_bytes += vec.bytes;
55212198SEiji.Ota@Sun.COM 
55312198SEiji.Ota@Sun.COM 		RDSV3_DPRINTF5("rdsv3_rdma_prepare",
55412198SEiji.Ota@Sun.COM 		    "RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx",
55512198SEiji.Ota@Sun.COM 		    nr_bytes, nr, vec.bytes, vec.addr);
55612198SEiji.Ota@Sun.COM 	}
55712198SEiji.Ota@Sun.COM 	op->r_nents = i;
55812198SEiji.Ota@Sun.COM 
55912198SEiji.Ota@Sun.COM 	if (nr_bytes > args->remote_vec.bytes) {
56012198SEiji.Ota@Sun.COM 		RDSV3_DPRINTF2("rdsv3_rdma_prepare",
56112198SEiji.Ota@Sun.COM 		    "RDS nr_bytes %u remote_bytes %u do not match",
56212198SEiji.Ota@Sun.COM 		    nr_bytes, (unsigned int) args->remote_vec.bytes);
56312198SEiji.Ota@Sun.COM 		ret = -EINVAL;
56412198SEiji.Ota@Sun.COM 		goto out;
56512198SEiji.Ota@Sun.COM 	}
56612198SEiji.Ota@Sun.COM 	op->r_bytes = nr_bytes;
56712198SEiji.Ota@Sun.COM 
56812198SEiji.Ota@Sun.COM 	ret = 0;
56912198SEiji.Ota@Sun.COM out:
57012198SEiji.Ota@Sun.COM 	if (ret) {
57112198SEiji.Ota@Sun.COM 		if (op)
57212198SEiji.Ota@Sun.COM 			rdsv3_rdma_free_op(op);
57312198SEiji.Ota@Sun.COM 		op = ERR_PTR(ret);
57412198SEiji.Ota@Sun.COM 	}
57512198SEiji.Ota@Sun.COM 	return (op);
57612198SEiji.Ota@Sun.COM }
57712198SEiji.Ota@Sun.COM 
578*12991SEiji.Ota@Sun.COM #define	CEIL(x, y)	(((x) + (y) - 1) / (y))
579*12991SEiji.Ota@Sun.COM 
58012198SEiji.Ota@Sun.COM /*
58112198SEiji.Ota@Sun.COM  * The application asks for a RDMA transfer.
58212198SEiji.Ota@Sun.COM  * Extract all arguments and set up the rdma_op
58312198SEiji.Ota@Sun.COM  */
58412198SEiji.Ota@Sun.COM int
rdsv3_cmsg_rdma_args(struct rdsv3_sock * rs,struct rdsv3_message * rm,struct cmsghdr * cmsg)58512198SEiji.Ota@Sun.COM rdsv3_cmsg_rdma_args(struct rdsv3_sock *rs, struct rdsv3_message *rm,
58612198SEiji.Ota@Sun.COM 	struct cmsghdr *cmsg)
58712198SEiji.Ota@Sun.COM {
58812198SEiji.Ota@Sun.COM 	struct rdsv3_rdma_op *op;
58912966SEiji.Ota@Sun.COM 	/* uint64_t alignment on the buffer */
590*12991SEiji.Ota@Sun.COM 	uint64_t buf[CEIL(CMSG_LEN(sizeof (struct rds_rdma_args)),
59112966SEiji.Ota@Sun.COM 	    sizeof (uint64_t))];
59212198SEiji.Ota@Sun.COM 
59312966SEiji.Ota@Sun.COM 	if (cmsg->cmsg_len != CMSG_LEN(sizeof (struct rds_rdma_args)) ||
59412198SEiji.Ota@Sun.COM 	    rm->m_rdma_op != NULL)
59512198SEiji.Ota@Sun.COM 		return (-EINVAL);
59612198SEiji.Ota@Sun.COM 
59712966SEiji.Ota@Sun.COM 	ASSERT(sizeof (buf) >= cmsg->cmsg_len && ((uintptr_t)buf & 0x7) == 0);
59812966SEiji.Ota@Sun.COM 
59912966SEiji.Ota@Sun.COM 	bcopy(CMSG_DATA(cmsg), (char *)buf, cmsg->cmsg_len);
60012966SEiji.Ota@Sun.COM 	op = rdsv3_rdma_prepare(rs, (struct rds_rdma_args *)buf);
60112966SEiji.Ota@Sun.COM 
60212198SEiji.Ota@Sun.COM 	if (IS_ERR(op))
60312198SEiji.Ota@Sun.COM 		return (PTR_ERR(op));
60412198SEiji.Ota@Sun.COM 	rdsv3_stats_inc(s_send_rdma);
60512198SEiji.Ota@Sun.COM 	rm->m_rdma_op = op;
60612198SEiji.Ota@Sun.COM 	return (0);
60712198SEiji.Ota@Sun.COM }
60812198SEiji.Ota@Sun.COM 
60912198SEiji.Ota@Sun.COM /*
61012198SEiji.Ota@Sun.COM  * The application wants us to pass an RDMA destination (aka MR)
61112198SEiji.Ota@Sun.COM  * to the remote
61212198SEiji.Ota@Sun.COM  */
61312198SEiji.Ota@Sun.COM int
rdsv3_cmsg_rdma_dest(struct rdsv3_sock * rs,struct rdsv3_message * rm,struct cmsghdr * cmsg)61412198SEiji.Ota@Sun.COM rdsv3_cmsg_rdma_dest(struct rdsv3_sock *rs, struct rdsv3_message *rm,
61512198SEiji.Ota@Sun.COM 	struct cmsghdr *cmsg)
61612198SEiji.Ota@Sun.COM {
61712198SEiji.Ota@Sun.COM 	struct rdsv3_mr *mr;
61812198SEiji.Ota@Sun.COM 	uint32_t r_key;
61912198SEiji.Ota@Sun.COM 	int err = 0;
62012198SEiji.Ota@Sun.COM 
62112966SEiji.Ota@Sun.COM 	if (cmsg->cmsg_len != CMSG_LEN(sizeof (rds_rdma_cookie_t)) ||
62212198SEiji.Ota@Sun.COM 	    rm->m_rdma_cookie != 0)
62312198SEiji.Ota@Sun.COM 		return (-EINVAL);
62412198SEiji.Ota@Sun.COM 
62512198SEiji.Ota@Sun.COM 	(void) memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg),
62612198SEiji.Ota@Sun.COM 	    sizeof (rm->m_rdma_cookie));
62712198SEiji.Ota@Sun.COM 
62812198SEiji.Ota@Sun.COM 	/*
62912198SEiji.Ota@Sun.COM 	 * We are reusing a previously mapped MR here. Most likely, the
63012198SEiji.Ota@Sun.COM 	 * application has written to the buffer, so we need to explicitly
63112198SEiji.Ota@Sun.COM 	 * flush those writes to RAM. Otherwise the HCA may not see them
63212198SEiji.Ota@Sun.COM 	 * when doing a DMA from that buffer.
63312198SEiji.Ota@Sun.COM 	 */
63412198SEiji.Ota@Sun.COM 	r_key = rdsv3_rdma_cookie_key(rm->m_rdma_cookie);
63512198SEiji.Ota@Sun.COM 
63612198SEiji.Ota@Sun.COM 	mutex_enter(&rs->rs_rdma_lock);
63712198SEiji.Ota@Sun.COM 	mr = rdsv3_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
63812676SEiji.Ota@Sun.COM 	if (!mr)
63912198SEiji.Ota@Sun.COM 		err = -EINVAL;	/* invalid r_key */
64012198SEiji.Ota@Sun.COM 	else
64112198SEiji.Ota@Sun.COM 		atomic_add_32(&mr->r_refcount, 1);
64212198SEiji.Ota@Sun.COM 	mutex_exit(&rs->rs_rdma_lock);
64312198SEiji.Ota@Sun.COM 
64412198SEiji.Ota@Sun.COM 	if (mr) {
64512198SEiji.Ota@Sun.COM 		mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
64612198SEiji.Ota@Sun.COM 		rm->m_rdma_mr = mr;
64712198SEiji.Ota@Sun.COM 	}
64812198SEiji.Ota@Sun.COM 	return (err);
64912198SEiji.Ota@Sun.COM }
65012198SEiji.Ota@Sun.COM 
65112198SEiji.Ota@Sun.COM /*
65212198SEiji.Ota@Sun.COM  * The application passes us an address range it wants to enable RDMA
65312198SEiji.Ota@Sun.COM  * to/from. We map the area, and save the <R_Key,offset> pair
65412198SEiji.Ota@Sun.COM  * in rm->m_rdma_cookie. This causes it to be sent along to the peer
65512198SEiji.Ota@Sun.COM  * in an extension header.
65612198SEiji.Ota@Sun.COM  */
65712198SEiji.Ota@Sun.COM int
rdsv3_cmsg_rdma_map(struct rdsv3_sock * rs,struct rdsv3_message * rm,struct cmsghdr * cmsg)65812198SEiji.Ota@Sun.COM rdsv3_cmsg_rdma_map(struct rdsv3_sock *rs, struct rdsv3_message *rm,
65912198SEiji.Ota@Sun.COM 	struct cmsghdr *cmsg)
66012198SEiji.Ota@Sun.COM {
66112966SEiji.Ota@Sun.COM 	/* uint64_t alignment on the buffer */
662*12991SEiji.Ota@Sun.COM 	uint64_t buf[CEIL(CMSG_LEN(sizeof (struct rds_get_mr_args)),
66312966SEiji.Ota@Sun.COM 	    sizeof (uint64_t))];
66412198SEiji.Ota@Sun.COM 	int status;
66512198SEiji.Ota@Sun.COM 
66612966SEiji.Ota@Sun.COM 	if (cmsg->cmsg_len != CMSG_LEN(sizeof (struct rds_get_mr_args)) ||
66712198SEiji.Ota@Sun.COM 	    rm->m_rdma_cookie != 0)
66812198SEiji.Ota@Sun.COM 		return (-EINVAL);
66912198SEiji.Ota@Sun.COM 
67012966SEiji.Ota@Sun.COM 	ASSERT(sizeof (buf) >= cmsg->cmsg_len && ((uintptr_t)buf & 0x7) == 0);
67112966SEiji.Ota@Sun.COM 
67212966SEiji.Ota@Sun.COM 	bcopy(CMSG_DATA(cmsg), (char *)buf, cmsg->cmsg_len);
67312966SEiji.Ota@Sun.COM 	status = __rdsv3_rdma_map(rs, (struct rds_get_mr_args *)buf,
67412966SEiji.Ota@Sun.COM 	    &rm->m_rdma_cookie, &rm->m_rdma_mr);
67512966SEiji.Ota@Sun.COM 
67612198SEiji.Ota@Sun.COM 	return (status);
67712198SEiji.Ota@Sun.COM }
678