112198SEiji.Ota@Sun.COM /*
212198SEiji.Ota@Sun.COM * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
312198SEiji.Ota@Sun.COM */
412198SEiji.Ota@Sun.COM
512763SGiri.Adari@Sun.COM /*
612763SGiri.Adari@Sun.COM * This file contains code imported from the OFED rds source file bind.c
712763SGiri.Adari@Sun.COM * Oracle elects to have and use the contents of bind.c under and governed
812763SGiri.Adari@Sun.COM * by the OpenIB.org BSD license (see below for full license text). However,
912763SGiri.Adari@Sun.COM * the following notice accompanied the original version of this file:
1012763SGiri.Adari@Sun.COM */
1112763SGiri.Adari@Sun.COM
1212763SGiri.Adari@Sun.COM /*
1312763SGiri.Adari@Sun.COM * Copyright (c) 2006 Oracle. All rights reserved.
1412763SGiri.Adari@Sun.COM *
1512763SGiri.Adari@Sun.COM * This software is available to you under a choice of one of two
1612763SGiri.Adari@Sun.COM * licenses. You may choose to be licensed under the terms of the GNU
1712763SGiri.Adari@Sun.COM * General Public License (GPL) Version 2, available from the file
1812763SGiri.Adari@Sun.COM * COPYING in the main directory of this source tree, or the
1912763SGiri.Adari@Sun.COM * OpenIB.org BSD license below:
2012763SGiri.Adari@Sun.COM *
2112763SGiri.Adari@Sun.COM * Redistribution and use in source and binary forms, with or
2212763SGiri.Adari@Sun.COM * without modification, are permitted provided that the following
2312763SGiri.Adari@Sun.COM * conditions are met:
2412763SGiri.Adari@Sun.COM *
2512763SGiri.Adari@Sun.COM * - Redistributions of source code must retain the above
2612763SGiri.Adari@Sun.COM * copyright notice, this list of conditions and the following
2712763SGiri.Adari@Sun.COM * disclaimer.
2812763SGiri.Adari@Sun.COM *
2912763SGiri.Adari@Sun.COM * - Redistributions in binary form must reproduce the above
3012763SGiri.Adari@Sun.COM * copyright notice, this list of conditions and the following
3112763SGiri.Adari@Sun.COM * disclaimer in the documentation and/or other materials
3212763SGiri.Adari@Sun.COM * provided with the distribution.
3312763SGiri.Adari@Sun.COM *
3412763SGiri.Adari@Sun.COM * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
3512763SGiri.Adari@Sun.COM * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
3612763SGiri.Adari@Sun.COM * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
3712763SGiri.Adari@Sun.COM * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
3812763SGiri.Adari@Sun.COM * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
3912763SGiri.Adari@Sun.COM * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
4012763SGiri.Adari@Sun.COM * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
4112763SGiri.Adari@Sun.COM * SOFTWARE.
4212763SGiri.Adari@Sun.COM *
4312763SGiri.Adari@Sun.COM */
4412198SEiji.Ota@Sun.COM #include <sys/types.h>
4512198SEiji.Ota@Sun.COM #include <sys/sysmacros.h>
4612198SEiji.Ota@Sun.COM #include <sys/random.h>
4712198SEiji.Ota@Sun.COM #include <sys/rds.h>
4812198SEiji.Ota@Sun.COM
4912198SEiji.Ota@Sun.COM #include <sys/ib/clients/rdsv3/rdsv3.h>
5012198SEiji.Ota@Sun.COM #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
5112198SEiji.Ota@Sun.COM
5212198SEiji.Ota@Sun.COM kmutex_t rdsv3_bind_lock;
5312198SEiji.Ota@Sun.COM avl_tree_t rdsv3_bind_tree;
5412198SEiji.Ota@Sun.COM
55*12895SGiri.Adari@Sun.COM /*
56*12895SGiri.Adari@Sun.COM * Each node in the rdsv3_bind_tree is of this type.
57*12895SGiri.Adari@Sun.COM */
58*12895SGiri.Adari@Sun.COM struct rdsv3_ip_bucket {
59*12895SGiri.Adari@Sun.COM ipaddr_t ip;
60*12895SGiri.Adari@Sun.COM zoneid_t zone;
61*12895SGiri.Adari@Sun.COM avl_node_t ip_avl_node;
62*12895SGiri.Adari@Sun.COM krwlock_t rwlock;
63*12895SGiri.Adari@Sun.COM uint_t nsockets;
64*12895SGiri.Adari@Sun.COM struct rdsv3_sock *port[65536];
65*12895SGiri.Adari@Sun.COM };
66*12895SGiri.Adari@Sun.COM
67*12895SGiri.Adari@Sun.COM static int
rdsv3_bind_node_compare(const void * a,const void * b)68*12895SGiri.Adari@Sun.COM rdsv3_bind_node_compare(const void *a, const void *b)
6912198SEiji.Ota@Sun.COM {
70*12895SGiri.Adari@Sun.COM struct rdsv3_ip_bucket *bp = (struct rdsv3_ip_bucket *)b;
71*12895SGiri.Adari@Sun.COM
72*12895SGiri.Adari@Sun.COM if (*(uint64_t *)a > (((uint64_t)bp->ip << 32) | bp->zone))
73*12895SGiri.Adari@Sun.COM return (+1);
74*12895SGiri.Adari@Sun.COM else if (*(uint64_t *)a < (((uint64_t)bp->ip << 32) | bp->zone))
75*12895SGiri.Adari@Sun.COM return (-1);
76*12895SGiri.Adari@Sun.COM
77*12895SGiri.Adari@Sun.COM return (0);
78*12895SGiri.Adari@Sun.COM }
79*12895SGiri.Adari@Sun.COM
80*12895SGiri.Adari@Sun.COM void
rdsv3_bind_init()81*12895SGiri.Adari@Sun.COM rdsv3_bind_init()
82*12895SGiri.Adari@Sun.COM {
83*12895SGiri.Adari@Sun.COM RDSV3_DPRINTF4("rdsv3_bind_tree_init", "Enter");
8412198SEiji.Ota@Sun.COM
85*12895SGiri.Adari@Sun.COM mutex_init(&rdsv3_bind_lock, NULL, MUTEX_DRIVER, NULL);
86*12895SGiri.Adari@Sun.COM avl_create(&rdsv3_bind_tree, rdsv3_bind_node_compare,
87*12895SGiri.Adari@Sun.COM sizeof (struct rdsv3_ip_bucket),
88*12895SGiri.Adari@Sun.COM offsetof(struct rdsv3_ip_bucket, ip_avl_node));
89*12895SGiri.Adari@Sun.COM
90*12895SGiri.Adari@Sun.COM RDSV3_DPRINTF4("rdsv3_bind_tree_init", "Return");
91*12895SGiri.Adari@Sun.COM }
92*12895SGiri.Adari@Sun.COM
93*12895SGiri.Adari@Sun.COM /* called on detach */
94*12895SGiri.Adari@Sun.COM void
rdsv3_bind_exit()95*12895SGiri.Adari@Sun.COM rdsv3_bind_exit()
96*12895SGiri.Adari@Sun.COM {
97*12895SGiri.Adari@Sun.COM struct rdsv3_ip_bucket *bucketp;
98*12895SGiri.Adari@Sun.COM void *cookie = NULL;
99*12895SGiri.Adari@Sun.COM
100*12895SGiri.Adari@Sun.COM RDSV3_DPRINTF2("rdsv3_bind_tree_exit", "Enter");
101*12895SGiri.Adari@Sun.COM
102*12895SGiri.Adari@Sun.COM while ((bucketp =
103*12895SGiri.Adari@Sun.COM avl_destroy_nodes(&rdsv3_bind_tree, &cookie)) != NULL) {
104*12895SGiri.Adari@Sun.COM rw_destroy(&bucketp->rwlock);
105*12895SGiri.Adari@Sun.COM kmem_free(bucketp, sizeof (struct rdsv3_ip_bucket));
10612198SEiji.Ota@Sun.COM }
10712198SEiji.Ota@Sun.COM
108*12895SGiri.Adari@Sun.COM avl_destroy(&rdsv3_bind_tree);
109*12895SGiri.Adari@Sun.COM mutex_destroy(&rdsv3_bind_lock);
110*12895SGiri.Adari@Sun.COM
111*12895SGiri.Adari@Sun.COM RDSV3_DPRINTF2("rdsv3_bind_tree_exit", "Return");
112*12895SGiri.Adari@Sun.COM }
113*12895SGiri.Adari@Sun.COM
114*12895SGiri.Adari@Sun.COM struct rdsv3_ip_bucket *
rdsv3_find_ip_bucket(ipaddr_t ipaddr,zoneid_t zoneid)115*12895SGiri.Adari@Sun.COM rdsv3_find_ip_bucket(ipaddr_t ipaddr, zoneid_t zoneid)
116*12895SGiri.Adari@Sun.COM {
117*12895SGiri.Adari@Sun.COM struct rdsv3_ip_bucket *bucketp;
118*12895SGiri.Adari@Sun.COM avl_index_t where;
119*12895SGiri.Adari@Sun.COM uint64_t needle = ((uint64_t)ipaddr << 32) | zoneid;
120*12895SGiri.Adari@Sun.COM
121*12895SGiri.Adari@Sun.COM mutex_enter(&rdsv3_bind_lock);
122*12895SGiri.Adari@Sun.COM bucketp = avl_find(&rdsv3_bind_tree, &needle, &where);
123*12895SGiri.Adari@Sun.COM if (bucketp == NULL) {
124*12895SGiri.Adari@Sun.COM /* allocate a new bucket for this IP & zone */
125*12895SGiri.Adari@Sun.COM bucketp =
126*12895SGiri.Adari@Sun.COM kmem_zalloc(sizeof (struct rdsv3_ip_bucket), KM_SLEEP);
127*12895SGiri.Adari@Sun.COM rw_init(&bucketp->rwlock, NULL, RW_DRIVER, NULL);
128*12895SGiri.Adari@Sun.COM bucketp->ip = ipaddr;
129*12895SGiri.Adari@Sun.COM bucketp->zone = zoneid;
130*12895SGiri.Adari@Sun.COM avl_insert(&rdsv3_bind_tree, bucketp, where);
131*12895SGiri.Adari@Sun.COM }
132*12895SGiri.Adari@Sun.COM mutex_exit(&rdsv3_bind_lock);
133*12895SGiri.Adari@Sun.COM
134*12895SGiri.Adari@Sun.COM return (bucketp);
13512198SEiji.Ota@Sun.COM }
13612198SEiji.Ota@Sun.COM
13712198SEiji.Ota@Sun.COM /*
13812198SEiji.Ota@Sun.COM * Return the rdsv3_sock bound at the given local address.
13912198SEiji.Ota@Sun.COM *
14012198SEiji.Ota@Sun.COM * The rx path can race with rdsv3_release. We notice if rdsv3_release() has
14112198SEiji.Ota@Sun.COM * marked this socket and don't return a rs ref to the rx path.
14212198SEiji.Ota@Sun.COM */
14312198SEiji.Ota@Sun.COM struct rdsv3_sock *
rdsv3_find_bound(struct rdsv3_connection * conn,uint16_be_t port)144*12895SGiri.Adari@Sun.COM rdsv3_find_bound(struct rdsv3_connection *conn, uint16_be_t port)
14512198SEiji.Ota@Sun.COM {
14612198SEiji.Ota@Sun.COM struct rdsv3_sock *rs;
14712198SEiji.Ota@Sun.COM
148*12895SGiri.Adari@Sun.COM RDSV3_DPRINTF4("rdsv3_find_bound", "Enter(ip:port: %u.%u.%u.%u:%d)",
149*12895SGiri.Adari@Sun.COM NIPQUAD(conn->c_laddr), ntohs(port));
15012198SEiji.Ota@Sun.COM
151*12895SGiri.Adari@Sun.COM rw_enter(&conn->c_bucketp->rwlock, RW_READER);
152*12895SGiri.Adari@Sun.COM ASSERT(ntohl(conn->c_laddr) == conn->c_bucketp->ip);
153*12895SGiri.Adari@Sun.COM rs = conn->c_bucketp->port[ntohs(port)];
15412198SEiji.Ota@Sun.COM if (rs && !rdsv3_sk_sock_flag(rdsv3_rs_to_sk(rs), SOCK_DEAD))
155*12895SGiri.Adari@Sun.COM rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs));
15612198SEiji.Ota@Sun.COM else
15712198SEiji.Ota@Sun.COM rs = NULL;
158*12895SGiri.Adari@Sun.COM rw_exit(&conn->c_bucketp->rwlock);
15912198SEiji.Ota@Sun.COM
160*12895SGiri.Adari@Sun.COM RDSV3_DPRINTF5("rdsv3_find_bound", "returning rs %p for %u.%u.%u.%u:%d",
161*12895SGiri.Adari@Sun.COM rs, NIPQUAD(conn->c_laddr), ntohs(port));
16212414SEiji.Ota@Sun.COM
16312198SEiji.Ota@Sun.COM return (rs);
16412198SEiji.Ota@Sun.COM }
16512198SEiji.Ota@Sun.COM
16612198SEiji.Ota@Sun.COM /* returns -ve errno or +ve port */
16712198SEiji.Ota@Sun.COM static int
rdsv3_add_bound(struct rdsv3_sock * rs,uint32_be_t addr,uint16_be_t * port)16812198SEiji.Ota@Sun.COM rdsv3_add_bound(struct rdsv3_sock *rs, uint32_be_t addr, uint16_be_t *port)
16912198SEiji.Ota@Sun.COM {
17012198SEiji.Ota@Sun.COM int ret = -EADDRINUSE;
17112198SEiji.Ota@Sun.COM uint16_t rover, last;
172*12895SGiri.Adari@Sun.COM struct rdsv3_ip_bucket *bucketp;
17312198SEiji.Ota@Sun.COM
174*12895SGiri.Adari@Sun.COM RDSV3_DPRINTF4("rdsv3_add_bound", "Enter(addr:port: %x:%x)",
175*12895SGiri.Adari@Sun.COM ntohl(addr), ntohs(*port));
17612198SEiji.Ota@Sun.COM
17712198SEiji.Ota@Sun.COM if (*port != 0) {
17812198SEiji.Ota@Sun.COM rover = ntohs(*port);
17912198SEiji.Ota@Sun.COM last = rover;
18012198SEiji.Ota@Sun.COM } else {
18112198SEiji.Ota@Sun.COM (void) random_get_pseudo_bytes((uint8_t *)&rover,
18212198SEiji.Ota@Sun.COM sizeof (uint16_t));
18312198SEiji.Ota@Sun.COM rover = MAX(rover, 2);
18412198SEiji.Ota@Sun.COM last = rover - 1;
18512198SEiji.Ota@Sun.COM }
18612198SEiji.Ota@Sun.COM
187*12895SGiri.Adari@Sun.COM bucketp = rdsv3_find_ip_bucket(ntohl(addr), rs->rs_zoneid);
188*12895SGiri.Adari@Sun.COM
189*12895SGiri.Adari@Sun.COM /* leave the bind lock and get the bucket lock */
190*12895SGiri.Adari@Sun.COM rw_enter(&bucketp->rwlock, RW_WRITER);
19112198SEiji.Ota@Sun.COM
19212198SEiji.Ota@Sun.COM do {
19312198SEiji.Ota@Sun.COM if (rover == 0)
19412198SEiji.Ota@Sun.COM rover++;
19512676SEiji.Ota@Sun.COM
196*12895SGiri.Adari@Sun.COM if (bucketp->port[rover] == NULL) {
19712198SEiji.Ota@Sun.COM *port = htons(rover);
19812198SEiji.Ota@Sun.COM ret = 0;
19912198SEiji.Ota@Sun.COM break;
20012198SEiji.Ota@Sun.COM }
20112198SEiji.Ota@Sun.COM } while (rover++ != last);
20212198SEiji.Ota@Sun.COM
20312198SEiji.Ota@Sun.COM if (ret == 0) {
20412198SEiji.Ota@Sun.COM rs->rs_bound_addr = addr;
20512198SEiji.Ota@Sun.COM rs->rs_bound_port = *port;
206*12895SGiri.Adari@Sun.COM bucketp->port[rover] = rs;
207*12895SGiri.Adari@Sun.COM bucketp->nsockets++;
20812198SEiji.Ota@Sun.COM rdsv3_sock_addref(rs);
20912198SEiji.Ota@Sun.COM
21012198SEiji.Ota@Sun.COM RDSV3_DPRINTF5("rdsv3_add_bound",
211*12895SGiri.Adari@Sun.COM "rs %p binding to %u.%u.%u.%u:%d",
212*12895SGiri.Adari@Sun.COM rs, NIPQUAD(addr), rover);
21312198SEiji.Ota@Sun.COM }
21412198SEiji.Ota@Sun.COM
215*12895SGiri.Adari@Sun.COM rw_exit(&bucketp->rwlock);
21612198SEiji.Ota@Sun.COM
217*12895SGiri.Adari@Sun.COM RDSV3_DPRINTF4("rdsv3_add_bound", "Return(ret: %d port: %d)",
218*12895SGiri.Adari@Sun.COM ret, rover);
219*12895SGiri.Adari@Sun.COM
22012198SEiji.Ota@Sun.COM
22112198SEiji.Ota@Sun.COM return (ret);
22212198SEiji.Ota@Sun.COM }
22312198SEiji.Ota@Sun.COM
22412198SEiji.Ota@Sun.COM void
rdsv3_remove_bound(struct rdsv3_sock * rs)22512198SEiji.Ota@Sun.COM rdsv3_remove_bound(struct rdsv3_sock *rs)
22612198SEiji.Ota@Sun.COM {
22712198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_remove_bound", "Enter(rs: %p)", rs);
22812198SEiji.Ota@Sun.COM
229*12895SGiri.Adari@Sun.COM if (rs->rs_bound_addr) {
230*12895SGiri.Adari@Sun.COM struct rdsv3_ip_bucket *bucketp;
23112198SEiji.Ota@Sun.COM
23212198SEiji.Ota@Sun.COM RDSV3_DPRINTF5("rdsv3_remove_bound",
23312198SEiji.Ota@Sun.COM "rs %p unbinding from %u.%u.%u.%u:%x",
234*12895SGiri.Adari@Sun.COM rs, NIPQUAD(htonl(rs->rs_bound_addr)), rs->rs_bound_port);
235*12895SGiri.Adari@Sun.COM
236*12895SGiri.Adari@Sun.COM bucketp = rdsv3_find_ip_bucket(ntohl(rs->rs_bound_addr),
237*12895SGiri.Adari@Sun.COM rs->rs_zoneid);
238*12895SGiri.Adari@Sun.COM
239*12895SGiri.Adari@Sun.COM rw_enter(&bucketp->rwlock, RW_WRITER);
240*12895SGiri.Adari@Sun.COM bucketp->port[ntohs(rs->rs_bound_port)] = NULL;
241*12895SGiri.Adari@Sun.COM bucketp->nsockets--;
24212198SEiji.Ota@Sun.COM rs->rs_bound_addr = 0;
243*12895SGiri.Adari@Sun.COM rw_exit(&bucketp->rwlock);
24412198SEiji.Ota@Sun.COM
245*12895SGiri.Adari@Sun.COM rdsv3_sock_put(rs);
246*12895SGiri.Adari@Sun.COM }
24712198SEiji.Ota@Sun.COM
24812198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_remove_bound", "Return(rs: %p)", rs);
24912198SEiji.Ota@Sun.COM }
25012198SEiji.Ota@Sun.COM
25112198SEiji.Ota@Sun.COM /* ARGSUSED */
25212198SEiji.Ota@Sun.COM int
rdsv3_bind(sock_lower_handle_t proto_handle,struct sockaddr * sa,socklen_t len,cred_t * cr)25312198SEiji.Ota@Sun.COM rdsv3_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
25412198SEiji.Ota@Sun.COM socklen_t len, cred_t *cr)
25512198SEiji.Ota@Sun.COM {
25612198SEiji.Ota@Sun.COM struct rsock *sk = (struct rsock *)proto_handle;
25712198SEiji.Ota@Sun.COM sin_t *sin = (sin_t *)sa;
25812198SEiji.Ota@Sun.COM struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
25912198SEiji.Ota@Sun.COM int ret;
26012198SEiji.Ota@Sun.COM
26112198SEiji.Ota@Sun.COM if (len != sizeof (sin_t) || (sin == NULL) ||
26212198SEiji.Ota@Sun.COM !OK_32PTR((char *)sin)) {
26312198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_bind", "address to bind not specified");
26412198SEiji.Ota@Sun.COM return (EINVAL);
26512198SEiji.Ota@Sun.COM }
26612198SEiji.Ota@Sun.COM
26712198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_bind", "Enter(rs: %p, addr: 0x%x, port: %x)",
26812198SEiji.Ota@Sun.COM rs, ntohl(sin->sin_addr.s_addr), htons(sin->sin_port));
26912198SEiji.Ota@Sun.COM
27012198SEiji.Ota@Sun.COM if (sin->sin_addr.s_addr == INADDR_ANY) {
27112198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_bind", "Invalid address");
27212198SEiji.Ota@Sun.COM return (EINVAL);
27312198SEiji.Ota@Sun.COM }
27412198SEiji.Ota@Sun.COM
27512198SEiji.Ota@Sun.COM /* We don't allow multiple binds */
27612198SEiji.Ota@Sun.COM if (rs->rs_bound_addr) {
27712198SEiji.Ota@Sun.COM RDSV3_DPRINTF2("rdsv3_bind", "Multiple binds not allowed");
27812198SEiji.Ota@Sun.COM return (EINVAL);
27912198SEiji.Ota@Sun.COM }
28012198SEiji.Ota@Sun.COM
28112198SEiji.Ota@Sun.COM ret = rdsv3_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port);
28212198SEiji.Ota@Sun.COM if (ret) {
28312198SEiji.Ota@Sun.COM return (ret);
28412198SEiji.Ota@Sun.COM }
28512198SEiji.Ota@Sun.COM
28612198SEiji.Ota@Sun.COM rs->rs_transport = rdsv3_trans_get_preferred(sin->sin_addr.s_addr);
28712676SEiji.Ota@Sun.COM if (!rs->rs_transport) {
28812198SEiji.Ota@Sun.COM rdsv3_remove_bound(rs);
28912414SEiji.Ota@Sun.COM if (rdsv3_printk_ratelimit()) {
29012414SEiji.Ota@Sun.COM RDSV3_DPRINTF1("rdsv3_bind",
29112414SEiji.Ota@Sun.COM "RDS: rdsv3_bind() could not find a transport.\n");
29212414SEiji.Ota@Sun.COM }
29312198SEiji.Ota@Sun.COM return (EADDRNOTAVAIL);
29412198SEiji.Ota@Sun.COM }
29512198SEiji.Ota@Sun.COM
29612198SEiji.Ota@Sun.COM RDSV3_DPRINTF4("rdsv3_bind", "Return: Assigned port: %x to sock: %p",
29712198SEiji.Ota@Sun.COM sin->sin_port, rs);
29812198SEiji.Ota@Sun.COM
29912198SEiji.Ota@Sun.COM return (0);
30012198SEiji.Ota@Sun.COM }
301