xref: /onnv-gate/usr/src/uts/common/io/ib/clients/rdsv3/af_rds.c (revision 12320:4d40a60c2314)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Copyright (c) 2006 Oracle.  All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *      - Redistributions of source code must retain the above
39  *        copyright notice, this list of conditions and the following
40  *        disclaimer.
41  *
42  *      - Redistributions in binary form must reproduce the above
43  *        copyright notice, this list of conditions and the following
44  *        disclaimer in the documentation and/or other materials
45  *        provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 #include <sys/types.h>
58 #include <sys/stat.h>
59 #include <sys/conf.h>
60 #include <sys/ddi.h>
61 #include <sys/sunddi.h>
62 #include <sys/modctl.h>
63 #include <sys/rds.h>
64 #include <sys/stropts.h>
65 #include <sys/socket.h>
66 #include <sys/socketvar.h>
67 #include <sys/sockio.h>
68 #include <sys/sysmacros.h>
69 
70 #include <inet/ip.h>
71 #include <net/if_types.h>
72 
73 #include <sys/ib/clients/rdsv3/rdsv3.h>
74 #include <sys/ib/clients/rdsv3/rdma.h>
75 #include <sys/ib/clients/rdsv3/rdma_transport.h>
76 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
77 
78 extern void rdsv3_remove_bound(struct rdsv3_sock *rds);
79 extern int rdsv3_verify_bind_address(ipaddr_t addr);
80 
81 extern ddi_taskq_t	*rdsv3_taskq;
82 extern struct rdma_cm_id *rdsv3_rdma_listen_id;
83 
84 /* this is just used for stats gathering :/ */
85 kmutex_t rdsv3_sock_lock;
86 static unsigned long rdsv3_sock_count;
87 list_t rdsv3_sock_list;
88 
89 /*
90  * This is called as the final descriptor referencing this socket is closed.
91  * We have to unbind the socket so that another socket can be bound to the
92  * address it was using.
93  *
94  * We have to be careful about racing with the incoming path.  sock_orphan()
95  * sets SOCK_DEAD and we use that as an indicator to the rx path that new
96  * messages shouldn't be queued.
97  */
98 /* ARGSUSED */
99 static int
100 rdsv3_release(sock_lower_handle_t proto_handle, int flgs, cred_t *cr)
101 {
102 	struct rsock *sk = (struct rsock *)proto_handle;
103 	struct rdsv3_sock *rs;
104 
105 	if (sk == NULL)
106 		goto out;
107 
108 	rs = rdsv3_sk_to_rs(sk);
109 	RDSV3_DPRINTF4("rdsv3_release", "Enter(rs: %p, sk: %p)", rs, sk);
110 
111 	rdsv3_sk_sock_orphan(sk);
112 	rdsv3_cong_remove_socket(rs);
113 	rdsv3_remove_bound(rs);
114 	/*
115 	 * Note - rdsv3_clear_recv_queue grabs rs_recv_lock, so
116 	 * that ensures the recv path has completed messing
117 	 * with the socket.
118 	 */
119 	rdsv3_clear_recv_queue(rs);
120 	rdsv3_send_drop_to(rs, NULL);
121 	rdsv3_rdma_drop_keys(rs);
122 	(void) rdsv3_notify_queue_get(rs, NULL);
123 
124 	mutex_enter(&rdsv3_sock_lock);
125 	list_remove_node(&rs->rs_item);
126 	rdsv3_sock_count--;
127 	mutex_exit(&rdsv3_sock_lock);
128 
129 	while (sk->sk_refcount > 1) {
130 		/* wait for 1 sec and try again */
131 		delay(drv_usectohz(1000000));
132 	}
133 
134 	/* this will free the rs and sk */
135 	rdsv3_sk_sock_put(sk);
136 
137 	RDSV3_DPRINTF4("rdsv3_release", "Return (rds: %p)", rs);
138 out:
139 	return (0);
140 }
141 
142 void
143 __rdsv3_wake_sk_sleep(struct rsock *sk)
144 {
145 	/* wakup anyone waiting in recvmsg */
146 	if (!rdsv3_sk_sock_flag(sk, SOCK_DEAD) && sk->sk_sleep)
147 		rdsv3_wake_up(sk->sk_sleep);
148 }
149 
150 /*
151  * Careful not to race with rdsv3_release -> sock_orphan which clears sk_sleep.
152  * _bh() isn't OK here, we're called from interrupt handlers.  It's probably OK
153  * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
154  * this seems more conservative.
155  * NB - normally, one would use sk_callback_lock for this, but we can
156  * get here from interrupts, whereas the network code grabs sk_callback_lock
157  * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
158  */
159 void
160 rdsv3_wake_sk_sleep(struct rdsv3_sock *rs)
161 {
162 	RDSV3_DPRINTF4("rdsv3_wake_sk_sleep", "Enter(rs: %p)", rs);
163 
164 	rw_enter(&rs->rs_recv_lock, RW_READER);
165 	__rdsv3_wake_sk_sleep(rdsv3_rs_to_sk(rs));
166 	rw_exit(&rs->rs_recv_lock);
167 }
168 
169 /*ARGSUSED*/
170 static int
171 rdsv3_getname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
172     socklen_t *addr_len, cred_t *cr)
173 {
174 	struct rsock *sk = (struct rsock *)proto_handle;
175 	struct sockaddr_in *sin = (struct sockaddr_in *)addr;
176 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
177 
178 	RDSV3_DPRINTF4("rdsv3_getname", "Enter(rs: %p, port: %d)", rs,
179 	    rs->rs_bound_port);
180 
181 	sin->sin_port = rs->rs_bound_port;
182 	sin->sin_addr.s_addr = rs->rs_bound_addr;
183 
184 	sin->sin_family = AF_INET_OFFLOAD;
185 
186 	*addr_len = sizeof (*sin);
187 	return (0);
188 }
189 
190 /*
191  * RDS' poll is without a doubt the least intuitive part of the interface,
192  * as POLLIN and POLLOUT do not behave entirely as you would expect from
193  * a network protocol.
194  *
195  * POLLIN is asserted if
196  *  -	there is data on the receive queue.
197  *  -	to signal that a previously congested destination may have become
198  *	uncongested
199  *  -	A notification has been queued to the socket (this can be a congestion
200  *	update, or a RDMA completion).
201  *
202  * POLLOUT is asserted if there is room on the send queue. This does not mean
203  * however, that the next sendmsg() call will succeed. If the application tries
204  * to send to a congested destination, the system call may still fail (and
205  * return ENOBUFS).
206  */
207 /* ARGSUSED */
208 static short
209 rdsv3_poll(sock_lower_handle_t proto_handle, short events, int anyyet,
210     cred_t *cr)
211 {
212 	struct rsock	*sk = (struct rsock *)proto_handle;
213 	struct rdsv3_sock	*rs = rdsv3_sk_to_rs(sk);
214 	unsigned short mask = 0;
215 
216 #if 0
217 	RDSV3_DPRINTF4("rdsv3_poll", "enter(%p %x %d)", rs, events, anyyet);
218 #endif
219 
220 	rw_enter(&rs->rs_recv_lock, RW_READER);
221 	if (!rs->rs_cong_monitor) {
222 		/*
223 		 * When a congestion map was updated, we signal POLLIN for
224 		 * "historical" reasons. Applications can also poll for
225 		 * WRBAND instead.
226 		 */
227 		if (rdsv3_cong_updated_since(&rs->rs_cong_track))
228 			mask |= (POLLIN | POLLRDNORM | POLLWRBAND);
229 	} else {
230 		mutex_enter(&rs->rs_lock);
231 		if (rs->rs_cong_notify)
232 			mask |= (POLLIN | POLLRDNORM);
233 		mutex_exit(&rs->rs_lock);
234 	}
235 	if (!list_is_empty(&rs->rs_recv_queue) ||
236 	    !list_is_empty(&rs->rs_notify_queue))
237 		mask |= (POLLIN | POLLRDNORM);
238 	if (rs->rs_snd_bytes < rdsv3_sk_sndbuf(rs))
239 		mask |= (POLLOUT | POLLWRNORM);
240 	rw_exit(&rs->rs_recv_lock);
241 
242 #if 0
243 	RDSV3_DPRINTF4("rdsv3_poll", "return(%p %x)", rs, mask);
244 #endif
245 
246 	return (mask);
247 }
248 
249 /* ARGSUSED */
250 static int
251 rdsv3_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
252     int mode, int32_t *rvalp, cred_t *cr)
253 {
254 	ksocket_t	so4;
255 	struct lifconf	lifc;
256 	struct lifreq	lifr, *lifrp;
257 	struct ifconf	ifc;
258 	struct ifreq	ifr;
259 	int		rval = 0, rc, len;
260 	int		numifs;
261 	int		bufsize;
262 	void		*buf;
263 
264 	RDSV3_DPRINTF4("rdsv3_ioctl", "enter: cmd: %d", cmd);
265 
266 	/* Only ipv4 for now */
267 	rval = ksocket_socket(&so4, PF_INET, SOCK_DGRAM, 0, KSOCKET_NOSLEEP,
268 	    CRED());
269 	if (rval != 0) {
270 		RDSV3_DPRINTF2("rdsv3_ioctl", "ksocket_socket returned %d",
271 		    rval);
272 		return (rval);
273 	}
274 
275 	switch (cmd) {
276 	case SIOCGLIFNUM :
277 	case SIOCGIFNUM :
278 		rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs);
279 		if (rval != 0) break;
280 		if (cmd == SIOCGLIFNUM) {
281 			(void) ddi_copyout(&numifs, (void *)arg,
282 			    sizeof (int), 0);
283 		} else {
284 			len = 0;
285 			for (lifrp = (struct lifreq *)buf, rc = 0; rc < numifs;
286 			    rc++, lifrp++) {
287 				if (strlen(lifrp->lifr_name) <= IFNAMSIZ) {
288 					len++;
289 				}
290 			}
291 			(void) ddi_copyout(&len, (void *)arg,
292 			    sizeof (int), 0);
293 		}
294 		kmem_free(buf, bufsize);
295 		break;
296 
297 	case SIOCGLIFCONF :
298 		if (ddi_copyin((void *)arg, &lifc, sizeof (struct lifconf), 0)
299 		    != 0) {
300 			RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifc");
301 			rval = EFAULT;
302 			break;
303 		}
304 
305 		rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs);
306 		if (rval != 0) {
307 			RDSV3_DPRINTF2("rdsv3_ioctl",
308 			    "rdsv3_do_ip_ioctl failed: %d", rval);
309 			break;
310 		}
311 
312 		if ((lifc.lifc_len > 0) && (numifs > 0)) {
313 			if (ddi_copyout(buf, (void *)lifc.lifc_req,
314 			    (lifc.lifc_len < bufsize) ? lifc.lifc_len :
315 			    bufsize, 0) != 0) {
316 				RDSV3_DPRINTF2("rdsv3_ioctl",
317 				    "copyout of records failed");
318 				rval = EFAULT;
319 			}
320 
321 		}
322 
323 		lifc.lifc_len = bufsize;
324 		if (ddi_copyout(&lifc, (void *)arg, sizeof (struct lifconf),
325 		    0) != 0) {
326 			RDSV3_DPRINTF2("rdsv3_ioctl",
327 			    "copyout of lifconf failed");
328 			rval = EFAULT;
329 		}
330 
331 		kmem_free(buf, bufsize);
332 		break;
333 
334 	case SIOCGIFCONF :
335 	case O_SIOCGIFCONF :
336 		if (ddi_copyin((void *)arg, &ifc, sizeof (struct ifconf), 0)
337 		    != 0) {
338 			RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifc");
339 			rval = EFAULT;
340 			break;
341 		}
342 
343 		RDSV3_DPRINTF2("rdsv3_ioctl",
344 		    "O_SIOCGIFCONF: ifc_len: %d, req: %p",
345 		    ifc.ifc_len, ifc.ifc_req);
346 
347 		rval = rdsv3_do_ip_ioctl_old(so4, &buf, &bufsize, &numifs);
348 		if (rval != 0) {
349 			RDSV3_DPRINTF2("rdsv3_ioctl",
350 			    "rdsv3_do_ip_ioctl_old failed: %d", rval);
351 			break;
352 		}
353 
354 		if ((ifc.ifc_len > 0) && (numifs > 0)) {
355 			if (ddi_copyout(buf, (void *)ifc.ifc_req,
356 			    (ifc.ifc_len < bufsize) ? ifc.ifc_len :
357 			    bufsize, 0) != 0) {
358 				RDSV3_DPRINTF2("rdsv3_ioctl",
359 				    "copyout of records failed");
360 				rval = EFAULT;
361 			}
362 
363 		}
364 
365 		ifc.ifc_len = bufsize;
366 		if (ddi_copyout(&ifc, (void *)arg, sizeof (struct ifconf),
367 		    0) != 0) {
368 			RDSV3_DPRINTF2("rdsv3_ioctl",
369 			    "copyout of ifconf failed");
370 			rval = EFAULT;
371 		}
372 
373 		kmem_free(buf, bufsize);
374 		break;
375 
376 	case SIOCGLIFFLAGS :
377 	case SIOCSLIFFLAGS :
378 	case SIOCGLIFMTU :
379 	case SIOCGLIFNETMASK :
380 	case SIOCGLIFINDEX :
381 		if (ddi_copyin((void *)arg, &lifr, sizeof (struct lifreq), 0)
382 		    != 0) {
383 			RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifr");
384 			rval = EFAULT;
385 			break;
386 		}
387 
388 		rc = ksocket_ioctl(so4, cmd, (intptr_t)&lifr, &rval, CRED());
389 		if (rc != 0) {
390 			RDSV3_DPRINTF2("rdsv3_ioctl",
391 			    "ksocket_ioctl failed: %d, name: %s cmd: 0x%x",
392 			    rc, lifr.lifr_name, cmd);
393 			break;
394 		}
395 
396 		(void) ddi_copyout(&lifr, (void *)arg,
397 		    sizeof (struct lifreq), 0);
398 		break;
399 
400 	case SIOCGIFFLAGS :
401 	case SIOCSIFFLAGS :
402 	case SIOCGIFMTU :
403 	case SIOCGIFNETMASK :
404 	case SIOCGIFINDEX :
405 		if (ddi_copyin((void *)arg, &ifr, sizeof (struct ifreq), 0)
406 		    != 0) {
407 			RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifr");
408 			rval = EFAULT;
409 			break;
410 		}
411 
412 		RDSV3_DPRINTF2("rdsv3_ioctl", "1. name: %s", ifr.ifr_name);
413 
414 		rc = ksocket_ioctl(so4, cmd, (intptr_t)&ifr, &rval, CRED());
415 		if (rc != 0) {
416 			RDSV3_DPRINTF2("rdsv3_ioctl",
417 			    "ksocket_ioctl failed: %d, name: %s cmd: 0x%x",
418 			    rc, ifr.ifr_name, cmd);
419 
420 			break;
421 		}
422 
423 		RDSV3_DPRINTF2("rdsv3_ioctl", "2. name: %s", ifr.ifr_name);
424 
425 		(void) ddi_copyout(&ifr, (void *)arg,
426 		    sizeof (struct ifreq), 0);
427 		break;
428 
429 	default:
430 		cmn_err(CE_CONT, "unsupported IOCTL cmd: %d \n", cmd);
431 		rval = EOPNOTSUPP;
432 	}
433 
434 	(void) ksocket_close(so4, CRED());
435 
436 	RDSV3_DPRINTF4("rdsv3_ioctl", "return: %d cmd: %d", rval, cmd);
437 
438 	*rvalp = rval;
439 	return (rval);
440 }
441 
442 static int
443 rdsv3_cancel_sent_to(struct rdsv3_sock *rs, char *optval, int len)
444 {
445 	struct sockaddr_in sin;
446 
447 	/* racing with another thread binding seems ok here */
448 	if (rs->rs_bound_addr == 0)
449 		return (-ENOTCONN); /* XXX not a great errno */
450 
451 	if (len < sizeof (struct sockaddr_in))
452 		return (-EINVAL);
453 
454 	if (ddi_copyin((void *)optval, &sin, sizeof (struct sockaddr_in),
455 	    0) != 0) {
456 		RDSV3_DPRINTF2("rdsv3_cancel_sent_to", "ddi_copyin failed sin");
457 		return (-EFAULT);
458 	}
459 
460 	rdsv3_send_drop_to(rs, &sin);
461 
462 	return (0);
463 }
464 
465 static int
466 rdsv3_set_bool_option(unsigned char *optvar, char *optval, int optlen)
467 {
468 	int value = *optval;
469 
470 	if (optlen < sizeof (int))
471 		return (-EINVAL);
472 	*optvar = !!value;
473 	return (0);
474 }
475 
476 static int
477 rdsv3_cong_monitor(struct rdsv3_sock *rs, char *optval, int optlen)
478 {
479 	int ret;
480 
481 	ret = rdsv3_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
482 	if (ret == 0) {
483 		if (rs->rs_cong_monitor) {
484 			rdsv3_cong_add_socket(rs);
485 		} else {
486 			rdsv3_cong_remove_socket(rs);
487 			rs->rs_cong_mask = 0;
488 			rs->rs_cong_notify = 0;
489 		}
490 	}
491 	return (ret);
492 }
493 
494 /*ARGSUSED*/
495 static int
496 rdsv3_setsockopt(sock_lower_handle_t proto_handle, int level,
497     int optname, const void *optval, socklen_t optlen, cred_t *cr)
498 {
499 	struct rsock *sk = (struct rsock *)proto_handle;
500 	struct rdsv3_sock	*rs = rdsv3_sk_to_rs(sk);
501 	int	ret = 0;
502 
503 	RDSV3_DPRINTF4("rdsv3_setsockopt", "enter(%p %d %d)",
504 	    rs, level, optname);
505 
506 	switch (optname) {
507 	case RDSV3_CANCEL_SENT_TO:
508 		ret = rdsv3_cancel_sent_to(rs, (char *)optval, optlen);
509 		break;
510 	case RDSV3_GET_MR:
511 		ret = rdsv3_get_mr(rs, optval, optlen);
512 		break;
513 	case RDSV3_FREE_MR:
514 		ret = rdsv3_free_mr(rs, optval, optlen);
515 		break;
516 	case RDSV3_RECVERR:
517 		ret = rdsv3_set_bool_option(&rs->rs_recverr,
518 		    (char *)optval, optlen);
519 		break;
520 	case RDSV3_CONG_MONITOR:
521 		ret = rdsv3_cong_monitor(rs, (char *)optval, optlen);
522 		break;
523 	case SO_SNDBUF:
524 		sk->sk_sndbuf = *(uint_t *)optval;
525 		return (ret);
526 	case SO_RCVBUF:
527 		sk->sk_rcvbuf = *(uint_t *)optval;
528 		return (ret);
529 	default:
530 #if 1
531 		break;
532 #else
533 		ret = -ENOPROTOOPT;
534 #endif
535 	}
536 out:
537 	return (ret);
538 }
539 
540 /* XXX */
541 /*ARGSUSED*/
542 static int
543 rdsv3_getsockopt(sock_lower_handle_t proto_handle, int level,
544     int optname, void *optval, socklen_t *optlen, cred_t *cr)
545 {
546 	struct rsock *sk = (struct rsock *)proto_handle;
547 	struct rdsv3_sock	*rs = rdsv3_sk_to_rs(sk);
548 	int ret = 0;
549 
550 	RDSV3_DPRINTF4("rdsv3_getsockopt", "enter(%p %d %d)",
551 	    rs, optname, *optlen);
552 
553 	switch (optname) {
554 	case SO_SNDBUF:
555 		RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_SNDBUF(%d)",
556 		    sk->sk_sndbuf);
557 		if (*optlen != 0) {
558 			*((int *)optval) = sk->sk_sndbuf;
559 			*optlen = sizeof (uint_t);
560 		}
561 		return (ret);
562 	case SO_RCVBUF:
563 		RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_RCVBUF(%d)",
564 		    sk->sk_rcvbuf);
565 		if (*optlen != 0) {
566 			*((int *)optval) = sk->sk_rcvbuf;
567 			*optlen = sizeof (uint_t);
568 		}
569 		return (ret);
570 	case RDSV3_RECVERR:
571 		RDSV3_DPRINTF4("rdsv3_getsockopt", "RDSV3_RECVERR(%d)",
572 		    rs->rs_recverr);
573 		if (*optlen < sizeof (int))
574 			return (-EINVAL);
575 		else {
576 			*(int *)optval = rs->rs_recverr;
577 			*optlen = sizeof (int);
578 		}
579 		return (0);
580 	default:
581 		if ((optname >= RDSV3_INFO_FIRST) &&
582 		    (optname <= RDSV3_INFO_LAST)) {
583 			return (rdsv3_info_getsockopt(sk, optname, optval,
584 			    optlen));
585 		}
586 		RDSV3_DPRINTF2("rdsv3_getsockopt",
587 		    "Unknown: level: %d optname: %d", level, optname);
588 		ret = -ENOPROTOOPT;
589 	}
590 
591 	RDSV3_DPRINTF4("rdsv3_getsockopt", "return(%p %d %d)",
592 	    rs, optname, ret);
593 	return (ret);
594 }
595 
596 /*ARGSUSED*/
597 static int rdsv3_connect(sock_lower_handle_t proto_handle,
598     const struct sockaddr *addr, socklen_t addr_len, sock_connid_t *conn,
599     cred_t *cr)
600 {
601 	struct rsock *sk = (struct rsock *)proto_handle;
602 	struct sockaddr_in *sin = (struct sockaddr_in *)addr;
603 	struct rdsv3_sock	*rs = rdsv3_sk_to_rs(sk);
604 	int ret = 0;
605 
606 	RDSV3_DPRINTF4("rdsv3_connect", "Enter(rs: %p)", rs);
607 
608 	mutex_enter(&sk->sk_lock);
609 
610 	if (addr_len != sizeof (struct sockaddr_in)) {
611 		ret = -EINVAL;
612 		goto out;
613 	}
614 
615 	if (sin->sin_family != AF_INET_OFFLOAD) {
616 		ret = -EAFNOSUPPORT;
617 		goto out;
618 	}
619 
620 	if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
621 		ret = -EDESTADDRREQ;
622 		goto out;
623 	}
624 
625 	rs->rs_conn_addr = sin->sin_addr.s_addr;
626 	rs->rs_conn_port = sin->sin_port;
627 
628 	sk->sk_upcalls->su_connected(sk->sk_upper_handle, 0, NULL, -1);
629 
630 	RDSV3_DPRINTF4("rdsv3_connect", "Return(rs: %p)", rs);
631 
632 out:
633 	mutex_exit(&sk->sk_lock);
634 	return (ret);
635 }
636 
637 /*ARGSUSED*/
638 static int
639 rdsv3_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
640 {
641 	struct rsock *sk = (struct rsock *)proto_handle;
642 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
643 
644 	RDSV3_DPRINTF4("rdsv3_shutdown", "Enter(rs: %p)", rs);
645 
646 	return (0);
647 }
648 
649 /*ARGSUSED*/
650 void
651 rdsv3_activate(sock_lower_handle_t proto_handle,
652     sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls,
653     int flags, cred_t *cr)
654 {
655 	struct rsock *sk = (struct rsock *)proto_handle;
656 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
657 
658 	RDSV3_DPRINTF4("rdsv3_activate", "Enter(rs: %p)", rs);
659 
660 	sk->sk_upcalls = sock_upcalls;
661 	sk->sk_upper_handle = sock_handle;
662 
663 	RDSV3_DPRINTF4("rdsv3_activate", "Return (rs: %p)", rs);
664 }
665 
666 
667 /* ARGSUSED */
668 int
669 rdsv3_send_uio(sock_lower_handle_t proto_handle, uio_t *uio,
670     struct nmsghdr *msg, cred_t *cr)
671 {
672 	struct rsock *sk = (struct rsock *)proto_handle;
673 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
674 	int ret;
675 
676 	RDSV3_DPRINTF4("rdsv3_send_uio", "Enter(rs: %p)", rs);
677 	ret = rdsv3_sendmsg(rs, uio, msg, uio->uio_resid);
678 
679 	RDSV3_DPRINTF4("rdsv3_send_uio", "Return(rs: %p ret %d)", rs, ret);
680 	if (ret < 0) {
681 		return (-ret);
682 	}
683 
684 	return (0);
685 }
686 
687 /* ARGSUSED */
688 int
689 rdsv3_recv_uio(sock_lower_handle_t proto_handle, uio_t *uio,
690     struct nmsghdr *msg, cred_t *cr)
691 {
692 	struct rsock *sk = (struct rsock *)proto_handle;
693 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
694 	int ret;
695 
696 	RDSV3_DPRINTF4("rdsv3_recv_uio", "Enter (rs: %p)", rs);
697 	ret = rdsv3_recvmsg(rs, uio, msg, uio->uio_resid, msg->msg_flags);
698 
699 	RDSV3_DPRINTF4("rdsv3_recv_uio", "Return(rs: %p ret %d)", rs, ret);
700 
701 	if (ret < 0) {
702 		return (-ret);
703 	}
704 
705 	return (0);
706 }
707 
708 /*ARGSUSED*/
709 int
710 rdsv3_getpeername(sock_lower_handle_t  proto_handle, struct sockaddr *addr,
711     socklen_t *addr_len, cred_t *cr)
712 {
713 	struct sockaddr_in *sin = (struct sockaddr_in *)addr;
714 	struct rsock *sk = (struct rsock *)proto_handle;
715 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
716 
717 	RDSV3_DPRINTF2("rdsv3_getpeername", "enter(rs: %p)", rs);
718 
719 	(void) memset(sin->sin_zero, 0, sizeof (sin->sin_zero));
720 
721 	/* racey, don't care */
722 	if (!rs->rs_conn_addr)
723 		return (-ENOTCONN);
724 
725 	sin->sin_port = rs->rs_conn_port;
726 	sin->sin_addr.s_addr = rs->rs_conn_addr;
727 
728 	sin->sin_family = AF_INET_OFFLOAD;
729 
730 	*addr_len = sizeof (*sin);
731 	return (0);
732 }
733 
734 void
735 rdsv3_clrflowctrl(sock_lower_handle_t proto_handle)
736 {
737 	struct rsock *sk = (struct rsock *)proto_handle;
738 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
739 
740 	RDSV3_DPRINTF2("rdsv3_clrflowctrl", "enter(rs: %p)", rs);
741 }
742 
743 #ifndef __lock_lint
744 static struct sock_downcalls_s rdsv3_sock_downcalls = {
745 	.sd_close =		rdsv3_release,
746 	.sd_bind =		rdsv3_bind,
747 	.sd_connect =		rdsv3_connect,
748 	.sd_accept =		NULL,
749 	.sd_getsockname =	rdsv3_getname,
750 	.sd_poll =		rdsv3_poll,
751 	.sd_ioctl =		rdsv3_ioctl,
752 	.sd_listen =		NULL,
753 	.sd_shutdown =		rdsv3_shutdown,
754 	.sd_setsockopt =	rdsv3_setsockopt,
755 	.sd_getsockopt =	rdsv3_getsockopt,
756 	.sd_send_uio =		rdsv3_send_uio,
757 	.sd_recv_uio =		rdsv3_recv_uio,
758 	.sd_activate =		rdsv3_activate,
759 	.sd_getpeername =	rdsv3_getpeername,
760 	.sd_send =		NULL,
761 	.sd_clr_flowctrl =	NULL
762 };
763 #else
764 static struct sock_downcalls_s rdsv3_sock_downcalls = {
765 	rdsv3_activate,
766 	NULL,
767 	rdsv3_bind,
768 	NULL,
769 	rdsv3_connect,
770 	rdsv3_getpeername,
771 	rdsv3_getname,
772 	rdsv3_getsockopt,
773 	rdsv3_setsockopt,
774 	NULL,
775 	rdsv3_send_uio,
776 	rdsv3_recv_uio,
777 	rdsv3_poll,
778 	rdsv3_shutdown,
779 	NULL,
780 	rdsv3_ioctl,
781 	rdsv3_release
782 };
783 #endif
784 
785 sock_lower_handle_t
786 rdsv3_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
787     uint_t *smodep, int *errorp, int flags, cred_t *credp)
788 {
789 	struct rdsv3_sock	*rs;
790 	struct rsock		*sk;
791 
792 	RDSV3_DPRINTF4("rdsv3_create", "Enter (family: %d type: %d, proto: %d "
793 	    "flags: %d", family, type, proto, flags);
794 
795 	sk = rdsv3_sk_alloc();
796 	if (sk == NULL)
797 		return (NULL);
798 	rdsv3_sock_init_data(sk);
799 
800 	rs = rdsv3_sk_to_rs(sk);
801 	rs->rs_sk = sk;
802 	mutex_init(&rs->rs_lock, NULL, MUTEX_DRIVER, NULL);
803 	rw_init(&rs->rs_recv_lock, NULL, RW_DRIVER, NULL);
804 	list_create(&rs->rs_send_queue, sizeof (struct rdsv3_message),
805 	    offsetof(struct rdsv3_message, m_sock_item));
806 	list_create(&rs->rs_recv_queue, sizeof (struct rdsv3_incoming),
807 	    offsetof(struct rdsv3_incoming, i_item));
808 	list_create(&rs->rs_notify_queue, sizeof (struct rdsv3_notifier),
809 	    offsetof(struct rdsv3_notifier, n_list));
810 	mutex_init(&rs->rs_rdma_lock, NULL, MUTEX_DRIVER, NULL);
811 	avl_create(&rs->rs_rdma_keys, rdsv3_mr_compare,
812 	    sizeof (struct rdsv3_mr), offsetof(struct rdsv3_mr, r_rb_node));
813 	mutex_init(&rs->rs_conn_lock, NULL, MUTEX_DRIVER, NULL);
814 	rs->rs_cred = credp;
815 	rs->rs_zoneid = getzoneid();
816 	crhold(credp);
817 
818 	mutex_enter(&rdsv3_sock_lock);
819 	list_insert_tail(&rdsv3_sock_list, rs);
820 	rdsv3_sock_count++;
821 	/* Initialize RDMA/IB on the 1st socket if not done at attach */
822 	if (rdsv3_sock_count == 1) {
823 		rdsv3_rdma_init();
824 	}
825 	mutex_exit(&rdsv3_sock_lock);
826 
827 	*errorp = 0;
828 	*smodep = SM_ATOMIC;
829 	*sock_downcalls = &rdsv3_sock_downcalls;
830 
831 	RDSV3_DPRINTF4("rdsv3_create", "Return: %p", rs);
832 
833 	return ((sock_lower_handle_t)rdsv3_rs_to_sk(rs));
834 }
835 
836 void
837 rdsv3_sock_addref(struct rdsv3_sock *rs)
838 {
839 	RDSV3_DPRINTF4("rdsv3_sock_addref", "Enter(rs: %p)", rs);
840 	rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs));
841 }
842 
843 void
844 rdsv3_sock_put(struct rdsv3_sock *rs)
845 {
846 	RDSV3_DPRINTF4("rdsv3_sock_put", "Enter(rs: %p)", rs);
847 	rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs));
848 }
849 
850 static void
851 rdsv3_sock_inc_info(struct rsock *sock, unsigned int len,
852     struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens)
853 {
854 	struct rdsv3_sock *rs;
855 	struct rdsv3_incoming *inc;
856 	unsigned int total = 0;
857 
858 	RDSV3_DPRINTF4("rdsv3_sock_inc_info", "Enter(rs: %p)",
859 	    rdsv3_sk_to_rs(sock));
860 
861 	len /= sizeof (struct rdsv3_info_message);
862 
863 	mutex_enter(&rdsv3_sock_lock);
864 
865 	RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) {
866 		rw_enter(&rs->rs_recv_lock, RW_READER);
867 
868 		/* XXX too lazy to maintain counts.. */
869 		RDSV3_FOR_EACH_LIST_NODE(inc, &rs->rs_recv_queue, i_item) {
870 			total++;
871 			if (total <= len)
872 				rdsv3_inc_info_copy(inc, iter, inc->i_saddr,
873 				    rs->rs_bound_addr, 1);
874 		}
875 
876 		rw_exit(&rs->rs_recv_lock);
877 	}
878 
879 	mutex_exit(&rdsv3_sock_lock);
880 
881 	lens->nr = total;
882 	lens->each = sizeof (struct rdsv3_info_message);
883 
884 	RDSV3_DPRINTF4("rdsv3_sock_inc_info", "return(rs: %p)",
885 	    rdsv3_sk_to_rs(sock));
886 }
887 
888 static void
889 rdsv3_sock_info(struct rsock *sock, unsigned int len,
890     struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens)
891 {
892 	struct rdsv3_info_socket sinfo;
893 	struct rdsv3_sock *rs;
894 	unsigned long bytes;
895 
896 	RDSV3_DPRINTF4("rdsv3_sock_info", "Enter(rs: %p)",
897 	    rdsv3_sk_to_rs(sock));
898 
899 	len /= sizeof (struct rdsv3_info_socket);
900 
901 	mutex_enter(&rdsv3_sock_lock);
902 
903 	if ((len < rdsv3_sock_count) || (iter->addr == NULL))
904 		goto out;
905 
906 	bytes = sizeof (struct rdsv3_info_socket);
907 	RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) {
908 		sinfo.sndbuf = rdsv3_sk_sndbuf(rs);
909 		sinfo.rcvbuf = rdsv3_sk_rcvbuf(rs);
910 		sinfo.bound_addr = rs->rs_bound_addr;
911 		sinfo.connected_addr = rs->rs_conn_addr;
912 		sinfo.bound_port = rs->rs_bound_port;
913 		sinfo.connected_port = rs->rs_conn_port;
914 
915 		rdsv3_info_copy(iter, &sinfo, bytes);
916 	}
917 
918 	RDSV3_DPRINTF4("rdsv3_sock_info", "Return(rs: %p)",
919 	    rdsv3_sk_to_rs(sock));
920 
921 out:
922 	lens->nr = rdsv3_sock_count;
923 	lens->each = sizeof (struct rdsv3_info_socket);
924 
925 	mutex_exit(&rdsv3_sock_lock);
926 }
927 
928 rdsv3_delayed_work_t	*rdsv3_rdma_dwp = NULL;
929 uint_t			rdsv3_rdma_init_delay = 5; /* secs */
930 extern void rdsv3_rdma_init_worker(struct rdsv3_work_s *work);
931 
932 void
933 rdsv3_exit(void)
934 {
935 	RDSV3_DPRINTF4("rdsv3_exit", "Enter");
936 
937 	if (rdsv3_rdma_dwp) {
938 		rdsv3_cancel_delayed_work(rdsv3_rdma_dwp);
939 	}
940 
941 	(void) ddi_taskq_dispatch(rdsv3_taskq, rdsv3_rdma_exit,
942 	    NULL, DDI_SLEEP);
943 	while (rdsv3_rdma_listen_id != NULL) {
944 #ifndef __lock_lint
945 		RDSV3_DPRINTF5("rdsv3", "%s-%d Waiting for rdsv3_rdma_exit",
946 		    __func__, __LINE__);
947 #endif
948 		delay(drv_usectohz(1000));
949 	}
950 
951 	rdsv3_conn_exit();
952 	rdsv3_cong_exit();
953 	rdsv3_sysctl_exit();
954 	rdsv3_threads_exit();
955 	rdsv3_stats_exit();
956 	rdsv3_info_deregister_func(RDSV3_INFO_SOCKETS, rdsv3_sock_info);
957 	rdsv3_info_deregister_func(RDSV3_INFO_RECV_MESSAGES,
958 	    rdsv3_sock_inc_info);
959 
960 	if (rdsv3_rdma_dwp) {
961 		kmem_free(rdsv3_rdma_dwp, sizeof (rdsv3_delayed_work_t));
962 		rdsv3_rdma_dwp = NULL;
963 	}
964 
965 	RDSV3_DPRINTF4("rdsv3_exit", "Return");
966 }
967 
968 /*ARGSUSED*/
969 int
970 rdsv3_init()
971 {
972 	int ret;
973 
974 	RDSV3_DPRINTF4("rdsv3_init", "Enter");
975 
976 	rdsv3_cong_init();
977 	ret = rdsv3_conn_init();
978 	if (ret)
979 		goto out;
980 	ret = rdsv3_threads_init();
981 	if (ret)
982 		goto out_conn;
983 	ret = rdsv3_sysctl_init();
984 	if (ret)
985 		goto out_threads;
986 	ret = rdsv3_stats_init();
987 	if (ret)
988 		goto out_sysctl;
989 
990 	rdsv3_info_register_func(RDSV3_INFO_SOCKETS, rdsv3_sock_info);
991 	rdsv3_info_register_func(RDSV3_INFO_RECV_MESSAGES, rdsv3_sock_inc_info);
992 
993 	/* rdsv3_rdma_init need to be called with a little delay */
994 	rdsv3_rdma_dwp = kmem_zalloc(sizeof (rdsv3_delayed_work_t), KM_SLEEP);
995 	RDSV3_INIT_DELAYED_WORK(rdsv3_rdma_dwp, rdsv3_rdma_init_worker);
996 	rdsv3_queue_delayed_work(rdsv3_wq, rdsv3_rdma_dwp,
997 	    rdsv3_rdma_init_delay);
998 
999 	RDSV3_DPRINTF4("rdsv3_init", "Return");
1000 
1001 	goto out;
1002 
1003 out_stats:
1004 	rdsv3_stats_exit();
1005 out_sysctl:
1006 	rdsv3_sysctl_exit();
1007 out_threads:
1008 	rdsv3_threads_exit();
1009 out_conn:
1010 	rdsv3_conn_exit();
1011 	rdsv3_cong_exit();
1012 out:
1013 	return (ret);
1014 }
1015