1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 2006 Oracle. All rights reserved. 27 * 28 * This software is available to you under a choice of one of two 29 * licenses. You may choose to be licensed under the terms of the GNU 30 * General Public License (GPL) Version 2, available from the file 31 * COPYING in the main directory of this source tree, or the 32 * OpenIB.org BSD license below: 33 * 34 * Redistribution and use in source and binary forms, with or 35 * without modification, are permitted provided that the following 36 * conditions are met: 37 * 38 * - Redistributions of source code must retain the above 39 * copyright notice, this list of conditions and the following 40 * disclaimer. 41 * 42 * - Redistributions in binary form must reproduce the above 43 * copyright notice, this list of conditions and the following 44 * disclaimer in the documentation and/or other materials 45 * provided with the distribution. 46 * 47 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 48 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 49 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 50 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 51 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 52 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 53 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 54 * SOFTWARE. 55 * 56 */ 57 #include <sys/types.h> 58 #include <sys/stat.h> 59 #include <sys/conf.h> 60 #include <sys/ddi.h> 61 #include <sys/sunddi.h> 62 #include <sys/modctl.h> 63 #include <sys/rds.h> 64 #include <sys/stropts.h> 65 #include <sys/socket.h> 66 #include <sys/socketvar.h> 67 #include <sys/sockio.h> 68 #include <sys/sysmacros.h> 69 70 #include <inet/ip.h> 71 #include <net/if_types.h> 72 73 #include <sys/ib/clients/rdsv3/rdsv3.h> 74 #include <sys/ib/clients/rdsv3/rdma.h> 75 #include <sys/ib/clients/rdsv3/rdma_transport.h> 76 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 77 78 extern void rdsv3_remove_bound(struct rdsv3_sock *rds); 79 extern int rdsv3_verify_bind_address(ipaddr_t addr); 80 81 extern ddi_taskq_t *rdsv3_taskq; 82 extern struct rdma_cm_id *rdsv3_rdma_listen_id; 83 84 /* this is just used for stats gathering :/ */ 85 kmutex_t rdsv3_sock_lock; 86 static unsigned long rdsv3_sock_count; 87 list_t rdsv3_sock_list; 88 89 /* 90 * This is called as the final descriptor referencing this socket is closed. 91 * We have to unbind the socket so that another socket can be bound to the 92 * address it was using. 93 * 94 * We have to be careful about racing with the incoming path. sock_orphan() 95 * sets SOCK_DEAD and we use that as an indicator to the rx path that new 96 * messages shouldn't be queued. 97 */ 98 /* ARGSUSED */ 99 static int 100 rdsv3_release(sock_lower_handle_t proto_handle, int flgs, cred_t *cr) 101 { 102 struct rsock *sk = (struct rsock *)proto_handle; 103 struct rdsv3_sock *rs; 104 105 if (sk == NULL) 106 goto out; 107 108 rs = rdsv3_sk_to_rs(sk); 109 RDSV3_DPRINTF4("rdsv3_release", "Enter(rs: %p, sk: %p)", rs, sk); 110 111 rdsv3_sk_sock_orphan(sk); 112 rdsv3_cong_remove_socket(rs); 113 rdsv3_remove_bound(rs); 114 /* 115 * Note - rdsv3_clear_recv_queue grabs rs_recv_lock, so 116 * that ensures the recv path has completed messing 117 * with the socket. 118 */ 119 rdsv3_clear_recv_queue(rs); 120 rdsv3_send_drop_to(rs, NULL); 121 rdsv3_rdma_drop_keys(rs); 122 (void) rdsv3_notify_queue_get(rs, NULL); 123 124 mutex_enter(&rdsv3_sock_lock); 125 list_remove_node(&rs->rs_item); 126 rdsv3_sock_count--; 127 mutex_exit(&rdsv3_sock_lock); 128 129 while (sk->sk_refcount > 1) { 130 /* wait for 1 sec and try again */ 131 delay(drv_usectohz(1000000)); 132 } 133 134 /* this will free the rs and sk */ 135 rdsv3_sk_sock_put(sk); 136 137 RDSV3_DPRINTF4("rdsv3_release", "Return (rds: %p)", rs); 138 out: 139 return (0); 140 } 141 142 void 143 __rdsv3_wake_sk_sleep(struct rsock *sk) 144 { 145 /* wakup anyone waiting in recvmsg */ 146 if (!rdsv3_sk_sock_flag(sk, SOCK_DEAD) && sk->sk_sleep) 147 rdsv3_wake_up(sk->sk_sleep); 148 } 149 150 /* 151 * Careful not to race with rdsv3_release -> sock_orphan which clears sk_sleep. 152 * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK 153 * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but 154 * this seems more conservative. 155 * NB - normally, one would use sk_callback_lock for this, but we can 156 * get here from interrupts, whereas the network code grabs sk_callback_lock 157 * with _lock_bh only - so relying on sk_callback_lock introduces livelocks. 158 */ 159 void 160 rdsv3_wake_sk_sleep(struct rdsv3_sock *rs) 161 { 162 RDSV3_DPRINTF4("rdsv3_wake_sk_sleep", "Enter(rs: %p)", rs); 163 164 rw_enter(&rs->rs_recv_lock, RW_READER); 165 __rdsv3_wake_sk_sleep(rdsv3_rs_to_sk(rs)); 166 rw_exit(&rs->rs_recv_lock); 167 } 168 169 /*ARGSUSED*/ 170 static int 171 rdsv3_getname(sock_lower_handle_t proto_handle, struct sockaddr *addr, 172 socklen_t *addr_len, cred_t *cr) 173 { 174 struct rsock *sk = (struct rsock *)proto_handle; 175 struct sockaddr_in *sin = (struct sockaddr_in *)addr; 176 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 177 178 RDSV3_DPRINTF4("rdsv3_getname", "Enter(rs: %p, port: %d)", rs, 179 rs->rs_bound_port); 180 181 sin->sin_port = rs->rs_bound_port; 182 sin->sin_addr.s_addr = rs->rs_bound_addr; 183 184 sin->sin_family = AF_INET_OFFLOAD; 185 186 *addr_len = sizeof (*sin); 187 return (0); 188 } 189 190 /* 191 * RDS' poll is without a doubt the least intuitive part of the interface, 192 * as POLLIN and POLLOUT do not behave entirely as you would expect from 193 * a network protocol. 194 * 195 * POLLIN is asserted if 196 * - there is data on the receive queue. 197 * - to signal that a previously congested destination may have become 198 * uncongested 199 * - A notification has been queued to the socket (this can be a congestion 200 * update, or a RDMA completion). 201 * 202 * POLLOUT is asserted if there is room on the send queue. This does not mean 203 * however, that the next sendmsg() call will succeed. If the application tries 204 * to send to a congested destination, the system call may still fail (and 205 * return ENOBUFS). 206 */ 207 /* ARGSUSED */ 208 static short 209 rdsv3_poll(sock_lower_handle_t proto_handle, short events, int anyyet, 210 cred_t *cr) 211 { 212 struct rsock *sk = (struct rsock *)proto_handle; 213 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 214 unsigned short mask = 0; 215 216 #if 0 217 RDSV3_DPRINTF4("rdsv3_poll", "enter(%p %x %d)", rs, events, anyyet); 218 #endif 219 220 rw_enter(&rs->rs_recv_lock, RW_READER); 221 if (!rs->rs_cong_monitor) { 222 /* 223 * When a congestion map was updated, we signal POLLIN for 224 * "historical" reasons. Applications can also poll for 225 * WRBAND instead. 226 */ 227 if (rdsv3_cong_updated_since(&rs->rs_cong_track)) 228 mask |= (POLLIN | POLLRDNORM | POLLWRBAND); 229 } else { 230 mutex_enter(&rs->rs_lock); 231 if (rs->rs_cong_notify) 232 mask |= (POLLIN | POLLRDNORM); 233 mutex_exit(&rs->rs_lock); 234 } 235 if (!list_is_empty(&rs->rs_recv_queue) || 236 !list_is_empty(&rs->rs_notify_queue)) 237 mask |= (POLLIN | POLLRDNORM); 238 if (rs->rs_snd_bytes < rdsv3_sk_sndbuf(rs)) 239 mask |= (POLLOUT | POLLWRNORM); 240 rw_exit(&rs->rs_recv_lock); 241 242 #if 0 243 RDSV3_DPRINTF4("rdsv3_poll", "return(%p %x)", rs, mask); 244 #endif 245 246 return (mask); 247 } 248 249 /* ARGSUSED */ 250 static int 251 rdsv3_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 252 int mode, int32_t *rvalp, cred_t *cr) 253 { 254 ksocket_t so4; 255 struct lifconf lifc; 256 struct lifreq lifr, *lifrp; 257 struct ifconf ifc; 258 struct ifreq ifr; 259 int rval = 0, rc, len; 260 int numifs; 261 int bufsize; 262 void *buf; 263 264 RDSV3_DPRINTF4("rdsv3_ioctl", "enter: cmd: %d", cmd); 265 266 /* Only ipv4 for now */ 267 rval = ksocket_socket(&so4, PF_INET, SOCK_DGRAM, 0, KSOCKET_NOSLEEP, 268 CRED()); 269 if (rval != 0) { 270 RDSV3_DPRINTF2("rdsv3_ioctl", "ksocket_socket returned %d", 271 rval); 272 return (rval); 273 } 274 275 switch (cmd) { 276 case SIOCGLIFNUM : 277 case SIOCGIFNUM : 278 rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs); 279 if (rval != 0) break; 280 if (cmd == SIOCGLIFNUM) { 281 (void) ddi_copyout(&numifs, (void *)arg, 282 sizeof (int), 0); 283 } else { 284 len = 0; 285 for (lifrp = (struct lifreq *)buf, rc = 0; rc < numifs; 286 rc++, lifrp++) { 287 if (strlen(lifrp->lifr_name) <= IFNAMSIZ) { 288 len++; 289 } 290 } 291 (void) ddi_copyout(&len, (void *)arg, 292 sizeof (int), 0); 293 } 294 kmem_free(buf, bufsize); 295 break; 296 297 case SIOCGLIFCONF : 298 if (ddi_copyin((void *)arg, &lifc, sizeof (struct lifconf), 0) 299 != 0) { 300 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifc"); 301 rval = EFAULT; 302 break; 303 } 304 305 rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs); 306 if (rval != 0) { 307 RDSV3_DPRINTF2("rdsv3_ioctl", 308 "rdsv3_do_ip_ioctl failed: %d", rval); 309 break; 310 } 311 312 if ((lifc.lifc_len > 0) && (numifs > 0)) { 313 if (ddi_copyout(buf, (void *)lifc.lifc_req, 314 (lifc.lifc_len < bufsize) ? lifc.lifc_len : 315 bufsize, 0) != 0) { 316 RDSV3_DPRINTF2("rdsv3_ioctl", 317 "copyout of records failed"); 318 rval = EFAULT; 319 } 320 321 } 322 323 lifc.lifc_len = bufsize; 324 if (ddi_copyout(&lifc, (void *)arg, sizeof (struct lifconf), 325 0) != 0) { 326 RDSV3_DPRINTF2("rdsv3_ioctl", 327 "copyout of lifconf failed"); 328 rval = EFAULT; 329 } 330 331 kmem_free(buf, bufsize); 332 break; 333 334 case SIOCGIFCONF : 335 case O_SIOCGIFCONF : 336 if (ddi_copyin((void *)arg, &ifc, sizeof (struct ifconf), 0) 337 != 0) { 338 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifc"); 339 rval = EFAULT; 340 break; 341 } 342 343 RDSV3_DPRINTF2("rdsv3_ioctl", 344 "O_SIOCGIFCONF: ifc_len: %d, req: %p", 345 ifc.ifc_len, ifc.ifc_req); 346 347 rval = rdsv3_do_ip_ioctl_old(so4, &buf, &bufsize, &numifs); 348 if (rval != 0) { 349 RDSV3_DPRINTF2("rdsv3_ioctl", 350 "rdsv3_do_ip_ioctl_old failed: %d", rval); 351 break; 352 } 353 354 if ((ifc.ifc_len > 0) && (numifs > 0)) { 355 if (ddi_copyout(buf, (void *)ifc.ifc_req, 356 (ifc.ifc_len < bufsize) ? ifc.ifc_len : 357 bufsize, 0) != 0) { 358 RDSV3_DPRINTF2("rdsv3_ioctl", 359 "copyout of records failed"); 360 rval = EFAULT; 361 } 362 363 } 364 365 ifc.ifc_len = bufsize; 366 if (ddi_copyout(&ifc, (void *)arg, sizeof (struct ifconf), 367 0) != 0) { 368 RDSV3_DPRINTF2("rdsv3_ioctl", 369 "copyout of ifconf failed"); 370 rval = EFAULT; 371 } 372 373 kmem_free(buf, bufsize); 374 break; 375 376 case SIOCGLIFFLAGS : 377 case SIOCSLIFFLAGS : 378 case SIOCGLIFMTU : 379 case SIOCGLIFNETMASK : 380 case SIOCGLIFINDEX : 381 if (ddi_copyin((void *)arg, &lifr, sizeof (struct lifreq), 0) 382 != 0) { 383 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifr"); 384 rval = EFAULT; 385 break; 386 } 387 388 rc = ksocket_ioctl(so4, cmd, (intptr_t)&lifr, &rval, CRED()); 389 if (rc != 0) { 390 RDSV3_DPRINTF2("rdsv3_ioctl", 391 "ksocket_ioctl failed: %d, name: %s cmd: 0x%x", 392 rc, lifr.lifr_name, cmd); 393 break; 394 } 395 396 (void) ddi_copyout(&lifr, (void *)arg, 397 sizeof (struct lifreq), 0); 398 break; 399 400 case SIOCGIFFLAGS : 401 case SIOCSIFFLAGS : 402 case SIOCGIFMTU : 403 case SIOCGIFNETMASK : 404 case SIOCGIFINDEX : 405 if (ddi_copyin((void *)arg, &ifr, sizeof (struct ifreq), 0) 406 != 0) { 407 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifr"); 408 rval = EFAULT; 409 break; 410 } 411 412 RDSV3_DPRINTF2("rdsv3_ioctl", "1. name: %s", ifr.ifr_name); 413 414 rc = ksocket_ioctl(so4, cmd, (intptr_t)&ifr, &rval, CRED()); 415 if (rc != 0) { 416 RDSV3_DPRINTF2("rdsv3_ioctl", 417 "ksocket_ioctl failed: %d, name: %s cmd: 0x%x", 418 rc, ifr.ifr_name, cmd); 419 420 break; 421 } 422 423 RDSV3_DPRINTF2("rdsv3_ioctl", "2. name: %s", ifr.ifr_name); 424 425 (void) ddi_copyout(&ifr, (void *)arg, 426 sizeof (struct ifreq), 0); 427 break; 428 429 default: 430 cmn_err(CE_CONT, "unsupported IOCTL cmd: %d \n", cmd); 431 rval = EOPNOTSUPP; 432 } 433 434 (void) ksocket_close(so4, CRED()); 435 436 RDSV3_DPRINTF4("rdsv3_ioctl", "return: %d cmd: %d", rval, cmd); 437 438 *rvalp = rval; 439 return (rval); 440 } 441 442 static int 443 rdsv3_cancel_sent_to(struct rdsv3_sock *rs, char *optval, int len) 444 { 445 struct sockaddr_in sin; 446 447 /* racing with another thread binding seems ok here */ 448 if (rs->rs_bound_addr == 0) 449 return (-ENOTCONN); /* XXX not a great errno */ 450 451 if (len < sizeof (struct sockaddr_in)) 452 return (-EINVAL); 453 454 if (ddi_copyin((void *)optval, &sin, sizeof (struct sockaddr_in), 455 0) != 0) { 456 RDSV3_DPRINTF2("rdsv3_cancel_sent_to", "ddi_copyin failed sin"); 457 return (-EFAULT); 458 } 459 460 rdsv3_send_drop_to(rs, &sin); 461 462 return (0); 463 } 464 465 static int 466 rdsv3_set_bool_option(unsigned char *optvar, char *optval, int optlen) 467 { 468 int value = *optval; 469 470 if (optlen < sizeof (int)) 471 return (-EINVAL); 472 *optvar = !!value; 473 return (0); 474 } 475 476 static int 477 rdsv3_cong_monitor(struct rdsv3_sock *rs, char *optval, int optlen) 478 { 479 int ret; 480 481 ret = rdsv3_set_bool_option(&rs->rs_cong_monitor, optval, optlen); 482 if (ret == 0) { 483 if (rs->rs_cong_monitor) { 484 rdsv3_cong_add_socket(rs); 485 } else { 486 rdsv3_cong_remove_socket(rs); 487 rs->rs_cong_mask = 0; 488 rs->rs_cong_notify = 0; 489 } 490 } 491 return (ret); 492 } 493 494 /*ARGSUSED*/ 495 static int 496 rdsv3_setsockopt(sock_lower_handle_t proto_handle, int level, 497 int optname, const void *optval, socklen_t optlen, cred_t *cr) 498 { 499 struct rsock *sk = (struct rsock *)proto_handle; 500 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 501 int ret = 0; 502 503 RDSV3_DPRINTF4("rdsv3_setsockopt", "enter(%p %d %d)", 504 rs, level, optname); 505 506 switch (optname) { 507 case RDSV3_CANCEL_SENT_TO: 508 ret = rdsv3_cancel_sent_to(rs, (char *)optval, optlen); 509 break; 510 case RDSV3_GET_MR: 511 ret = rdsv3_get_mr(rs, optval, optlen); 512 break; 513 case RDSV3_FREE_MR: 514 ret = rdsv3_free_mr(rs, optval, optlen); 515 break; 516 case RDSV3_RECVERR: 517 ret = rdsv3_set_bool_option(&rs->rs_recverr, 518 (char *)optval, optlen); 519 break; 520 case RDSV3_CONG_MONITOR: 521 ret = rdsv3_cong_monitor(rs, (char *)optval, optlen); 522 break; 523 case SO_SNDBUF: 524 sk->sk_sndbuf = *(uint_t *)optval; 525 return (ret); 526 case SO_RCVBUF: 527 sk->sk_rcvbuf = *(uint_t *)optval; 528 return (ret); 529 default: 530 #if 1 531 break; 532 #else 533 ret = -ENOPROTOOPT; 534 #endif 535 } 536 out: 537 return (ret); 538 } 539 540 /* XXX */ 541 /*ARGSUSED*/ 542 static int 543 rdsv3_getsockopt(sock_lower_handle_t proto_handle, int level, 544 int optname, void *optval, socklen_t *optlen, cred_t *cr) 545 { 546 struct rsock *sk = (struct rsock *)proto_handle; 547 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 548 int ret = 0; 549 550 RDSV3_DPRINTF4("rdsv3_getsockopt", "enter(%p %d %d)", 551 rs, optname, *optlen); 552 553 switch (optname) { 554 case SO_SNDBUF: 555 RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_SNDBUF(%d)", 556 sk->sk_sndbuf); 557 if (*optlen != 0) { 558 *((int *)optval) = sk->sk_sndbuf; 559 *optlen = sizeof (uint_t); 560 } 561 return (ret); 562 case SO_RCVBUF: 563 RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_RCVBUF(%d)", 564 sk->sk_rcvbuf); 565 if (*optlen != 0) { 566 *((int *)optval) = sk->sk_rcvbuf; 567 *optlen = sizeof (uint_t); 568 } 569 return (ret); 570 case RDSV3_RECVERR: 571 RDSV3_DPRINTF4("rdsv3_getsockopt", "RDSV3_RECVERR(%d)", 572 rs->rs_recverr); 573 if (*optlen < sizeof (int)) 574 return (-EINVAL); 575 else { 576 *(int *)optval = rs->rs_recverr; 577 *optlen = sizeof (int); 578 } 579 return (0); 580 default: 581 if ((optname >= RDSV3_INFO_FIRST) && 582 (optname <= RDSV3_INFO_LAST)) { 583 return (rdsv3_info_getsockopt(sk, optname, optval, 584 optlen)); 585 } 586 RDSV3_DPRINTF2("rdsv3_getsockopt", 587 "Unknown: level: %d optname: %d", level, optname); 588 ret = -ENOPROTOOPT; 589 } 590 591 RDSV3_DPRINTF4("rdsv3_getsockopt", "return(%p %d %d)", 592 rs, optname, ret); 593 return (ret); 594 } 595 596 /*ARGSUSED*/ 597 static int rdsv3_connect(sock_lower_handle_t proto_handle, 598 const struct sockaddr *addr, socklen_t addr_len, sock_connid_t *conn, 599 cred_t *cr) 600 { 601 struct rsock *sk = (struct rsock *)proto_handle; 602 struct sockaddr_in *sin = (struct sockaddr_in *)addr; 603 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 604 int ret = 0; 605 606 RDSV3_DPRINTF4("rdsv3_connect", "Enter(rs: %p)", rs); 607 608 mutex_enter(&sk->sk_lock); 609 610 if (addr_len != sizeof (struct sockaddr_in)) { 611 ret = -EINVAL; 612 goto out; 613 } 614 615 if (sin->sin_family != AF_INET_OFFLOAD) { 616 ret = -EAFNOSUPPORT; 617 goto out; 618 } 619 620 if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { 621 ret = -EDESTADDRREQ; 622 goto out; 623 } 624 625 rs->rs_conn_addr = sin->sin_addr.s_addr; 626 rs->rs_conn_port = sin->sin_port; 627 628 sk->sk_upcalls->su_connected(sk->sk_upper_handle, 0, NULL, -1); 629 630 RDSV3_DPRINTF4("rdsv3_connect", "Return(rs: %p)", rs); 631 632 out: 633 mutex_exit(&sk->sk_lock); 634 return (ret); 635 } 636 637 /*ARGSUSED*/ 638 static int 639 rdsv3_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 640 { 641 struct rsock *sk = (struct rsock *)proto_handle; 642 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 643 644 RDSV3_DPRINTF4("rdsv3_shutdown", "Enter(rs: %p)", rs); 645 646 return (0); 647 } 648 649 /*ARGSUSED*/ 650 void 651 rdsv3_activate(sock_lower_handle_t proto_handle, 652 sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, 653 int flags, cred_t *cr) 654 { 655 struct rsock *sk = (struct rsock *)proto_handle; 656 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 657 658 RDSV3_DPRINTF4("rdsv3_activate", "Enter(rs: %p)", rs); 659 660 sk->sk_upcalls = sock_upcalls; 661 sk->sk_upper_handle = sock_handle; 662 663 RDSV3_DPRINTF4("rdsv3_activate", "Return (rs: %p)", rs); 664 } 665 666 667 /* ARGSUSED */ 668 int 669 rdsv3_send_uio(sock_lower_handle_t proto_handle, uio_t *uio, 670 struct nmsghdr *msg, cred_t *cr) 671 { 672 struct rsock *sk = (struct rsock *)proto_handle; 673 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 674 int ret; 675 676 RDSV3_DPRINTF4("rdsv3_send_uio", "Enter(rs: %p)", rs); 677 ret = rdsv3_sendmsg(rs, uio, msg, uio->uio_resid); 678 679 RDSV3_DPRINTF4("rdsv3_send_uio", "Return(rs: %p ret %d)", rs, ret); 680 if (ret < 0) { 681 return (-ret); 682 } 683 684 return (0); 685 } 686 687 /* ARGSUSED */ 688 int 689 rdsv3_recv_uio(sock_lower_handle_t proto_handle, uio_t *uio, 690 struct nmsghdr *msg, cred_t *cr) 691 { 692 struct rsock *sk = (struct rsock *)proto_handle; 693 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 694 int ret; 695 696 RDSV3_DPRINTF4("rdsv3_recv_uio", "Enter (rs: %p)", rs); 697 ret = rdsv3_recvmsg(rs, uio, msg, uio->uio_resid, msg->msg_flags); 698 699 RDSV3_DPRINTF4("rdsv3_recv_uio", "Return(rs: %p ret %d)", rs, ret); 700 701 if (ret < 0) { 702 return (-ret); 703 } 704 705 return (0); 706 } 707 708 /*ARGSUSED*/ 709 int 710 rdsv3_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, 711 socklen_t *addr_len, cred_t *cr) 712 { 713 struct sockaddr_in *sin = (struct sockaddr_in *)addr; 714 struct rsock *sk = (struct rsock *)proto_handle; 715 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 716 717 RDSV3_DPRINTF2("rdsv3_getpeername", "enter(rs: %p)", rs); 718 719 (void) memset(sin->sin_zero, 0, sizeof (sin->sin_zero)); 720 721 /* racey, don't care */ 722 if (!rs->rs_conn_addr) 723 return (-ENOTCONN); 724 725 sin->sin_port = rs->rs_conn_port; 726 sin->sin_addr.s_addr = rs->rs_conn_addr; 727 728 sin->sin_family = AF_INET_OFFLOAD; 729 730 *addr_len = sizeof (*sin); 731 return (0); 732 } 733 734 void 735 rdsv3_clrflowctrl(sock_lower_handle_t proto_handle) 736 { 737 struct rsock *sk = (struct rsock *)proto_handle; 738 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 739 740 RDSV3_DPRINTF2("rdsv3_clrflowctrl", "enter(rs: %p)", rs); 741 } 742 743 #ifndef __lock_lint 744 static struct sock_downcalls_s rdsv3_sock_downcalls = { 745 .sd_close = rdsv3_release, 746 .sd_bind = rdsv3_bind, 747 .sd_connect = rdsv3_connect, 748 .sd_accept = NULL, 749 .sd_getsockname = rdsv3_getname, 750 .sd_poll = rdsv3_poll, 751 .sd_ioctl = rdsv3_ioctl, 752 .sd_listen = NULL, 753 .sd_shutdown = rdsv3_shutdown, 754 .sd_setsockopt = rdsv3_setsockopt, 755 .sd_getsockopt = rdsv3_getsockopt, 756 .sd_send_uio = rdsv3_send_uio, 757 .sd_recv_uio = rdsv3_recv_uio, 758 .sd_activate = rdsv3_activate, 759 .sd_getpeername = rdsv3_getpeername, 760 .sd_send = NULL, 761 .sd_clr_flowctrl = NULL 762 }; 763 #else 764 static struct sock_downcalls_s rdsv3_sock_downcalls = { 765 rdsv3_activate, 766 NULL, 767 rdsv3_bind, 768 NULL, 769 rdsv3_connect, 770 rdsv3_getpeername, 771 rdsv3_getname, 772 rdsv3_getsockopt, 773 rdsv3_setsockopt, 774 NULL, 775 rdsv3_send_uio, 776 rdsv3_recv_uio, 777 rdsv3_poll, 778 rdsv3_shutdown, 779 NULL, 780 rdsv3_ioctl, 781 rdsv3_release 782 }; 783 #endif 784 785 sock_lower_handle_t 786 rdsv3_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 787 uint_t *smodep, int *errorp, int flags, cred_t *credp) 788 { 789 struct rdsv3_sock *rs; 790 struct rsock *sk; 791 792 RDSV3_DPRINTF4("rdsv3_create", "Enter (family: %d type: %d, proto: %d " 793 "flags: %d", family, type, proto, flags); 794 795 sk = rdsv3_sk_alloc(); 796 if (sk == NULL) 797 return (NULL); 798 rdsv3_sock_init_data(sk); 799 800 rs = rdsv3_sk_to_rs(sk); 801 rs->rs_sk = sk; 802 mutex_init(&rs->rs_lock, NULL, MUTEX_DRIVER, NULL); 803 rw_init(&rs->rs_recv_lock, NULL, RW_DRIVER, NULL); 804 list_create(&rs->rs_send_queue, sizeof (struct rdsv3_message), 805 offsetof(struct rdsv3_message, m_sock_item)); 806 list_create(&rs->rs_recv_queue, sizeof (struct rdsv3_incoming), 807 offsetof(struct rdsv3_incoming, i_item)); 808 list_create(&rs->rs_notify_queue, sizeof (struct rdsv3_notifier), 809 offsetof(struct rdsv3_notifier, n_list)); 810 mutex_init(&rs->rs_rdma_lock, NULL, MUTEX_DRIVER, NULL); 811 avl_create(&rs->rs_rdma_keys, rdsv3_mr_compare, 812 sizeof (struct rdsv3_mr), offsetof(struct rdsv3_mr, r_rb_node)); 813 mutex_init(&rs->rs_conn_lock, NULL, MUTEX_DRIVER, NULL); 814 rs->rs_cred = credp; 815 rs->rs_zoneid = getzoneid(); 816 crhold(credp); 817 818 mutex_enter(&rdsv3_sock_lock); 819 list_insert_tail(&rdsv3_sock_list, rs); 820 rdsv3_sock_count++; 821 /* Initialize RDMA/IB on the 1st socket if not done at attach */ 822 if (rdsv3_sock_count == 1) { 823 rdsv3_rdma_init(); 824 } 825 mutex_exit(&rdsv3_sock_lock); 826 827 *errorp = 0; 828 *smodep = SM_ATOMIC; 829 *sock_downcalls = &rdsv3_sock_downcalls; 830 831 RDSV3_DPRINTF4("rdsv3_create", "Return: %p", rs); 832 833 return ((sock_lower_handle_t)rdsv3_rs_to_sk(rs)); 834 } 835 836 void 837 rdsv3_sock_addref(struct rdsv3_sock *rs) 838 { 839 RDSV3_DPRINTF4("rdsv3_sock_addref", "Enter(rs: %p)", rs); 840 rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs)); 841 } 842 843 void 844 rdsv3_sock_put(struct rdsv3_sock *rs) 845 { 846 RDSV3_DPRINTF4("rdsv3_sock_put", "Enter(rs: %p)", rs); 847 rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs)); 848 } 849 850 static void 851 rdsv3_sock_inc_info(struct rsock *sock, unsigned int len, 852 struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens) 853 { 854 struct rdsv3_sock *rs; 855 struct rdsv3_incoming *inc; 856 unsigned int total = 0; 857 858 RDSV3_DPRINTF4("rdsv3_sock_inc_info", "Enter(rs: %p)", 859 rdsv3_sk_to_rs(sock)); 860 861 len /= sizeof (struct rdsv3_info_message); 862 863 mutex_enter(&rdsv3_sock_lock); 864 865 RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) { 866 rw_enter(&rs->rs_recv_lock, RW_READER); 867 868 /* XXX too lazy to maintain counts.. */ 869 RDSV3_FOR_EACH_LIST_NODE(inc, &rs->rs_recv_queue, i_item) { 870 total++; 871 if (total <= len) 872 rdsv3_inc_info_copy(inc, iter, inc->i_saddr, 873 rs->rs_bound_addr, 1); 874 } 875 876 rw_exit(&rs->rs_recv_lock); 877 } 878 879 mutex_exit(&rdsv3_sock_lock); 880 881 lens->nr = total; 882 lens->each = sizeof (struct rdsv3_info_message); 883 884 RDSV3_DPRINTF4("rdsv3_sock_inc_info", "return(rs: %p)", 885 rdsv3_sk_to_rs(sock)); 886 } 887 888 static void 889 rdsv3_sock_info(struct rsock *sock, unsigned int len, 890 struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens) 891 { 892 struct rdsv3_info_socket sinfo; 893 struct rdsv3_sock *rs; 894 unsigned long bytes; 895 896 RDSV3_DPRINTF4("rdsv3_sock_info", "Enter(rs: %p)", 897 rdsv3_sk_to_rs(sock)); 898 899 len /= sizeof (struct rdsv3_info_socket); 900 901 mutex_enter(&rdsv3_sock_lock); 902 903 if ((len < rdsv3_sock_count) || (iter->addr == NULL)) 904 goto out; 905 906 bytes = sizeof (struct rdsv3_info_socket); 907 RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) { 908 sinfo.sndbuf = rdsv3_sk_sndbuf(rs); 909 sinfo.rcvbuf = rdsv3_sk_rcvbuf(rs); 910 sinfo.bound_addr = rs->rs_bound_addr; 911 sinfo.connected_addr = rs->rs_conn_addr; 912 sinfo.bound_port = rs->rs_bound_port; 913 sinfo.connected_port = rs->rs_conn_port; 914 915 rdsv3_info_copy(iter, &sinfo, bytes); 916 } 917 918 RDSV3_DPRINTF4("rdsv3_sock_info", "Return(rs: %p)", 919 rdsv3_sk_to_rs(sock)); 920 921 out: 922 lens->nr = rdsv3_sock_count; 923 lens->each = sizeof (struct rdsv3_info_socket); 924 925 mutex_exit(&rdsv3_sock_lock); 926 } 927 928 rdsv3_delayed_work_t *rdsv3_rdma_dwp = NULL; 929 uint_t rdsv3_rdma_init_delay = 5; /* secs */ 930 extern void rdsv3_rdma_init_worker(struct rdsv3_work_s *work); 931 932 void 933 rdsv3_exit(void) 934 { 935 RDSV3_DPRINTF4("rdsv3_exit", "Enter"); 936 937 if (rdsv3_rdma_dwp) { 938 rdsv3_cancel_delayed_work(rdsv3_rdma_dwp); 939 } 940 941 (void) ddi_taskq_dispatch(rdsv3_taskq, rdsv3_rdma_exit, 942 NULL, DDI_SLEEP); 943 while (rdsv3_rdma_listen_id != NULL) { 944 #ifndef __lock_lint 945 RDSV3_DPRINTF5("rdsv3", "%s-%d Waiting for rdsv3_rdma_exit", 946 __func__, __LINE__); 947 #endif 948 delay(drv_usectohz(1000)); 949 } 950 951 rdsv3_conn_exit(); 952 rdsv3_cong_exit(); 953 rdsv3_sysctl_exit(); 954 rdsv3_threads_exit(); 955 rdsv3_stats_exit(); 956 rdsv3_info_deregister_func(RDSV3_INFO_SOCKETS, rdsv3_sock_info); 957 rdsv3_info_deregister_func(RDSV3_INFO_RECV_MESSAGES, 958 rdsv3_sock_inc_info); 959 960 if (rdsv3_rdma_dwp) { 961 kmem_free(rdsv3_rdma_dwp, sizeof (rdsv3_delayed_work_t)); 962 rdsv3_rdma_dwp = NULL; 963 } 964 965 RDSV3_DPRINTF4("rdsv3_exit", "Return"); 966 } 967 968 /*ARGSUSED*/ 969 int 970 rdsv3_init() 971 { 972 int ret; 973 974 RDSV3_DPRINTF4("rdsv3_init", "Enter"); 975 976 rdsv3_cong_init(); 977 ret = rdsv3_conn_init(); 978 if (ret) 979 goto out; 980 ret = rdsv3_threads_init(); 981 if (ret) 982 goto out_conn; 983 ret = rdsv3_sysctl_init(); 984 if (ret) 985 goto out_threads; 986 ret = rdsv3_stats_init(); 987 if (ret) 988 goto out_sysctl; 989 990 rdsv3_info_register_func(RDSV3_INFO_SOCKETS, rdsv3_sock_info); 991 rdsv3_info_register_func(RDSV3_INFO_RECV_MESSAGES, rdsv3_sock_inc_info); 992 993 /* rdsv3_rdma_init need to be called with a little delay */ 994 rdsv3_rdma_dwp = kmem_zalloc(sizeof (rdsv3_delayed_work_t), KM_SLEEP); 995 RDSV3_INIT_DELAYED_WORK(rdsv3_rdma_dwp, rdsv3_rdma_init_worker); 996 rdsv3_queue_delayed_work(rdsv3_wq, rdsv3_rdma_dwp, 997 rdsv3_rdma_init_delay); 998 999 RDSV3_DPRINTF4("rdsv3_init", "Return"); 1000 1001 goto out; 1002 1003 out_stats: 1004 rdsv3_stats_exit(); 1005 out_sysctl: 1006 rdsv3_sysctl_exit(); 1007 out_threads: 1008 rdsv3_threads_exit(); 1009 out_conn: 1010 rdsv3_conn_exit(); 1011 rdsv3_cong_exit(); 1012 out: 1013 return (ret); 1014 } 1015