1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 /*
26 * Copyright (c) 2007, The Ohio State University. All rights reserved.
27 *
28 * Portions of this source code is developed by the team members of
29 * The Ohio State University's Network-Based Computing Laboratory (NBCL),
30 * headed by Professor Dhabaleswar K. (DK) Panda.
31 *
32 * Acknowledgements to contributions from developors:
33 * Ranjit Noronha: noronha@cse.ohio-state.edu
34 * Lei Chai : chail@cse.ohio-state.edu
35 * Weikuan Yu : yuw@cse.ohio-state.edu
36 *
37 */
38
39 /*
40 * The rpcib plugin. Implements the interface for RDMATF's
41 * interaction with IBTF.
42 */
43
44 #include <sys/param.h>
45 #include <sys/types.h>
46 #include <sys/user.h>
47 #include <sys/systm.h>
48 #include <sys/sysmacros.h>
49 #include <sys/proc.h>
50 #include <sys/socket.h>
51 #include <sys/file.h>
52 #include <sys/stream.h>
53 #include <sys/strsubr.h>
54 #include <sys/stropts.h>
55 #include <sys/errno.h>
56 #include <sys/kmem.h>
57 #include <sys/debug.h>
58 #include <sys/pathname.h>
59 #include <sys/kstat.h>
60 #include <sys/t_lock.h>
61 #include <sys/ddi.h>
62 #include <sys/cmn_err.h>
63 #include <sys/time.h>
64 #include <sys/isa_defs.h>
65 #include <sys/callb.h>
66 #include <sys/sunddi.h>
67 #include <sys/sunndi.h>
68 #include <sys/sdt.h>
69 #include <sys/ib/ibtl/ibti.h>
70 #include <rpc/rpc.h>
71 #include <rpc/ib.h>
72 #include <sys/modctl.h>
73 #include <sys/kstr.h>
74 #include <sys/sockio.h>
75 #include <sys/vnode.h>
76 #include <sys/tiuser.h>
77 #include <net/if.h>
78 #include <net/if_types.h>
79 #include <sys/cred.h>
80 #include <rpc/rpc_rdma.h>
81 #include <nfs/nfs.h>
82 #include <sys/atomic.h>
83
84 #define NFS_RDMA_PORT 20049
85
86
87 /*
88 * Convenience structures for connection management
89 */
90 typedef struct rpcib_ipaddrs {
91 void *ri_list; /* pointer to list of addresses */
92 uint_t ri_count; /* number of addresses in list */
93 uint_t ri_size; /* size of ri_list in bytes */
94 } rpcib_ipaddrs_t;
95
96
97 typedef struct rpcib_ping {
98 rib_hca_t *hca;
99 ibt_path_info_t path;
100 ibt_ip_addr_t srcip;
101 ibt_ip_addr_t dstip;
102 } rpcib_ping_t;
103
104 /*
105 * Prototype declarations for driver ops
106 */
107 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
108 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
109 void *, void **);
110 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
111 static boolean_t rpcib_rdma_capable_interface(struct lifreq *);
112 static int rpcib_do_ip_ioctl(int, int, void *);
113 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *);
114 static int rpcib_cache_kstat_update(kstat_t *, int);
115 static void rib_force_cleanup(void *);
116 static void rib_stop_hca_services(rib_hca_t *);
117 static void rib_attach_hca(void);
118 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
119 struct netbuf *d_svcaddr, CONN **conn);
120
121 struct {
122 kstat_named_t cache_limit;
123 kstat_named_t cache_allocation;
124 kstat_named_t cache_hits;
125 kstat_named_t cache_misses;
126 kstat_named_t cache_misses_above_the_limit;
127 } rpcib_kstat = {
128 {"cache_limit", KSTAT_DATA_UINT64 },
129 {"cache_allocation", KSTAT_DATA_UINT64 },
130 {"cache_hits", KSTAT_DATA_UINT64 },
131 {"cache_misses", KSTAT_DATA_UINT64 },
132 {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
133 };
134
135 /* rpcib cb_ops */
136 static struct cb_ops rpcib_cbops = {
137 nulldev, /* open */
138 nulldev, /* close */
139 nodev, /* strategy */
140 nodev, /* print */
141 nodev, /* dump */
142 nodev, /* read */
143 nodev, /* write */
144 nodev, /* ioctl */
145 nodev, /* devmap */
146 nodev, /* mmap */
147 nodev, /* segmap */
148 nochpoll, /* poll */
149 ddi_prop_op, /* prop_op */
150 NULL, /* stream */
151 D_MP, /* cb_flag */
152 CB_REV, /* rev */
153 nodev, /* int (*cb_aread)() */
154 nodev /* int (*cb_awrite)() */
155 };
156
157 /*
158 * Device options
159 */
160 static struct dev_ops rpcib_ops = {
161 DEVO_REV, /* devo_rev, */
162 0, /* refcnt */
163 rpcib_getinfo, /* info */
164 nulldev, /* identify */
165 nulldev, /* probe */
166 rpcib_attach, /* attach */
167 rpcib_detach, /* detach */
168 nodev, /* reset */
169 &rpcib_cbops, /* driver ops - devctl interfaces */
170 NULL, /* bus operations */
171 NULL, /* power */
172 ddi_quiesce_not_needed, /* quiesce */
173 };
174
175 /*
176 * Module linkage information.
177 */
178
179 static struct modldrv rib_modldrv = {
180 &mod_driverops, /* Driver module */
181 "RPCIB plugin driver", /* Driver name and version */
182 &rpcib_ops, /* Driver ops */
183 };
184
185 static struct modlinkage rib_modlinkage = {
186 MODREV_1,
187 (void *)&rib_modldrv,
188 NULL
189 };
190
191 typedef struct rib_lrc_entry {
192 struct rib_lrc_entry *forw;
193 struct rib_lrc_entry *back;
194 char *lrc_buf;
195
196 uint32_t lrc_len;
197 void *avl_node;
198 bool_t registered;
199
200 struct mrc lrc_mhandle;
201 bool_t lrc_on_freed_list;
202 } rib_lrc_entry_t;
203
204 typedef struct cache_struct {
205 rib_lrc_entry_t r;
206 uint32_t len;
207 uint32_t elements;
208 kmutex_t node_lock;
209 avl_node_t avl_link;
210 } cache_avl_struct_t;
211
212 uint64_t cache_limit = 100 * 1024 * 1024;
213 static uint64_t cache_watermark = 80 * 1024 * 1024;
214 static bool_t stats_enabled = FALSE;
215
216 static uint64_t max_unsignaled_rws = 5;
217 int nfs_rdma_port = NFS_RDMA_PORT;
218
219 #define RIBNETID_TCP "tcp"
220 #define RIBNETID_TCP6 "tcp6"
221
222 /*
223 * rib_stat: private data pointer used when registering
224 * with the IBTF. It is returned to the consumer
225 * in all callbacks.
226 */
227 static rpcib_state_t *rib_stat = NULL;
228
229 #define RNR_RETRIES IBT_RNR_RETRY_1
230 #define MAX_PORTS 2
231 #define RDMA_DUMMY_WRID 0x4D3A1D4D3A1D
232 #define RDMA_CONN_REAP_RETRY 10 /* 10 secs */
233
234 int preposted_rbufs = RDMA_BUFS_GRANT;
235 int send_threshold = 1;
236
237 /*
238 * Old cards with Tavor driver have limited memory footprint
239 * when booted in 32bit. The rib_max_rbufs tunable can be
240 * tuned for more buffers if needed.
241 */
242
243 #if !defined(_ELF64) && !defined(__sparc)
244 int rib_max_rbufs = MAX_BUFS;
245 #else
246 int rib_max_rbufs = 10 * MAX_BUFS;
247 #endif /* !(_ELF64) && !(__sparc) */
248
249 int rib_conn_timeout = 60 * 12; /* 12 minutes */
250
251 /*
252 * State of the plugin.
253 * ACCEPT = accepting new connections and requests.
254 * NO_ACCEPT = not accepting new connection and requests.
255 * This should eventually move to rpcib_state_t structure, since this
256 * will tell in which state the plugin is for a particular type of service
257 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
258 * state for one and in no_accept state for the other.
259 */
260 int plugin_state;
261 kmutex_t plugin_state_lock;
262
263 ldi_ident_t rpcib_li;
264
265 /*
266 * RPCIB RDMATF operations
267 */
268 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
269 static rdma_stat rib_disconnect(CONN *conn);
270 static void rib_listen(struct rdma_svc_data *rd);
271 static void rib_listen_stop(struct rdma_svc_data *rd);
272 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf,
273 uint_t buflen, struct mrc *buf_handle);
274 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
275 struct mrc buf_handle);
276 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
277 caddr_t buf, uint_t buflen, struct mrc *buf_handle);
278 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
279 struct mrc buf_handle);
280 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf,
281 uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
282 void *lrc);
283 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
284 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
285 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
286 caddr_t buf, int len, int cpu);
287
288 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
289
290 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
291 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
292
293 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
294
295 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
296 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
297 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
298 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
299 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
300 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
301 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
302 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
303 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *);
304 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *,
305 int addr_type, void *, CONN **);
306 static rdma_stat rib_conn_release(CONN *conn);
307 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int,
308 rpcib_ping_t *, CONN **);
309 static rdma_stat rib_getinfo(rdma_info_t *info);
310
311 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
312 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
313 static void rib_destroy_cache(rib_hca_t *hca);
314 static void rib_server_side_cache_reclaim(void *argp);
315 static int avl_compare(const void *t1, const void *t2);
316
317 static void rib_stop_services(rib_hca_t *);
318 static void rib_close_channels(rib_conn_list_t *);
319 static void rib_conn_close(void *);
320 static void rib_recv_rele(rib_qp_t *);
321 static rdma_stat rib_conn_release_locked(CONN *conn);
322
323 /*
324 * RPCIB addressing operations
325 */
326
327 /*
328 * RDMA operations the RPCIB module exports
329 */
330 static rdmaops_t rib_ops = {
331 rib_reachable,
332 rib_conn_get,
333 rib_conn_release,
334 rib_listen,
335 rib_listen_stop,
336 rib_registermem,
337 rib_deregistermem,
338 rib_registermemsync,
339 rib_deregistermemsync,
340 rib_syncmem,
341 rib_reg_buf_alloc,
342 rib_reg_buf_free,
343 rib_send,
344 rib_send_resp,
345 rib_post_resp,
346 rib_post_resp_remove,
347 rib_post_recv,
348 rib_recv,
349 rib_read,
350 rib_write,
351 rib_getinfo,
352 };
353
354 /*
355 * RDMATF RPCIB plugin details
356 */
357 static rdma_mod_t rib_mod = {
358 "ibtf", /* api name */
359 RDMATF_VERS_1,
360 0,
361 &rib_ops, /* rdma op vector for ibtf */
362 };
363
364 static rdma_stat rpcib_open_hcas(rpcib_state_t *);
365 static rdma_stat rib_qp_init(rib_qp_t *, int);
366 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
367 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
368 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
369 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
370 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
371 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
372 ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
373 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
374 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
375 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *);
376 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
377 rib_qp_t **);
378 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
379 rib_qp_t **);
380 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
381 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
382 static int rib_free_sendwait(struct send_wid *);
383 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
384 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
385 static void rdma_done_rem_list(rib_qp_t *);
386 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
387
388 static void rib_async_handler(void *,
389 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
390 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
391 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
392 static int rib_free_svc_recv(struct svc_recv *);
393 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
394 static void rib_free_wid(struct recv_wid *);
395 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
396 static void rib_detach_hca(ibt_hca_hdl_t);
397 static void rib_close_a_channel(CONN *);
398 static void rib_send_hold(rib_qp_t *);
399 static void rib_send_rele(rib_qp_t *);
400
401 /*
402 * Registration with IBTF as a consumer
403 */
404 static struct ibt_clnt_modinfo_s rib_modinfo = {
405 IBTI_V_CURR,
406 IBT_GENERIC,
407 rib_async_handler, /* async event handler */
408 NULL, /* Memory Region Handler */
409 "nfs/ib"
410 };
411
412 /*
413 * Global strucuture
414 */
415
416 typedef struct rpcib_s {
417 dev_info_t *rpcib_dip;
418 kmutex_t rpcib_mutex;
419 } rpcib_t;
420
421 rpcib_t rpcib;
422
423 /*
424 * /etc/system controlled variable to control
425 * debugging in rpcib kernel module.
426 * Set it to values greater that 1 to control
427 * the amount of debugging messages required.
428 */
429 int rib_debug = 0;
430
431 int
_init(void)432 _init(void)
433 {
434 int error;
435
436 error = mod_install((struct modlinkage *)&rib_modlinkage);
437 if (error != 0) {
438 /*
439 * Could not load module
440 */
441 return (error);
442 }
443 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
444 return (0);
445 }
446
447 int
_fini()448 _fini()
449 {
450 int status;
451
452 /*
453 * Remove module
454 */
455 if ((status = mod_remove(&rib_modlinkage)) != 0) {
456 return (status);
457 }
458 mutex_destroy(&plugin_state_lock);
459 return (0);
460 }
461
462 int
_info(struct modinfo * modinfop)463 _info(struct modinfo *modinfop)
464 {
465 return (mod_info(&rib_modlinkage, modinfop));
466 }
467
468 /*
469 * rpcib_getinfo()
470 * Given the device number, return the devinfo pointer or the
471 * instance number.
472 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
473 */
474
475 /*ARGSUSED*/
476 static int
rpcib_getinfo(dev_info_t * dip,ddi_info_cmd_t cmd,void * arg,void ** result)477 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
478 {
479 int ret = DDI_SUCCESS;
480
481 switch (cmd) {
482 case DDI_INFO_DEVT2DEVINFO:
483 if (rpcib.rpcib_dip != NULL)
484 *result = rpcib.rpcib_dip;
485 else {
486 *result = NULL;
487 ret = DDI_FAILURE;
488 }
489 break;
490
491 case DDI_INFO_DEVT2INSTANCE:
492 *result = NULL;
493 break;
494
495 default:
496 ret = DDI_FAILURE;
497 }
498 return (ret);
499 }
500
501 static void
rpcib_free_hca_list()502 rpcib_free_hca_list()
503 {
504 rib_hca_t *hca, *hcap;
505
506 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
507 hca = rib_stat->hcas_list;
508 rib_stat->hcas_list = NULL;
509 rw_exit(&rib_stat->hcas_list_lock);
510 while (hca != NULL) {
511 rw_enter(&hca->state_lock, RW_WRITER);
512 hcap = hca;
513 hca = hca->next;
514 rib_stat->nhca_inited--;
515 rib_mod.rdma_count--;
516 hcap->state = HCA_DETACHED;
517 rw_exit(&hcap->state_lock);
518 rib_stop_hca_services(hcap);
519
520 kmem_free(hcap, sizeof (*hcap));
521 }
522 }
523
524 static rdma_stat
rpcib_free_service_list()525 rpcib_free_service_list()
526 {
527 rib_service_t *service;
528 ibt_status_t ret;
529
530 rw_enter(&rib_stat->service_list_lock, RW_WRITER);
531 while (rib_stat->service_list != NULL) {
532 service = rib_stat->service_list;
533 ret = ibt_unbind_all_services(service->srv_hdl);
534 if (ret != IBT_SUCCESS) {
535 rw_exit(&rib_stat->service_list_lock);
536 #ifdef DEBUG
537 cmn_err(CE_NOTE, "rpcib_free_service_list: "
538 "ibt_unbind_all_services failed (%d)\n", (int)ret);
539 #endif
540 return (RDMA_FAILED);
541 }
542 ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl,
543 service->srv_hdl);
544 if (ret != IBT_SUCCESS) {
545 rw_exit(&rib_stat->service_list_lock);
546 #ifdef DEBUG
547 cmn_err(CE_NOTE, "rpcib_free_service_list: "
548 "ibt_deregister_service failed (%d)\n", (int)ret);
549 #endif
550 return (RDMA_FAILED);
551 }
552 rib_stat->service_list = service->next;
553 kmem_free(service, sizeof (rib_service_t));
554 }
555 rw_exit(&rib_stat->service_list_lock);
556
557 return (RDMA_SUCCESS);
558 }
559
560 static int
rpcib_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)561 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
562 {
563 ibt_status_t ibt_status;
564 rdma_stat r_status;
565
566 switch (cmd) {
567 case DDI_ATTACH:
568 break;
569 case DDI_RESUME:
570 return (DDI_SUCCESS);
571 default:
572 return (DDI_FAILURE);
573 }
574
575 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
576
577 mutex_enter(&rpcib.rpcib_mutex);
578 if (rpcib.rpcib_dip != NULL) {
579 mutex_exit(&rpcib.rpcib_mutex);
580 return (DDI_FAILURE);
581 }
582 rpcib.rpcib_dip = dip;
583 mutex_exit(&rpcib.rpcib_mutex);
584 /*
585 * Create the "rpcib" minor-node.
586 */
587 if (ddi_create_minor_node(dip,
588 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
589 /* Error message, no cmn_err as they print on console */
590 return (DDI_FAILURE);
591 }
592
593 if (rib_stat == NULL) {
594 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
595 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
596 rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL);
597 mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL);
598 }
599
600 rib_stat->hca_count = ibt_get_hca_list(NULL);
601 if (rib_stat->hca_count < 1) {
602 mutex_destroy(&rib_stat->listen_lock);
603 rw_destroy(&rib_stat->hcas_list_lock);
604 mutex_destroy(&rib_stat->open_hca_lock);
605 kmem_free(rib_stat, sizeof (*rib_stat));
606 rib_stat = NULL;
607 return (DDI_FAILURE);
608 }
609
610 ibt_status = ibt_attach(&rib_modinfo, dip,
611 (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
612
613 if (ibt_status != IBT_SUCCESS) {
614 mutex_destroy(&rib_stat->listen_lock);
615 rw_destroy(&rib_stat->hcas_list_lock);
616 mutex_destroy(&rib_stat->open_hca_lock);
617 kmem_free(rib_stat, sizeof (*rib_stat));
618 rib_stat = NULL;
619 return (DDI_FAILURE);
620 }
621
622 rib_stat->service_list = NULL;
623 rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL);
624 mutex_enter(&rib_stat->open_hca_lock);
625 if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) {
626 mutex_exit(&rib_stat->open_hca_lock);
627 goto open_fail;
628 }
629 mutex_exit(&rib_stat->open_hca_lock);
630
631 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
632 DDI_PROP_SUCCESS) {
633 cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update "
634 "failed.");
635 goto register_fail;
636 }
637
638 /*
639 * Register with rdmatf
640 */
641 r_status = rdma_register_mod(&rib_mod);
642 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
643 cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, "
644 "status = %d", r_status);
645 goto register_fail;
646 }
647
648 return (DDI_SUCCESS);
649
650 register_fail:
651
652 open_fail:
653 (void) ibt_detach(rib_stat->ibt_clnt_hdl);
654 rpcib_free_hca_list();
655 (void) rpcib_free_service_list();
656 mutex_destroy(&rib_stat->listen_lock);
657 rw_destroy(&rib_stat->hcas_list_lock);
658 mutex_destroy(&rib_stat->open_hca_lock);
659 rw_destroy(&rib_stat->service_list_lock);
660 kmem_free(rib_stat, sizeof (*rib_stat));
661 rib_stat = NULL;
662 return (DDI_FAILURE);
663 }
664
665 /*ARGSUSED*/
666 static int
rpcib_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)667 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
668 {
669 switch (cmd) {
670
671 case DDI_DETACH:
672 break;
673
674 case DDI_SUSPEND:
675 default:
676 return (DDI_FAILURE);
677 }
678
679 /*
680 * Detach the hca and free resources
681 */
682 mutex_enter(&plugin_state_lock);
683 plugin_state = NO_ACCEPT;
684 mutex_exit(&plugin_state_lock);
685
686 if (rpcib_free_service_list() != RDMA_SUCCESS)
687 return (DDI_FAILURE);
688 rpcib_free_hca_list();
689
690 (void) ibt_detach(rib_stat->ibt_clnt_hdl);
691 mutex_destroy(&rib_stat->listen_lock);
692 rw_destroy(&rib_stat->hcas_list_lock);
693 mutex_destroy(&rib_stat->open_hca_lock);
694 rw_destroy(&rib_stat->service_list_lock);
695
696 kmem_free(rib_stat, sizeof (*rib_stat));
697 rib_stat = NULL;
698
699 mutex_enter(&rpcib.rpcib_mutex);
700 rpcib.rpcib_dip = NULL;
701 mutex_exit(&rpcib.rpcib_mutex);
702 mutex_destroy(&rpcib.rpcib_mutex);
703 return (DDI_SUCCESS);
704 }
705
706
707 static void rib_rbufpool_free(rib_hca_t *, int);
708 static void rib_rbufpool_deregister(rib_hca_t *, int);
709 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
710 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
711 static rdma_stat rib_rem_replylist(rib_qp_t *);
712 static int rib_remreply(rib_qp_t *, struct reply *);
713 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
714 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
715
716
717 /*
718 * One CQ pair per HCA
719 */
720 static rdma_stat
rib_create_cq(rib_hca_t * hca,uint32_t cq_size,ibt_cq_handler_t cq_handler,rib_cq_t ** cqp)721 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
722 rib_cq_t **cqp)
723 {
724 rib_cq_t *cq;
725 ibt_cq_attr_t cq_attr;
726 uint32_t real_size;
727 ibt_status_t status;
728 rdma_stat error = RDMA_SUCCESS;
729
730 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
731 cq->rib_hca = hca;
732 bzero(&cq_attr, sizeof (cq_attr));
733 cq_attr.cq_size = cq_size;
734 cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
735 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
736 &real_size);
737 if (status != IBT_SUCCESS) {
738 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
739 " status=%d", status);
740 error = RDMA_FAILED;
741 goto fail;
742 }
743 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca);
744
745 /*
746 * Enable CQ callbacks. CQ Callbacks are single shot
747 * (e.g. you have to call ibt_enable_cq_notify()
748 * after each callback to get another one).
749 */
750 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
751 if (status != IBT_SUCCESS) {
752 cmn_err(CE_WARN, "rib_create_cq: "
753 "enable_cq_notify failed, status %d", status);
754 error = RDMA_FAILED;
755 goto fail;
756 }
757 *cqp = cq;
758
759 return (error);
760 fail:
761 if (cq->rib_cq_hdl)
762 (void) ibt_free_cq(cq->rib_cq_hdl);
763 if (cq)
764 kmem_free(cq, sizeof (rib_cq_t));
765 return (error);
766 }
767
768 /*
769 * rpcib_find_hca
770 *
771 * Caller should have already locked the hcas_lock before calling
772 * this function.
773 */
774 static rib_hca_t *
rpcib_find_hca(rpcib_state_t * ribstat,ib_guid_t guid)775 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid)
776 {
777 rib_hca_t *hca = ribstat->hcas_list;
778
779 while (hca && hca->hca_guid != guid)
780 hca = hca->next;
781
782 return (hca);
783 }
784
785 static rdma_stat
rpcib_open_hcas(rpcib_state_t * ribstat)786 rpcib_open_hcas(rpcib_state_t *ribstat)
787 {
788 rib_hca_t *hca;
789 ibt_status_t ibt_status;
790 rdma_stat status;
791 ibt_hca_portinfo_t *pinfop;
792 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS;
793 uint_t size, cq_size;
794 int i;
795 kstat_t *ksp;
796 cache_avl_struct_t example_avl_node;
797 char rssc_name[32];
798 int old_nhca_inited = ribstat->nhca_inited;
799 ib_guid_t *hca_guids;
800
801 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
802
803 ribstat->hca_count = ibt_get_hca_list(&hca_guids);
804 if (ribstat->hca_count == 0)
805 return (RDMA_FAILED);
806
807 rw_enter(&ribstat->hcas_list_lock, RW_WRITER);
808 /*
809 * Open a hca and setup for RDMA
810 */
811 for (i = 0; i < ribstat->hca_count; i++) {
812 if (rpcib_find_hca(ribstat, hca_guids[i]))
813 continue;
814 hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP);
815
816 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
817 hca_guids[i], &hca->hca_hdl);
818 if (ibt_status != IBT_SUCCESS) {
819 kmem_free(hca, sizeof (rib_hca_t));
820 continue;
821 }
822 hca->hca_guid = hca_guids[i];
823 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
824 hca->state = HCA_INITED;
825
826 /*
827 * query HCA info
828 */
829 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
830 if (ibt_status != IBT_SUCCESS) {
831 goto fail1;
832 }
833
834 /*
835 * One PD (Protection Domain) per HCA.
836 * A qp is allowed to access a memory region
837 * only when it's in the same PD as that of
838 * the memory region.
839 */
840 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
841 if (ibt_status != IBT_SUCCESS) {
842 goto fail1;
843 }
844
845 /*
846 * query HCA ports
847 */
848 ibt_status = ibt_query_hca_ports(hca->hca_hdl,
849 0, &pinfop, &hca->hca_nports, &size);
850 if (ibt_status != IBT_SUCCESS) {
851 goto fail2;
852 }
853 hca->hca_ports = pinfop;
854 hca->hca_pinfosz = size;
855 pinfop = NULL;
856
857 cq_size = DEF_CQ_SIZE; /* default cq size */
858 /*
859 * Create 2 pairs of cq's (1 pair for client
860 * and the other pair for server) on this hca.
861 * If number of qp's gets too large, then several
862 * cq's will be needed.
863 */
864 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
865 &hca->svc_rcq);
866 if (status != RDMA_SUCCESS) {
867 goto fail3;
868 }
869
870 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
871 &hca->svc_scq);
872 if (status != RDMA_SUCCESS) {
873 goto fail3;
874 }
875
876 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
877 &hca->clnt_rcq);
878 if (status != RDMA_SUCCESS) {
879 goto fail3;
880 }
881
882 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
883 &hca->clnt_scq);
884 if (status != RDMA_SUCCESS) {
885 goto fail3;
886 }
887
888 /*
889 * Create buffer pools.
890 * Note rib_rbuf_create also allocates memory windows.
891 */
892 hca->recv_pool = rib_rbufpool_create(hca,
893 RECV_BUFFER, rib_max_rbufs);
894 if (hca->recv_pool == NULL) {
895 goto fail3;
896 }
897
898 hca->send_pool = rib_rbufpool_create(hca,
899 SEND_BUFFER, rib_max_rbufs);
900 if (hca->send_pool == NULL) {
901 rib_rbufpool_destroy(hca, RECV_BUFFER);
902 goto fail3;
903 }
904
905 if (hca->server_side_cache == NULL) {
906 (void) sprintf(rssc_name,
907 "rib_srvr_cache_%llx",
908 (long long unsigned int) hca->hca_guid);
909 hca->server_side_cache = kmem_cache_create(
910 rssc_name,
911 sizeof (cache_avl_struct_t), 0,
912 NULL,
913 NULL,
914 rib_server_side_cache_reclaim,
915 hca, NULL, 0);
916 }
917
918 avl_create(&hca->avl_tree,
919 avl_compare,
920 sizeof (cache_avl_struct_t),
921 (uint_t)(uintptr_t)&example_avl_node.avl_link-
922 (uint_t)(uintptr_t)&example_avl_node);
923
924 rw_init(&hca->bound_services_lock, NULL, RW_DRIVER,
925 hca->iblock);
926 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
927 rw_init(&hca->avl_rw_lock,
928 NULL, RW_DRIVER, hca->iblock);
929 mutex_init(&hca->cache_allocation_lock,
930 NULL, MUTEX_DRIVER, NULL);
931 hca->avl_init = TRUE;
932
933 /* Create kstats for the cache */
934 ASSERT(INGLOBALZONE(curproc));
935
936 if (!stats_enabled) {
937 ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
938 KSTAT_TYPE_NAMED,
939 sizeof (rpcib_kstat) / sizeof (kstat_named_t),
940 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
941 GLOBAL_ZONEID);
942 if (ksp) {
943 ksp->ks_data = (void *) &rpcib_kstat;
944 ksp->ks_update = rpcib_cache_kstat_update;
945 kstat_install(ksp);
946 stats_enabled = TRUE;
947 }
948 }
949 if (hca->cleanup_helper == NULL) {
950 char tq_name[sizeof (hca->hca_guid) * 2 + 1];
951
952 (void) snprintf(tq_name, sizeof (tq_name), "%llX",
953 (unsigned long long int) hca->hca_guid);
954 hca->cleanup_helper = ddi_taskq_create(NULL,
955 tq_name, 1, TASKQ_DEFAULTPRI, 0);
956 }
957
958 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
959 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
960 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
961 hca->iblock);
962 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
963 hca->iblock);
964 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
965 hca->inuse = TRUE;
966
967 hca->next = ribstat->hcas_list;
968 ribstat->hcas_list = hca;
969 ribstat->nhca_inited++;
970 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
971 continue;
972
973 fail3:
974 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
975 fail2:
976 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
977 fail1:
978 (void) ibt_close_hca(hca->hca_hdl);
979 kmem_free(hca, sizeof (rib_hca_t));
980 }
981 rw_exit(&ribstat->hcas_list_lock);
982 ibt_free_hca_list(hca_guids, ribstat->hca_count);
983 rib_mod.rdma_count = rib_stat->nhca_inited;
984
985 /*
986 * return success if at least one new hca has been configured.
987 */
988 if (ribstat->nhca_inited != old_nhca_inited)
989 return (RDMA_SUCCESS);
990 else
991 return (RDMA_FAILED);
992 }
993
994 /*
995 * Callback routines
996 */
997
998 /*
999 * SCQ handlers
1000 */
1001 /* ARGSUSED */
1002 static void
rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl,void * arg)1003 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1004 {
1005 ibt_status_t ibt_status;
1006 ibt_wc_t wc;
1007 struct send_wid *wd;
1008 CONN *conn;
1009 rib_qp_t *qp;
1010 int i;
1011
1012 /*
1013 * Re-enable cq notify here to avoid missing any
1014 * completion queue notification.
1015 */
1016 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1017
1018 ibt_status = IBT_SUCCESS;
1019 while (ibt_status != IBT_CQ_EMPTY) {
1020 bzero(&wc, sizeof (wc));
1021 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1022 if (ibt_status != IBT_SUCCESS)
1023 return;
1024
1025 /*
1026 * Got a send completion
1027 */
1028 if (wc.wc_id != RDMA_DUMMY_WRID) {
1029 wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1030 qp = wd->qp;
1031 conn = qptoc(qp);
1032
1033 mutex_enter(&wd->sendwait_lock);
1034 switch (wc.wc_status) {
1035 case IBT_WC_SUCCESS:
1036 wd->status = RDMA_SUCCESS;
1037 break;
1038 default:
1039 /*
1040 * RC Send Q Error Code Local state Remote State
1041 * ==================== =========== ============
1042 * IBT_WC_BAD_RESPONSE_ERR ERROR None
1043 * IBT_WC_LOCAL_LEN_ERR ERROR None
1044 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None
1045 * IBT_WC_LOCAL_PROTECT_ERR ERROR None
1046 * IBT_WC_MEM_WIN_BIND_ERR ERROR None
1047 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR
1048 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR
1049 * IBT_WC_REMOTE_OP_ERR ERROR ERROR
1050 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None
1051 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None
1052 * IBT_WC_WR_FLUSHED_ERR ERROR None
1053 */
1054 /*
1055 * Channel in error state. Set connection to
1056 * ERROR and cleanup will happen either from
1057 * conn_release or from rib_conn_get
1058 */
1059 wd->status = RDMA_FAILED;
1060 mutex_enter(&conn->c_lock);
1061 if (conn->c_state != C_DISCONN_PEND)
1062 conn->c_state = C_ERROR_CONN;
1063 mutex_exit(&conn->c_lock);
1064 break;
1065 }
1066
1067 if (wd->cv_sig == 1) {
1068 /*
1069 * Notify poster
1070 */
1071 cv_signal(&wd->wait_cv);
1072 mutex_exit(&wd->sendwait_lock);
1073 } else {
1074 /*
1075 * Poster not waiting for notification.
1076 * Free the send buffers and send_wid
1077 */
1078 for (i = 0; i < wd->nsbufs; i++) {
1079 rib_rbuf_free(qptoc(wd->qp),
1080 SEND_BUFFER,
1081 (void *)(uintptr_t)wd->sbufaddr[i]);
1082 }
1083
1084 /* decrement the send ref count */
1085 rib_send_rele(qp);
1086
1087 mutex_exit(&wd->sendwait_lock);
1088 (void) rib_free_sendwait(wd);
1089 }
1090 }
1091 }
1092 }
1093
1094 /* ARGSUSED */
1095 static void
rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl,void * arg)1096 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1097 {
1098 ibt_status_t ibt_status;
1099 ibt_wc_t wc;
1100 struct send_wid *wd;
1101 rib_qp_t *qp;
1102 CONN *conn;
1103 int i;
1104
1105 /*
1106 * Re-enable cq notify here to avoid missing any
1107 * completion queue notification.
1108 */
1109 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1110
1111 ibt_status = IBT_SUCCESS;
1112 while (ibt_status != IBT_CQ_EMPTY) {
1113 bzero(&wc, sizeof (wc));
1114 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1115 if (ibt_status != IBT_SUCCESS)
1116 return;
1117
1118 /*
1119 * Got a send completion
1120 */
1121 if (wc.wc_id != RDMA_DUMMY_WRID) {
1122 wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1123 qp = wd->qp;
1124 conn = qptoc(qp);
1125 mutex_enter(&wd->sendwait_lock);
1126
1127 switch (wc.wc_status) {
1128 case IBT_WC_SUCCESS:
1129 wd->status = RDMA_SUCCESS;
1130 break;
1131 default:
1132 /*
1133 * Channel in error state. Set connection to
1134 * ERROR and cleanup will happen either from
1135 * conn_release or conn timeout.
1136 */
1137 wd->status = RDMA_FAILED;
1138 mutex_enter(&conn->c_lock);
1139 if (conn->c_state != C_DISCONN_PEND)
1140 conn->c_state = C_ERROR_CONN;
1141 mutex_exit(&conn->c_lock);
1142 break;
1143 }
1144
1145 if (wd->cv_sig == 1) {
1146 /*
1147 * Update completion status and notify poster
1148 */
1149 cv_signal(&wd->wait_cv);
1150 mutex_exit(&wd->sendwait_lock);
1151 } else {
1152 /*
1153 * Poster not waiting for notification.
1154 * Free the send buffers and send_wid
1155 */
1156 for (i = 0; i < wd->nsbufs; i++) {
1157 rib_rbuf_free(qptoc(wd->qp),
1158 SEND_BUFFER,
1159 (void *)(uintptr_t)wd->sbufaddr[i]);
1160 }
1161
1162 /* decrement the send ref count */
1163 rib_send_rele(qp);
1164
1165 mutex_exit(&wd->sendwait_lock);
1166 (void) rib_free_sendwait(wd);
1167 }
1168 }
1169 }
1170 }
1171
1172 /*
1173 * RCQ handler
1174 */
1175 /* ARGSUSED */
1176 static void
rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl,void * arg)1177 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1178 {
1179 rib_qp_t *qp;
1180 ibt_status_t ibt_status;
1181 ibt_wc_t wc;
1182 struct recv_wid *rwid;
1183
1184 /*
1185 * Re-enable cq notify here to avoid missing any
1186 * completion queue notification.
1187 */
1188 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1189
1190 ibt_status = IBT_SUCCESS;
1191 while (ibt_status != IBT_CQ_EMPTY) {
1192 bzero(&wc, sizeof (wc));
1193 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1194 if (ibt_status != IBT_SUCCESS)
1195 return;
1196
1197 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1198 qp = rwid->qp;
1199
1200 if (wc.wc_status == IBT_WC_SUCCESS) {
1201 XDR inxdrs, *xdrs;
1202 uint_t xid, vers, op, find_xid = 0;
1203 struct reply *r;
1204 CONN *conn = qptoc(qp);
1205 uint32_t rdma_credit = 0;
1206
1207 xdrs = &inxdrs;
1208 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1209 wc.wc_bytes_xfer, XDR_DECODE);
1210 /*
1211 * Treat xid as opaque (xid is the first entity
1212 * in the rpc rdma message).
1213 */
1214 xid = *(uint32_t *)(uintptr_t)rwid->addr;
1215
1216 /* Skip xid and set the xdr position accordingly. */
1217 XDR_SETPOS(xdrs, sizeof (uint32_t));
1218 (void) xdr_u_int(xdrs, &vers);
1219 (void) xdr_u_int(xdrs, &rdma_credit);
1220 (void) xdr_u_int(xdrs, &op);
1221 XDR_DESTROY(xdrs);
1222
1223 if (vers != RPCRDMA_VERS) {
1224 /*
1225 * Invalid RPC/RDMA version. Cannot
1226 * interoperate. Set connection to
1227 * ERROR state and bail out.
1228 */
1229 mutex_enter(&conn->c_lock);
1230 if (conn->c_state != C_DISCONN_PEND)
1231 conn->c_state = C_ERROR_CONN;
1232 mutex_exit(&conn->c_lock);
1233 rib_rbuf_free(conn, RECV_BUFFER,
1234 (void *)(uintptr_t)rwid->addr);
1235 rib_free_wid(rwid);
1236 rib_recv_rele(qp);
1237 continue;
1238 }
1239
1240 mutex_enter(&qp->replylist_lock);
1241 for (r = qp->replylist; r != NULL; r = r->next) {
1242 if (r->xid == xid) {
1243 find_xid = 1;
1244 switch (op) {
1245 case RDMA_MSG:
1246 case RDMA_NOMSG:
1247 case RDMA_MSGP:
1248 r->status = RDMA_SUCCESS;
1249 r->vaddr_cq = rwid->addr;
1250 r->bytes_xfer =
1251 wc.wc_bytes_xfer;
1252 cv_signal(&r->wait_cv);
1253 break;
1254 default:
1255 rib_rbuf_free(qptoc(qp),
1256 RECV_BUFFER,
1257 (void *)(uintptr_t)
1258 rwid->addr);
1259 break;
1260 }
1261 break;
1262 }
1263 }
1264 mutex_exit(&qp->replylist_lock);
1265 if (find_xid == 0) {
1266 /* RPC caller not waiting for reply */
1267
1268 DTRACE_PROBE1(rpcib__i__nomatchxid1,
1269 int, xid);
1270
1271 rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1272 (void *)(uintptr_t)rwid->addr);
1273 }
1274 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1275 CONN *conn = qptoc(qp);
1276
1277 /*
1278 * Connection being flushed. Just free
1279 * the posted buffer
1280 */
1281 rib_rbuf_free(conn, RECV_BUFFER,
1282 (void *)(uintptr_t)rwid->addr);
1283 } else {
1284 CONN *conn = qptoc(qp);
1285 /*
1286 * RC Recv Q Error Code Local state Remote State
1287 * ==================== =========== ============
1288 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd
1289 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd
1290 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd
1291 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd
1292 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd
1293 * IBT_WC_WR_FLUSHED_ERR None None
1294 */
1295 /*
1296 * Channel in error state. Set connection
1297 * in ERROR state.
1298 */
1299 mutex_enter(&conn->c_lock);
1300 if (conn->c_state != C_DISCONN_PEND)
1301 conn->c_state = C_ERROR_CONN;
1302 mutex_exit(&conn->c_lock);
1303 rib_rbuf_free(conn, RECV_BUFFER,
1304 (void *)(uintptr_t)rwid->addr);
1305 }
1306 rib_free_wid(rwid);
1307 rib_recv_rele(qp);
1308 }
1309 }
1310
1311 /* Server side */
1312 /* ARGSUSED */
1313 static void
rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl,void * arg)1314 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1315 {
1316 rdma_recv_data_t *rdp;
1317 rib_qp_t *qp;
1318 ibt_status_t ibt_status;
1319 ibt_wc_t wc;
1320 struct svc_recv *s_recvp;
1321 CONN *conn;
1322 mblk_t *mp;
1323
1324 /*
1325 * Re-enable cq notify here to avoid missing any
1326 * completion queue notification.
1327 */
1328 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1329
1330 ibt_status = IBT_SUCCESS;
1331 while (ibt_status != IBT_CQ_EMPTY) {
1332 bzero(&wc, sizeof (wc));
1333 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1334 if (ibt_status != IBT_SUCCESS)
1335 return;
1336
1337 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1338 qp = s_recvp->qp;
1339 conn = qptoc(qp);
1340
1341 if (wc.wc_status == IBT_WC_SUCCESS) {
1342 XDR inxdrs, *xdrs;
1343 uint_t xid, vers, op;
1344 uint32_t rdma_credit;
1345
1346 xdrs = &inxdrs;
1347 /* s_recvp->vaddr stores data */
1348 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1349 wc.wc_bytes_xfer, XDR_DECODE);
1350
1351 /*
1352 * Treat xid as opaque (xid is the first entity
1353 * in the rpc rdma message).
1354 */
1355 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1356 /* Skip xid and set the xdr position accordingly. */
1357 XDR_SETPOS(xdrs, sizeof (uint32_t));
1358 if (!xdr_u_int(xdrs, &vers) ||
1359 !xdr_u_int(xdrs, &rdma_credit) ||
1360 !xdr_u_int(xdrs, &op)) {
1361 rib_rbuf_free(conn, RECV_BUFFER,
1362 (void *)(uintptr_t)s_recvp->vaddr);
1363 XDR_DESTROY(xdrs);
1364 rib_recv_rele(qp);
1365 (void) rib_free_svc_recv(s_recvp);
1366 continue;
1367 }
1368 XDR_DESTROY(xdrs);
1369
1370 if (vers != RPCRDMA_VERS) {
1371 /*
1372 * Invalid RPC/RDMA version.
1373 * Drop rpc rdma message.
1374 */
1375 rib_rbuf_free(conn, RECV_BUFFER,
1376 (void *)(uintptr_t)s_recvp->vaddr);
1377 rib_recv_rele(qp);
1378 (void) rib_free_svc_recv(s_recvp);
1379 continue;
1380 }
1381 /*
1382 * Is this for RDMA_DONE?
1383 */
1384 if (op == RDMA_DONE) {
1385 rib_rbuf_free(conn, RECV_BUFFER,
1386 (void *)(uintptr_t)s_recvp->vaddr);
1387 /*
1388 * Wake up the thread waiting on
1389 * a RDMA_DONE for xid
1390 */
1391 mutex_enter(&qp->rdlist_lock);
1392 rdma_done_notify(qp, xid);
1393 mutex_exit(&qp->rdlist_lock);
1394 rib_recv_rele(qp);
1395 (void) rib_free_svc_recv(s_recvp);
1396 continue;
1397 }
1398
1399 mutex_enter(&plugin_state_lock);
1400 mutex_enter(&conn->c_lock);
1401 if ((plugin_state == ACCEPT) &&
1402 (conn->c_state == C_CONNECTED)) {
1403 conn->c_ref++;
1404 mutex_exit(&conn->c_lock);
1405 while ((mp = allocb(sizeof (*rdp), BPRI_LO))
1406 == NULL)
1407 (void) strwaitbuf(
1408 sizeof (*rdp), BPRI_LO);
1409 /*
1410 * Plugin is in accept state, hence the master
1411 * transport queue for this is still accepting
1412 * requests. Hence we can call svc_queuereq to
1413 * queue this recieved msg.
1414 */
1415 rdp = (rdma_recv_data_t *)mp->b_rptr;
1416 rdp->conn = conn;
1417 rdp->rpcmsg.addr =
1418 (caddr_t)(uintptr_t)s_recvp->vaddr;
1419 rdp->rpcmsg.type = RECV_BUFFER;
1420 rdp->rpcmsg.len = wc.wc_bytes_xfer;
1421 rdp->status = wc.wc_status;
1422 mp->b_wptr += sizeof (*rdp);
1423 svc_queuereq((queue_t *)rib_stat->q, mp);
1424 mutex_exit(&plugin_state_lock);
1425 } else {
1426 /*
1427 * The master transport for this is going
1428 * away and the queue is not accepting anymore
1429 * requests for krpc, so don't do anything, just
1430 * free the msg.
1431 */
1432 mutex_exit(&conn->c_lock);
1433 mutex_exit(&plugin_state_lock);
1434 rib_rbuf_free(conn, RECV_BUFFER,
1435 (void *)(uintptr_t)s_recvp->vaddr);
1436 }
1437 } else {
1438 rib_rbuf_free(conn, RECV_BUFFER,
1439 (void *)(uintptr_t)s_recvp->vaddr);
1440 }
1441 rib_recv_rele(qp);
1442 (void) rib_free_svc_recv(s_recvp);
1443 }
1444 }
1445
1446 static void
rib_attach_hca()1447 rib_attach_hca()
1448 {
1449 mutex_enter(&rib_stat->open_hca_lock);
1450 (void) rpcib_open_hcas(rib_stat);
1451 rib_listen(NULL);
1452 mutex_exit(&rib_stat->open_hca_lock);
1453 }
1454
1455 /*
1456 * Handles DR event of IBT_HCA_DETACH_EVENT.
1457 */
1458 /* ARGSUSED */
1459 static void
rib_async_handler(void * clnt_private,ibt_hca_hdl_t hca_hdl,ibt_async_code_t code,ibt_async_event_t * event)1460 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1461 ibt_async_code_t code, ibt_async_event_t *event)
1462 {
1463 switch (code) {
1464 case IBT_HCA_ATTACH_EVENT:
1465 rib_attach_hca();
1466 break;
1467 case IBT_HCA_DETACH_EVENT:
1468 rib_detach_hca(hca_hdl);
1469 #ifdef DEBUG
1470 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1471 #endif
1472 break;
1473 case IBT_EVENT_PORT_UP:
1474 /*
1475 * A port is up. We should call rib_listen() since there is
1476 * a chance that rib_listen() may have failed during
1477 * rib_attach_hca() because the port had not been up yet.
1478 */
1479 rib_listen(NULL);
1480 #ifdef DEBUG
1481 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1482 #endif
1483 break;
1484 #ifdef DEBUG
1485 case IBT_EVENT_PATH_MIGRATED:
1486 cmn_err(CE_NOTE, "rib_async_handler(): "
1487 "IBT_EVENT_PATH_MIGRATED\n");
1488 break;
1489 case IBT_EVENT_SQD:
1490 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1491 break;
1492 case IBT_EVENT_COM_EST:
1493 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1494 break;
1495 case IBT_ERROR_CATASTROPHIC_CHAN:
1496 cmn_err(CE_NOTE, "rib_async_handler(): "
1497 "IBT_ERROR_CATASTROPHIC_CHAN\n");
1498 break;
1499 case IBT_ERROR_INVALID_REQUEST_CHAN:
1500 cmn_err(CE_NOTE, "rib_async_handler(): "
1501 "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1502 break;
1503 case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1504 cmn_err(CE_NOTE, "rib_async_handler(): "
1505 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1506 break;
1507 case IBT_ERROR_PATH_MIGRATE_REQ:
1508 cmn_err(CE_NOTE, "rib_async_handler(): "
1509 "IBT_ERROR_PATH_MIGRATE_REQ\n");
1510 break;
1511 case IBT_ERROR_CQ:
1512 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1513 break;
1514 case IBT_ERROR_PORT_DOWN:
1515 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1516 break;
1517 case IBT_ASYNC_OPAQUE1:
1518 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1519 break;
1520 case IBT_ASYNC_OPAQUE2:
1521 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1522 break;
1523 case IBT_ASYNC_OPAQUE3:
1524 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1525 break;
1526 case IBT_ASYNC_OPAQUE4:
1527 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1528 break;
1529 #endif
1530 default:
1531 break;
1532 }
1533 }
1534
1535 /*
1536 * Client's reachable function.
1537 */
1538 static rdma_stat
rib_reachable(int addr_type,struct netbuf * raddr,void ** handle)1539 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1540 {
1541 rdma_stat status;
1542 rpcib_ping_t rpt;
1543 struct netbuf saddr;
1544 CONN *conn;
1545
1546 bzero(&saddr, sizeof (struct netbuf));
1547 status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn);
1548
1549 if (status == RDMA_SUCCESS) {
1550 *handle = (void *)rpt.hca;
1551 /* release the reference */
1552 (void) rib_conn_release(conn);
1553 return (RDMA_SUCCESS);
1554 } else {
1555 *handle = NULL;
1556 DTRACE_PROBE(rpcib__i__pingfailed);
1557 return (RDMA_FAILED);
1558 }
1559 }
1560
1561 /* Client side qp creation */
1562 static rdma_stat
rib_clnt_create_chan(rib_hca_t * hca,struct netbuf * raddr,rib_qp_t ** qp)1563 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1564 {
1565 rib_qp_t *kqp = NULL;
1566 CONN *conn;
1567 rdma_clnt_cred_ctrl_t *cc_info;
1568
1569 ASSERT(qp != NULL);
1570 *qp = NULL;
1571
1572 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1573 conn = qptoc(kqp);
1574 kqp->hca = hca;
1575 kqp->rdmaconn.c_rdmamod = &rib_mod;
1576 kqp->rdmaconn.c_private = (caddr_t)kqp;
1577
1578 kqp->mode = RIB_CLIENT;
1579 kqp->chan_flags = IBT_BLOCKING;
1580 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1581 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1582 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1583 /*
1584 * Initialize
1585 */
1586 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1587 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1588 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1589 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1590 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1591 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1592 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1593 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1594 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1595 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1596 /*
1597 * Initialize the client credit control
1598 * portion of the rdmaconn struct.
1599 */
1600 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1601 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1602 cc_info->clnt_cc_granted_ops = 0;
1603 cc_info->clnt_cc_in_flight_ops = 0;
1604 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1605
1606 *qp = kqp;
1607 return (RDMA_SUCCESS);
1608 }
1609
1610 /* Server side qp creation */
1611 static rdma_stat
rib_svc_create_chan(rib_hca_t * hca,caddr_t q,uint8_t port,rib_qp_t ** qp)1612 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1613 {
1614 rib_qp_t *kqp = NULL;
1615 ibt_chan_sizes_t chan_sizes;
1616 ibt_rc_chan_alloc_args_t qp_attr;
1617 ibt_status_t ibt_status;
1618 rdma_srv_cred_ctrl_t *cc_info;
1619
1620 *qp = NULL;
1621
1622 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1623 kqp->hca = hca;
1624 kqp->port_num = port;
1625 kqp->rdmaconn.c_rdmamod = &rib_mod;
1626 kqp->rdmaconn.c_private = (caddr_t)kqp;
1627
1628 /*
1629 * Create the qp handle
1630 */
1631 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1632 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1633 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1634 qp_attr.rc_pd = hca->pd_hdl;
1635 qp_attr.rc_hca_port_num = port;
1636 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1637 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1638 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1639 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1640 qp_attr.rc_clone_chan = NULL;
1641 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1642 qp_attr.rc_flags = IBT_WR_SIGNALED;
1643
1644 rw_enter(&hca->state_lock, RW_READER);
1645 if (hca->state != HCA_DETACHED) {
1646 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1647 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1648 &chan_sizes);
1649 } else {
1650 rw_exit(&hca->state_lock);
1651 goto fail;
1652 }
1653 rw_exit(&hca->state_lock);
1654
1655 if (ibt_status != IBT_SUCCESS) {
1656 DTRACE_PROBE1(rpcib__i_svccreatechanfail,
1657 int, ibt_status);
1658 goto fail;
1659 }
1660
1661 kqp->mode = RIB_SERVER;
1662 kqp->chan_flags = IBT_BLOCKING;
1663 kqp->q = q; /* server ONLY */
1664
1665 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1666 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1667 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1668 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1669 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1670 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1671 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1672 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1673 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1674 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1675 /*
1676 * Set the private data area to qp to be used in callbacks
1677 */
1678 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1679 kqp->rdmaconn.c_state = C_CONNECTED;
1680
1681 /*
1682 * Initialize the server credit control
1683 * portion of the rdmaconn struct.
1684 */
1685 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1686 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1687 cc_info->srv_cc_buffers_granted = preposted_rbufs;
1688 cc_info->srv_cc_cur_buffers_used = 0;
1689 cc_info->srv_cc_posted = preposted_rbufs;
1690
1691 *qp = kqp;
1692
1693 return (RDMA_SUCCESS);
1694 fail:
1695 if (kqp)
1696 kmem_free(kqp, sizeof (rib_qp_t));
1697
1698 return (RDMA_FAILED);
1699 }
1700
1701 /* ARGSUSED */
1702 ibt_cm_status_t
rib_clnt_cm_handler(void * clnt_hdl,ibt_cm_event_t * event,ibt_cm_return_args_t * ret_args,void * priv_data,ibt_priv_data_len_t len)1703 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1704 ibt_cm_return_args_t *ret_args, void *priv_data,
1705 ibt_priv_data_len_t len)
1706 {
1707 rib_hca_t *hca;
1708
1709 hca = (rib_hca_t *)clnt_hdl;
1710
1711 switch (event->cm_type) {
1712
1713 /* got a connection close event */
1714 case IBT_CM_EVENT_CONN_CLOSED:
1715 {
1716 CONN *conn;
1717 rib_qp_t *qp;
1718
1719 /* check reason why connection was closed */
1720 switch (event->cm_event.closed) {
1721 case IBT_CM_CLOSED_DREP_RCVD:
1722 case IBT_CM_CLOSED_DREQ_TIMEOUT:
1723 case IBT_CM_CLOSED_DUP:
1724 case IBT_CM_CLOSED_ABORT:
1725 case IBT_CM_CLOSED_ALREADY:
1726 /*
1727 * These cases indicate the local end initiated
1728 * the closing of the channel. Nothing to do here.
1729 */
1730 break;
1731 default:
1732 /*
1733 * Reason for CONN_CLOSED event must be one of
1734 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1735 * or IBT_CM_CLOSED_STALE. These indicate cases were
1736 * the remote end is closing the channel. In these
1737 * cases free the channel and transition to error
1738 * state
1739 */
1740 qp = ibt_get_chan_private(event->cm_channel);
1741 conn = qptoc(qp);
1742 mutex_enter(&conn->c_lock);
1743 if (conn->c_state == C_DISCONN_PEND) {
1744 mutex_exit(&conn->c_lock);
1745 break;
1746 }
1747
1748 conn->c_state = C_ERROR_CONN;
1749
1750 /*
1751 * Free the conn if c_ref is down to 0 already
1752 */
1753 if (conn->c_ref == 0) {
1754 /*
1755 * Remove from list and free conn
1756 */
1757 conn->c_state = C_DISCONN_PEND;
1758 mutex_exit(&conn->c_lock);
1759 rw_enter(&hca->state_lock, RW_READER);
1760 if (hca->state != HCA_DETACHED)
1761 (void) rib_disconnect_channel(conn,
1762 &hca->cl_conn_list);
1763 rw_exit(&hca->state_lock);
1764 } else {
1765 /*
1766 * conn will be freed when c_ref goes to 0.
1767 * Indicate to cleaning thread not to close
1768 * the connection, but just free the channel.
1769 */
1770 conn->c_flags |= C_CLOSE_NOTNEEDED;
1771 mutex_exit(&conn->c_lock);
1772 }
1773 #ifdef DEBUG
1774 if (rib_debug)
1775 cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1776 "(CONN_CLOSED) channel disconnected");
1777 #endif
1778 break;
1779 }
1780 break;
1781 }
1782 default:
1783 break;
1784 }
1785 return (IBT_CM_ACCEPT);
1786 }
1787
1788 /*
1789 * Connect to the server.
1790 */
1791 rdma_stat
rib_conn_to_srv(rib_hca_t * hca,rib_qp_t * qp,rpcib_ping_t * rptp)1792 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp)
1793 {
1794 ibt_chan_open_args_t chan_args; /* channel args */
1795 ibt_chan_sizes_t chan_sizes;
1796 ibt_rc_chan_alloc_args_t qp_attr;
1797 ibt_status_t ibt_status;
1798 ibt_rc_returns_t ret_args; /* conn reject info */
1799 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */
1800 ibt_ip_cm_info_t ipcm_info;
1801 uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
1802
1803
1804 (void) bzero(&chan_args, sizeof (chan_args));
1805 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1806 (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
1807
1808 ipcm_info.src_addr.family = rptp->srcip.family;
1809 switch (ipcm_info.src_addr.family) {
1810 case AF_INET:
1811 ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr;
1812 break;
1813 case AF_INET6:
1814 ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr;
1815 break;
1816 }
1817
1818 ipcm_info.dst_addr.family = rptp->srcip.family;
1819 switch (ipcm_info.dst_addr.family) {
1820 case AF_INET:
1821 ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr;
1822 break;
1823 case AF_INET6:
1824 ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr;
1825 break;
1826 }
1827
1828 ipcm_info.src_port = (in_port_t)nfs_rdma_port;
1829
1830 ibt_status = ibt_format_ip_private_data(&ipcm_info,
1831 IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
1832
1833 if (ibt_status != IBT_SUCCESS) {
1834 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
1835 return (-1);
1836 }
1837
1838 qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num;
1839 /* Alloc a RC channel */
1840 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1841 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1842 qp_attr.rc_pd = hca->pd_hdl;
1843 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1844 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1845 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1846 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1847 qp_attr.rc_clone_chan = NULL;
1848 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1849 qp_attr.rc_flags = IBT_WR_SIGNALED;
1850
1851 rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port);
1852 chan_args.oc_path = &rptp->path;
1853
1854 chan_args.oc_cm_handler = rib_clnt_cm_handler;
1855 chan_args.oc_cm_clnt_private = (void *)hca;
1856 chan_args.oc_rdma_ra_out = 4;
1857 chan_args.oc_rdma_ra_in = 4;
1858 chan_args.oc_path_retry_cnt = 2;
1859 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1860 chan_args.oc_priv_data = cmp_ip_pvt;
1861 chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
1862
1863 refresh:
1864 rw_enter(&hca->state_lock, RW_READER);
1865 if (hca->state != HCA_DETACHED) {
1866 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1867 IBT_ACHAN_NO_FLAGS,
1868 &qp_attr, &qp->qp_hdl,
1869 &chan_sizes);
1870 } else {
1871 rw_exit(&hca->state_lock);
1872 return (RDMA_FAILED);
1873 }
1874 rw_exit(&hca->state_lock);
1875
1876 if (ibt_status != IBT_SUCCESS) {
1877 DTRACE_PROBE1(rpcib__i_conntosrv,
1878 int, ibt_status);
1879 return (RDMA_FAILED);
1880 }
1881
1882 /* Connect to the Server */
1883 (void) bzero(&ret_args, sizeof (ret_args));
1884 mutex_enter(&qp->cb_lock);
1885 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1886 IBT_BLOCKING, &chan_args, &ret_args);
1887 if (ibt_status != IBT_SUCCESS) {
1888 DTRACE_PROBE2(rpcib__i_openrctosrv,
1889 int, ibt_status, int, ret_args.rc_status);
1890
1891 (void) ibt_free_channel(qp->qp_hdl);
1892 qp->qp_hdl = NULL;
1893 mutex_exit(&qp->cb_lock);
1894 if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1895 ret_args.rc_status == IBT_CM_CONN_STALE) {
1896 /*
1897 * Got IBT_CM_CONN_STALE probably because of stale
1898 * data on the passive end of a channel that existed
1899 * prior to reboot. Retry establishing a channel
1900 * REFRESH_ATTEMPTS times, during which time the
1901 * stale conditions on the server might clear up.
1902 */
1903 goto refresh;
1904 }
1905 return (RDMA_FAILED);
1906 }
1907 mutex_exit(&qp->cb_lock);
1908 /*
1909 * Set the private data area to qp to be used in callbacks
1910 */
1911 ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1912 return (RDMA_SUCCESS);
1913 }
1914
1915 rdma_stat
rib_ping_srv(int addr_type,struct netbuf * raddr,rpcib_ping_t * rptp)1916 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp)
1917 {
1918 uint_t i, addr_count;
1919 ibt_status_t ibt_status;
1920 uint8_t num_paths_p;
1921 ibt_ip_path_attr_t ipattr;
1922 ibt_path_ip_src_t srcip;
1923 rpcib_ipaddrs_t addrs4;
1924 rpcib_ipaddrs_t addrs6;
1925 struct sockaddr_in *sinp;
1926 struct sockaddr_in6 *sin6p;
1927 rdma_stat retval = RDMA_FAILED;
1928 rib_hca_t *hca;
1929
1930 if ((addr_type != AF_INET) && (addr_type != AF_INET6))
1931 return (RDMA_INVAL);
1932 ASSERT(raddr->buf != NULL);
1933
1934 bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1935
1936 if (!rpcib_get_ib_addresses(&addrs4, &addrs6) ||
1937 (addrs4.ri_count == 0 && addrs6.ri_count == 0)) {
1938 retval = RDMA_FAILED;
1939 goto done2;
1940 }
1941
1942 if (addr_type == AF_INET) {
1943 addr_count = addrs4.ri_count;
1944 sinp = (struct sockaddr_in *)raddr->buf;
1945 rptp->dstip.family = AF_INET;
1946 rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr;
1947 sinp = addrs4.ri_list;
1948 } else {
1949 addr_count = addrs6.ri_count;
1950 sin6p = (struct sockaddr_in6 *)raddr->buf;
1951 rptp->dstip.family = AF_INET6;
1952 rptp->dstip.un.ip6addr = sin6p->sin6_addr;
1953 sin6p = addrs6.ri_list;
1954 }
1955
1956 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
1957 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
1958 rw_enter(&hca->state_lock, RW_READER);
1959 if (hca->state == HCA_DETACHED) {
1960 rw_exit(&hca->state_lock);
1961 continue;
1962 }
1963
1964 ipattr.ipa_dst_ip = &rptp->dstip;
1965 ipattr.ipa_hca_guid = hca->hca_guid;
1966 ipattr.ipa_ndst = 1;
1967 ipattr.ipa_max_paths = 1;
1968 ipattr.ipa_src_ip.family = rptp->dstip.family;
1969 for (i = 0; i < addr_count; i++) {
1970 num_paths_p = 0;
1971 if (addr_type == AF_INET) {
1972 ipattr.ipa_src_ip.un.ip4addr =
1973 sinp[i].sin_addr.s_addr;
1974 } else {
1975 ipattr.ipa_src_ip.un.ip6addr =
1976 sin6p[i].sin6_addr;
1977 }
1978 bzero(&srcip, sizeof (ibt_path_ip_src_t));
1979
1980 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1981 IBT_PATH_NO_FLAGS, &ipattr, &rptp->path,
1982 &num_paths_p, &srcip);
1983 if (ibt_status == IBT_SUCCESS &&
1984 num_paths_p != 0 &&
1985 rptp->path.pi_hca_guid == hca->hca_guid) {
1986 rptp->hca = hca;
1987 rw_exit(&hca->state_lock);
1988 if (addr_type == AF_INET) {
1989 rptp->srcip.family = AF_INET;
1990 rptp->srcip.un.ip4addr =
1991 srcip.ip_primary.un.ip4addr;
1992 } else {
1993 rptp->srcip.family = AF_INET6;
1994 rptp->srcip.un.ip6addr =
1995 srcip.ip_primary.un.ip6addr;
1996
1997 }
1998 retval = RDMA_SUCCESS;
1999 goto done1;
2000 }
2001 }
2002 rw_exit(&hca->state_lock);
2003 }
2004 done1:
2005 rw_exit(&rib_stat->hcas_list_lock);
2006 done2:
2007 if (addrs4.ri_size > 0)
2008 kmem_free(addrs4.ri_list, addrs4.ri_size);
2009 if (addrs6.ri_size > 0)
2010 kmem_free(addrs6.ri_list, addrs6.ri_size);
2011 return (retval);
2012 }
2013
2014 /*
2015 * Close channel, remove from connection list and
2016 * free up resources allocated for that channel.
2017 */
2018 rdma_stat
rib_disconnect_channel(CONN * conn,rib_conn_list_t * conn_list)2019 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
2020 {
2021 rib_qp_t *qp = ctoqp(conn);
2022 rib_hca_t *hca;
2023
2024 mutex_enter(&conn->c_lock);
2025 if (conn->c_timeout != NULL) {
2026 mutex_exit(&conn->c_lock);
2027 (void) untimeout(conn->c_timeout);
2028 mutex_enter(&conn->c_lock);
2029 }
2030
2031 while (conn->c_flags & C_CLOSE_PENDING) {
2032 cv_wait(&conn->c_cv, &conn->c_lock);
2033 }
2034 mutex_exit(&conn->c_lock);
2035
2036 /*
2037 * c_ref == 0 and connection is in C_DISCONN_PEND
2038 */
2039 hca = qp->hca;
2040 if (conn_list != NULL)
2041 (void) rib_rm_conn(conn, conn_list);
2042
2043 /*
2044 * There is only one case where we get here with
2045 * qp_hdl = NULL, which is during connection setup on
2046 * the client. In such a case there are no posted
2047 * send/recv buffers.
2048 */
2049 if (qp->qp_hdl != NULL) {
2050 mutex_enter(&qp->posted_rbufs_lock);
2051 while (qp->n_posted_rbufs)
2052 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
2053 mutex_exit(&qp->posted_rbufs_lock);
2054
2055 mutex_enter(&qp->send_rbufs_lock);
2056 while (qp->n_send_rbufs)
2057 cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock);
2058 mutex_exit(&qp->send_rbufs_lock);
2059
2060 (void) ibt_free_channel(qp->qp_hdl);
2061 qp->qp_hdl = NULL;
2062 }
2063
2064 ASSERT(qp->rdlist == NULL);
2065
2066 if (qp->replylist != NULL) {
2067 (void) rib_rem_replylist(qp);
2068 }
2069
2070 cv_destroy(&qp->cb_conn_cv);
2071 cv_destroy(&qp->posted_rbufs_cv);
2072 cv_destroy(&qp->send_rbufs_cv);
2073 mutex_destroy(&qp->cb_lock);
2074 mutex_destroy(&qp->replylist_lock);
2075 mutex_destroy(&qp->posted_rbufs_lock);
2076 mutex_destroy(&qp->send_rbufs_lock);
2077 mutex_destroy(&qp->rdlist_lock);
2078
2079 cv_destroy(&conn->c_cv);
2080 mutex_destroy(&conn->c_lock);
2081
2082 if (conn->c_raddr.buf != NULL) {
2083 kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
2084 }
2085 if (conn->c_laddr.buf != NULL) {
2086 kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
2087 }
2088 if (conn->c_netid != NULL) {
2089 kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1));
2090 }
2091 if (conn->c_addrmask.buf != NULL) {
2092 kmem_free(conn->c_addrmask.buf, conn->c_addrmask.len);
2093 }
2094
2095 /*
2096 * Credit control cleanup.
2097 */
2098 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
2099 rdma_clnt_cred_ctrl_t *cc_info;
2100 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
2101 cv_destroy(&cc_info->clnt_cc_cv);
2102 }
2103
2104 kmem_free(qp, sizeof (rib_qp_t));
2105
2106 /*
2107 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
2108 * then the hca is no longer being used.
2109 */
2110 if (conn_list != NULL) {
2111 rw_enter(&hca->state_lock, RW_READER);
2112 if (hca->state == HCA_DETACHED) {
2113 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
2114 if (hca->srv_conn_list.conn_hd == NULL) {
2115 rw_enter(&hca->cl_conn_list.conn_lock,
2116 RW_READER);
2117
2118 if (hca->cl_conn_list.conn_hd == NULL) {
2119 mutex_enter(&hca->inuse_lock);
2120 hca->inuse = FALSE;
2121 cv_signal(&hca->cb_cv);
2122 mutex_exit(&hca->inuse_lock);
2123 }
2124 rw_exit(&hca->cl_conn_list.conn_lock);
2125 }
2126 rw_exit(&hca->srv_conn_list.conn_lock);
2127 }
2128 rw_exit(&hca->state_lock);
2129 }
2130
2131 return (RDMA_SUCCESS);
2132 }
2133
2134 /*
2135 * All sends are done under the protection of
2136 * the wdesc->sendwait_lock. n_send_rbufs count
2137 * is protected using the send_rbufs_lock.
2138 * lock ordering is:
2139 * sendwait_lock -> send_rbufs_lock
2140 */
2141
2142 void
rib_send_hold(rib_qp_t * qp)2143 rib_send_hold(rib_qp_t *qp)
2144 {
2145 mutex_enter(&qp->send_rbufs_lock);
2146 qp->n_send_rbufs++;
2147 mutex_exit(&qp->send_rbufs_lock);
2148 }
2149
2150 void
rib_send_rele(rib_qp_t * qp)2151 rib_send_rele(rib_qp_t *qp)
2152 {
2153 mutex_enter(&qp->send_rbufs_lock);
2154 qp->n_send_rbufs--;
2155 if (qp->n_send_rbufs == 0)
2156 cv_signal(&qp->send_rbufs_cv);
2157 mutex_exit(&qp->send_rbufs_lock);
2158 }
2159
2160 void
rib_recv_rele(rib_qp_t * qp)2161 rib_recv_rele(rib_qp_t *qp)
2162 {
2163 mutex_enter(&qp->posted_rbufs_lock);
2164 qp->n_posted_rbufs--;
2165 if (qp->n_posted_rbufs == 0)
2166 cv_signal(&qp->posted_rbufs_cv);
2167 mutex_exit(&qp->posted_rbufs_lock);
2168 }
2169
2170 /*
2171 * Wait for send completion notification. Only on receiving a
2172 * notification be it a successful or error completion, free the
2173 * send_wid.
2174 */
2175 static rdma_stat
rib_sendwait(rib_qp_t * qp,struct send_wid * wd)2176 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2177 {
2178 clock_t timout, cv_wait_ret;
2179 rdma_stat error = RDMA_SUCCESS;
2180 int i;
2181
2182 /*
2183 * Wait for send to complete
2184 */
2185 ASSERT(wd != NULL);
2186 mutex_enter(&wd->sendwait_lock);
2187 if (wd->status == (uint_t)SEND_WAIT) {
2188 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2189 ddi_get_lbolt();
2190
2191 if (qp->mode == RIB_SERVER) {
2192 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
2193 &wd->sendwait_lock, timout)) > 0 &&
2194 wd->status == (uint_t)SEND_WAIT)
2195 ;
2196 switch (cv_wait_ret) {
2197 case -1: /* timeout */
2198 DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
2199
2200 wd->cv_sig = 0; /* no signal needed */
2201 error = RDMA_TIMEDOUT;
2202 break;
2203 default: /* got send completion */
2204 break;
2205 }
2206 } else {
2207 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
2208 &wd->sendwait_lock, timout)) > 0 &&
2209 wd->status == (uint_t)SEND_WAIT)
2210 ;
2211 switch (cv_wait_ret) {
2212 case -1: /* timeout */
2213 DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
2214
2215 wd->cv_sig = 0; /* no signal needed */
2216 error = RDMA_TIMEDOUT;
2217 break;
2218 case 0: /* interrupted */
2219 DTRACE_PROBE(rpcib__i__clntsendwait__intr);
2220
2221 wd->cv_sig = 0; /* no signal needed */
2222 error = RDMA_INTR;
2223 break;
2224 default: /* got send completion */
2225 break;
2226 }
2227 }
2228 }
2229
2230 if (wd->status != (uint_t)SEND_WAIT) {
2231 /* got send completion */
2232 if (wd->status != RDMA_SUCCESS) {
2233 switch (wd->status) {
2234 case RDMA_CONNLOST:
2235 error = RDMA_CONNLOST;
2236 break;
2237 default:
2238 error = RDMA_FAILED;
2239 break;
2240 }
2241 }
2242 for (i = 0; i < wd->nsbufs; i++) {
2243 rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2244 (void *)(uintptr_t)wd->sbufaddr[i]);
2245 }
2246
2247 rib_send_rele(qp);
2248
2249 mutex_exit(&wd->sendwait_lock);
2250 (void) rib_free_sendwait(wd);
2251
2252 } else {
2253 mutex_exit(&wd->sendwait_lock);
2254 }
2255 return (error);
2256 }
2257
2258 static struct send_wid *
rib_init_sendwait(uint32_t xid,int cv_sig,rib_qp_t * qp)2259 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2260 {
2261 struct send_wid *wd;
2262
2263 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2264 wd->xid = xid;
2265 wd->cv_sig = cv_sig;
2266 wd->qp = qp;
2267 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2268 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2269 wd->status = (uint_t)SEND_WAIT;
2270
2271 return (wd);
2272 }
2273
2274 static int
rib_free_sendwait(struct send_wid * wdesc)2275 rib_free_sendwait(struct send_wid *wdesc)
2276 {
2277 cv_destroy(&wdesc->wait_cv);
2278 mutex_destroy(&wdesc->sendwait_lock);
2279 kmem_free(wdesc, sizeof (*wdesc));
2280
2281 return (0);
2282 }
2283
2284 static rdma_stat
rib_rem_rep(rib_qp_t * qp,struct reply * rep)2285 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2286 {
2287 mutex_enter(&qp->replylist_lock);
2288 if (rep != NULL) {
2289 (void) rib_remreply(qp, rep);
2290 mutex_exit(&qp->replylist_lock);
2291 return (RDMA_SUCCESS);
2292 }
2293 mutex_exit(&qp->replylist_lock);
2294 return (RDMA_FAILED);
2295 }
2296
2297 /*
2298 * Send buffers are freed here only in case of error in posting
2299 * on QP. If the post succeeded, the send buffers are freed upon
2300 * send completion in rib_sendwait() or in the scq_handler.
2301 */
2302 rdma_stat
rib_send_and_wait(CONN * conn,struct clist * cl,uint32_t msgid,int send_sig,int cv_sig,caddr_t * swid)2303 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2304 int send_sig, int cv_sig, caddr_t *swid)
2305 {
2306 struct send_wid *wdesc;
2307 struct clist *clp;
2308 ibt_status_t ibt_status = IBT_SUCCESS;
2309 rdma_stat ret = RDMA_SUCCESS;
2310 ibt_send_wr_t tx_wr;
2311 int i, nds;
2312 ibt_wr_ds_t sgl[DSEG_MAX];
2313 uint_t total_msg_size;
2314 rib_qp_t *qp;
2315
2316 qp = ctoqp(conn);
2317
2318 ASSERT(cl != NULL);
2319
2320 bzero(&tx_wr, sizeof (ibt_send_wr_t));
2321
2322 nds = 0;
2323 total_msg_size = 0;
2324 clp = cl;
2325 while (clp != NULL) {
2326 if (nds >= DSEG_MAX) {
2327 DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
2328 return (RDMA_FAILED);
2329 }
2330 sgl[nds].ds_va = clp->w.c_saddr;
2331 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2332 sgl[nds].ds_len = clp->c_len;
2333 total_msg_size += clp->c_len;
2334 clp = clp->c_next;
2335 nds++;
2336 }
2337
2338 if (send_sig) {
2339 /* Set SEND_SIGNAL flag. */
2340 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2341 wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2342 *swid = (caddr_t)wdesc;
2343 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2344 mutex_enter(&wdesc->sendwait_lock);
2345 wdesc->nsbufs = nds;
2346 for (i = 0; i < nds; i++) {
2347 wdesc->sbufaddr[i] = sgl[i].ds_va;
2348 }
2349 } else {
2350 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2351 *swid = NULL;
2352 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2353 }
2354
2355 tx_wr.wr_opcode = IBT_WRC_SEND;
2356 tx_wr.wr_trans = IBT_RC_SRV;
2357 tx_wr.wr_nds = nds;
2358 tx_wr.wr_sgl = sgl;
2359
2360 mutex_enter(&conn->c_lock);
2361 if (conn->c_state == C_CONNECTED) {
2362 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2363 }
2364 if (conn->c_state != C_CONNECTED ||
2365 ibt_status != IBT_SUCCESS) {
2366 if (conn->c_state != C_DISCONN_PEND)
2367 conn->c_state = C_ERROR_CONN;
2368 mutex_exit(&conn->c_lock);
2369 if (send_sig) {
2370 for (i = 0; i < nds; i++) {
2371 rib_rbuf_free(conn, SEND_BUFFER,
2372 (void *)(uintptr_t)wdesc->sbufaddr[i]);
2373 }
2374 mutex_exit(&wdesc->sendwait_lock);
2375 (void) rib_free_sendwait(wdesc);
2376 }
2377 return (RDMA_CONNLOST);
2378 }
2379
2380 mutex_exit(&conn->c_lock);
2381
2382 if (send_sig) {
2383 rib_send_hold(qp);
2384 mutex_exit(&wdesc->sendwait_lock);
2385 if (cv_sig) {
2386 /*
2387 * cv_wait for send to complete.
2388 * We can fail due to a timeout or signal or
2389 * unsuccessful send.
2390 */
2391 ret = rib_sendwait(qp, wdesc);
2392
2393 return (ret);
2394 }
2395 }
2396
2397 return (RDMA_SUCCESS);
2398 }
2399
2400
2401 rdma_stat
rib_send(CONN * conn,struct clist * cl,uint32_t msgid)2402 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2403 {
2404 rdma_stat ret;
2405 caddr_t wd;
2406
2407 /* send-wait & cv_signal */
2408 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2409 return (ret);
2410 }
2411
2412 /*
2413 * Deprecated/obsolete interface not used currently
2414 * but earlier used for READ-READ protocol.
2415 * Send RPC reply and wait for RDMA_DONE.
2416 */
2417 rdma_stat
rib_send_resp(CONN * conn,struct clist * cl,uint32_t msgid)2418 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2419 {
2420 rdma_stat ret = RDMA_SUCCESS;
2421 struct rdma_done_list *rd;
2422 clock_t cv_wait_ret;
2423 caddr_t *wid = NULL;
2424 rib_qp_t *qp = ctoqp(conn);
2425
2426 mutex_enter(&qp->rdlist_lock);
2427 rd = rdma_done_add(qp, msgid);
2428
2429 /* No cv_signal (whether send-wait or no-send-wait) */
2430 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2431
2432 if (ret != RDMA_SUCCESS) {
2433 rdma_done_rm(qp, rd);
2434 } else {
2435 /*
2436 * Wait for RDMA_DONE from remote end
2437 */
2438 cv_wait_ret = cv_reltimedwait(&rd->rdma_done_cv,
2439 &qp->rdlist_lock, drv_usectohz(REPLY_WAIT_TIME * 1000000),
2440 TR_CLOCK_TICK);
2441
2442 rdma_done_rm(qp, rd);
2443
2444 if (cv_wait_ret < 0) {
2445 ret = RDMA_TIMEDOUT;
2446 }
2447 }
2448
2449 mutex_exit(&qp->rdlist_lock);
2450 return (ret);
2451 }
2452
2453 static struct recv_wid *
rib_create_wid(rib_qp_t * qp,ibt_wr_ds_t * sgl,uint32_t msgid)2454 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2455 {
2456 struct recv_wid *rwid;
2457
2458 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2459 rwid->xid = msgid;
2460 rwid->addr = sgl->ds_va;
2461 rwid->qp = qp;
2462
2463 return (rwid);
2464 }
2465
2466 static void
rib_free_wid(struct recv_wid * rwid)2467 rib_free_wid(struct recv_wid *rwid)
2468 {
2469 kmem_free(rwid, sizeof (struct recv_wid));
2470 }
2471
2472 rdma_stat
rib_clnt_post(CONN * conn,struct clist * cl,uint32_t msgid)2473 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2474 {
2475 rib_qp_t *qp = ctoqp(conn);
2476 struct clist *clp = cl;
2477 struct reply *rep;
2478 struct recv_wid *rwid;
2479 int nds;
2480 ibt_wr_ds_t sgl[DSEG_MAX];
2481 ibt_recv_wr_t recv_wr;
2482 rdma_stat ret;
2483 ibt_status_t ibt_status;
2484
2485 /*
2486 * rdma_clnt_postrecv uses RECV_BUFFER.
2487 */
2488
2489 nds = 0;
2490 while (cl != NULL) {
2491 if (nds >= DSEG_MAX) {
2492 ret = RDMA_FAILED;
2493 goto done;
2494 }
2495 sgl[nds].ds_va = cl->w.c_saddr;
2496 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2497 sgl[nds].ds_len = cl->c_len;
2498 cl = cl->c_next;
2499 nds++;
2500 }
2501
2502 if (nds != 1) {
2503 ret = RDMA_FAILED;
2504 goto done;
2505 }
2506
2507 bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2508 recv_wr.wr_nds = nds;
2509 recv_wr.wr_sgl = sgl;
2510
2511 rwid = rib_create_wid(qp, &sgl[0], msgid);
2512 if (rwid) {
2513 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2514 } else {
2515 ret = RDMA_NORESOURCE;
2516 goto done;
2517 }
2518 rep = rib_addreplylist(qp, msgid);
2519 if (!rep) {
2520 rib_free_wid(rwid);
2521 ret = RDMA_NORESOURCE;
2522 goto done;
2523 }
2524
2525 mutex_enter(&conn->c_lock);
2526
2527 if (conn->c_state == C_CONNECTED) {
2528 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2529 }
2530
2531 if (conn->c_state != C_CONNECTED ||
2532 ibt_status != IBT_SUCCESS) {
2533 if (conn->c_state != C_DISCONN_PEND)
2534 conn->c_state = C_ERROR_CONN;
2535 mutex_exit(&conn->c_lock);
2536 rib_free_wid(rwid);
2537 (void) rib_rem_rep(qp, rep);
2538 ret = RDMA_CONNLOST;
2539 goto done;
2540 }
2541
2542 mutex_enter(&qp->posted_rbufs_lock);
2543 qp->n_posted_rbufs++;
2544 mutex_exit(&qp->posted_rbufs_lock);
2545
2546 mutex_exit(&conn->c_lock);
2547 return (RDMA_SUCCESS);
2548
2549 done:
2550 while (clp != NULL) {
2551 rib_rbuf_free(conn, RECV_BUFFER,
2552 (void *)(uintptr_t)clp->w.c_saddr3);
2553 clp = clp->c_next;
2554 }
2555 return (ret);
2556 }
2557
2558 rdma_stat
rib_svc_post(CONN * conn,struct clist * cl)2559 rib_svc_post(CONN* conn, struct clist *cl)
2560 {
2561 rib_qp_t *qp = ctoqp(conn);
2562 struct svc_recv *s_recvp;
2563 int nds;
2564 ibt_wr_ds_t sgl[DSEG_MAX];
2565 ibt_recv_wr_t recv_wr;
2566 ibt_status_t ibt_status;
2567
2568 nds = 0;
2569 while (cl != NULL) {
2570 if (nds >= DSEG_MAX) {
2571 return (RDMA_FAILED);
2572 }
2573 sgl[nds].ds_va = cl->w.c_saddr;
2574 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2575 sgl[nds].ds_len = cl->c_len;
2576 cl = cl->c_next;
2577 nds++;
2578 }
2579
2580 if (nds != 1) {
2581 rib_rbuf_free(conn, RECV_BUFFER,
2582 (caddr_t)(uintptr_t)sgl[0].ds_va);
2583
2584 return (RDMA_FAILED);
2585 }
2586
2587 bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2588 recv_wr.wr_nds = nds;
2589 recv_wr.wr_sgl = sgl;
2590
2591 s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2592 /* Use s_recvp's addr as wr id */
2593 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2594 mutex_enter(&conn->c_lock);
2595 if (conn->c_state == C_CONNECTED) {
2596 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2597 }
2598 if (conn->c_state != C_CONNECTED ||
2599 ibt_status != IBT_SUCCESS) {
2600 if (conn->c_state != C_DISCONN_PEND)
2601 conn->c_state = C_ERROR_CONN;
2602 mutex_exit(&conn->c_lock);
2603 rib_rbuf_free(conn, RECV_BUFFER,
2604 (caddr_t)(uintptr_t)sgl[0].ds_va);
2605 (void) rib_free_svc_recv(s_recvp);
2606
2607 return (RDMA_CONNLOST);
2608 }
2609 mutex_exit(&conn->c_lock);
2610
2611 return (RDMA_SUCCESS);
2612 }
2613
2614 /* Client */
2615 rdma_stat
rib_post_resp(CONN * conn,struct clist * cl,uint32_t msgid)2616 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2617 {
2618 return (rib_clnt_post(conn, cl, msgid));
2619 }
2620
2621 /* Client */
2622 rdma_stat
rib_post_resp_remove(CONN * conn,uint32_t msgid)2623 rib_post_resp_remove(CONN* conn, uint32_t msgid)
2624 {
2625 rib_qp_t *qp = ctoqp(conn);
2626 struct reply *rep;
2627
2628 mutex_enter(&qp->replylist_lock);
2629 for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2630 if (rep->xid == msgid) {
2631 if (rep->vaddr_cq) {
2632 rib_rbuf_free(conn, RECV_BUFFER,
2633 (caddr_t)(uintptr_t)rep->vaddr_cq);
2634 }
2635 (void) rib_remreply(qp, rep);
2636 break;
2637 }
2638 }
2639 mutex_exit(&qp->replylist_lock);
2640
2641 return (RDMA_SUCCESS);
2642 }
2643
2644 /* Server */
2645 rdma_stat
rib_post_recv(CONN * conn,struct clist * cl)2646 rib_post_recv(CONN *conn, struct clist *cl)
2647 {
2648 rib_qp_t *qp = ctoqp(conn);
2649
2650 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2651 mutex_enter(&qp->posted_rbufs_lock);
2652 qp->n_posted_rbufs++;
2653 mutex_exit(&qp->posted_rbufs_lock);
2654 return (RDMA_SUCCESS);
2655 }
2656 return (RDMA_FAILED);
2657 }
2658
2659 /*
2660 * Client side only interface to "recv" the rpc reply buf
2661 * posted earlier by rib_post_resp(conn, cl, msgid).
2662 */
2663 rdma_stat
rib_recv(CONN * conn,struct clist ** clp,uint32_t msgid)2664 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2665 {
2666 struct reply *rep = NULL;
2667 clock_t timout, cv_wait_ret;
2668 rdma_stat ret = RDMA_SUCCESS;
2669 rib_qp_t *qp = ctoqp(conn);
2670
2671 /*
2672 * Find the reply structure for this msgid
2673 */
2674 mutex_enter(&qp->replylist_lock);
2675
2676 for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2677 if (rep->xid == msgid)
2678 break;
2679 }
2680
2681 if (rep != NULL) {
2682 /*
2683 * If message not yet received, wait.
2684 */
2685 if (rep->status == (uint_t)REPLY_WAIT) {
2686 timout = ddi_get_lbolt() +
2687 drv_usectohz(REPLY_WAIT_TIME * 1000000);
2688
2689 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2690 &qp->replylist_lock, timout)) > 0 &&
2691 rep->status == (uint_t)REPLY_WAIT)
2692 ;
2693
2694 switch (cv_wait_ret) {
2695 case -1: /* timeout */
2696 ret = RDMA_TIMEDOUT;
2697 break;
2698 case 0:
2699 ret = RDMA_INTR;
2700 break;
2701 default:
2702 break;
2703 }
2704 }
2705
2706 if (rep->status == RDMA_SUCCESS) {
2707 struct clist *cl = NULL;
2708
2709 /*
2710 * Got message successfully
2711 */
2712 clist_add(&cl, 0, rep->bytes_xfer, NULL,
2713 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2714 *clp = cl;
2715 } else {
2716 if (rep->status != (uint_t)REPLY_WAIT) {
2717 /*
2718 * Got error in reply message. Free
2719 * recv buffer here.
2720 */
2721 ret = rep->status;
2722 rib_rbuf_free(conn, RECV_BUFFER,
2723 (caddr_t)(uintptr_t)rep->vaddr_cq);
2724 }
2725 }
2726 (void) rib_remreply(qp, rep);
2727 } else {
2728 /*
2729 * No matching reply structure found for given msgid on the
2730 * reply wait list.
2731 */
2732 ret = RDMA_INVAL;
2733 DTRACE_PROBE(rpcib__i__nomatchxid2);
2734 }
2735
2736 /*
2737 * Done.
2738 */
2739 mutex_exit(&qp->replylist_lock);
2740 return (ret);
2741 }
2742
2743 /*
2744 * RDMA write a buffer to the remote address.
2745 */
2746 rdma_stat
rib_write(CONN * conn,struct clist * cl,int wait)2747 rib_write(CONN *conn, struct clist *cl, int wait)
2748 {
2749 ibt_send_wr_t tx_wr;
2750 int cv_sig;
2751 ibt_wr_ds_t sgl[DSEG_MAX];
2752 struct send_wid *wdesc;
2753 ibt_status_t ibt_status;
2754 rdma_stat ret = RDMA_SUCCESS;
2755 rib_qp_t *qp = ctoqp(conn);
2756 uint64_t n_writes = 0;
2757
2758 if (cl == NULL) {
2759 return (RDMA_FAILED);
2760 }
2761
2762 while ((cl != NULL)) {
2763 if (cl->c_len > 0) {
2764 bzero(&tx_wr, sizeof (ibt_send_wr_t));
2765 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
2766 tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
2767 cl->c_dmemhandle.mrc_rmr; /* rkey */
2768 sgl[0].ds_va = cl->w.c_saddr;
2769 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2770 sgl[0].ds_len = cl->c_len;
2771
2772 if (wait) {
2773 cv_sig = 1;
2774 } else {
2775 if (n_writes > max_unsignaled_rws) {
2776 n_writes = 0;
2777 cv_sig = 1;
2778 } else {
2779 cv_sig = 0;
2780 }
2781 }
2782
2783 if (cv_sig) {
2784 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2785 wdesc = rib_init_sendwait(0, cv_sig, qp);
2786 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2787 mutex_enter(&wdesc->sendwait_lock);
2788 } else {
2789 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2790 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2791 }
2792 tx_wr.wr_opcode = IBT_WRC_RDMAW;
2793 tx_wr.wr_trans = IBT_RC_SRV;
2794 tx_wr.wr_nds = 1;
2795 tx_wr.wr_sgl = sgl;
2796
2797 mutex_enter(&conn->c_lock);
2798 if (conn->c_state == C_CONNECTED) {
2799 ibt_status =
2800 ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2801 }
2802 if (conn->c_state != C_CONNECTED ||
2803 ibt_status != IBT_SUCCESS) {
2804 if (conn->c_state != C_DISCONN_PEND)
2805 conn->c_state = C_ERROR_CONN;
2806 mutex_exit(&conn->c_lock);
2807 if (cv_sig) {
2808 mutex_exit(&wdesc->sendwait_lock);
2809 (void) rib_free_sendwait(wdesc);
2810 }
2811 return (RDMA_CONNLOST);
2812 }
2813
2814 mutex_exit(&conn->c_lock);
2815
2816 /*
2817 * Wait for send to complete
2818 */
2819 if (cv_sig) {
2820
2821 rib_send_hold(qp);
2822 mutex_exit(&wdesc->sendwait_lock);
2823
2824 ret = rib_sendwait(qp, wdesc);
2825 if (ret != 0)
2826 return (ret);
2827 }
2828 n_writes ++;
2829 }
2830 cl = cl->c_next;
2831 }
2832 return (RDMA_SUCCESS);
2833 }
2834
2835 /*
2836 * RDMA Read a buffer from the remote address.
2837 */
2838 rdma_stat
rib_read(CONN * conn,struct clist * cl,int wait)2839 rib_read(CONN *conn, struct clist *cl, int wait)
2840 {
2841 ibt_send_wr_t rx_wr;
2842 int cv_sig = 0;
2843 ibt_wr_ds_t sgl;
2844 struct send_wid *wdesc;
2845 ibt_status_t ibt_status = IBT_SUCCESS;
2846 rdma_stat ret = RDMA_SUCCESS;
2847 rib_qp_t *qp = ctoqp(conn);
2848
2849 if (cl == NULL) {
2850 return (RDMA_FAILED);
2851 }
2852
2853 while (cl != NULL) {
2854 bzero(&rx_wr, sizeof (ibt_send_wr_t));
2855 /*
2856 * Remote address is at the head chunk item in list.
2857 */
2858 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
2859 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
2860
2861 sgl.ds_va = cl->u.c_daddr;
2862 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2863 sgl.ds_len = cl->c_len;
2864
2865 /*
2866 * If there are multiple chunks to be read, and
2867 * wait is set, ask for signal only for the last chunk
2868 * and wait only on the last chunk. The completion of
2869 * RDMA_READ on last chunk ensures that reads on all
2870 * previous chunks are also completed.
2871 */
2872 if (wait && (cl->c_next == NULL)) {
2873 cv_sig = 1;
2874 wdesc = rib_init_sendwait(0, cv_sig, qp);
2875 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2876 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2877 mutex_enter(&wdesc->sendwait_lock);
2878 } else {
2879 rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2880 rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2881 }
2882 rx_wr.wr_opcode = IBT_WRC_RDMAR;
2883 rx_wr.wr_trans = IBT_RC_SRV;
2884 rx_wr.wr_nds = 1;
2885 rx_wr.wr_sgl = &sgl;
2886
2887 mutex_enter(&conn->c_lock);
2888 if (conn->c_state == C_CONNECTED) {
2889 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2890 }
2891 if (conn->c_state != C_CONNECTED ||
2892 ibt_status != IBT_SUCCESS) {
2893 if (conn->c_state != C_DISCONN_PEND)
2894 conn->c_state = C_ERROR_CONN;
2895 mutex_exit(&conn->c_lock);
2896 if (wait && (cl->c_next == NULL)) {
2897 mutex_exit(&wdesc->sendwait_lock);
2898 (void) rib_free_sendwait(wdesc);
2899 }
2900 return (RDMA_CONNLOST);
2901 }
2902
2903 mutex_exit(&conn->c_lock);
2904
2905 /*
2906 * Wait for send to complete if this is the
2907 * last item in the list.
2908 */
2909 if (wait && cl->c_next == NULL) {
2910 rib_send_hold(qp);
2911 mutex_exit(&wdesc->sendwait_lock);
2912
2913 ret = rib_sendwait(qp, wdesc);
2914
2915 if (ret != 0)
2916 return (ret);
2917 }
2918 cl = cl->c_next;
2919 }
2920 return (RDMA_SUCCESS);
2921 }
2922
2923 /*
2924 * rib_srv_cm_handler()
2925 * Connection Manager callback to handle RC connection requests.
2926 */
2927 /* ARGSUSED */
2928 static ibt_cm_status_t
rib_srv_cm_handler(void * any,ibt_cm_event_t * event,ibt_cm_return_args_t * ret_args,void * priv_data,ibt_priv_data_len_t len)2929 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2930 ibt_cm_return_args_t *ret_args, void *priv_data,
2931 ibt_priv_data_len_t len)
2932 {
2933 queue_t *q;
2934 rib_qp_t *qp;
2935 rib_hca_t *hca;
2936 rdma_stat status = RDMA_SUCCESS;
2937 int i;
2938 struct clist cl;
2939 rdma_buf_t rdbuf = {0};
2940 void *buf = NULL;
2941 CONN *conn;
2942 ibt_ip_cm_info_t ipinfo;
2943 struct sockaddr_in *s;
2944 struct sockaddr_in6 *s6;
2945 int sin_size = sizeof (struct sockaddr_in);
2946 int in_size = sizeof (struct in_addr);
2947 int sin6_size = sizeof (struct sockaddr_in6);
2948
2949 ASSERT(any != NULL);
2950 ASSERT(event != NULL);
2951
2952 hca = (rib_hca_t *)any;
2953
2954 /* got a connection request */
2955 switch (event->cm_type) {
2956 case IBT_CM_EVENT_REQ_RCV:
2957 /*
2958 * If the plugin is in the NO_ACCEPT state, bail out.
2959 */
2960 mutex_enter(&plugin_state_lock);
2961 if (plugin_state == NO_ACCEPT) {
2962 mutex_exit(&plugin_state_lock);
2963 return (IBT_CM_REJECT);
2964 }
2965 mutex_exit(&plugin_state_lock);
2966
2967 /*
2968 * Need to send a MRA MAD to CM so that it does not
2969 * timeout on us.
2970 */
2971 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2972 event->cm_event.req.req_timeout * 8, NULL, 0);
2973
2974 mutex_enter(&rib_stat->open_hca_lock);
2975 q = rib_stat->q;
2976 mutex_exit(&rib_stat->open_hca_lock);
2977
2978 status = rib_svc_create_chan(hca, (caddr_t)q,
2979 event->cm_event.req.req_prim_hca_port, &qp);
2980
2981 if (status) {
2982 return (IBT_CM_REJECT);
2983 }
2984
2985 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2986 ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
2987 ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
2988 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2989
2990 /*
2991 * Pre-posts RECV buffers
2992 */
2993 conn = qptoc(qp);
2994 for (i = 0; i < preposted_rbufs; i++) {
2995 bzero(&rdbuf, sizeof (rdbuf));
2996 rdbuf.type = RECV_BUFFER;
2997 buf = rib_rbuf_alloc(conn, &rdbuf);
2998 if (buf == NULL) {
2999 /*
3000 * A connection is not established yet.
3001 * Just flush the channel. Buffers
3002 * posted till now will error out with
3003 * IBT_WC_WR_FLUSHED_ERR.
3004 */
3005 (void) ibt_flush_channel(qp->qp_hdl);
3006 (void) rib_disconnect_channel(conn, NULL);
3007 return (IBT_CM_REJECT);
3008 }
3009
3010 bzero(&cl, sizeof (cl));
3011 cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
3012 cl.c_len = rdbuf.len;
3013 cl.c_smemhandle.mrc_lmr =
3014 rdbuf.handle.mrc_lmr; /* lkey */
3015 cl.c_next = NULL;
3016 status = rib_post_recv(conn, &cl);
3017 if (status != RDMA_SUCCESS) {
3018 /*
3019 * A connection is not established yet.
3020 * Just flush the channel. Buffers
3021 * posted till now will error out with
3022 * IBT_WC_WR_FLUSHED_ERR.
3023 */
3024 (void) ibt_flush_channel(qp->qp_hdl);
3025 (void) rib_disconnect_channel(conn, NULL);
3026 return (IBT_CM_REJECT);
3027 }
3028 }
3029 (void) rib_add_connlist(conn, &hca->srv_conn_list);
3030
3031 /*
3032 * Get the address translation
3033 */
3034 rw_enter(&hca->state_lock, RW_READER);
3035 if (hca->state == HCA_DETACHED) {
3036 rw_exit(&hca->state_lock);
3037 return (IBT_CM_REJECT);
3038 }
3039 rw_exit(&hca->state_lock);
3040
3041 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
3042
3043 if (ibt_get_ip_data(event->cm_priv_data_len,
3044 event->cm_priv_data,
3045 &ipinfo) != IBT_SUCCESS) {
3046
3047 return (IBT_CM_REJECT);
3048 }
3049
3050 switch (ipinfo.src_addr.family) {
3051 case AF_INET:
3052
3053 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1,
3054 KM_SLEEP);
3055 (void) strcpy(conn->c_netid, RIBNETID_TCP);
3056
3057 conn->c_raddr.maxlen =
3058 conn->c_raddr.len = sin_size;
3059 conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3060
3061 s = (struct sockaddr_in *)conn->c_raddr.buf;
3062 s->sin_family = AF_INET;
3063 bcopy((void *)&ipinfo.src_addr.un.ip4addr,
3064 &s->sin_addr, in_size);
3065
3066 conn->c_laddr.maxlen =
3067 conn->c_laddr.len = sin_size;
3068 conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3069
3070 s = (struct sockaddr_in *)conn->c_laddr.buf;
3071 s->sin_family = AF_INET;
3072 bcopy((void *)&ipinfo.dst_addr.un.ip4addr,
3073 &s->sin_addr, in_size);
3074
3075 conn->c_addrmask.maxlen = conn->c_addrmask.len =
3076 sizeof (struct sockaddr_in);
3077 conn->c_addrmask.buf =
3078 kmem_zalloc(conn->c_addrmask.len, KM_SLEEP);
3079 ((struct sockaddr_in *)
3080 conn->c_addrmask.buf)->sin_addr.s_addr =
3081 (uint32_t)~0;
3082 ((struct sockaddr_in *)
3083 conn->c_addrmask.buf)->sin_family =
3084 (sa_family_t)~0;
3085 break;
3086
3087 case AF_INET6:
3088
3089 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1,
3090 KM_SLEEP);
3091 (void) strcpy(conn->c_netid, RIBNETID_TCP6);
3092
3093 conn->c_raddr.maxlen =
3094 conn->c_raddr.len = sin6_size;
3095 conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3096
3097 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
3098 s6->sin6_family = AF_INET6;
3099 bcopy((void *)&ipinfo.src_addr.un.ip6addr,
3100 &s6->sin6_addr,
3101 sizeof (struct in6_addr));
3102
3103 conn->c_laddr.maxlen =
3104 conn->c_laddr.len = sin6_size;
3105 conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3106
3107 s6 = (struct sockaddr_in6 *)conn->c_laddr.buf;
3108 s6->sin6_family = AF_INET6;
3109 bcopy((void *)&ipinfo.dst_addr.un.ip6addr,
3110 &s6->sin6_addr,
3111 sizeof (struct in6_addr));
3112
3113 conn->c_addrmask.maxlen = conn->c_addrmask.len =
3114 sizeof (struct sockaddr_in6);
3115 conn->c_addrmask.buf =
3116 kmem_zalloc(conn->c_addrmask.len, KM_SLEEP);
3117 (void) memset(&((struct sockaddr_in6 *)
3118 conn->c_addrmask.buf)->sin6_addr, (uchar_t)~0,
3119 sizeof (struct in6_addr));
3120 ((struct sockaddr_in6 *)
3121 conn->c_addrmask.buf)->sin6_family =
3122 (sa_family_t)~0;
3123 break;
3124
3125 default:
3126 return (IBT_CM_REJECT);
3127 }
3128
3129 break;
3130
3131 case IBT_CM_EVENT_CONN_CLOSED:
3132 {
3133 CONN *conn;
3134 rib_qp_t *qp;
3135
3136 switch (event->cm_event.closed) {
3137 case IBT_CM_CLOSED_DREP_RCVD:
3138 case IBT_CM_CLOSED_DREQ_TIMEOUT:
3139 case IBT_CM_CLOSED_DUP:
3140 case IBT_CM_CLOSED_ABORT:
3141 case IBT_CM_CLOSED_ALREADY:
3142 /*
3143 * These cases indicate the local end initiated
3144 * the closing of the channel. Nothing to do here.
3145 */
3146 break;
3147 default:
3148 /*
3149 * Reason for CONN_CLOSED event must be one of
3150 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
3151 * or IBT_CM_CLOSED_STALE. These indicate cases were
3152 * the remote end is closing the channel. In these
3153 * cases free the channel and transition to error
3154 * state
3155 */
3156 qp = ibt_get_chan_private(event->cm_channel);
3157 conn = qptoc(qp);
3158 mutex_enter(&conn->c_lock);
3159 if (conn->c_state == C_DISCONN_PEND) {
3160 mutex_exit(&conn->c_lock);
3161 break;
3162 }
3163 conn->c_state = C_ERROR_CONN;
3164
3165 /*
3166 * Free the conn if c_ref goes down to 0
3167 */
3168 if (conn->c_ref == 0) {
3169 /*
3170 * Remove from list and free conn
3171 */
3172 conn->c_state = C_DISCONN_PEND;
3173 mutex_exit(&conn->c_lock);
3174 (void) rib_disconnect_channel(conn,
3175 &hca->srv_conn_list);
3176 } else {
3177 /*
3178 * conn will be freed when c_ref goes to 0.
3179 * Indicate to cleaning thread not to close
3180 * the connection, but just free the channel.
3181 */
3182 conn->c_flags |= C_CLOSE_NOTNEEDED;
3183 mutex_exit(&conn->c_lock);
3184 }
3185 DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
3186 break;
3187 }
3188 break;
3189 }
3190 case IBT_CM_EVENT_CONN_EST:
3191 /*
3192 * RTU received, hence connection established.
3193 */
3194 if (rib_debug > 1)
3195 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3196 "(CONN_EST) channel established");
3197 break;
3198
3199 default:
3200 if (rib_debug > 2) {
3201 /* Let CM handle the following events. */
3202 if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
3203 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3204 "server recv'ed IBT_CM_EVENT_REP_RCV\n");
3205 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
3206 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3207 "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
3208 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
3209 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3210 "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
3211 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
3212 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3213 "server recv'ed IBT_CM_EVENT_APR_RCV\n");
3214 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
3215 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3216 "server recv'ed IBT_CM_EVENT_FAILURE\n");
3217 }
3218 }
3219 return (IBT_CM_DEFAULT);
3220 }
3221
3222 /* accept all other CM messages (i.e. let the CM handle them) */
3223 return (IBT_CM_ACCEPT);
3224 }
3225
3226 static rdma_stat
rib_register_service(rib_hca_t * hca,int service_type,uint8_t protocol_num,in_port_t dst_port)3227 rib_register_service(rib_hca_t *hca, int service_type,
3228 uint8_t protocol_num, in_port_t dst_port)
3229 {
3230 ibt_srv_desc_t sdesc;
3231 ibt_hca_portinfo_t *port_infop;
3232 ib_svc_id_t srv_id;
3233 ibt_srv_hdl_t srv_hdl;
3234 uint_t port_size;
3235 uint_t pki, i, num_ports, nbinds;
3236 ibt_status_t ibt_status;
3237 rib_service_t *service;
3238 ib_pkey_t pkey;
3239
3240 /*
3241 * Query all ports for the given HCA
3242 */
3243 rw_enter(&hca->state_lock, RW_READER);
3244 if (hca->state != HCA_DETACHED) {
3245 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3246 &num_ports, &port_size);
3247 rw_exit(&hca->state_lock);
3248 } else {
3249 rw_exit(&hca->state_lock);
3250 return (RDMA_FAILED);
3251 }
3252 if (ibt_status != IBT_SUCCESS) {
3253 return (RDMA_FAILED);
3254 }
3255
3256 DTRACE_PROBE1(rpcib__i__regservice_numports,
3257 int, num_ports);
3258
3259 for (i = 0; i < num_ports; i++) {
3260 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3261 DTRACE_PROBE1(rpcib__i__regservice__portinactive,
3262 int, i+1);
3263 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
3264 DTRACE_PROBE1(rpcib__i__regservice__portactive,
3265 int, i+1);
3266 }
3267 }
3268
3269 /*
3270 * Get all the IP addresses on this system to register the
3271 * given "service type" on all DNS recognized IP addrs.
3272 * Each service type such as NFS will have all the systems
3273 * IP addresses as its different names. For now the only
3274 * type of service we support in RPCIB is NFS.
3275 */
3276 rw_enter(&rib_stat->service_list_lock, RW_WRITER);
3277 /*
3278 * Start registering and binding service to active
3279 * on active ports on this HCA.
3280 */
3281 nbinds = 0;
3282 for (service = rib_stat->service_list;
3283 service && (service->srv_type != service_type);
3284 service = service->next)
3285 ;
3286
3287 if (service == NULL) {
3288 /*
3289 * We use IP addresses as the service names for
3290 * service registration. Register each of them
3291 * with CM to obtain a svc_id and svc_hdl. We do not
3292 * register the service with machine's loopback address.
3293 */
3294 (void) bzero(&srv_id, sizeof (ib_svc_id_t));
3295 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3296 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3297 sdesc.sd_handler = rib_srv_cm_handler;
3298 sdesc.sd_flags = 0;
3299 ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3300 &sdesc, ibt_get_ip_sid(protocol_num, dst_port),
3301 1, &srv_hdl, &srv_id);
3302 if ((ibt_status != IBT_SUCCESS) &&
3303 (ibt_status != IBT_CM_SERVICE_EXISTS)) {
3304 rw_exit(&rib_stat->service_list_lock);
3305 DTRACE_PROBE1(rpcib__i__regservice__ibtres,
3306 int, ibt_status);
3307 ibt_free_portinfo(port_infop, port_size);
3308 return (RDMA_FAILED);
3309 }
3310
3311 /*
3312 * Allocate and prepare a service entry
3313 */
3314 service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP);
3315
3316 service->srv_type = service_type;
3317 service->srv_hdl = srv_hdl;
3318 service->srv_id = srv_id;
3319
3320 service->next = rib_stat->service_list;
3321 rib_stat->service_list = service;
3322 DTRACE_PROBE1(rpcib__i__regservice__new__service,
3323 int, service->srv_type);
3324 } else {
3325 srv_hdl = service->srv_hdl;
3326 srv_id = service->srv_id;
3327 DTRACE_PROBE1(rpcib__i__regservice__existing__service,
3328 int, service->srv_type);
3329 }
3330
3331 for (i = 0; i < num_ports; i++) {
3332 ibt_sbind_hdl_t sbp;
3333 rib_hca_service_t *hca_srv;
3334 ib_gid_t gid;
3335
3336 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3337 continue;
3338
3339 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3340 pkey = port_infop[i].p_pkey_tbl[pki];
3341
3342 rw_enter(&hca->bound_services_lock, RW_READER);
3343 gid = port_infop[i].p_sgid_tbl[0];
3344 for (hca_srv = hca->bound_services; hca_srv;
3345 hca_srv = hca_srv->next) {
3346 if ((hca_srv->srv_id == service->srv_id) &&
3347 (hca_srv->gid.gid_prefix ==
3348 gid.gid_prefix) &&
3349 (hca_srv->gid.gid_guid == gid.gid_guid))
3350 break;
3351 }
3352 rw_exit(&hca->bound_services_lock);
3353 if (hca_srv != NULL) {
3354 /*
3355 * port is alreay bound the the service
3356 */
3357 DTRACE_PROBE1(
3358 rpcib__i__regservice__already__bound,
3359 int, i+1);
3360 nbinds++;
3361 continue;
3362 }
3363
3364 if ((pkey & IBSRM_HB) &&
3365 (pkey != IB_PKEY_INVALID_FULL)) {
3366
3367 sbp = NULL;
3368 ibt_status = ibt_bind_service(srv_hdl,
3369 gid, NULL, hca, &sbp);
3370
3371 if (ibt_status == IBT_SUCCESS) {
3372 hca_srv = kmem_zalloc(
3373 sizeof (rib_hca_service_t),
3374 KM_SLEEP);
3375 hca_srv->srv_id = srv_id;
3376 hca_srv->gid = gid;
3377 hca_srv->sbind_hdl = sbp;
3378
3379 rw_enter(&hca->bound_services_lock,
3380 RW_WRITER);
3381 hca_srv->next = hca->bound_services;
3382 hca->bound_services = hca_srv;
3383 rw_exit(&hca->bound_services_lock);
3384 nbinds++;
3385 }
3386
3387 DTRACE_PROBE1(rpcib__i__regservice__bindres,
3388 int, ibt_status);
3389 }
3390 }
3391 }
3392 rw_exit(&rib_stat->service_list_lock);
3393
3394 ibt_free_portinfo(port_infop, port_size);
3395
3396 if (nbinds == 0) {
3397 return (RDMA_FAILED);
3398 } else {
3399 /*
3400 * Put this plugin into accept state, since atleast
3401 * one registration was successful.
3402 */
3403 mutex_enter(&plugin_state_lock);
3404 plugin_state = ACCEPT;
3405 mutex_exit(&plugin_state_lock);
3406 return (RDMA_SUCCESS);
3407 }
3408 }
3409
3410 void
rib_listen(struct rdma_svc_data * rd)3411 rib_listen(struct rdma_svc_data *rd)
3412 {
3413 rdma_stat status;
3414 int n_listening = 0;
3415 rib_hca_t *hca;
3416
3417 mutex_enter(&rib_stat->listen_lock);
3418 /*
3419 * if rd parameter is NULL then it means that rib_stat->q is
3420 * already initialized by a call from RDMA and we just want to
3421 * add a newly attached HCA to the same listening state as other
3422 * HCAs.
3423 */
3424 if (rd == NULL) {
3425 if (rib_stat->q == NULL) {
3426 mutex_exit(&rib_stat->listen_lock);
3427 return;
3428 }
3429 } else {
3430 rib_stat->q = &rd->q;
3431 }
3432 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3433 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3434 /*
3435 * First check if a hca is still attached
3436 */
3437 rw_enter(&hca->state_lock, RW_READER);
3438 if (hca->state != HCA_INITED) {
3439 rw_exit(&hca->state_lock);
3440 continue;
3441 }
3442 rw_exit(&hca->state_lock);
3443
3444 /*
3445 * Right now the only service type is NFS. Hence
3446 * force feed this value. Ideally to communicate
3447 * the service type it should be passed down in
3448 * rdma_svc_data.
3449 */
3450 status = rib_register_service(hca, NFS,
3451 IPPROTO_TCP, nfs_rdma_port);
3452 if (status == RDMA_SUCCESS)
3453 n_listening++;
3454 }
3455 rw_exit(&rib_stat->hcas_list_lock);
3456
3457 /*
3458 * Service active on an HCA, check rd->err_code for more
3459 * explainable errors.
3460 */
3461 if (rd) {
3462 if (n_listening > 0) {
3463 rd->active = 1;
3464 rd->err_code = RDMA_SUCCESS;
3465 } else {
3466 rd->active = 0;
3467 rd->err_code = RDMA_FAILED;
3468 }
3469 }
3470 mutex_exit(&rib_stat->listen_lock);
3471 }
3472
3473 /* XXXX */
3474 /* ARGSUSED */
3475 static void
rib_listen_stop(struct rdma_svc_data * svcdata)3476 rib_listen_stop(struct rdma_svc_data *svcdata)
3477 {
3478 rib_hca_t *hca;
3479
3480 mutex_enter(&rib_stat->listen_lock);
3481 /*
3482 * KRPC called the RDMATF to stop the listeners, this means
3483 * stop sending incomming or recieved requests to KRPC master
3484 * transport handle for RDMA-IB. This is also means that the
3485 * master transport handle, responsible for us, is going away.
3486 */
3487 mutex_enter(&plugin_state_lock);
3488 plugin_state = NO_ACCEPT;
3489 if (svcdata != NULL)
3490 svcdata->active = 0;
3491 mutex_exit(&plugin_state_lock);
3492
3493 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3494 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3495 /*
3496 * First check if a hca is still attached
3497 */
3498 rw_enter(&hca->state_lock, RW_READER);
3499 if (hca->state == HCA_DETACHED) {
3500 rw_exit(&hca->state_lock);
3501 continue;
3502 }
3503 rib_close_channels(&hca->srv_conn_list);
3504 rib_stop_services(hca);
3505 rw_exit(&hca->state_lock);
3506 }
3507 rw_exit(&rib_stat->hcas_list_lock);
3508
3509 /*
3510 * Avoid rib_listen() using the stale q field.
3511 * This could happen if a port goes up after all services
3512 * are already unregistered.
3513 */
3514 rib_stat->q = NULL;
3515 mutex_exit(&rib_stat->listen_lock);
3516 }
3517
3518 /*
3519 * Traverse the HCA's service list to unbind and deregister services.
3520 * For each bound service of HCA to be removed, first find the corresponding
3521 * service handle (srv_hdl) and then unbind the service by calling
3522 * ibt_unbind_service().
3523 */
3524 static void
rib_stop_services(rib_hca_t * hca)3525 rib_stop_services(rib_hca_t *hca)
3526 {
3527 rib_hca_service_t *srv_list, *to_remove;
3528
3529 /*
3530 * unbind and deregister the services for this service type.
3531 * Right now there is only one service type. In future it will
3532 * be passed down to this function.
3533 */
3534 rw_enter(&hca->bound_services_lock, RW_READER);
3535 srv_list = hca->bound_services;
3536 hca->bound_services = NULL;
3537 rw_exit(&hca->bound_services_lock);
3538
3539 while (srv_list != NULL) {
3540 rib_service_t *sc;
3541
3542 to_remove = srv_list;
3543 srv_list = to_remove->next;
3544 rw_enter(&rib_stat->service_list_lock, RW_READER);
3545 for (sc = rib_stat->service_list;
3546 sc && (sc->srv_id != to_remove->srv_id);
3547 sc = sc->next)
3548 ;
3549 /*
3550 * if sc is NULL then the service doesn't exist anymore,
3551 * probably just removed completely through rib_stat.
3552 */
3553 if (sc != NULL)
3554 (void) ibt_unbind_service(sc->srv_hdl,
3555 to_remove->sbind_hdl);
3556 rw_exit(&rib_stat->service_list_lock);
3557 kmem_free(to_remove, sizeof (rib_hca_service_t));
3558 }
3559 }
3560
3561 static struct svc_recv *
rib_init_svc_recv(rib_qp_t * qp,ibt_wr_ds_t * sgl)3562 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3563 {
3564 struct svc_recv *recvp;
3565
3566 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3567 recvp->vaddr = sgl->ds_va;
3568 recvp->qp = qp;
3569 recvp->bytes_xfer = 0;
3570 return (recvp);
3571 }
3572
3573 static int
rib_free_svc_recv(struct svc_recv * recvp)3574 rib_free_svc_recv(struct svc_recv *recvp)
3575 {
3576 kmem_free(recvp, sizeof (*recvp));
3577
3578 return (0);
3579 }
3580
3581 static struct reply *
rib_addreplylist(rib_qp_t * qp,uint32_t msgid)3582 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3583 {
3584 struct reply *rep;
3585
3586
3587 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3588 if (rep == NULL) {
3589 DTRACE_PROBE(rpcib__i__addrreply__nomem);
3590 return (NULL);
3591 }
3592 rep->xid = msgid;
3593 rep->vaddr_cq = NULL;
3594 rep->bytes_xfer = 0;
3595 rep->status = (uint_t)REPLY_WAIT;
3596 rep->prev = NULL;
3597 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3598
3599 mutex_enter(&qp->replylist_lock);
3600 if (qp->replylist) {
3601 rep->next = qp->replylist;
3602 qp->replylist->prev = rep;
3603 }
3604 qp->rep_list_size++;
3605
3606 DTRACE_PROBE1(rpcib__i__addrreply__listsize,
3607 int, qp->rep_list_size);
3608
3609 qp->replylist = rep;
3610 mutex_exit(&qp->replylist_lock);
3611
3612 return (rep);
3613 }
3614
3615 static rdma_stat
rib_rem_replylist(rib_qp_t * qp)3616 rib_rem_replylist(rib_qp_t *qp)
3617 {
3618 struct reply *r, *n;
3619
3620 mutex_enter(&qp->replylist_lock);
3621 for (r = qp->replylist; r != NULL; r = n) {
3622 n = r->next;
3623 (void) rib_remreply(qp, r);
3624 }
3625 mutex_exit(&qp->replylist_lock);
3626
3627 return (RDMA_SUCCESS);
3628 }
3629
3630 static int
rib_remreply(rib_qp_t * qp,struct reply * rep)3631 rib_remreply(rib_qp_t *qp, struct reply *rep)
3632 {
3633
3634 ASSERT(MUTEX_HELD(&qp->replylist_lock));
3635 if (rep->prev) {
3636 rep->prev->next = rep->next;
3637 }
3638 if (rep->next) {
3639 rep->next->prev = rep->prev;
3640 }
3641 if (qp->replylist == rep)
3642 qp->replylist = rep->next;
3643
3644 cv_destroy(&rep->wait_cv);
3645 qp->rep_list_size--;
3646
3647 DTRACE_PROBE1(rpcib__i__remreply__listsize,
3648 int, qp->rep_list_size);
3649
3650 kmem_free(rep, sizeof (*rep));
3651
3652 return (0);
3653 }
3654
3655 rdma_stat
rib_registermem(CONN * conn,caddr_t adsp,caddr_t buf,uint_t buflen,struct mrc * buf_handle)3656 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen,
3657 struct mrc *buf_handle)
3658 {
3659 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
3660 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
3661 rdma_stat status;
3662 rib_hca_t *hca = (ctoqp(conn))->hca;
3663
3664 /*
3665 * Note: ALL buffer pools use the same memory type RDMARW.
3666 */
3667 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3668 if (status == RDMA_SUCCESS) {
3669 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3670 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3671 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3672 } else {
3673 buf_handle->mrc_linfo = NULL;
3674 buf_handle->mrc_lmr = 0;
3675 buf_handle->mrc_rmr = 0;
3676 }
3677 return (status);
3678 }
3679
3680 static rdma_stat
rib_reg_mem(rib_hca_t * hca,caddr_t adsp,caddr_t buf,uint_t size,ibt_mr_flags_t spec,ibt_mr_hdl_t * mr_hdlp,ibt_mr_desc_t * mr_descp)3681 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
3682 ibt_mr_flags_t spec,
3683 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3684 {
3685 ibt_mr_attr_t mem_attr;
3686 ibt_status_t ibt_status;
3687 mem_attr.mr_vaddr = (uintptr_t)buf;
3688 mem_attr.mr_len = (ib_msglen_t)size;
3689 mem_attr.mr_as = (struct as *)(caddr_t)adsp;
3690 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3691 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3692 IBT_MR_ENABLE_WINDOW_BIND | spec;
3693
3694 rw_enter(&hca->state_lock, RW_READER);
3695 if (hca->state != HCA_DETACHED) {
3696 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3697 &mem_attr, mr_hdlp, mr_descp);
3698 rw_exit(&hca->state_lock);
3699 } else {
3700 rw_exit(&hca->state_lock);
3701 return (RDMA_FAILED);
3702 }
3703
3704 if (ibt_status != IBT_SUCCESS) {
3705 return (RDMA_FAILED);
3706 }
3707 return (RDMA_SUCCESS);
3708 }
3709
3710 rdma_stat
rib_registermemsync(CONN * conn,caddr_t adsp,caddr_t buf,uint_t buflen,struct mrc * buf_handle,RIB_SYNCMEM_HANDLE * sync_handle,void * lrc)3711 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen,
3712 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
3713 {
3714 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
3715 rib_lrc_entry_t *l;
3716 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
3717 rdma_stat status;
3718 rib_hca_t *hca = (ctoqp(conn))->hca;
3719
3720 /*
3721 * Non-coherent memory registration.
3722 */
3723 l = (rib_lrc_entry_t *)lrc;
3724 if (l) {
3725 if (l->registered) {
3726 buf_handle->mrc_linfo =
3727 (uintptr_t)l->lrc_mhandle.mrc_linfo;
3728 buf_handle->mrc_lmr =
3729 (uint32_t)l->lrc_mhandle.mrc_lmr;
3730 buf_handle->mrc_rmr =
3731 (uint32_t)l->lrc_mhandle.mrc_rmr;
3732 *sync_handle = (RIB_SYNCMEM_HANDLE)
3733 (uintptr_t)l->lrc_mhandle.mrc_linfo;
3734 return (RDMA_SUCCESS);
3735 } else {
3736 /* Always register the whole buffer */
3737 buf = (caddr_t)l->lrc_buf;
3738 buflen = l->lrc_len;
3739 }
3740 }
3741 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3742
3743 if (status == RDMA_SUCCESS) {
3744 if (l) {
3745 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
3746 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey;
3747 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey;
3748 l->registered = TRUE;
3749 }
3750 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3751 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3752 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3753 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3754 } else {
3755 buf_handle->mrc_linfo = NULL;
3756 buf_handle->mrc_lmr = 0;
3757 buf_handle->mrc_rmr = 0;
3758 }
3759 return (status);
3760 }
3761
3762 /* ARGSUSED */
3763 rdma_stat
rib_deregistermem(CONN * conn,caddr_t buf,struct mrc buf_handle)3764 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3765 {
3766 rib_hca_t *hca = (ctoqp(conn))->hca;
3767 /*
3768 * Allow memory deregistration even if HCA is
3769 * getting detached. Need all outstanding
3770 * memory registrations to be deregistered
3771 * before HCA_DETACH_EVENT can be accepted.
3772 */
3773 (void) ibt_deregister_mr(hca->hca_hdl,
3774 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3775 return (RDMA_SUCCESS);
3776 }
3777
3778 /* ARGSUSED */
3779 rdma_stat
rib_deregistermemsync(CONN * conn,caddr_t buf,struct mrc buf_handle,RIB_SYNCMEM_HANDLE sync_handle,void * lrc)3780 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3781 RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
3782 {
3783 rib_lrc_entry_t *l;
3784 l = (rib_lrc_entry_t *)lrc;
3785 if (l)
3786 if (l->registered)
3787 return (RDMA_SUCCESS);
3788
3789 (void) rib_deregistermem(conn, buf, buf_handle);
3790
3791 return (RDMA_SUCCESS);
3792 }
3793
3794 /* ARGSUSED */
3795 rdma_stat
rib_syncmem(CONN * conn,RIB_SYNCMEM_HANDLE shandle,caddr_t buf,int len,int cpu)3796 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3797 int len, int cpu)
3798 {
3799 ibt_status_t status;
3800 rib_hca_t *hca = (ctoqp(conn))->hca;
3801 ibt_mr_sync_t mr_segment;
3802
3803 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3804 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3805 mr_segment.ms_len = (ib_memlen_t)len;
3806 if (cpu) {
3807 /* make incoming data visible to memory */
3808 mr_segment.ms_flags = IBT_SYNC_WRITE;
3809 } else {
3810 /* make memory changes visible to IO */
3811 mr_segment.ms_flags = IBT_SYNC_READ;
3812 }
3813 rw_enter(&hca->state_lock, RW_READER);
3814 if (hca->state != HCA_DETACHED) {
3815 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3816 rw_exit(&hca->state_lock);
3817 } else {
3818 rw_exit(&hca->state_lock);
3819 return (RDMA_FAILED);
3820 }
3821
3822 if (status == IBT_SUCCESS)
3823 return (RDMA_SUCCESS);
3824 else {
3825 return (RDMA_FAILED);
3826 }
3827 }
3828
3829 /*
3830 * XXXX ????
3831 */
3832 static rdma_stat
rib_getinfo(rdma_info_t * info)3833 rib_getinfo(rdma_info_t *info)
3834 {
3835 /*
3836 * XXXX Hack!
3837 */
3838 info->addrlen = 16;
3839 info->mts = 1000000;
3840 info->mtu = 1000000;
3841
3842 return (RDMA_SUCCESS);
3843 }
3844
3845 rib_bufpool_t *
rib_rbufpool_create(rib_hca_t * hca,int ptype,int num)3846 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3847 {
3848 rib_bufpool_t *rbp = NULL;
3849 bufpool_t *bp = NULL;
3850 caddr_t buf;
3851 ibt_mr_attr_t mem_attr;
3852 ibt_status_t ibt_status;
3853 int i, j;
3854
3855 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3856
3857 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3858 num * sizeof (void *), KM_SLEEP);
3859
3860 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3861 bp->numelems = num;
3862
3863
3864 switch (ptype) {
3865 case SEND_BUFFER:
3866 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3867 bp->rsize = RPC_MSG_SZ;
3868 break;
3869 case RECV_BUFFER:
3870 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3871 bp->rsize = RPC_BUF_SIZE;
3872 break;
3873 default:
3874 goto fail;
3875 }
3876
3877 /*
3878 * Register the pool.
3879 */
3880 bp->bufsize = num * bp->rsize;
3881 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3882 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3883 sizeof (ibt_mr_hdl_t), KM_SLEEP);
3884 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3885 sizeof (ibt_mr_desc_t), KM_SLEEP);
3886 rw_enter(&hca->state_lock, RW_READER);
3887
3888 if (hca->state == HCA_DETACHED) {
3889 rw_exit(&hca->state_lock);
3890 goto fail;
3891 }
3892
3893 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3894 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3895 mem_attr.mr_vaddr = (uintptr_t)buf;
3896 mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3897 mem_attr.mr_as = NULL;
3898 ibt_status = ibt_register_mr(hca->hca_hdl,
3899 hca->pd_hdl, &mem_attr,
3900 &rbp->mr_hdl[i],
3901 &rbp->mr_desc[i]);
3902 if (ibt_status != IBT_SUCCESS) {
3903 for (j = 0; j < i; j++) {
3904 (void) ibt_deregister_mr(hca->hca_hdl,
3905 rbp->mr_hdl[j]);
3906 }
3907 rw_exit(&hca->state_lock);
3908 goto fail;
3909 }
3910 }
3911 rw_exit(&hca->state_lock);
3912 buf = (caddr_t)bp->buf;
3913 for (i = 0; i < num; i++, buf += bp->rsize) {
3914 bp->buflist[i] = (void *)buf;
3915 }
3916 bp->buffree = num - 1; /* no. of free buffers */
3917 rbp->bpool = bp;
3918
3919 return (rbp);
3920 fail:
3921 if (bp) {
3922 if (bp->buf)
3923 kmem_free(bp->buf, bp->bufsize);
3924 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3925 }
3926 if (rbp) {
3927 if (rbp->mr_hdl)
3928 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3929 if (rbp->mr_desc)
3930 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3931 kmem_free(rbp, sizeof (rib_bufpool_t));
3932 }
3933 return (NULL);
3934 }
3935
3936 static void
rib_rbufpool_deregister(rib_hca_t * hca,int ptype)3937 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3938 {
3939 int i;
3940 rib_bufpool_t *rbp = NULL;
3941 bufpool_t *bp;
3942
3943 /*
3944 * Obtain pool address based on type of pool
3945 */
3946 switch (ptype) {
3947 case SEND_BUFFER:
3948 rbp = hca->send_pool;
3949 break;
3950 case RECV_BUFFER:
3951 rbp = hca->recv_pool;
3952 break;
3953 default:
3954 return;
3955 }
3956 if (rbp == NULL)
3957 return;
3958
3959 bp = rbp->bpool;
3960
3961 /*
3962 * Deregister the pool memory and free it.
3963 */
3964 for (i = 0; i < bp->numelems; i++) {
3965 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3966 }
3967 }
3968
3969 static void
rib_rbufpool_free(rib_hca_t * hca,int ptype)3970 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3971 {
3972
3973 rib_bufpool_t *rbp = NULL;
3974 bufpool_t *bp;
3975
3976 /*
3977 * Obtain pool address based on type of pool
3978 */
3979 switch (ptype) {
3980 case SEND_BUFFER:
3981 rbp = hca->send_pool;
3982 break;
3983 case RECV_BUFFER:
3984 rbp = hca->recv_pool;
3985 break;
3986 default:
3987 return;
3988 }
3989 if (rbp == NULL)
3990 return;
3991
3992 bp = rbp->bpool;
3993
3994 /*
3995 * Free the pool memory.
3996 */
3997 if (rbp->mr_hdl)
3998 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
3999
4000 if (rbp->mr_desc)
4001 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
4002 if (bp->buf)
4003 kmem_free(bp->buf, bp->bufsize);
4004 mutex_destroy(&bp->buflock);
4005 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
4006 kmem_free(rbp, sizeof (rib_bufpool_t));
4007 }
4008
4009 void
rib_rbufpool_destroy(rib_hca_t * hca,int ptype)4010 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
4011 {
4012 /*
4013 * Deregister the pool memory and free it.
4014 */
4015 rib_rbufpool_deregister(hca, ptype);
4016 rib_rbufpool_free(hca, ptype);
4017 }
4018
4019 /*
4020 * Fetch a buffer from the pool of type specified in rdbuf->type.
4021 */
4022 static rdma_stat
rib_reg_buf_alloc(CONN * conn,rdma_buf_t * rdbuf)4023 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4024 {
4025 rib_lrc_entry_t *rlep;
4026
4027 if (rdbuf->type == RDMA_LONG_BUFFER) {
4028 rlep = rib_get_cache_buf(conn, rdbuf->len);
4029 rdbuf->rb_private = (caddr_t)rlep;
4030 rdbuf->addr = rlep->lrc_buf;
4031 rdbuf->handle = rlep->lrc_mhandle;
4032 return (RDMA_SUCCESS);
4033 }
4034
4035 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
4036 if (rdbuf->addr) {
4037 switch (rdbuf->type) {
4038 case SEND_BUFFER:
4039 rdbuf->len = RPC_MSG_SZ; /* 1K */
4040 break;
4041 case RECV_BUFFER:
4042 rdbuf->len = RPC_BUF_SIZE; /* 2K */
4043 break;
4044 default:
4045 rdbuf->len = 0;
4046 }
4047 return (RDMA_SUCCESS);
4048 } else
4049 return (RDMA_FAILED);
4050 }
4051
4052 /*
4053 * Fetch a buffer of specified type.
4054 * Note that rdbuf->handle is mw's rkey.
4055 */
4056 static void *
rib_rbuf_alloc(CONN * conn,rdma_buf_t * rdbuf)4057 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4058 {
4059 rib_qp_t *qp = ctoqp(conn);
4060 rib_hca_t *hca = qp->hca;
4061 rdma_btype ptype = rdbuf->type;
4062 void *buf;
4063 rib_bufpool_t *rbp = NULL;
4064 bufpool_t *bp;
4065 int i;
4066
4067 /*
4068 * Obtain pool address based on type of pool
4069 */
4070 switch (ptype) {
4071 case SEND_BUFFER:
4072 rbp = hca->send_pool;
4073 break;
4074 case RECV_BUFFER:
4075 rbp = hca->recv_pool;
4076 break;
4077 default:
4078 return (NULL);
4079 }
4080 if (rbp == NULL)
4081 return (NULL);
4082
4083 bp = rbp->bpool;
4084
4085 mutex_enter(&bp->buflock);
4086 if (bp->buffree < 0) {
4087 mutex_exit(&bp->buflock);
4088 return (NULL);
4089 }
4090
4091 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4092 buf = bp->buflist[bp->buffree];
4093 rdbuf->addr = buf;
4094 rdbuf->len = bp->rsize;
4095 for (i = bp->numelems - 1; i >= 0; i--) {
4096 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
4097 rdbuf->handle.mrc_rmr =
4098 (uint32_t)rbp->mr_desc[i].md_rkey;
4099 rdbuf->handle.mrc_linfo =
4100 (uintptr_t)rbp->mr_hdl[i];
4101 rdbuf->handle.mrc_lmr =
4102 (uint32_t)rbp->mr_desc[i].md_lkey;
4103 bp->buffree--;
4104
4105 mutex_exit(&bp->buflock);
4106
4107 return (buf);
4108 }
4109 }
4110
4111 mutex_exit(&bp->buflock);
4112
4113 return (NULL);
4114 }
4115
4116 static void
rib_reg_buf_free(CONN * conn,rdma_buf_t * rdbuf)4117 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4118 {
4119
4120 if (rdbuf->type == RDMA_LONG_BUFFER) {
4121 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
4122 rdbuf->rb_private = NULL;
4123 return;
4124 }
4125 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
4126 }
4127
4128 static void
rib_rbuf_free(CONN * conn,int ptype,void * buf)4129 rib_rbuf_free(CONN *conn, int ptype, void *buf)
4130 {
4131 rib_qp_t *qp = ctoqp(conn);
4132 rib_hca_t *hca = qp->hca;
4133 rib_bufpool_t *rbp = NULL;
4134 bufpool_t *bp;
4135
4136 /*
4137 * Obtain pool address based on type of pool
4138 */
4139 switch (ptype) {
4140 case SEND_BUFFER:
4141 rbp = hca->send_pool;
4142 break;
4143 case RECV_BUFFER:
4144 rbp = hca->recv_pool;
4145 break;
4146 default:
4147 return;
4148 }
4149 if (rbp == NULL)
4150 return;
4151
4152 bp = rbp->bpool;
4153
4154 mutex_enter(&bp->buflock);
4155 if (++bp->buffree >= bp->numelems) {
4156 /*
4157 * Should never happen
4158 */
4159 bp->buffree--;
4160 } else {
4161 bp->buflist[bp->buffree] = buf;
4162 }
4163 mutex_exit(&bp->buflock);
4164 }
4165
4166 static rdma_stat
rib_add_connlist(CONN * cn,rib_conn_list_t * connlist)4167 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
4168 {
4169 rw_enter(&connlist->conn_lock, RW_WRITER);
4170 if (connlist->conn_hd) {
4171 cn->c_next = connlist->conn_hd;
4172 connlist->conn_hd->c_prev = cn;
4173 }
4174 connlist->conn_hd = cn;
4175 rw_exit(&connlist->conn_lock);
4176
4177 return (RDMA_SUCCESS);
4178 }
4179
4180 static rdma_stat
rib_rm_conn(CONN * cn,rib_conn_list_t * connlist)4181 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
4182 {
4183 rw_enter(&connlist->conn_lock, RW_WRITER);
4184 if (cn->c_prev) {
4185 cn->c_prev->c_next = cn->c_next;
4186 }
4187 if (cn->c_next) {
4188 cn->c_next->c_prev = cn->c_prev;
4189 }
4190 if (connlist->conn_hd == cn)
4191 connlist->conn_hd = cn->c_next;
4192 rw_exit(&connlist->conn_lock);
4193
4194 return (RDMA_SUCCESS);
4195 }
4196
4197 /* ARGSUSED */
4198 static rdma_stat
rib_conn_get(struct netbuf * s_svcaddr,struct netbuf * d_svcaddr,int addr_type,void * handle,CONN ** conn)4199 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4200 int addr_type, void *handle, CONN **conn)
4201 {
4202 rdma_stat status;
4203 rpcib_ping_t rpt;
4204
4205 status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn);
4206 return (status);
4207 }
4208
4209 /*
4210 * rib_find_hca_connection
4211 *
4212 * if there is an existing connection to the specified address then
4213 * it will be returned in conn, otherwise conn will be set to NULL.
4214 * Also cleans up any connection that is in error state.
4215 */
4216 static int
rib_find_hca_connection(rib_hca_t * hca,struct netbuf * s_svcaddr,struct netbuf * d_svcaddr,CONN ** conn)4217 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
4218 struct netbuf *d_svcaddr, CONN **conn)
4219 {
4220 CONN *cn;
4221 clock_t cv_stat, timout;
4222
4223 *conn = NULL;
4224 again:
4225 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4226 cn = hca->cl_conn_list.conn_hd;
4227 while (cn != NULL) {
4228 /*
4229 * First, clear up any connection in the ERROR state
4230 */
4231 mutex_enter(&cn->c_lock);
4232 if (cn->c_state == C_ERROR_CONN) {
4233 if (cn->c_ref == 0) {
4234 /*
4235 * Remove connection from list and destroy it.
4236 */
4237 cn->c_state = C_DISCONN_PEND;
4238 mutex_exit(&cn->c_lock);
4239 rw_exit(&hca->cl_conn_list.conn_lock);
4240 rib_conn_close((void *)cn);
4241 goto again;
4242 }
4243 mutex_exit(&cn->c_lock);
4244 cn = cn->c_next;
4245 continue;
4246 }
4247 if (cn->c_state == C_DISCONN_PEND) {
4248 mutex_exit(&cn->c_lock);
4249 cn = cn->c_next;
4250 continue;
4251 }
4252
4253 /*
4254 * source address is only checked for if there is one,
4255 * this is the case for retries.
4256 */
4257 if ((cn->c_raddr.len == d_svcaddr->len) &&
4258 (bcmp(d_svcaddr->buf, cn->c_raddr.buf,
4259 d_svcaddr->len) == 0) &&
4260 ((s_svcaddr->len == 0) ||
4261 ((cn->c_laddr.len == s_svcaddr->len) &&
4262 (bcmp(s_svcaddr->buf, cn->c_laddr.buf,
4263 s_svcaddr->len) == 0)))) {
4264 /*
4265 * Our connection. Give up conn list lock
4266 * as we are done traversing the list.
4267 */
4268 rw_exit(&hca->cl_conn_list.conn_lock);
4269 if (cn->c_state == C_CONNECTED) {
4270 cn->c_ref++; /* sharing a conn */
4271 mutex_exit(&cn->c_lock);
4272 *conn = cn;
4273 return (RDMA_SUCCESS);
4274 }
4275 if (cn->c_state == C_CONN_PEND) {
4276 /*
4277 * Hold a reference to this conn before
4278 * we give up the lock.
4279 */
4280 cn->c_ref++;
4281 timout = ddi_get_lbolt() +
4282 drv_usectohz(CONN_WAIT_TIME * 1000000);
4283 while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4284 &cn->c_lock, timout)) > 0 &&
4285 cn->c_state == C_CONN_PEND)
4286 ;
4287 if (cv_stat == 0) {
4288 (void) rib_conn_release_locked(cn);
4289 return (RDMA_INTR);
4290 }
4291 if (cv_stat < 0) {
4292 (void) rib_conn_release_locked(cn);
4293 return (RDMA_TIMEDOUT);
4294 }
4295 if (cn->c_state == C_CONNECTED) {
4296 *conn = cn;
4297 mutex_exit(&cn->c_lock);
4298 return (RDMA_SUCCESS);
4299 } else {
4300 (void) rib_conn_release_locked(cn);
4301 return (RDMA_TIMEDOUT);
4302 }
4303 }
4304 }
4305 mutex_exit(&cn->c_lock);
4306 cn = cn->c_next;
4307 }
4308 rw_exit(&hca->cl_conn_list.conn_lock);
4309 *conn = NULL;
4310 return (RDMA_FAILED);
4311 }
4312
4313 /*
4314 * Connection management.
4315 * IBTF does not support recycling of channels. So connections are only
4316 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
4317 * C_DISCONN_PEND state. No C_IDLE state.
4318 * C_CONN_PEND state: Connection establishment in progress to the server.
4319 * C_CONNECTED state: A connection when created is in C_CONNECTED state.
4320 * It has an RC channel associated with it. ibt_post_send/recv are allowed
4321 * only in this state.
4322 * C_ERROR_CONN state: A connection transitions to this state when WRs on the
4323 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
4324 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
4325 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
4326 * c_ref drops to 0 (this indicates that RPC has no more references to this
4327 * connection), the connection should be destroyed. A connection transitions
4328 * into this state when it is being destroyed.
4329 */
4330 /* ARGSUSED */
4331 static rdma_stat
rib_connect(struct netbuf * s_svcaddr,struct netbuf * d_svcaddr,int addr_type,rpcib_ping_t * rpt,CONN ** conn)4332 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4333 int addr_type, rpcib_ping_t *rpt, CONN **conn)
4334 {
4335 CONN *cn;
4336 int status;
4337 rib_hca_t *hca;
4338 rib_qp_t *qp;
4339 int s_addr_len;
4340 char *s_addr_buf;
4341
4342 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
4343 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
4344 rw_enter(&hca->state_lock, RW_READER);
4345 if (hca->state != HCA_DETACHED) {
4346 status = rib_find_hca_connection(hca, s_svcaddr,
4347 d_svcaddr, conn);
4348 rw_exit(&hca->state_lock);
4349 if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) {
4350 rw_exit(&rib_stat->hcas_list_lock);
4351 return (status);
4352 }
4353 } else
4354 rw_exit(&hca->state_lock);
4355 }
4356 rw_exit(&rib_stat->hcas_list_lock);
4357
4358 /*
4359 * No existing connection found, establish a new connection.
4360 */
4361 bzero(rpt, sizeof (rpcib_ping_t));
4362
4363 status = rib_ping_srv(addr_type, d_svcaddr, rpt);
4364 if (status != RDMA_SUCCESS) {
4365 return (RDMA_FAILED);
4366 }
4367 hca = rpt->hca;
4368
4369 if (rpt->srcip.family == AF_INET) {
4370 s_addr_len = sizeof (rpt->srcip.un.ip4addr);
4371 s_addr_buf = (char *)&rpt->srcip.un.ip4addr;
4372 } else if (rpt->srcip.family == AF_INET6) {
4373 s_addr_len = sizeof (rpt->srcip.un.ip6addr);
4374 s_addr_buf = (char *)&rpt->srcip.un.ip6addr;
4375 } else {
4376 return (RDMA_FAILED);
4377 }
4378
4379 /*
4380 * Channel to server doesn't exist yet, create one.
4381 */
4382 if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) {
4383 return (RDMA_FAILED);
4384 }
4385 cn = qptoc(qp);
4386 cn->c_state = C_CONN_PEND;
4387 cn->c_ref = 1;
4388
4389 cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP);
4390 bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len);
4391 cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len;
4392
4393 if (rpt->srcip.family == AF_INET) {
4394 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP);
4395 (void) strcpy(cn->c_netid, RIBNETID_TCP);
4396
4397 cn->c_addrmask.len = cn->c_addrmask.maxlen =
4398 sizeof (struct sockaddr_in);
4399 cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP);
4400
4401 ((struct sockaddr_in *)cn->c_addrmask.buf)->sin_addr.s_addr =
4402 (uint32_t)~0;
4403 ((struct sockaddr_in *)cn->c_addrmask.buf)->sin_family =
4404 (ushort_t)~0;
4405
4406 } else {
4407 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP);
4408 (void) strcpy(cn->c_netid, RIBNETID_TCP6);
4409
4410 cn->c_addrmask.len = cn->c_addrmask.maxlen =
4411 sizeof (struct sockaddr_in6);
4412 cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP);
4413
4414 (void) memset(
4415 &((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_addr,
4416 (uchar_t)~0, sizeof (struct in6_addr));
4417 ((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_family =
4418 (sa_family_t)~0;
4419 }
4420
4421 /*
4422 * Add to conn list.
4423 * We had given up the READER lock. In the time since then,
4424 * another thread might have created the connection we are
4425 * trying here. But for now, that is quiet alright - there
4426 * might be two connections between a pair of hosts instead
4427 * of one. If we really want to close that window,
4428 * then need to check the list after acquiring the
4429 * WRITER lock.
4430 */
4431 (void) rib_add_connlist(cn, &hca->cl_conn_list);
4432 status = rib_conn_to_srv(hca, qp, rpt);
4433 mutex_enter(&cn->c_lock);
4434
4435 if (cn->c_flags & C_CLOSE_PENDING) {
4436 /*
4437 * This handles a case where the module or
4438 * HCA detached in the time a connection is
4439 * established. In such a case close the
4440 * connection immediately if this is the
4441 * only reference.
4442 */
4443 if (cn->c_ref == 1) {
4444 cn->c_ref--;
4445 cn->c_state = C_DISCONN_PEND;
4446 mutex_exit(&cn->c_lock);
4447 rib_conn_close((void *)cn);
4448 return (RDMA_FAILED);
4449 }
4450
4451 /*
4452 * Connection to be closed later when c_ref = 0
4453 */
4454 status = RDMA_FAILED;
4455 }
4456
4457 if (status == RDMA_SUCCESS) {
4458 cn->c_state = C_CONNECTED;
4459 *conn = cn;
4460 } else {
4461 cn->c_state = C_ERROR_CONN;
4462 cn->c_ref--;
4463 }
4464 cv_signal(&cn->c_cv);
4465 mutex_exit(&cn->c_lock);
4466 return (status);
4467 }
4468
4469 static void
rib_conn_close(void * rarg)4470 rib_conn_close(void *rarg)
4471 {
4472 CONN *conn = (CONN *)rarg;
4473 rib_qp_t *qp = ctoqp(conn);
4474
4475 mutex_enter(&conn->c_lock);
4476 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4477
4478 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4479
4480 /*
4481 * Live connection in CONNECTED state.
4482 */
4483 if (conn->c_state == C_CONNECTED) {
4484 conn->c_state = C_ERROR_CONN;
4485 }
4486 mutex_exit(&conn->c_lock);
4487
4488 rib_close_a_channel(conn);
4489
4490 mutex_enter(&conn->c_lock);
4491 conn->c_flags &= ~C_CLOSE_PENDING;
4492 }
4493
4494 mutex_exit(&conn->c_lock);
4495
4496 if (qp->mode == RIB_SERVER)
4497 (void) rib_disconnect_channel(conn,
4498 &qp->hca->srv_conn_list);
4499 else
4500 (void) rib_disconnect_channel(conn,
4501 &qp->hca->cl_conn_list);
4502 }
4503
4504 static void
rib_conn_timeout_call(void * carg)4505 rib_conn_timeout_call(void *carg)
4506 {
4507 time_t idle_time;
4508 CONN *conn = (CONN *)carg;
4509 rib_hca_t *hca = ctoqp(conn)->hca;
4510 int error;
4511
4512 mutex_enter(&conn->c_lock);
4513 if ((conn->c_ref > 0) ||
4514 (conn->c_state == C_DISCONN_PEND)) {
4515 conn->c_timeout = NULL;
4516 mutex_exit(&conn->c_lock);
4517 return;
4518 }
4519
4520 idle_time = (gethrestime_sec() - conn->c_last_used);
4521
4522 if ((idle_time <= rib_conn_timeout) &&
4523 (conn->c_state != C_ERROR_CONN)) {
4524 /*
4525 * There was activity after the last timeout.
4526 * Extend the conn life. Unless the conn is
4527 * already in error state.
4528 */
4529 conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4530 SEC_TO_TICK(rib_conn_timeout - idle_time));
4531 mutex_exit(&conn->c_lock);
4532 return;
4533 }
4534
4535 error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close,
4536 (void *)conn, DDI_NOSLEEP);
4537
4538 /*
4539 * If taskq dispatch fails above, then reset the timeout
4540 * to try again after 10 secs.
4541 */
4542
4543 if (error != DDI_SUCCESS) {
4544 conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4545 SEC_TO_TICK(RDMA_CONN_REAP_RETRY));
4546 mutex_exit(&conn->c_lock);
4547 return;
4548 }
4549
4550 conn->c_state = C_DISCONN_PEND;
4551 mutex_exit(&conn->c_lock);
4552 }
4553
4554 static rdma_stat
rib_conn_release(CONN * conn)4555 rib_conn_release(CONN *conn)
4556 {
4557 mutex_enter(&conn->c_lock);
4558 return (rib_conn_release_locked(conn));
4559 }
4560
4561 /*
4562 * Expects conn->c_lock to be held on entry.
4563 * c_lock released on return
4564 */
4565 static rdma_stat
rib_conn_release_locked(CONN * conn)4566 rib_conn_release_locked(CONN *conn)
4567 {
4568 conn->c_ref--;
4569
4570 conn->c_last_used = gethrestime_sec();
4571 if (conn->c_ref > 0) {
4572 mutex_exit(&conn->c_lock);
4573 return (RDMA_SUCCESS);
4574 }
4575
4576 /*
4577 * If a conn is C_ERROR_CONN, close the channel.
4578 */
4579 if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
4580 conn->c_state = C_DISCONN_PEND;
4581 mutex_exit(&conn->c_lock);
4582 rib_conn_close((void *)conn);
4583 return (RDMA_SUCCESS);
4584 }
4585
4586 /*
4587 * c_ref == 0, set a timeout for conn release
4588 */
4589
4590 if (conn->c_timeout == NULL) {
4591 conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4592 SEC_TO_TICK(rib_conn_timeout));
4593 }
4594
4595 mutex_exit(&conn->c_lock);
4596 return (RDMA_SUCCESS);
4597 }
4598
4599 /*
4600 * Add at front of list
4601 */
4602 static struct rdma_done_list *
rdma_done_add(rib_qp_t * qp,uint32_t xid)4603 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4604 {
4605 struct rdma_done_list *rd;
4606
4607 ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4608
4609 rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4610 rd->xid = xid;
4611 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4612
4613 rd->prev = NULL;
4614 rd->next = qp->rdlist;
4615 if (qp->rdlist != NULL)
4616 qp->rdlist->prev = rd;
4617 qp->rdlist = rd;
4618
4619 return (rd);
4620 }
4621
4622 static void
rdma_done_rm(rib_qp_t * qp,struct rdma_done_list * rd)4623 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4624 {
4625 struct rdma_done_list *r;
4626
4627 ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4628
4629 r = rd->next;
4630 if (r != NULL) {
4631 r->prev = rd->prev;
4632 }
4633
4634 r = rd->prev;
4635 if (r != NULL) {
4636 r->next = rd->next;
4637 } else {
4638 qp->rdlist = rd->next;
4639 }
4640
4641 cv_destroy(&rd->rdma_done_cv);
4642 kmem_free(rd, sizeof (*rd));
4643 }
4644
4645 static void
rdma_done_rem_list(rib_qp_t * qp)4646 rdma_done_rem_list(rib_qp_t *qp)
4647 {
4648 struct rdma_done_list *r, *n;
4649
4650 mutex_enter(&qp->rdlist_lock);
4651 for (r = qp->rdlist; r != NULL; r = n) {
4652 n = r->next;
4653 rdma_done_rm(qp, r);
4654 }
4655 mutex_exit(&qp->rdlist_lock);
4656 }
4657
4658 static void
rdma_done_notify(rib_qp_t * qp,uint32_t xid)4659 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4660 {
4661 struct rdma_done_list *r = qp->rdlist;
4662
4663 ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4664
4665 while (r) {
4666 if (r->xid == xid) {
4667 cv_signal(&r->rdma_done_cv);
4668 return;
4669 } else {
4670 r = r->next;
4671 }
4672 }
4673 DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
4674 int, xid);
4675 }
4676
4677 /*
4678 * Expects conn->c_lock to be held by the caller.
4679 */
4680
4681 static void
rib_close_a_channel(CONN * conn)4682 rib_close_a_channel(CONN *conn)
4683 {
4684 rib_qp_t *qp;
4685 qp = ctoqp(conn);
4686
4687 if (qp->qp_hdl == NULL) {
4688 /* channel already freed */
4689 return;
4690 }
4691
4692 /*
4693 * Call ibt_close_rc_channel in blocking mode
4694 * with no callbacks.
4695 */
4696 (void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS,
4697 NULL, 0, NULL, NULL, 0);
4698 }
4699
4700 /*
4701 * Goes through all connections and closes the channel
4702 * This will cause all the WRs on those channels to be
4703 * flushed.
4704 */
4705 static void
rib_close_channels(rib_conn_list_t * connlist)4706 rib_close_channels(rib_conn_list_t *connlist)
4707 {
4708 CONN *conn, *tmp;
4709
4710 rw_enter(&connlist->conn_lock, RW_READER);
4711 conn = connlist->conn_hd;
4712 while (conn != NULL) {
4713 mutex_enter(&conn->c_lock);
4714 tmp = conn->c_next;
4715 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4716
4717 if (conn->c_state == C_CONN_PEND) {
4718 conn->c_flags |= C_CLOSE_PENDING;
4719 goto next;
4720 }
4721
4722 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4723
4724 /*
4725 * Live connection in CONNECTED state.
4726 */
4727 if (conn->c_state == C_CONNECTED)
4728 conn->c_state = C_ERROR_CONN;
4729 mutex_exit(&conn->c_lock);
4730
4731 rib_close_a_channel(conn);
4732
4733 mutex_enter(&conn->c_lock);
4734 conn->c_flags &= ~C_CLOSE_PENDING;
4735 /* Signal a pending rib_disconnect_channel() */
4736 cv_signal(&conn->c_cv);
4737 }
4738 next:
4739 mutex_exit(&conn->c_lock);
4740 conn = tmp;
4741 }
4742 rw_exit(&connlist->conn_lock);
4743 }
4744
4745 /*
4746 * Frees up all connections that are no longer being referenced
4747 */
4748 static void
rib_purge_connlist(rib_conn_list_t * connlist)4749 rib_purge_connlist(rib_conn_list_t *connlist)
4750 {
4751 CONN *conn;
4752
4753 top:
4754 rw_enter(&connlist->conn_lock, RW_READER);
4755 conn = connlist->conn_hd;
4756 while (conn != NULL) {
4757 mutex_enter(&conn->c_lock);
4758
4759 /*
4760 * At this point connection is either in ERROR
4761 * or DISCONN_PEND state. If in DISCONN_PEND state
4762 * then some other thread is culling that connection.
4763 * If not and if c_ref is 0, then destroy the connection.
4764 */
4765 if (conn->c_ref == 0 &&
4766 conn->c_state != C_DISCONN_PEND) {
4767 /*
4768 * Cull the connection
4769 */
4770 conn->c_state = C_DISCONN_PEND;
4771 mutex_exit(&conn->c_lock);
4772 rw_exit(&connlist->conn_lock);
4773 (void) rib_disconnect_channel(conn, connlist);
4774 goto top;
4775 } else {
4776 /*
4777 * conn disconnect already scheduled or will
4778 * happen from conn_release when c_ref drops to 0.
4779 */
4780 mutex_exit(&conn->c_lock);
4781 }
4782 conn = conn->c_next;
4783 }
4784 rw_exit(&connlist->conn_lock);
4785
4786 /*
4787 * At this point, only connections with c_ref != 0 are on the list
4788 */
4789 }
4790
4791 /*
4792 * Free all the HCA resources and close
4793 * the hca.
4794 */
4795
4796 static void
rib_free_hca(rib_hca_t * hca)4797 rib_free_hca(rib_hca_t *hca)
4798 {
4799 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4800 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4801 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4802 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4803
4804 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4805 kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4806 kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4807 kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4808
4809 rib_rbufpool_destroy(hca, RECV_BUFFER);
4810 rib_rbufpool_destroy(hca, SEND_BUFFER);
4811 rib_destroy_cache(hca);
4812 if (rib_mod.rdma_count == 0)
4813 (void) rdma_unregister_mod(&rib_mod);
4814 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4815 (void) ibt_close_hca(hca->hca_hdl);
4816 hca->hca_hdl = NULL;
4817 }
4818
4819
4820 static void
rib_stop_hca_services(rib_hca_t * hca)4821 rib_stop_hca_services(rib_hca_t *hca)
4822 {
4823 rib_stop_services(hca);
4824 rib_close_channels(&hca->cl_conn_list);
4825 rib_close_channels(&hca->srv_conn_list);
4826
4827 rib_purge_connlist(&hca->cl_conn_list);
4828 rib_purge_connlist(&hca->srv_conn_list);
4829
4830 if ((rib_stat->hcas_list == NULL) && stats_enabled) {
4831 kstat_delete_byname_zone("unix", 0, "rpcib_cache",
4832 GLOBAL_ZONEID);
4833 stats_enabled = FALSE;
4834 }
4835
4836 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4837 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4838 if (hca->srv_conn_list.conn_hd == NULL &&
4839 hca->cl_conn_list.conn_hd == NULL) {
4840 /*
4841 * conn_lists are NULL, so destroy
4842 * buffers, close hca and be done.
4843 */
4844 rib_free_hca(hca);
4845 }
4846 rw_exit(&hca->cl_conn_list.conn_lock);
4847 rw_exit(&hca->srv_conn_list.conn_lock);
4848
4849 if (hca->hca_hdl != NULL) {
4850 mutex_enter(&hca->inuse_lock);
4851 while (hca->inuse)
4852 cv_wait(&hca->cb_cv, &hca->inuse_lock);
4853 mutex_exit(&hca->inuse_lock);
4854
4855 rib_free_hca(hca);
4856 }
4857 rw_destroy(&hca->bound_services_lock);
4858
4859 if (hca->cleanup_helper != NULL) {
4860 ddi_taskq_destroy(hca->cleanup_helper);
4861 hca->cleanup_helper = NULL;
4862 }
4863 }
4864
4865 /*
4866 * Cleans and closes up all uses of the HCA
4867 */
4868 static void
rib_detach_hca(ibt_hca_hdl_t hca_hdl)4869 rib_detach_hca(ibt_hca_hdl_t hca_hdl)
4870 {
4871 rib_hca_t *hca = NULL;
4872 rib_hca_t **hcap;
4873
4874 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
4875 for (hcap = &rib_stat->hcas_list; *hcap; hcap = &(*hcap)->next) {
4876 hca = *hcap;
4877 rw_enter(&hca->state_lock, RW_WRITER);
4878 if (hca->hca_hdl == hca_hdl) {
4879 /*
4880 * Mark as detached and remove from
4881 * hca list.
4882 */
4883 hca->state = HCA_DETACHED;
4884 *hcap = hca->next;
4885 rib_stat->nhca_inited--;
4886 rib_mod.rdma_count--;
4887 rw_exit(&hca->state_lock);
4888 break;
4889 }
4890 rw_exit(&hca->state_lock);
4891 }
4892 rw_exit(&rib_stat->hcas_list_lock);
4893
4894 if (hca == NULL)
4895 return;
4896 ASSERT(hca->hca_hdl == hca_hdl);
4897
4898 /*
4899 * Stop all services on the HCA
4900 * Go through cl_conn_list and close all rc_channels
4901 * Go through svr_conn_list and close all rc_channels
4902 * Free connections whose c_ref has dropped to 0
4903 * Destroy all CQs
4904 * Deregister and released all buffer pool memory after all
4905 * connections are destroyed
4906 * Free the protection domain
4907 * ibt_close_hca()
4908 */
4909 rib_stop_hca_services(hca);
4910
4911 kmem_free(hca, sizeof (*hca));
4912 }
4913
4914 static void
rib_server_side_cache_reclaim(void * argp)4915 rib_server_side_cache_reclaim(void *argp)
4916 {
4917 cache_avl_struct_t *rcas;
4918 rib_lrc_entry_t *rb;
4919 rib_hca_t *hca = (rib_hca_t *)argp;
4920
4921 rw_enter(&hca->avl_rw_lock, RW_WRITER);
4922 rcas = avl_first(&hca->avl_tree);
4923 if (rcas != NULL)
4924 avl_remove(&hca->avl_tree, rcas);
4925
4926 while (rcas != NULL) {
4927 while (rcas->r.forw != &rcas->r) {
4928 rcas->elements--;
4929 rb = rcas->r.forw;
4930 remque(rb);
4931 if (rb->registered)
4932 (void) rib_deregistermem_via_hca(hca,
4933 rb->lrc_buf, rb->lrc_mhandle);
4934
4935 hca->cache_allocation -= rb->lrc_len;
4936 kmem_free(rb->lrc_buf, rb->lrc_len);
4937 kmem_free(rb, sizeof (rib_lrc_entry_t));
4938 }
4939 mutex_destroy(&rcas->node_lock);
4940 kmem_cache_free(hca->server_side_cache, rcas);
4941 rcas = avl_first(&hca->avl_tree);
4942 if (rcas != NULL)
4943 avl_remove(&hca->avl_tree, rcas);
4944 }
4945 rw_exit(&hca->avl_rw_lock);
4946 }
4947
4948 static void
rib_server_side_cache_cleanup(void * argp)4949 rib_server_side_cache_cleanup(void *argp)
4950 {
4951 cache_avl_struct_t *rcas;
4952 rib_lrc_entry_t *rb;
4953 rib_hca_t *hca = (rib_hca_t *)argp;
4954
4955 mutex_enter(&hca->cache_allocation_lock);
4956 if (hca->cache_allocation < cache_limit) {
4957 mutex_exit(&hca->cache_allocation_lock);
4958 return;
4959 }
4960 mutex_exit(&hca->cache_allocation_lock);
4961
4962 rw_enter(&hca->avl_rw_lock, RW_WRITER);
4963 rcas = avl_last(&hca->avl_tree);
4964 if (rcas != NULL)
4965 avl_remove(&hca->avl_tree, rcas);
4966
4967 while (rcas != NULL) {
4968 while (rcas->r.forw != &rcas->r) {
4969 rcas->elements--;
4970 rb = rcas->r.forw;
4971 remque(rb);
4972 if (rb->registered)
4973 (void) rib_deregistermem_via_hca(hca,
4974 rb->lrc_buf, rb->lrc_mhandle);
4975
4976 hca->cache_allocation -= rb->lrc_len;
4977
4978 kmem_free(rb->lrc_buf, rb->lrc_len);
4979 kmem_free(rb, sizeof (rib_lrc_entry_t));
4980 }
4981 mutex_destroy(&rcas->node_lock);
4982 if (hca->server_side_cache) {
4983 kmem_cache_free(hca->server_side_cache, rcas);
4984 }
4985
4986 if (hca->cache_allocation < cache_limit) {
4987 rw_exit(&hca->avl_rw_lock);
4988 return;
4989 }
4990
4991 rcas = avl_last(&hca->avl_tree);
4992 if (rcas != NULL)
4993 avl_remove(&hca->avl_tree, rcas);
4994 }
4995 rw_exit(&hca->avl_rw_lock);
4996 }
4997
4998 static int
avl_compare(const void * t1,const void * t2)4999 avl_compare(const void *t1, const void *t2)
5000 {
5001 if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
5002 return (0);
5003
5004 if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
5005 return (-1);
5006
5007 return (1);
5008 }
5009
5010 static void
rib_destroy_cache(rib_hca_t * hca)5011 rib_destroy_cache(rib_hca_t *hca)
5012 {
5013 if (hca->avl_init) {
5014 rib_server_side_cache_reclaim((void *)hca);
5015 if (hca->server_side_cache) {
5016 kmem_cache_destroy(hca->server_side_cache);
5017 hca->server_side_cache = NULL;
5018 }
5019 avl_destroy(&hca->avl_tree);
5020 mutex_destroy(&hca->cache_allocation_lock);
5021 rw_destroy(&hca->avl_rw_lock);
5022 }
5023 hca->avl_init = FALSE;
5024 }
5025
5026 static void
rib_force_cleanup(void * hca)5027 rib_force_cleanup(void *hca)
5028 {
5029 if (((rib_hca_t *)hca)->cleanup_helper != NULL)
5030 (void) ddi_taskq_dispatch(
5031 ((rib_hca_t *)hca)->cleanup_helper,
5032 rib_server_side_cache_cleanup,
5033 (void *)hca, DDI_NOSLEEP);
5034 }
5035
5036 static rib_lrc_entry_t *
rib_get_cache_buf(CONN * conn,uint32_t len)5037 rib_get_cache_buf(CONN *conn, uint32_t len)
5038 {
5039 cache_avl_struct_t cas, *rcas;
5040 rib_hca_t *hca = (ctoqp(conn))->hca;
5041 rib_lrc_entry_t *reply_buf;
5042 avl_index_t where = NULL;
5043 uint64_t c_alloc = 0;
5044
5045 if (!hca->avl_init)
5046 goto error_alloc;
5047
5048 cas.len = len;
5049
5050 rw_enter(&hca->avl_rw_lock, RW_READER);
5051
5052 mutex_enter(&hca->cache_allocation_lock);
5053 c_alloc = hca->cache_allocation;
5054 mutex_exit(&hca->cache_allocation_lock);
5055
5056 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
5057 &where)) == NULL) {
5058 /* Am I above the cache limit */
5059 if ((c_alloc + len) >= cache_limit) {
5060 rib_force_cleanup((void *)hca);
5061 rw_exit(&hca->avl_rw_lock);
5062 mutex_enter(&hca->cache_allocation_lock);
5063 hca->cache_misses_above_the_limit ++;
5064 mutex_exit(&hca->cache_allocation_lock);
5065
5066 /* Allocate and register the buffer directly */
5067 goto error_alloc;
5068 }
5069
5070 rw_exit(&hca->avl_rw_lock);
5071 rw_enter(&hca->avl_rw_lock, RW_WRITER);
5072
5073 /* Recheck to make sure no other thread added the entry in */
5074 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
5075 &cas, &where)) == NULL) {
5076 /* Allocate an avl tree entry */
5077 rcas = (cache_avl_struct_t *)
5078 kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
5079
5080 bzero(rcas, sizeof (cache_avl_struct_t));
5081 rcas->elements = 0;
5082 rcas->r.forw = &rcas->r;
5083 rcas->r.back = &rcas->r;
5084 rcas->len = len;
5085 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
5086 avl_insert(&hca->avl_tree, rcas, where);
5087 }
5088 }
5089
5090 mutex_enter(&rcas->node_lock);
5091
5092 if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
5093 reply_buf = rcas->r.forw;
5094 remque(reply_buf);
5095 rcas->elements--;
5096 mutex_exit(&rcas->node_lock);
5097 rw_exit(&hca->avl_rw_lock);
5098
5099 mutex_enter(&hca->cache_allocation_lock);
5100 hca->cache_hits++;
5101 hca->cache_allocation -= len;
5102 mutex_exit(&hca->cache_allocation_lock);
5103 } else {
5104 /* Am I above the cache limit */
5105 mutex_exit(&rcas->node_lock);
5106 if ((c_alloc + len) >= cache_limit) {
5107 rib_force_cleanup((void *)hca);
5108 rw_exit(&hca->avl_rw_lock);
5109
5110 mutex_enter(&hca->cache_allocation_lock);
5111 hca->cache_misses_above_the_limit++;
5112 mutex_exit(&hca->cache_allocation_lock);
5113 /* Allocate and register the buffer directly */
5114 goto error_alloc;
5115 }
5116 rw_exit(&hca->avl_rw_lock);
5117 mutex_enter(&hca->cache_allocation_lock);
5118 hca->cache_misses++;
5119 mutex_exit(&hca->cache_allocation_lock);
5120 /* Allocate a reply_buf entry */
5121 reply_buf = (rib_lrc_entry_t *)
5122 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5123 bzero(reply_buf, sizeof (rib_lrc_entry_t));
5124 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5125 reply_buf->lrc_len = len;
5126 reply_buf->registered = FALSE;
5127 reply_buf->avl_node = (void *)rcas;
5128 }
5129
5130 return (reply_buf);
5131
5132 error_alloc:
5133 reply_buf = (rib_lrc_entry_t *)
5134 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5135 bzero(reply_buf, sizeof (rib_lrc_entry_t));
5136 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5137 reply_buf->lrc_len = len;
5138 reply_buf->registered = FALSE;
5139 reply_buf->avl_node = NULL;
5140
5141 return (reply_buf);
5142 }
5143
5144 /*
5145 * Return a pre-registered back to the cache (without
5146 * unregistering the buffer)..
5147 */
5148
5149 static void
rib_free_cache_buf(CONN * conn,rib_lrc_entry_t * reg_buf)5150 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
5151 {
5152 cache_avl_struct_t cas, *rcas;
5153 avl_index_t where = NULL;
5154 rib_hca_t *hca = (ctoqp(conn))->hca;
5155
5156 if (!hca->avl_init)
5157 goto error_free;
5158
5159 cas.len = reg_buf->lrc_len;
5160 rw_enter(&hca->avl_rw_lock, RW_READER);
5161 if ((rcas = (cache_avl_struct_t *)
5162 avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
5163 rw_exit(&hca->avl_rw_lock);
5164 goto error_free;
5165 } else {
5166 cas.len = reg_buf->lrc_len;
5167 mutex_enter(&rcas->node_lock);
5168 insque(reg_buf, &rcas->r);
5169 rcas->elements ++;
5170 mutex_exit(&rcas->node_lock);
5171 rw_exit(&hca->avl_rw_lock);
5172 mutex_enter(&hca->cache_allocation_lock);
5173 hca->cache_allocation += cas.len;
5174 mutex_exit(&hca->cache_allocation_lock);
5175 }
5176
5177 return;
5178
5179 error_free:
5180
5181 if (reg_buf->registered)
5182 (void) rib_deregistermem_via_hca(hca,
5183 reg_buf->lrc_buf, reg_buf->lrc_mhandle);
5184 kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
5185 kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
5186 }
5187
5188 static rdma_stat
rib_registermem_via_hca(rib_hca_t * hca,caddr_t adsp,caddr_t buf,uint_t buflen,struct mrc * buf_handle)5189 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
5190 uint_t buflen, struct mrc *buf_handle)
5191 {
5192 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
5193 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
5194 rdma_stat status;
5195
5196
5197 /*
5198 * Note: ALL buffer pools use the same memory type RDMARW.
5199 */
5200 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
5201 if (status == RDMA_SUCCESS) {
5202 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
5203 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
5204 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
5205 } else {
5206 buf_handle->mrc_linfo = NULL;
5207 buf_handle->mrc_lmr = 0;
5208 buf_handle->mrc_rmr = 0;
5209 }
5210 return (status);
5211 }
5212
5213 /* ARGSUSED */
5214 static rdma_stat
rib_deregistermemsync_via_hca(rib_hca_t * hca,caddr_t buf,struct mrc buf_handle,RIB_SYNCMEM_HANDLE sync_handle)5215 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
5216 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
5217 {
5218
5219 (void) rib_deregistermem_via_hca(hca, buf, buf_handle);
5220 return (RDMA_SUCCESS);
5221 }
5222
5223 /* ARGSUSED */
5224 static rdma_stat
rib_deregistermem_via_hca(rib_hca_t * hca,caddr_t buf,struct mrc buf_handle)5225 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
5226 {
5227
5228 (void) ibt_deregister_mr(hca->hca_hdl,
5229 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
5230 return (RDMA_SUCCESS);
5231 }
5232
5233 /*
5234 * Check if the IP interface named by `lifrp' is RDMA-capable.
5235 */
5236 static boolean_t
rpcib_rdma_capable_interface(struct lifreq * lifrp)5237 rpcib_rdma_capable_interface(struct lifreq *lifrp)
5238 {
5239 char ifname[LIFNAMSIZ];
5240 char *cp;
5241
5242 if (lifrp->lifr_type == IFT_IB)
5243 return (B_TRUE);
5244
5245 /*
5246 * Strip off the logical interface portion before getting
5247 * intimate with the name.
5248 */
5249 (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
5250 if ((cp = strchr(ifname, ':')) != NULL)
5251 *cp = '\0';
5252
5253 return (strcmp("lo0", ifname) == 0);
5254 }
5255
5256 static int
rpcib_do_ip_ioctl(int cmd,int len,void * arg)5257 rpcib_do_ip_ioctl(int cmd, int len, void *arg)
5258 {
5259 vnode_t *kkvp, *vp;
5260 TIUSER *tiptr;
5261 struct strioctl iocb;
5262 k_sigset_t smask;
5263 int err = 0;
5264
5265 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kkvp) == 0) {
5266 if (t_kopen(NULL, kkvp->v_rdev, FREAD|FWRITE,
5267 &tiptr, CRED()) == 0) {
5268 vp = tiptr->fp->f_vnode;
5269 } else {
5270 VN_RELE(kkvp);
5271 return (EPROTO);
5272 }
5273 } else {
5274 return (EPROTO);
5275 }
5276
5277 iocb.ic_cmd = cmd;
5278 iocb.ic_timout = 0;
5279 iocb.ic_len = len;
5280 iocb.ic_dp = (caddr_t)arg;
5281 sigintr(&smask, 0);
5282 err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
5283 sigunintr(&smask);
5284 (void) t_kclose(tiptr, 0);
5285 VN_RELE(kkvp);
5286 return (err);
5287 }
5288
5289 /*
5290 * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
5291 * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
5292 */
5293 static int
rpcib_do_lifconf(struct lifconf * lifcp,uint_t * bufsizep)5294 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
5295 {
5296 int err;
5297 struct lifnum lifn;
5298
5299 bzero(&lifn, sizeof (struct lifnum));
5300 lifn.lifn_family = AF_UNSPEC;
5301
5302 err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
5303 if (err != 0)
5304 return (err);
5305
5306 /*
5307 * Pad the interface count to account for additional interfaces that
5308 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
5309 */
5310 lifn.lifn_count += 4;
5311
5312 bzero(lifcp, sizeof (struct lifconf));
5313 lifcp->lifc_family = AF_UNSPEC;
5314 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
5315 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
5316
5317 err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
5318 if (err != 0) {
5319 kmem_free(lifcp->lifc_buf, *bufsizep);
5320 return (err);
5321 }
5322 return (0);
5323 }
5324
5325 static boolean_t
rpcib_get_ib_addresses(rpcib_ipaddrs_t * addrs4,rpcib_ipaddrs_t * addrs6)5326 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6)
5327 {
5328 uint_t i, nifs;
5329 uint_t bufsize;
5330 struct lifconf lifc;
5331 struct lifreq *lifrp;
5332 struct sockaddr_in *sinp;
5333 struct sockaddr_in6 *sin6p;
5334
5335 bzero(addrs4, sizeof (rpcib_ipaddrs_t));
5336 bzero(addrs6, sizeof (rpcib_ipaddrs_t));
5337
5338 if (rpcib_do_lifconf(&lifc, &bufsize) != 0)
5339 return (B_FALSE);
5340
5341 if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) {
5342 kmem_free(lifc.lifc_buf, bufsize);
5343 return (B_FALSE);
5344 }
5345
5346 /*
5347 * Worst case is that all of the addresses are IB-capable and have
5348 * the same address family, so size our buffers accordingly.
5349 */
5350 addrs4->ri_size = nifs * sizeof (struct sockaddr_in);
5351 addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP);
5352 addrs6->ri_size = nifs * sizeof (struct sockaddr_in6);
5353 addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP);
5354
5355 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
5356 if (!rpcib_rdma_capable_interface(lifrp))
5357 continue;
5358
5359 if (lifrp->lifr_addr.ss_family == AF_INET) {
5360 sinp = addrs4->ri_list;
5361 bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++],
5362 sizeof (struct sockaddr_in));
5363 } else if (lifrp->lifr_addr.ss_family == AF_INET6) {
5364 sin6p = addrs6->ri_list;
5365 bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++],
5366 sizeof (struct sockaddr_in6));
5367 }
5368 }
5369
5370 kmem_free(lifc.lifc_buf, bufsize);
5371 return (B_TRUE);
5372 }
5373
5374 /* ARGSUSED */
5375 static int
rpcib_cache_kstat_update(kstat_t * ksp,int rw)5376 rpcib_cache_kstat_update(kstat_t *ksp, int rw)
5377 {
5378 rib_hca_t *hca;
5379
5380 if (KSTAT_WRITE == rw) {
5381 return (EACCES);
5382 }
5383
5384 rpcib_kstat.cache_limit.value.ui64 =
5385 (uint64_t)cache_limit;
5386 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
5387 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
5388 rpcib_kstat.cache_allocation.value.ui64 +=
5389 (uint64_t)hca->cache_allocation;
5390 rpcib_kstat.cache_hits.value.ui64 +=
5391 (uint64_t)hca->cache_hits;
5392 rpcib_kstat.cache_misses.value.ui64 +=
5393 (uint64_t)hca->cache_misses;
5394 rpcib_kstat.cache_misses_above_the_limit.value.ui64 +=
5395 (uint64_t)hca->cache_misses_above_the_limit;
5396 }
5397 rw_exit(&rib_stat->hcas_list_lock);
5398 return (0);
5399 }
5400