1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
25 /* All Rights Reserved */
26 /*
27 * Portions of this source code were derived from Berkeley
28 * 4.3 BSD under license from the Regents of the University of
29 * California.
30 */
31
32 /*
33 * Server side of RPC over RDMA in the kernel.
34 */
35
36 #include <sys/param.h>
37 #include <sys/types.h>
38 #include <sys/user.h>
39 #include <sys/sysmacros.h>
40 #include <sys/proc.h>
41 #include <sys/file.h>
42 #include <sys/errno.h>
43 #include <sys/kmem.h>
44 #include <sys/debug.h>
45 #include <sys/systm.h>
46 #include <sys/cmn_err.h>
47 #include <sys/kstat.h>
48 #include <sys/vtrace.h>
49 #include <sys/debug.h>
50
51 #include <rpc/types.h>
52 #include <rpc/xdr.h>
53 #include <rpc/auth.h>
54 #include <rpc/clnt.h>
55 #include <rpc/rpc_msg.h>
56 #include <rpc/svc.h>
57 #include <rpc/rpc_rdma.h>
58 #include <sys/ddi.h>
59 #include <sys/sunddi.h>
60
61 #include <inet/common.h>
62 #include <inet/ip.h>
63 #include <inet/ip6.h>
64
65 #include <nfs/nfs.h>
66 #include <sys/sdt.h>
67
68 #define SVC_RDMA_SUCCESS 0
69 #define SVC_RDMA_FAIL -1
70
71 #define SVC_CREDIT_FACTOR (0.5)
72
73 #define MSG_IS_RPCSEC_GSS(msg) \
74 ((msg)->rm_reply.rp_acpt.ar_verf.oa_flavor == RPCSEC_GSS)
75
76
77 uint32_t rdma_bufs_granted = RDMA_BUFS_GRANT;
78
79 /*
80 * RDMA transport specific data associated with SVCMASTERXPRT
81 */
82 struct rdma_data {
83 SVCMASTERXPRT *rd_xprt; /* back ptr to SVCMASTERXPRT */
84 struct rdma_svc_data rd_data; /* rdma data */
85 rdma_mod_t *r_mod; /* RDMA module containing ops ptr */
86 };
87
88 /*
89 * Plugin connection specific data stashed away in clone SVCXPRT
90 */
91 struct clone_rdma_data {
92 bool_t cloned; /* xprt cloned for thread processing */
93 CONN *conn; /* RDMA connection */
94 rdma_buf_t rpcbuf; /* RPC req/resp buffer */
95 struct clist *cl_reply; /* reply chunk buffer info */
96 struct clist *cl_wlist; /* write list clist */
97 };
98
99
100 #define MAXADDRLEN 128 /* max length for address mask */
101
102 /*
103 * Routines exported through ops vector.
104 */
105 static bool_t svc_rdma_krecv(SVCXPRT *, mblk_t *, struct rpc_msg *);
106 static bool_t svc_rdma_ksend(SVCXPRT *, struct rpc_msg *);
107 static bool_t svc_rdma_kgetargs(SVCXPRT *, xdrproc_t, caddr_t);
108 static bool_t svc_rdma_kfreeargs(SVCXPRT *, xdrproc_t, caddr_t);
109 void svc_rdma_kdestroy(SVCMASTERXPRT *);
110 static int svc_rdma_kdup(struct svc_req *, caddr_t, int,
111 struct dupreq **, bool_t *);
112 static void svc_rdma_kdupdone(struct dupreq *, caddr_t,
113 void (*)(), int, int);
114 static int32_t *svc_rdma_kgetres(SVCXPRT *, int);
115 static void svc_rdma_kfreeres(SVCXPRT *);
116 static void svc_rdma_kclone_destroy(SVCXPRT *);
117 static void svc_rdma_kstart(SVCMASTERXPRT *);
118 void svc_rdma_kstop(SVCMASTERXPRT *);
119 static void svc_rdma_kclone_xprt(SVCXPRT *, SVCXPRT *);
120 static void svc_rdma_ktattrs(SVCXPRT *, int, void **);
121
122 static int svc_process_long_reply(SVCXPRT *, xdrproc_t,
123 caddr_t, struct rpc_msg *, bool_t, int *,
124 int *, int *, unsigned int *);
125
126 static int svc_compose_rpcmsg(SVCXPRT *, CONN *, xdrproc_t,
127 caddr_t, rdma_buf_t *, XDR **, struct rpc_msg *,
128 bool_t, uint_t *);
129 static bool_t rpcmsg_length(xdrproc_t,
130 caddr_t,
131 struct rpc_msg *, bool_t, int);
132
133 /*
134 * Server transport operations vector.
135 */
136 struct svc_ops rdma_svc_ops = {
137 svc_rdma_krecv, /* Get requests */
138 svc_rdma_kgetargs, /* Deserialize arguments */
139 svc_rdma_ksend, /* Send reply */
140 svc_rdma_kfreeargs, /* Free argument data space */
141 svc_rdma_kdestroy, /* Destroy transport handle */
142 svc_rdma_kdup, /* Check entry in dup req cache */
143 svc_rdma_kdupdone, /* Mark entry in dup req cache as done */
144 svc_rdma_kgetres, /* Get pointer to response buffer */
145 svc_rdma_kfreeres, /* Destroy pre-serialized response header */
146 svc_rdma_kclone_destroy, /* Destroy a clone xprt */
147 svc_rdma_kstart, /* Tell `ready-to-receive' to rpcmod */
148 svc_rdma_kclone_xprt, /* Transport specific clone xprt */
149 svc_rdma_ktattrs /* Get Transport Attributes */
150 };
151
152 /*
153 * Server statistics
154 * NOTE: This structure type is duplicated in the NFS fast path.
155 */
156 struct {
157 kstat_named_t rscalls;
158 kstat_named_t rsbadcalls;
159 kstat_named_t rsnullrecv;
160 kstat_named_t rsbadlen;
161 kstat_named_t rsxdrcall;
162 kstat_named_t rsdupchecks;
163 kstat_named_t rsdupreqs;
164 kstat_named_t rslongrpcs;
165 kstat_named_t rstotalreplies;
166 kstat_named_t rstotallongreplies;
167 kstat_named_t rstotalinlinereplies;
168 } rdmarsstat = {
169 { "calls", KSTAT_DATA_UINT64 },
170 { "badcalls", KSTAT_DATA_UINT64 },
171 { "nullrecv", KSTAT_DATA_UINT64 },
172 { "badlen", KSTAT_DATA_UINT64 },
173 { "xdrcall", KSTAT_DATA_UINT64 },
174 { "dupchecks", KSTAT_DATA_UINT64 },
175 { "dupreqs", KSTAT_DATA_UINT64 },
176 { "longrpcs", KSTAT_DATA_UINT64 },
177 { "totalreplies", KSTAT_DATA_UINT64 },
178 { "totallongreplies", KSTAT_DATA_UINT64 },
179 { "totalinlinereplies", KSTAT_DATA_UINT64 },
180 };
181
182 kstat_named_t *rdmarsstat_ptr = (kstat_named_t *)&rdmarsstat;
183 uint_t rdmarsstat_ndata = sizeof (rdmarsstat) / sizeof (kstat_named_t);
184
185 #define RSSTAT_INCR(x) atomic_add_64(&rdmarsstat.x.value.ui64, 1)
186 /*
187 * Create a transport record.
188 * The transport record, output buffer, and private data structure
189 * are allocated. The output buffer is serialized into using xdrmem.
190 * There is one transport record per user process which implements a
191 * set of services.
192 */
193 /* ARGSUSED */
194 int
svc_rdma_kcreate(char * netid,SVC_CALLOUT_TABLE * sct,int id,rdma_xprt_group_t * started_xprts)195 svc_rdma_kcreate(char *netid, SVC_CALLOUT_TABLE *sct, int id,
196 rdma_xprt_group_t *started_xprts)
197 {
198 int error;
199 SVCMASTERXPRT *xprt;
200 struct rdma_data *rd;
201 rdma_registry_t *rmod;
202 rdma_xprt_record_t *xprt_rec;
203 queue_t *q;
204 /*
205 * modload the RDMA plugins is not already done.
206 */
207 if (!rdma_modloaded) {
208 /*CONSTANTCONDITION*/
209 ASSERT(sizeof (struct clone_rdma_data) <= SVC_P2LEN);
210
211 mutex_enter(&rdma_modload_lock);
212 if (!rdma_modloaded) {
213 error = rdma_modload();
214 }
215 mutex_exit(&rdma_modload_lock);
216
217 if (error)
218 return (error);
219 }
220
221 /*
222 * master_xprt_count is the count of master transport handles
223 * that were successfully created and are ready to recieve for
224 * RDMA based access.
225 */
226 error = 0;
227 xprt_rec = NULL;
228 rw_enter(&rdma_lock, RW_READER);
229 if (rdma_mod_head == NULL) {
230 started_xprts->rtg_count = 0;
231 rw_exit(&rdma_lock);
232 if (rdma_dev_available)
233 return (EPROTONOSUPPORT);
234 else
235 return (ENODEV);
236 }
237
238 /*
239 * If we have reached here, then atleast one RDMA plugin has loaded.
240 * Create a master_xprt, make it start listenining on the device,
241 * if an error is generated, record it, we might need to shut
242 * the master_xprt.
243 * SVC_START() calls svc_rdma_kstart which calls plugin binding
244 * routines.
245 */
246 for (rmod = rdma_mod_head; rmod != NULL; rmod = rmod->r_next) {
247
248 /*
249 * One SVCMASTERXPRT per RDMA plugin.
250 */
251 xprt = kmem_zalloc(sizeof (*xprt), KM_SLEEP);
252 xprt->xp_ops = &rdma_svc_ops;
253 xprt->xp_sct = sct;
254 xprt->xp_type = T_RDMA;
255 mutex_init(&xprt->xp_req_lock, NULL, MUTEX_DEFAULT, NULL);
256 mutex_init(&xprt->xp_thread_lock, NULL, MUTEX_DEFAULT, NULL);
257 xprt->xp_req_head = (mblk_t *)0;
258 xprt->xp_req_tail = (mblk_t *)0;
259 xprt->xp_threads = 0;
260 xprt->xp_detached_threads = 0;
261
262 rd = kmem_zalloc(sizeof (*rd), KM_SLEEP);
263 xprt->xp_p2 = (caddr_t)rd;
264 rd->rd_xprt = xprt;
265 rd->r_mod = rmod->r_mod;
266
267 q = &rd->rd_data.q;
268 xprt->xp_wq = q;
269 q->q_ptr = &rd->rd_xprt;
270 xprt->xp_netid = NULL;
271
272 /*
273 * Each of the plugins will have their own Service ID
274 * to listener specific mapping, like port number for VI
275 * and service name for IB.
276 */
277 rd->rd_data.svcid = id;
278 error = svc_xprt_register(xprt, id);
279 if (error) {
280 DTRACE_PROBE(krpc__e__svcrdma__xprt__reg);
281 goto cleanup;
282 }
283
284 SVC_START(xprt);
285 if (!rd->rd_data.active) {
286 svc_xprt_unregister(xprt);
287 error = rd->rd_data.err_code;
288 goto cleanup;
289 }
290
291 /*
292 * This is set only when there is atleast one or more
293 * transports successfully created. We insert the pointer
294 * to the created RDMA master xprt into a separately maintained
295 * list. This way we can easily reference it later to cleanup,
296 * when NFS kRPC service pool is going away/unregistered.
297 */
298 started_xprts->rtg_count ++;
299 xprt_rec = kmem_alloc(sizeof (*xprt_rec), KM_SLEEP);
300 xprt_rec->rtr_xprt_ptr = xprt;
301 xprt_rec->rtr_next = started_xprts->rtg_listhead;
302 started_xprts->rtg_listhead = xprt_rec;
303 continue;
304 cleanup:
305 SVC_DESTROY(xprt);
306 if (error == RDMA_FAILED)
307 error = EPROTONOSUPPORT;
308 }
309
310 rw_exit(&rdma_lock);
311
312 /*
313 * Don't return any error even if a single plugin was started
314 * successfully.
315 */
316 if (started_xprts->rtg_count == 0)
317 return (error);
318 return (0);
319 }
320
321 /*
322 * Cleanup routine for freeing up memory allocated by
323 * svc_rdma_kcreate()
324 */
325 void
svc_rdma_kdestroy(SVCMASTERXPRT * xprt)326 svc_rdma_kdestroy(SVCMASTERXPRT *xprt)
327 {
328 struct rdma_data *rd = (struct rdma_data *)xprt->xp_p2;
329
330
331 mutex_destroy(&xprt->xp_req_lock);
332 mutex_destroy(&xprt->xp_thread_lock);
333 kmem_free(rd, sizeof (*rd));
334 kmem_free(xprt, sizeof (*xprt));
335 }
336
337
338 static void
svc_rdma_kstart(SVCMASTERXPRT * xprt)339 svc_rdma_kstart(SVCMASTERXPRT *xprt)
340 {
341 struct rdma_svc_data *svcdata;
342 rdma_mod_t *rmod;
343
344 svcdata = &((struct rdma_data *)xprt->xp_p2)->rd_data;
345 rmod = ((struct rdma_data *)xprt->xp_p2)->r_mod;
346
347 /*
348 * Create a listener for module at this port
349 */
350
351 if (rmod->rdma_count != 0)
352 (*rmod->rdma_ops->rdma_svc_listen)(svcdata);
353 else
354 svcdata->err_code = RDMA_FAILED;
355 }
356
357 void
svc_rdma_kstop(SVCMASTERXPRT * xprt)358 svc_rdma_kstop(SVCMASTERXPRT *xprt)
359 {
360 struct rdma_svc_data *svcdata;
361 rdma_mod_t *rmod;
362
363 svcdata = &((struct rdma_data *)xprt->xp_p2)->rd_data;
364 rmod = ((struct rdma_data *)xprt->xp_p2)->r_mod;
365
366 /*
367 * Call the stop listener routine for each plugin. If rdma_count is
368 * already zero set active to zero.
369 */
370 if (rmod->rdma_count != 0)
371 (*rmod->rdma_ops->rdma_svc_stop)(svcdata);
372 else
373 svcdata->active = 0;
374 if (svcdata->active)
375 DTRACE_PROBE(krpc__e__svcrdma__kstop);
376 }
377
378 /* ARGSUSED */
379 static void
svc_rdma_kclone_destroy(SVCXPRT * clone_xprt)380 svc_rdma_kclone_destroy(SVCXPRT *clone_xprt)
381 {
382
383 struct clone_rdma_data *cdrp;
384 cdrp = (struct clone_rdma_data *)clone_xprt->xp_p2buf;
385
386 /*
387 * Only free buffers and release connection when cloned is set.
388 */
389 if (cdrp->cloned != TRUE)
390 return;
391
392 rdma_buf_free(cdrp->conn, &cdrp->rpcbuf);
393 if (cdrp->cl_reply) {
394 clist_free(cdrp->cl_reply);
395 cdrp->cl_reply = NULL;
396 }
397 RDMA_REL_CONN(cdrp->conn);
398
399 cdrp->cloned = 0;
400 }
401
402 /*
403 * Clone the xprt specific information. It will be freed by
404 * SVC_CLONE_DESTROY.
405 */
406 static void
svc_rdma_kclone_xprt(SVCXPRT * src_xprt,SVCXPRT * dst_xprt)407 svc_rdma_kclone_xprt(SVCXPRT *src_xprt, SVCXPRT *dst_xprt)
408 {
409 struct clone_rdma_data *srcp2;
410 struct clone_rdma_data *dstp2;
411
412 srcp2 = (struct clone_rdma_data *)src_xprt->xp_p2buf;
413 dstp2 = (struct clone_rdma_data *)dst_xprt->xp_p2buf;
414
415 if (srcp2->conn != NULL) {
416 srcp2->cloned = TRUE;
417 *dstp2 = *srcp2;
418 }
419 }
420
421 static void
svc_rdma_ktattrs(SVCXPRT * clone_xprt,int attrflag,void ** tattr)422 svc_rdma_ktattrs(SVCXPRT *clone_xprt, int attrflag, void **tattr)
423 {
424 CONN *conn;
425 *tattr = NULL;
426
427 switch (attrflag) {
428 case SVC_TATTR_ADDRMASK:
429 conn = ((struct clone_rdma_data *)clone_xprt->xp_p2buf)->conn;
430 ASSERT(conn != NULL);
431 if (conn)
432 *tattr = (void *)&conn->c_addrmask;
433 }
434 }
435
436 static bool_t
svc_rdma_krecv(SVCXPRT * clone_xprt,mblk_t * mp,struct rpc_msg * msg)437 svc_rdma_krecv(SVCXPRT *clone_xprt, mblk_t *mp, struct rpc_msg *msg)
438 {
439 XDR *xdrs;
440 CONN *conn;
441 rdma_recv_data_t *rdp = (rdma_recv_data_t *)mp->b_rptr;
442 struct clone_rdma_data *crdp;
443 struct clist *cl = NULL;
444 struct clist *wcl = NULL;
445 struct clist *cllong = NULL;
446
447 rdma_stat status;
448 uint32_t vers, op, pos, xid;
449 uint32_t rdma_credit;
450 uint32_t wcl_total_length = 0;
451 bool_t wwl = FALSE;
452
453 crdp = (struct clone_rdma_data *)clone_xprt->xp_p2buf;
454 RSSTAT_INCR(rscalls);
455 conn = rdp->conn;
456
457 status = rdma_svc_postrecv(conn);
458 if (status != RDMA_SUCCESS) {
459 DTRACE_PROBE(krpc__e__svcrdma__krecv__postrecv);
460 goto badrpc_call;
461 }
462
463 xdrs = &clone_xprt->xp_xdrin;
464 xdrmem_create(xdrs, rdp->rpcmsg.addr, rdp->rpcmsg.len, XDR_DECODE);
465 xid = *(uint32_t *)rdp->rpcmsg.addr;
466 XDR_SETPOS(xdrs, sizeof (uint32_t));
467
468 if (! xdr_u_int(xdrs, &vers) ||
469 ! xdr_u_int(xdrs, &rdma_credit) ||
470 ! xdr_u_int(xdrs, &op)) {
471 DTRACE_PROBE(krpc__e__svcrdma__krecv__uint);
472 goto xdr_err;
473 }
474
475 /* Checking if the status of the recv operation was normal */
476 if (rdp->status != 0) {
477 DTRACE_PROBE1(krpc__e__svcrdma__krecv__invalid__status,
478 int, rdp->status);
479 goto badrpc_call;
480 }
481
482 if (! xdr_do_clist(xdrs, &cl)) {
483 DTRACE_PROBE(krpc__e__svcrdma__krecv__do__clist);
484 goto xdr_err;
485 }
486
487 if (!xdr_decode_wlist_svc(xdrs, &wcl, &wwl, &wcl_total_length, conn)) {
488 DTRACE_PROBE(krpc__e__svcrdma__krecv__decode__wlist);
489 if (cl)
490 clist_free(cl);
491 goto xdr_err;
492 }
493 crdp->cl_wlist = wcl;
494
495 crdp->cl_reply = NULL;
496 (void) xdr_decode_reply_wchunk(xdrs, &crdp->cl_reply);
497
498 /*
499 * A chunk at 0 offset indicates that the RPC call message
500 * is in a chunk. Get the RPC call message chunk.
501 */
502 if (cl != NULL && op == RDMA_NOMSG) {
503
504 /* Remove RPC call message chunk from chunklist */
505 cllong = cl;
506 cl = cl->c_next;
507 cllong->c_next = NULL;
508
509
510 /* Allocate and register memory for the RPC call msg chunk */
511 cllong->rb_longbuf.type = RDMA_LONG_BUFFER;
512 cllong->rb_longbuf.len = cllong->c_len > LONG_REPLY_LEN ?
513 cllong->c_len : LONG_REPLY_LEN;
514
515 if (rdma_buf_alloc(conn, &cllong->rb_longbuf)) {
516 clist_free(cllong);
517 goto cll_malloc_err;
518 }
519
520 cllong->u.c_daddr3 = cllong->rb_longbuf.addr;
521
522 if (cllong->u.c_daddr == NULL) {
523 DTRACE_PROBE(krpc__e__svcrdma__krecv__nomem);
524 rdma_buf_free(conn, &cllong->rb_longbuf);
525 clist_free(cllong);
526 goto cll_malloc_err;
527 }
528
529 status = clist_register(conn, cllong, CLIST_REG_DST);
530 if (status) {
531 DTRACE_PROBE(krpc__e__svcrdma__krecv__clist__reg);
532 rdma_buf_free(conn, &cllong->rb_longbuf);
533 clist_free(cllong);
534 goto cll_malloc_err;
535 }
536
537 /*
538 * Now read the RPC call message in
539 */
540 status = RDMA_READ(conn, cllong, WAIT);
541 if (status) {
542 DTRACE_PROBE(krpc__e__svcrdma__krecv__read);
543 (void) clist_deregister(conn, cllong);
544 rdma_buf_free(conn, &cllong->rb_longbuf);
545 clist_free(cllong);
546 goto cll_malloc_err;
547 }
548
549 status = clist_syncmem(conn, cllong, CLIST_REG_DST);
550 (void) clist_deregister(conn, cllong);
551
552 xdrrdma_create(xdrs, (caddr_t)(uintptr_t)cllong->u.c_daddr3,
553 cllong->c_len, 0, cl, XDR_DECODE, conn);
554
555 crdp->rpcbuf = cllong->rb_longbuf;
556 crdp->rpcbuf.len = cllong->c_len;
557 clist_free(cllong);
558 RDMA_BUF_FREE(conn, &rdp->rpcmsg);
559 } else {
560 pos = XDR_GETPOS(xdrs);
561 xdrrdma_create(xdrs, rdp->rpcmsg.addr + pos,
562 rdp->rpcmsg.len - pos, 0, cl, XDR_DECODE, conn);
563 crdp->rpcbuf = rdp->rpcmsg;
564
565 /* Use xdrrdmablk_ops to indicate there is a read chunk list */
566 if (cl != NULL) {
567 int32_t flg = XDR_RDMA_RLIST_REG;
568
569 XDR_CONTROL(xdrs, XDR_RDMA_SET_FLAGS, &flg);
570 xdrs->x_ops = &xdrrdmablk_ops;
571 }
572 }
573
574 if (crdp->cl_wlist) {
575 int32_t flg = XDR_RDMA_WLIST_REG;
576
577 XDR_CONTROL(xdrs, XDR_RDMA_SET_WLIST, crdp->cl_wlist);
578 XDR_CONTROL(xdrs, XDR_RDMA_SET_FLAGS, &flg);
579 }
580
581 if (! xdr_callmsg(xdrs, msg)) {
582 DTRACE_PROBE(krpc__e__svcrdma__krecv__callmsg);
583 RSSTAT_INCR(rsxdrcall);
584 goto callmsg_err;
585 }
586
587 /*
588 * Point the remote transport address in the service_transport
589 * handle at the address in the request.
590 */
591 clone_xprt->xp_rtaddr.buf = conn->c_raddr.buf;
592 clone_xprt->xp_rtaddr.len = conn->c_raddr.len;
593 clone_xprt->xp_rtaddr.maxlen = conn->c_raddr.len;
594
595 clone_xprt->xp_lcladdr.buf = conn->c_laddr.buf;
596 clone_xprt->xp_lcladdr.len = conn->c_laddr.len;
597 clone_xprt->xp_lcladdr.maxlen = conn->c_laddr.len;
598
599 /*
600 * In case of RDMA, connection management is
601 * entirely done in rpcib module and netid in the
602 * SVCMASTERXPRT is NULL. Initialize the clone netid
603 * from the connection.
604 */
605
606 clone_xprt->xp_netid = conn->c_netid;
607
608 clone_xprt->xp_xid = xid;
609 crdp->conn = conn;
610
611 freeb(mp);
612
613 return (TRUE);
614
615 callmsg_err:
616 rdma_buf_free(conn, &crdp->rpcbuf);
617
618 cll_malloc_err:
619 if (cl)
620 clist_free(cl);
621 xdr_err:
622 XDR_DESTROY(xdrs);
623
624 badrpc_call:
625 RDMA_BUF_FREE(conn, &rdp->rpcmsg);
626 RDMA_REL_CONN(conn);
627 freeb(mp);
628 RSSTAT_INCR(rsbadcalls);
629 return (FALSE);
630 }
631
632 static int
svc_process_long_reply(SVCXPRT * clone_xprt,xdrproc_t xdr_results,caddr_t xdr_location,struct rpc_msg * msg,bool_t has_args,int * msglen,int * freelen,int * numchunks,unsigned int * final_len)633 svc_process_long_reply(SVCXPRT * clone_xprt,
634 xdrproc_t xdr_results, caddr_t xdr_location,
635 struct rpc_msg *msg, bool_t has_args, int *msglen,
636 int *freelen, int *numchunks, unsigned int *final_len)
637 {
638 int status;
639 XDR xdrslong;
640 struct clist *wcl = NULL;
641 int count = 0;
642 int alloc_len;
643 char *memp;
644 rdma_buf_t long_rpc = {0};
645 struct clone_rdma_data *crdp;
646
647 crdp = (struct clone_rdma_data *)clone_xprt->xp_p2buf;
648
649 bzero(&xdrslong, sizeof (xdrslong));
650
651 /* Choose a size for the long rpc response */
652 if (MSG_IS_RPCSEC_GSS(msg)) {
653 alloc_len = RNDUP(MAX_AUTH_BYTES + *msglen);
654 } else {
655 alloc_len = RNDUP(*msglen);
656 }
657
658 if (alloc_len <= 64 * 1024) {
659 if (alloc_len > 32 * 1024) {
660 alloc_len = 64 * 1024;
661 } else {
662 if (alloc_len > 16 * 1024) {
663 alloc_len = 32 * 1024;
664 } else {
665 alloc_len = 16 * 1024;
666 }
667 }
668 }
669
670 long_rpc.type = RDMA_LONG_BUFFER;
671 long_rpc.len = alloc_len;
672 if (rdma_buf_alloc(crdp->conn, &long_rpc)) {
673 return (SVC_RDMA_FAIL);
674 }
675
676 memp = long_rpc.addr;
677 xdrmem_create(&xdrslong, memp, alloc_len, XDR_ENCODE);
678
679 msg->rm_xid = clone_xprt->xp_xid;
680
681 if (!(xdr_replymsg(&xdrslong, msg) &&
682 (!has_args || SVCAUTH_WRAP(&clone_xprt->xp_auth, &xdrslong,
683 xdr_results, xdr_location)))) {
684 rdma_buf_free(crdp->conn, &long_rpc);
685 DTRACE_PROBE(krpc__e__svcrdma__longrep__authwrap);
686 return (SVC_RDMA_FAIL);
687 }
688
689 *final_len = XDR_GETPOS(&xdrslong);
690
691 DTRACE_PROBE1(krpc__i__replylen, uint_t, *final_len);
692 *numchunks = 0;
693 *freelen = 0;
694
695 wcl = crdp->cl_reply;
696 wcl->rb_longbuf = long_rpc;
697
698 count = *final_len;
699 while ((wcl != NULL) && (count > 0)) {
700
701 if (wcl->c_dmemhandle.mrc_rmr == 0)
702 break;
703
704 DTRACE_PROBE2(krpc__i__write__chunks, uint32_t, count,
705 uint32_t, wcl->c_len);
706
707 if (wcl->c_len > count) {
708 wcl->c_len = count;
709 }
710 wcl->w.c_saddr3 = (caddr_t)memp;
711
712 count -= wcl->c_len;
713 *numchunks += 1;
714 memp += wcl->c_len;
715 wcl = wcl->c_next;
716 }
717
718 /*
719 * Make rest of the chunks 0-len
720 */
721 while (wcl != NULL) {
722 if (wcl->c_dmemhandle.mrc_rmr == 0)
723 break;
724 wcl->c_len = 0;
725 wcl = wcl->c_next;
726 }
727
728 wcl = crdp->cl_reply;
729
730 /*
731 * MUST fail if there are still more data
732 */
733 if (count > 0) {
734 rdma_buf_free(crdp->conn, &long_rpc);
735 DTRACE_PROBE(krpc__e__svcrdma__longrep__dlen__clist);
736 return (SVC_RDMA_FAIL);
737 }
738
739 if (clist_register(crdp->conn, wcl, CLIST_REG_SOURCE) != RDMA_SUCCESS) {
740 rdma_buf_free(crdp->conn, &long_rpc);
741 DTRACE_PROBE(krpc__e__svcrdma__longrep__clistreg);
742 return (SVC_RDMA_FAIL);
743 }
744
745 status = clist_syncmem(crdp->conn, wcl, CLIST_REG_SOURCE);
746
747 if (status) {
748 (void) clist_deregister(crdp->conn, wcl);
749 rdma_buf_free(crdp->conn, &long_rpc);
750 DTRACE_PROBE(krpc__e__svcrdma__longrep__syncmem);
751 return (SVC_RDMA_FAIL);
752 }
753
754 status = RDMA_WRITE(crdp->conn, wcl, WAIT);
755
756 (void) clist_deregister(crdp->conn, wcl);
757 rdma_buf_free(crdp->conn, &wcl->rb_longbuf);
758
759 if (status != RDMA_SUCCESS) {
760 DTRACE_PROBE(krpc__e__svcrdma__longrep__write);
761 return (SVC_RDMA_FAIL);
762 }
763
764 return (SVC_RDMA_SUCCESS);
765 }
766
767
768 static int
svc_compose_rpcmsg(SVCXPRT * clone_xprt,CONN * conn,xdrproc_t xdr_results,caddr_t xdr_location,rdma_buf_t * rpcreply,XDR ** xdrs,struct rpc_msg * msg,bool_t has_args,uint_t * len)769 svc_compose_rpcmsg(SVCXPRT * clone_xprt, CONN * conn, xdrproc_t xdr_results,
770 caddr_t xdr_location, rdma_buf_t *rpcreply, XDR ** xdrs,
771 struct rpc_msg *msg, bool_t has_args, uint_t *len)
772 {
773 /*
774 * Get a pre-allocated buffer for rpc reply
775 */
776 rpcreply->type = SEND_BUFFER;
777 if (rdma_buf_alloc(conn, rpcreply)) {
778 DTRACE_PROBE(krpc__e__svcrdma__rpcmsg__reply__nofreebufs);
779 return (SVC_RDMA_FAIL);
780 }
781
782 xdrrdma_create(*xdrs, rpcreply->addr, rpcreply->len,
783 0, NULL, XDR_ENCODE, conn);
784
785 msg->rm_xid = clone_xprt->xp_xid;
786
787 if (has_args) {
788 if (!(xdr_replymsg(*xdrs, msg) &&
789 (!has_args ||
790 SVCAUTH_WRAP(&clone_xprt->xp_auth, *xdrs,
791 xdr_results, xdr_location)))) {
792 rdma_buf_free(conn, rpcreply);
793 DTRACE_PROBE(
794 krpc__e__svcrdma__rpcmsg__reply__authwrap1);
795 return (SVC_RDMA_FAIL);
796 }
797 } else {
798 if (!xdr_replymsg(*xdrs, msg)) {
799 rdma_buf_free(conn, rpcreply);
800 DTRACE_PROBE(
801 krpc__e__svcrdma__rpcmsg__reply__authwrap2);
802 return (SVC_RDMA_FAIL);
803 }
804 }
805
806 *len = XDR_GETPOS(*xdrs);
807
808 return (SVC_RDMA_SUCCESS);
809 }
810
811 /*
812 * Send rpc reply.
813 */
814 static bool_t
svc_rdma_ksend(SVCXPRT * clone_xprt,struct rpc_msg * msg)815 svc_rdma_ksend(SVCXPRT * clone_xprt, struct rpc_msg *msg)
816 {
817 XDR *xdrs_rpc = &(clone_xprt->xp_xdrout);
818 XDR xdrs_rhdr;
819 CONN *conn = NULL;
820 rdma_buf_t rbuf_resp = {0}, rbuf_rpc_resp = {0};
821
822 struct clone_rdma_data *crdp;
823 struct clist *cl_read = NULL;
824 struct clist *cl_send = NULL;
825 struct clist *cl_write = NULL;
826 xdrproc_t xdr_results; /* results XDR encoding function */
827 caddr_t xdr_location; /* response results pointer */
828
829 int retval = FALSE;
830 int status, msglen, num_wreply_segments = 0;
831 uint32_t rdma_credit = 0;
832 int freelen = 0;
833 bool_t has_args;
834 uint_t final_resp_len, rdma_response_op, vers;
835
836 bzero(&xdrs_rhdr, sizeof (XDR));
837 crdp = (struct clone_rdma_data *)clone_xprt->xp_p2buf;
838 conn = crdp->conn;
839
840 /*
841 * If there is a result procedure specified in the reply message,
842 * it will be processed in the xdr_replymsg and SVCAUTH_WRAP.
843 * We need to make sure it won't be processed twice, so we null
844 * it for xdr_replymsg here.
845 */
846 has_args = FALSE;
847 if (msg->rm_reply.rp_stat == MSG_ACCEPTED &&
848 msg->rm_reply.rp_acpt.ar_stat == SUCCESS) {
849 if ((xdr_results = msg->acpted_rply.ar_results.proc) != NULL) {
850 has_args = TRUE;
851 xdr_location = msg->acpted_rply.ar_results.where;
852 msg->acpted_rply.ar_results.proc = xdr_void;
853 msg->acpted_rply.ar_results.where = NULL;
854 }
855 }
856
857 /*
858 * Given the limit on the inline response size (RPC_MSG_SZ),
859 * there is a need to make a guess as to the overall size of
860 * the response. If the resultant size is beyond the inline
861 * size, then the server needs to use the "reply chunk list"
862 * provided by the client (if the client provided one). An
863 * example of this type of response would be a READDIR
864 * response (e.g. a small directory read would fit in RPC_MSG_SZ
865 * and that is the preference but it may not fit)
866 *
867 * Combine the encoded size and the size of the true results
868 * and then make the decision about where to encode and send results.
869 *
870 * One important note, this calculation is ignoring the size
871 * of the encoding of the authentication overhead. The reason
872 * for this is rooted in the complexities of access to the
873 * encoded size of RPCSEC_GSS related authentiation,
874 * integrity, and privacy.
875 *
876 * If it turns out that the encoded authentication bumps the
877 * response over the RPC_MSG_SZ limit, then it may need to
878 * attempt to encode for the reply chunk list.
879 */
880
881 /*
882 * Calculating the "sizeof" the RPC response header and the
883 * encoded results.
884 */
885 msglen = xdr_sizeof(xdr_replymsg, msg);
886
887 if (msglen > 0) {
888 RSSTAT_INCR(rstotalreplies);
889 }
890 if (has_args)
891 msglen += xdrrdma_sizeof(xdr_results, xdr_location,
892 rdma_minchunk, NULL, NULL);
893
894 DTRACE_PROBE1(krpc__i__svcrdma__ksend__msglen, int, msglen);
895
896 status = SVC_RDMA_SUCCESS;
897
898 if (msglen < RPC_MSG_SZ) {
899 /*
900 * Looks like the response will fit in the inline
901 * response; let's try
902 */
903 RSSTAT_INCR(rstotalinlinereplies);
904
905 rdma_response_op = RDMA_MSG;
906
907 status = svc_compose_rpcmsg(clone_xprt, conn, xdr_results,
908 xdr_location, &rbuf_rpc_resp, &xdrs_rpc, msg,
909 has_args, &final_resp_len);
910
911 DTRACE_PROBE1(krpc__i__srdma__ksend__compose_status,
912 int, status);
913 DTRACE_PROBE1(krpc__i__srdma__ksend__compose_len,
914 int, final_resp_len);
915
916 if (status == SVC_RDMA_SUCCESS && crdp->cl_reply) {
917 clist_free(crdp->cl_reply);
918 crdp->cl_reply = NULL;
919 }
920 }
921
922 /*
923 * If the encode failed (size?) or the message really is
924 * larger than what is allowed, try the response chunk list.
925 */
926 if (status != SVC_RDMA_SUCCESS || msglen >= RPC_MSG_SZ) {
927 /*
928 * attempting to use a reply chunk list when there
929 * isn't one won't get very far...
930 */
931 if (crdp->cl_reply == NULL) {
932 DTRACE_PROBE(krpc__e__svcrdma__ksend__noreplycl);
933 goto out;
934 }
935
936 RSSTAT_INCR(rstotallongreplies);
937
938 msglen = xdr_sizeof(xdr_replymsg, msg);
939 msglen += xdrrdma_sizeof(xdr_results, xdr_location, 0,
940 NULL, NULL);
941
942 status = svc_process_long_reply(clone_xprt, xdr_results,
943 xdr_location, msg, has_args, &msglen, &freelen,
944 &num_wreply_segments, &final_resp_len);
945
946 DTRACE_PROBE1(krpc__i__svcrdma__ksend__longreplen,
947 int, final_resp_len);
948
949 if (status != SVC_RDMA_SUCCESS) {
950 DTRACE_PROBE(krpc__e__svcrdma__ksend__compose__failed);
951 goto out;
952 }
953
954 rdma_response_op = RDMA_NOMSG;
955 }
956
957 DTRACE_PROBE1(krpc__i__svcrdma__ksend__rdmamsg__len,
958 int, final_resp_len);
959
960 rbuf_resp.type = SEND_BUFFER;
961 if (rdma_buf_alloc(conn, &rbuf_resp)) {
962 rdma_buf_free(conn, &rbuf_rpc_resp);
963 DTRACE_PROBE(krpc__e__svcrdma__ksend__nofreebufs);
964 goto out;
965 }
966
967 rdma_credit = rdma_bufs_granted;
968
969 vers = RPCRDMA_VERS;
970 xdrmem_create(&xdrs_rhdr, rbuf_resp.addr, rbuf_resp.len, XDR_ENCODE);
971 (*(uint32_t *)rbuf_resp.addr) = msg->rm_xid;
972 /* Skip xid and set the xdr position accordingly. */
973 XDR_SETPOS(&xdrs_rhdr, sizeof (uint32_t));
974 if (!xdr_u_int(&xdrs_rhdr, &vers) ||
975 !xdr_u_int(&xdrs_rhdr, &rdma_credit) ||
976 !xdr_u_int(&xdrs_rhdr, &rdma_response_op)) {
977 rdma_buf_free(conn, &rbuf_rpc_resp);
978 rdma_buf_free(conn, &rbuf_resp);
979 DTRACE_PROBE(krpc__e__svcrdma__ksend__uint);
980 goto out;
981 }
982
983 /*
984 * Now XDR the read chunk list, actually always NULL
985 */
986 (void) xdr_encode_rlist_svc(&xdrs_rhdr, cl_read);
987
988 /*
989 * encode write list -- we already drove RDMA_WRITEs
990 */
991 cl_write = crdp->cl_wlist;
992 if (!xdr_encode_wlist(&xdrs_rhdr, cl_write)) {
993 DTRACE_PROBE(krpc__e__svcrdma__ksend__enc__wlist);
994 rdma_buf_free(conn, &rbuf_rpc_resp);
995 rdma_buf_free(conn, &rbuf_resp);
996 goto out;
997 }
998
999 /*
1000 * XDR encode the RDMA_REPLY write chunk
1001 */
1002 if (!xdr_encode_reply_wchunk(&xdrs_rhdr, crdp->cl_reply,
1003 num_wreply_segments)) {
1004 rdma_buf_free(conn, &rbuf_rpc_resp);
1005 rdma_buf_free(conn, &rbuf_resp);
1006 goto out;
1007 }
1008
1009 clist_add(&cl_send, 0, XDR_GETPOS(&xdrs_rhdr), &rbuf_resp.handle,
1010 rbuf_resp.addr, NULL, NULL);
1011
1012 if (rdma_response_op == RDMA_MSG) {
1013 clist_add(&cl_send, 0, final_resp_len, &rbuf_rpc_resp.handle,
1014 rbuf_rpc_resp.addr, NULL, NULL);
1015 }
1016
1017 status = RDMA_SEND(conn, cl_send, msg->rm_xid);
1018
1019 if (status == RDMA_SUCCESS) {
1020 retval = TRUE;
1021 }
1022
1023 out:
1024 /*
1025 * Free up sendlist chunks
1026 */
1027 if (cl_send != NULL)
1028 clist_free(cl_send);
1029
1030 /*
1031 * Destroy private data for xdr rdma
1032 */
1033 if (clone_xprt->xp_xdrout.x_ops != NULL) {
1034 XDR_DESTROY(&(clone_xprt->xp_xdrout));
1035 }
1036
1037 if (crdp->cl_reply) {
1038 clist_free(crdp->cl_reply);
1039 crdp->cl_reply = NULL;
1040 }
1041
1042 /*
1043 * This is completely disgusting. If public is set it is
1044 * a pointer to a structure whose first field is the address
1045 * of the function to free that structure and any related
1046 * stuff. (see rrokfree in nfs_xdr.c).
1047 */
1048 if (xdrs_rpc->x_public) {
1049 /* LINTED pointer alignment */
1050 (**((int (**)()) xdrs_rpc->x_public)) (xdrs_rpc->x_public);
1051 }
1052
1053 if (xdrs_rhdr.x_ops != NULL) {
1054 XDR_DESTROY(&xdrs_rhdr);
1055 }
1056
1057 return (retval);
1058 }
1059
1060 /*
1061 * Deserialize arguments.
1062 */
1063 static bool_t
svc_rdma_kgetargs(SVCXPRT * clone_xprt,xdrproc_t xdr_args,caddr_t args_ptr)1064 svc_rdma_kgetargs(SVCXPRT *clone_xprt, xdrproc_t xdr_args, caddr_t args_ptr)
1065 {
1066 if ((SVCAUTH_UNWRAP(&clone_xprt->xp_auth, &clone_xprt->xp_xdrin,
1067 xdr_args, args_ptr)) != TRUE)
1068 return (FALSE);
1069 return (TRUE);
1070 }
1071
1072 static bool_t
svc_rdma_kfreeargs(SVCXPRT * clone_xprt,xdrproc_t xdr_args,caddr_t args_ptr)1073 svc_rdma_kfreeargs(SVCXPRT *clone_xprt, xdrproc_t xdr_args,
1074 caddr_t args_ptr)
1075 {
1076 struct clone_rdma_data *crdp;
1077 bool_t retval;
1078
1079 /*
1080 * If the cloned bit is true, then this transport specific
1081 * rmda data has been duplicated into another cloned xprt. Do
1082 * not free, or release the connection, it is still in use. The
1083 * buffers will be freed and the connection released later by
1084 * SVC_CLONE_DESTROY().
1085 */
1086 crdp = (struct clone_rdma_data *)clone_xprt->xp_p2buf;
1087 if (crdp->cloned == TRUE) {
1088 crdp->cloned = 0;
1089 return (TRUE);
1090 }
1091
1092 /*
1093 * Free the args if needed then XDR_DESTROY
1094 */
1095 if (args_ptr) {
1096 XDR *xdrs = &clone_xprt->xp_xdrin;
1097
1098 xdrs->x_op = XDR_FREE;
1099 retval = (*xdr_args)(xdrs, args_ptr);
1100 }
1101
1102 XDR_DESTROY(&(clone_xprt->xp_xdrin));
1103 rdma_buf_free(crdp->conn, &crdp->rpcbuf);
1104 if (crdp->cl_reply) {
1105 clist_free(crdp->cl_reply);
1106 crdp->cl_reply = NULL;
1107 }
1108 RDMA_REL_CONN(crdp->conn);
1109
1110 return (retval);
1111 }
1112
1113 /* ARGSUSED */
1114 static int32_t *
svc_rdma_kgetres(SVCXPRT * clone_xprt,int size)1115 svc_rdma_kgetres(SVCXPRT *clone_xprt, int size)
1116 {
1117 return (NULL);
1118 }
1119
1120 /* ARGSUSED */
1121 static void
svc_rdma_kfreeres(SVCXPRT * clone_xprt)1122 svc_rdma_kfreeres(SVCXPRT *clone_xprt)
1123 {
1124 }
1125
1126 /*
1127 * the dup cacheing routines below provide a cache of non-failure
1128 * transaction id's. rpc service routines can use this to detect
1129 * retransmissions and re-send a non-failure response.
1130 */
1131
1132 /*
1133 * MAXDUPREQS is the number of cached items. It should be adjusted
1134 * to the service load so that there is likely to be a response entry
1135 * when the first retransmission comes in.
1136 */
1137 #define MAXDUPREQS 1024
1138
1139 /*
1140 * This should be appropriately scaled to MAXDUPREQS.
1141 */
1142 #define DRHASHSZ 257
1143
1144 #if ((DRHASHSZ & (DRHASHSZ - 1)) == 0)
1145 #define XIDHASH(xid) ((xid) & (DRHASHSZ - 1))
1146 #else
1147 #define XIDHASH(xid) ((xid) % DRHASHSZ)
1148 #endif
1149 #define DRHASH(dr) XIDHASH((dr)->dr_xid)
1150 #define REQTOXID(req) ((req)->rq_xprt->xp_xid)
1151
1152 static int rdmandupreqs = 0;
1153 int rdmamaxdupreqs = MAXDUPREQS;
1154 static kmutex_t rdmadupreq_lock;
1155 static struct dupreq *rdmadrhashtbl[DRHASHSZ];
1156 static int rdmadrhashstat[DRHASHSZ];
1157
1158 static void unhash(struct dupreq *);
1159
1160 /*
1161 * rdmadrmru points to the head of a circular linked list in lru order.
1162 * rdmadrmru->dr_next == drlru
1163 */
1164 struct dupreq *rdmadrmru;
1165
1166 /*
1167 * svc_rdma_kdup searches the request cache and returns 0 if the
1168 * request is not found in the cache. If it is found, then it
1169 * returns the state of the request (in progress or done) and
1170 * the status or attributes that were part of the original reply.
1171 */
1172 static int
svc_rdma_kdup(struct svc_req * req,caddr_t res,int size,struct dupreq ** drpp,bool_t * dupcachedp)1173 svc_rdma_kdup(struct svc_req *req, caddr_t res, int size, struct dupreq **drpp,
1174 bool_t *dupcachedp)
1175 {
1176 struct dupreq *dr;
1177 uint32_t xid;
1178 uint32_t drhash;
1179 int status;
1180
1181 xid = REQTOXID(req);
1182 mutex_enter(&rdmadupreq_lock);
1183 RSSTAT_INCR(rsdupchecks);
1184 /*
1185 * Check to see whether an entry already exists in the cache.
1186 */
1187 dr = rdmadrhashtbl[XIDHASH(xid)];
1188 while (dr != NULL) {
1189 if (dr->dr_xid == xid &&
1190 dr->dr_proc == req->rq_proc &&
1191 dr->dr_prog == req->rq_prog &&
1192 dr->dr_vers == req->rq_vers &&
1193 dr->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
1194 bcmp((caddr_t)dr->dr_addr.buf,
1195 (caddr_t)req->rq_xprt->xp_rtaddr.buf,
1196 dr->dr_addr.len) == 0) {
1197 status = dr->dr_status;
1198 if (status == DUP_DONE) {
1199 bcopy(dr->dr_resp.buf, res, size);
1200 if (dupcachedp != NULL)
1201 *dupcachedp = (dr->dr_resfree != NULL);
1202 } else {
1203 dr->dr_status = DUP_INPROGRESS;
1204 *drpp = dr;
1205 }
1206 RSSTAT_INCR(rsdupreqs);
1207 mutex_exit(&rdmadupreq_lock);
1208 return (status);
1209 }
1210 dr = dr->dr_chain;
1211 }
1212
1213 /*
1214 * There wasn't an entry, either allocate a new one or recycle
1215 * an old one.
1216 */
1217 if (rdmandupreqs < rdmamaxdupreqs) {
1218 dr = kmem_alloc(sizeof (*dr), KM_NOSLEEP);
1219 if (dr == NULL) {
1220 mutex_exit(&rdmadupreq_lock);
1221 return (DUP_ERROR);
1222 }
1223 dr->dr_resp.buf = NULL;
1224 dr->dr_resp.maxlen = 0;
1225 dr->dr_addr.buf = NULL;
1226 dr->dr_addr.maxlen = 0;
1227 if (rdmadrmru) {
1228 dr->dr_next = rdmadrmru->dr_next;
1229 rdmadrmru->dr_next = dr;
1230 } else {
1231 dr->dr_next = dr;
1232 }
1233 rdmandupreqs++;
1234 } else {
1235 dr = rdmadrmru->dr_next;
1236 while (dr->dr_status == DUP_INPROGRESS) {
1237 dr = dr->dr_next;
1238 if (dr == rdmadrmru->dr_next) {
1239 mutex_exit(&rdmadupreq_lock);
1240 return (DUP_ERROR);
1241 }
1242 }
1243 unhash(dr);
1244 if (dr->dr_resfree) {
1245 (*dr->dr_resfree)(dr->dr_resp.buf);
1246 }
1247 }
1248 dr->dr_resfree = NULL;
1249 rdmadrmru = dr;
1250
1251 dr->dr_xid = REQTOXID(req);
1252 dr->dr_prog = req->rq_prog;
1253 dr->dr_vers = req->rq_vers;
1254 dr->dr_proc = req->rq_proc;
1255 if (dr->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
1256 if (dr->dr_addr.buf != NULL)
1257 kmem_free(dr->dr_addr.buf, dr->dr_addr.maxlen);
1258 dr->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
1259 dr->dr_addr.buf = kmem_alloc(dr->dr_addr.maxlen, KM_NOSLEEP);
1260 if (dr->dr_addr.buf == NULL) {
1261 dr->dr_addr.maxlen = 0;
1262 dr->dr_status = DUP_DROP;
1263 mutex_exit(&rdmadupreq_lock);
1264 return (DUP_ERROR);
1265 }
1266 }
1267 dr->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
1268 bcopy(req->rq_xprt->xp_rtaddr.buf, dr->dr_addr.buf, dr->dr_addr.len);
1269 if (dr->dr_resp.maxlen < size) {
1270 if (dr->dr_resp.buf != NULL)
1271 kmem_free(dr->dr_resp.buf, dr->dr_resp.maxlen);
1272 dr->dr_resp.maxlen = (unsigned int)size;
1273 dr->dr_resp.buf = kmem_alloc(size, KM_NOSLEEP);
1274 if (dr->dr_resp.buf == NULL) {
1275 dr->dr_resp.maxlen = 0;
1276 dr->dr_status = DUP_DROP;
1277 mutex_exit(&rdmadupreq_lock);
1278 return (DUP_ERROR);
1279 }
1280 }
1281 dr->dr_status = DUP_INPROGRESS;
1282
1283 drhash = (uint32_t)DRHASH(dr);
1284 dr->dr_chain = rdmadrhashtbl[drhash];
1285 rdmadrhashtbl[drhash] = dr;
1286 rdmadrhashstat[drhash]++;
1287 mutex_exit(&rdmadupreq_lock);
1288 *drpp = dr;
1289 return (DUP_NEW);
1290 }
1291
1292 /*
1293 * svc_rdma_kdupdone marks the request done (DUP_DONE or DUP_DROP)
1294 * and stores the response.
1295 */
1296 static void
svc_rdma_kdupdone(struct dupreq * dr,caddr_t res,void (* dis_resfree)(),int size,int status)1297 svc_rdma_kdupdone(struct dupreq *dr, caddr_t res, void (*dis_resfree)(),
1298 int size, int status)
1299 {
1300 ASSERT(dr->dr_resfree == NULL);
1301 if (status == DUP_DONE) {
1302 bcopy(res, dr->dr_resp.buf, size);
1303 dr->dr_resfree = dis_resfree;
1304 }
1305 dr->dr_status = status;
1306 }
1307
1308 /*
1309 * This routine expects that the mutex, rdmadupreq_lock, is already held.
1310 */
1311 static void
unhash(struct dupreq * dr)1312 unhash(struct dupreq *dr)
1313 {
1314 struct dupreq *drt;
1315 struct dupreq *drtprev = NULL;
1316 uint32_t drhash;
1317
1318 ASSERT(MUTEX_HELD(&rdmadupreq_lock));
1319
1320 drhash = (uint32_t)DRHASH(dr);
1321 drt = rdmadrhashtbl[drhash];
1322 while (drt != NULL) {
1323 if (drt == dr) {
1324 rdmadrhashstat[drhash]--;
1325 if (drtprev == NULL) {
1326 rdmadrhashtbl[drhash] = drt->dr_chain;
1327 } else {
1328 drtprev->dr_chain = drt->dr_chain;
1329 }
1330 return;
1331 }
1332 drtprev = drt;
1333 drt = drt->dr_chain;
1334 }
1335 }
1336
1337 bool_t
rdma_get_wchunk(struct svc_req * req,iovec_t * iov,struct clist * wlist)1338 rdma_get_wchunk(struct svc_req *req, iovec_t *iov, struct clist *wlist)
1339 {
1340 struct clist *clist;
1341 uint32_t tlen;
1342
1343 if (req->rq_xprt->xp_type != T_RDMA) {
1344 return (FALSE);
1345 }
1346
1347 tlen = 0;
1348 clist = wlist;
1349 while (clist) {
1350 tlen += clist->c_len;
1351 clist = clist->c_next;
1352 }
1353
1354 /*
1355 * set iov to addr+len of first segment of first wchunk of
1356 * wlist sent by client. krecv() already malloc'd a buffer
1357 * large enough, but registration is deferred until we write
1358 * the buffer back to (NFS) client using RDMA_WRITE.
1359 */
1360 iov->iov_base = (caddr_t)(uintptr_t)wlist->w.c_saddr;
1361 iov->iov_len = tlen;
1362
1363 return (TRUE);
1364 }
1365
1366 /*
1367 * routine to setup the read chunk lists
1368 */
1369
1370 int
rdma_setup_read_chunks(struct clist * wcl,uint32_t count,int * wcl_len)1371 rdma_setup_read_chunks(struct clist *wcl, uint32_t count, int *wcl_len)
1372 {
1373 int data_len, avail_len;
1374 uint_t round_len;
1375
1376 data_len = avail_len = 0;
1377
1378 while (wcl != NULL && count > 0) {
1379 if (wcl->c_dmemhandle.mrc_rmr == 0)
1380 break;
1381
1382 if (wcl->c_len < count) {
1383 data_len += wcl->c_len;
1384 avail_len = 0;
1385 } else {
1386 data_len += count;
1387 avail_len = wcl->c_len - count;
1388 wcl->c_len = count;
1389 }
1390 count -= wcl->c_len;
1391
1392 if (count == 0)
1393 break;
1394
1395 wcl = wcl->c_next;
1396 }
1397
1398 /*
1399 * MUST fail if there are still more data
1400 */
1401 if (count > 0) {
1402 DTRACE_PROBE2(krpc__e__rdma_setup_read_chunks_clist_len,
1403 int, data_len, int, count);
1404 return (FALSE);
1405 }
1406
1407 /*
1408 * Round up the last chunk to 4-byte boundary
1409 */
1410 *wcl_len = roundup(data_len, BYTES_PER_XDR_UNIT);
1411 round_len = *wcl_len - data_len;
1412
1413 if (round_len) {
1414
1415 /*
1416 * If there is space in the current chunk,
1417 * add the roundup to the chunk.
1418 */
1419 if (avail_len >= round_len) {
1420 wcl->c_len += round_len;
1421 } else {
1422 /*
1423 * try the next one.
1424 */
1425 wcl = wcl->c_next;
1426 if ((wcl == NULL) || (wcl->c_len < round_len)) {
1427 DTRACE_PROBE1(
1428 krpc__e__rdma_setup_read_chunks_rndup,
1429 int, round_len);
1430 return (FALSE);
1431 }
1432 wcl->c_len = round_len;
1433 }
1434 }
1435
1436 wcl = wcl->c_next;
1437
1438 /*
1439 * Make rest of the chunks 0-len
1440 */
1441
1442 clist_zero_len(wcl);
1443
1444 return (TRUE);
1445 }
1446