xref: /spdk/module/accel/mlx5/accel_mlx5.c (revision d4d015a572e1af7b2818e44218c1e661a61545ec)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  */
4 
5 #include "spdk/env.h"
6 #include "spdk/thread.h"
7 #include "spdk/queue.h"
8 #include "spdk/log.h"
9 #include "spdk/string.h"
10 #include "spdk/likely.h"
11 #include "spdk/dma.h"
12 #include "spdk/json.h"
13 #include "spdk/util.h"
14 
15 #include "spdk_internal/mlx5.h"
16 #include "spdk_internal/rdma_utils.h"
17 #include "spdk/accel_module.h"
18 #include "spdk_internal/assert.h"
19 #include "spdk_internal/sgl.h"
20 #include "accel_mlx5.h"
21 
22 #include <infiniband/mlx5dv.h>
23 #include <rdma/rdma_cma.h>
24 
25 #define ACCEL_MLX5_QP_SIZE (256u)
26 #define ACCEL_MLX5_NUM_REQUESTS (2048u - 1)
27 #define ACCEL_MLX5_RECOVER_POLLER_PERIOD_US (10000)
28 #define ACCEL_MLX5_MAX_SGE (16u)
29 #define ACCEL_MLX5_MAX_WC (64u)
30 #define ACCEL_MLX5_MAX_MKEYS_IN_TASK (16u)
31 
32 /* Assume we have up to 16 devices */
33 #define ACCEL_MLX5_ALLOWED_DEVS_MAX_LEN ((SPDK_MLX5_DEV_MAX_NAME_LEN + 1) * 16)
34 
35 #define ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED(qp, task)	\
36 do {							\
37 	assert((qp)->wrs_submitted < (qp)->wrs_max);	\
38 	(qp)->wrs_submitted++;				\
39 	assert((task)->num_wrs < UINT16_MAX);		\
40 	(task)->num_wrs++;				\
41 } while (0)
42 
43 #define ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED_SIGNALED(dev, qp, task)	\
44 do {									\
45 	assert((dev)->wrs_in_cq < (dev)->wrs_in_cq_max);		\
46 	(dev)->wrs_in_cq++;						\
47         assert((qp)->wrs_submitted < (qp)->wrs_max);			\
48 	(qp)->wrs_submitted++;						\
49 	assert((task)->num_wrs < UINT16_MAX);				\
50 	(task)->num_wrs++;						\
51 } while (0)
52 
53 struct accel_mlx5_io_channel;
54 struct accel_mlx5_task;
55 
56 struct accel_mlx5_dev_ctx {
57 	struct ibv_context *context;
58 	struct ibv_pd *pd;
59 	struct spdk_memory_domain *domain;
60 	struct spdk_mempool *psv_pool;
61 	TAILQ_ENTRY(accel_mlx5_dev_ctx) link;
62 	struct spdk_mlx5_psv **psvs;
63 	bool crypto_mkeys;
64 	bool sig_mkeys;
65 	bool crypto_multi_block;
66 };
67 
68 enum accel_mlx5_opcode {
69 	ACCEL_MLX5_OPC_COPY,
70 	ACCEL_MLX5_OPC_CRYPTO,
71 	ACCEL_MLX5_OPC_CRC32C,
72 	ACCEL_MLX5_OPC_LAST
73 };
74 
75 struct accel_mlx5_stats {
76 	uint64_t crypto_umrs;
77 	uint64_t sig_umrs;
78 	uint64_t rdma_reads;
79 	uint64_t rdma_writes;
80 	uint64_t polls;
81 	uint64_t idle_polls;
82 	uint64_t completions;
83 	uint64_t nomem_qdepth;
84 	uint64_t nomem_mkey;
85 	uint64_t opcodes[ACCEL_MLX5_OPC_LAST];
86 };
87 
88 struct accel_mlx5_module {
89 	struct spdk_accel_module_if module;
90 	struct accel_mlx5_stats stats;
91 	struct spdk_spinlock lock;
92 	struct accel_mlx5_dev_ctx *dev_ctxs;
93 	uint32_t num_ctxs;
94 	struct accel_mlx5_attr attr;
95 	char **allowed_devs;
96 	size_t allowed_devs_count;
97 	bool initialized;
98 	bool enabled;
99 	bool crypto_supported;
100 	bool crc32c_supported;
101 };
102 
103 struct accel_mlx5_sge {
104 	uint32_t src_sge_count;
105 	uint32_t dst_sge_count;
106 	struct ibv_sge src_sge[ACCEL_MLX5_MAX_SGE];
107 	struct ibv_sge dst_sge[ACCEL_MLX5_MAX_SGE];
108 };
109 
110 struct accel_mlx5_iov_sgl {
111 	struct iovec	*iov;
112 	uint32_t	iovcnt;
113 	uint32_t	iov_offset;
114 };
115 
116 struct accel_mlx5_psv_wrapper {
117 	uint32_t psv_index;
118 	struct {
119 		uint32_t error : 1;
120 		uint32_t reserved : 31;
121 	} bits;
122 	/* mlx5 engine requires DMAable memory, use this member to copy user's crc value since we don't know which
123 	 * memory it is in */
124 	uint32_t crc;
125 	uint32_t crc_lkey;
126 };
127 
128 struct accel_mlx5_task {
129 	struct spdk_accel_task base;
130 	struct accel_mlx5_iov_sgl src;
131 	struct accel_mlx5_iov_sgl dst;
132 	struct accel_mlx5_qp *qp;
133 	STAILQ_ENTRY(accel_mlx5_task) link;
134 	uint16_t num_reqs;
135 	uint16_t num_completed_reqs;
136 	uint16_t num_submitted_reqs;
137 	uint16_t num_ops; /* number of allocated mkeys or number of operations */
138 	uint16_t num_wrs; /* Number of outstanding operations which consume qp slot */
139 	union {
140 		struct {
141 			uint16_t blocks_per_req;
142 			uint16_t num_processed_blocks;
143 			uint16_t num_blocks;
144 		};
145 		struct {
146 			struct accel_mlx5_psv_wrapper *psv;
147 			uint32_t last_umr_len;
148 			uint8_t last_mkey_idx;
149 		};
150 	};
151 	union {
152 		uint8_t raw;
153 		struct {
154 			uint8_t inplace : 1;
155 			uint8_t enc_order : 2;
156 			uint8_t mlx5_opcode: 5;
157 		};
158 	};
159 	/* Keep this array last since not all elements might be accessed, this reduces amount of data to be
160 	 * cached */
161 	struct spdk_mlx5_mkey_pool_obj *mkeys[ACCEL_MLX5_MAX_MKEYS_IN_TASK];
162 };
163 
164 SPDK_STATIC_ASSERT(ACCEL_MLX5_MAX_MKEYS_IN_TASK <= UINT8_MAX, "uint8_t is used to iterate mkeys");
165 
166 struct accel_mlx5_qp {
167 	struct spdk_mlx5_qp *qp;
168 	struct ibv_qp *verbs_qp;
169 	struct accel_mlx5_dev *dev;
170 	struct accel_mlx5_io_channel *ch;
171 	/* tasks submitted to HW. We can't complete a task even in error case until we reap completions for all
172 	 * submitted requests */
173 	STAILQ_HEAD(, accel_mlx5_task) in_hw;
174 	uint16_t wrs_submitted;
175 	uint16_t wrs_max;
176 	bool recovering;
177 	struct spdk_poller *recover_poller;
178 };
179 
180 struct accel_mlx5_dev {
181 	struct accel_mlx5_qp qp;
182 	struct spdk_mlx5_cq *cq;
183 	struct spdk_mlx5_mkey_pool *crypto_mkeys;
184 	struct spdk_mlx5_mkey_pool *sig_mkeys;
185 	struct spdk_rdma_utils_mem_map *mmap;
186 	struct accel_mlx5_dev_ctx *dev_ctx;
187 	uint16_t wrs_in_cq;
188 	uint16_t wrs_in_cq_max;
189 	uint16_t crypto_split_blocks;
190 	bool crypto_multi_block;
191 	/* Pending tasks waiting for requests resources */
192 	STAILQ_HEAD(, accel_mlx5_task) nomem;
193 	TAILQ_ENTRY(accel_mlx5_dev) link;
194 	struct accel_mlx5_stats stats;
195 };
196 
197 struct accel_mlx5_io_channel {
198 	struct accel_mlx5_dev *devs;
199 	struct spdk_poller *poller;
200 	uint32_t num_devs;
201 	/* Index in \b devs to be used for operations in round-robin way */
202 	uint32_t dev_idx;
203 };
204 
205 struct accel_mlx5_task_operations {
206 	int (*init)(struct accel_mlx5_task *task);
207 	int (*process)(struct accel_mlx5_task *task);
208 	int (*cont)(struct accel_mlx5_task *task);
209 	void (*complete)(struct accel_mlx5_task *task);
210 };
211 
212 struct accel_mlx5_psv_pool_iter_cb_args {
213 	struct accel_mlx5_dev_ctx *dev;
214 	struct spdk_rdma_utils_mem_map *map;
215 	int rc;
216 };
217 
218 struct accel_mlx5_dump_stats_ctx {
219 	struct accel_mlx5_stats total;
220 	struct spdk_json_write_ctx *w;
221 	enum accel_mlx5_dump_state_level level;
222 	accel_mlx5_dump_stat_done_cb cb;
223 	void *ctx;
224 };
225 
226 static struct accel_mlx5_module g_accel_mlx5;
227 
228 static inline void
229 accel_mlx5_iov_sgl_init(struct accel_mlx5_iov_sgl *s, struct iovec *iov, uint32_t iovcnt)
230 {
231 	s->iov = iov;
232 	s->iovcnt = iovcnt;
233 	s->iov_offset = 0;
234 }
235 
236 static inline void
237 accel_mlx5_iov_sgl_advance(struct accel_mlx5_iov_sgl *s, uint32_t step)
238 {
239 	s->iov_offset += step;
240 	while (s->iovcnt > 0) {
241 		assert(s->iov != NULL);
242 		if (s->iov_offset < s->iov->iov_len) {
243 			break;
244 		}
245 
246 		s->iov_offset -= s->iov->iov_len;
247 		s->iov++;
248 		s->iovcnt--;
249 	}
250 }
251 
252 static inline void
253 accel_mlx5_iov_sgl_unwind(struct accel_mlx5_iov_sgl *s, uint32_t max_iovs, uint32_t step)
254 {
255 	SPDK_DEBUGLOG(accel_mlx5, "iov %p, iovcnt %u, max %u, offset %u, step %u\n", s->iov, s->iovcnt,
256 		      max_iovs, s->iov_offset, step);
257 	while (s->iovcnt <= max_iovs) {
258 		assert(s->iov != NULL);
259 		if (s->iov_offset >= step) {
260 			s->iov_offset -= step;
261 			SPDK_DEBUGLOG(accel_mlx5, "\tEND, iov %p, iovcnt %u, offset %u\n", s->iov, s->iovcnt,
262 				      s->iov_offset);
263 			return;
264 		}
265 		step -= s->iov_offset;
266 		s->iov--;
267 		s->iovcnt++;
268 		s->iov_offset = s->iov->iov_len;
269 		SPDK_DEBUGLOG(accel_mlx5, "\tiov %p, iovcnt %u, offset %u, step %u\n", s->iov, s->iovcnt,
270 			      s->iov_offset, step);
271 	}
272 
273 	SPDK_ERRLOG("Can't unwind iovs, remaining  %u\n", step);
274 	assert(0);
275 }
276 
277 static inline int
278 accel_mlx5_sge_unwind(struct ibv_sge *sge, uint32_t sge_count, uint32_t step)
279 {
280 	int i;
281 
282 	assert(sge_count > 0);
283 	SPDK_DEBUGLOG(accel_mlx5, "sge %p, count %u, step %u\n", sge, sge_count, step);
284 	for (i = (int)sge_count - 1; i >= 0; i--) {
285 		if (sge[i].length > step) {
286 			sge[i].length -= step;
287 			SPDK_DEBUGLOG(accel_mlx5, "\tsge[%u] len %u, step %u\n", i, sge[i].length, step);
288 			return (int)i + 1;
289 		}
290 		SPDK_DEBUGLOG(accel_mlx5, "\tsge[%u] len %u, step %u\n", i, sge[i].length, step);
291 		step -= sge[i].length;
292 	}
293 
294 	SPDK_ERRLOG("Can't unwind sge, remaining  %u\n", step);
295 	assert(step == 0);
296 
297 	return 0;
298 }
299 
300 static inline void
301 accel_mlx5_crypto_task_complete(struct accel_mlx5_task *task)
302 {
303 	struct accel_mlx5_dev *dev = task->qp->dev;
304 
305 	assert(task->num_ops);
306 	spdk_mlx5_mkey_pool_put_bulk(dev->crypto_mkeys, task->mkeys, task->num_ops);
307 	spdk_accel_task_complete(&task->base, 0);
308 }
309 
310 static inline void
311 accel_mlx5_task_fail(struct accel_mlx5_task *task, int rc)
312 {
313 	struct accel_mlx5_dev *dev = task->qp->dev;
314 
315 	assert(task->num_reqs == task->num_completed_reqs);
316 	SPDK_DEBUGLOG(accel_mlx5, "Fail task %p, opc %d, rc %d\n", task, task->base.op_code, rc);
317 
318 	if (task->num_ops) {
319 		if (task->mlx5_opcode == ACCEL_MLX5_OPC_CRYPTO) {
320 			spdk_mlx5_mkey_pool_put_bulk(dev->crypto_mkeys, task->mkeys, task->num_ops);
321 		}
322 		if (task->mlx5_opcode == ACCEL_MLX5_OPC_CRC32C) {
323 			spdk_mlx5_mkey_pool_put_bulk(dev->sig_mkeys, task->mkeys, task->num_ops);
324 			spdk_mempool_put(dev->dev_ctx->psv_pool, task->psv);
325 		}
326 	}
327 	spdk_accel_task_complete(&task->base, rc);
328 }
329 
330 static int
331 accel_mlx5_translate_addr(void *addr, size_t size, struct spdk_memory_domain *domain,
332 			  void *domain_ctx, struct accel_mlx5_dev *dev, struct ibv_sge *sge)
333 {
334 	struct spdk_rdma_utils_memory_translation map_translation;
335 	struct spdk_memory_domain_translation_result domain_translation;
336 	struct spdk_memory_domain_translation_ctx local_ctx;
337 	int rc;
338 
339 	if (domain) {
340 		domain_translation.size = sizeof(struct spdk_memory_domain_translation_result);
341 		local_ctx.size = sizeof(local_ctx);
342 		local_ctx.rdma.ibv_qp = dev->qp.verbs_qp;
343 		rc = spdk_memory_domain_translate_data(domain, domain_ctx, dev->dev_ctx->domain,
344 						       &local_ctx, addr, size, &domain_translation);
345 		if (spdk_unlikely(rc || domain_translation.iov_count != 1)) {
346 			SPDK_ERRLOG("Memory domain translation failed, addr %p, length %zu, iovcnt %u\n", addr, size,
347 				    domain_translation.iov_count);
348 			if (rc == 0) {
349 				rc = -EINVAL;
350 			}
351 
352 			return rc;
353 		}
354 		sge->lkey = domain_translation.rdma.lkey;
355 		sge->addr = (uint64_t) domain_translation.iov.iov_base;
356 		sge->length = domain_translation.iov.iov_len;
357 	} else {
358 		rc = spdk_rdma_utils_get_translation(dev->mmap, addr, size,
359 						     &map_translation);
360 		if (spdk_unlikely(rc)) {
361 			SPDK_ERRLOG("Memory translation failed, addr %p, length %zu\n", addr, size);
362 			return rc;
363 		}
364 		sge->lkey = spdk_rdma_utils_memory_translation_get_lkey(&map_translation);
365 		sge->addr = (uint64_t)addr;
366 		sge->length = size;
367 	}
368 
369 	return 0;
370 }
371 
372 static inline int
373 accel_mlx5_fill_block_sge(struct accel_mlx5_dev *dev, struct ibv_sge *sge,
374 			  struct accel_mlx5_iov_sgl *iovs, uint32_t len, uint32_t *_remaining,
375 			  struct spdk_memory_domain *domain, void *domain_ctx)
376 {
377 	void *addr;
378 	uint32_t remaining = len;
379 	uint32_t size;
380 	int i = 0;
381 	int rc;
382 
383 	while (remaining && i < (int)ACCEL_MLX5_MAX_SGE) {
384 		size = spdk_min(remaining, iovs->iov->iov_len - iovs->iov_offset);
385 		addr = (void *)iovs->iov->iov_base + iovs->iov_offset;
386 		rc = accel_mlx5_translate_addr(addr, size, domain, domain_ctx, dev, &sge[i]);
387 		if (spdk_unlikely(rc)) {
388 			return rc;
389 		}
390 		SPDK_DEBUGLOG(accel_mlx5, "\t sge[%d]: lkey %u, len %u, addr %"PRIx64"\n", i, sge[i].lkey,
391 			      sge[i].length, sge[i].addr);
392 		accel_mlx5_iov_sgl_advance(iovs, size);
393 		i++;
394 		assert(remaining >= size);
395 		remaining -= size;
396 	}
397 	*_remaining = remaining;
398 
399 	return i;
400 }
401 
402 static inline bool
403 accel_mlx5_compare_iovs(struct iovec *v1, struct iovec *v2, uint32_t iovcnt)
404 {
405 	return memcmp(v1, v2, sizeof(*v1) * iovcnt) == 0;
406 }
407 
408 static inline uint16_t
409 accel_mlx5_dev_get_available_slots(struct accel_mlx5_dev *dev, struct accel_mlx5_qp *qp)
410 {
411 	assert(qp->wrs_max >= qp->wrs_submitted);
412 	assert(dev->wrs_in_cq_max >= dev->wrs_in_cq);
413 
414 	/* Each time we produce only 1 CQE, so we need 1 CQ slot */
415 	if (spdk_unlikely(dev->wrs_in_cq == dev->wrs_in_cq_max)) {
416 		return 0;
417 	}
418 
419 	return qp->wrs_max - qp->wrs_submitted;
420 }
421 
422 static inline uint32_t
423 accel_mlx5_task_alloc_mkeys(struct accel_mlx5_task *task, struct spdk_mlx5_mkey_pool *pool)
424 {
425 	uint32_t num_ops;
426 	int rc;
427 
428 	assert(task->num_reqs > task->num_completed_reqs);
429 	num_ops = task->num_reqs - task->num_completed_reqs;
430 	num_ops = spdk_min(num_ops, ACCEL_MLX5_MAX_MKEYS_IN_TASK);
431 	if (!num_ops) {
432 		return 0;
433 	}
434 	rc = spdk_mlx5_mkey_pool_get_bulk(pool, task->mkeys, num_ops);
435 	if (spdk_unlikely(rc)) {
436 		return 0;
437 	}
438 	assert(num_ops <= UINT16_MAX);
439 	task->num_ops = num_ops;
440 
441 	return num_ops;
442 }
443 
444 static inline uint8_t
445 bs_to_bs_selector(uint32_t bs)
446 {
447 	switch (bs) {
448 	case 512:
449 		return SPDK_MLX5_BLOCK_SIZE_SELECTOR_512;
450 	case 520:
451 		return SPDK_MLX5_BLOCK_SIZE_SELECTOR_520;
452 	case 4096:
453 		return SPDK_MLX5_BLOCK_SIZE_SELECTOR_4096;
454 	case 4160:
455 		return SPDK_MLX5_BLOCK_SIZE_SELECTOR_4160;
456 	default:
457 		return SPDK_MLX5_BLOCK_SIZE_SELECTOR_RESERVED;
458 	}
459 }
460 
461 static inline int
462 accel_mlx5_configure_crypto_umr(struct accel_mlx5_task *mlx5_task, struct accel_mlx5_sge *sge,
463 				uint32_t mkey, uint32_t num_blocks, struct spdk_mlx5_crypto_dek_data *dek_data)
464 {
465 	struct spdk_mlx5_umr_crypto_attr cattr;
466 	struct spdk_mlx5_umr_attr umr_attr;
467 	struct accel_mlx5_qp *qp = mlx5_task->qp;
468 	struct accel_mlx5_dev *dev = qp->dev;
469 	struct spdk_accel_task *task = &mlx5_task->base;
470 	uint32_t length, remaining = 0, block_size = task->block_size;
471 	int rc;
472 
473 	length = num_blocks * block_size;
474 	SPDK_DEBUGLOG(accel_mlx5, "task %p, domain %p, len %u, blocks %u\n", task, task->src_domain, length,
475 		      num_blocks);
476 	rc = accel_mlx5_fill_block_sge(dev, sge->src_sge, &mlx5_task->src,  length, &remaining,
477 				       task->src_domain, task->src_domain_ctx);
478 	if (spdk_unlikely(rc <= 0)) {
479 		if (rc == 0) {
480 			rc = -EINVAL;
481 		}
482 		SPDK_ERRLOG("failed set src sge, rc %d\n", rc);
483 		return rc;
484 	}
485 	sge->src_sge_count = rc;
486 	if (spdk_unlikely(remaining)) {
487 		uint32_t new_len = length - remaining;
488 		uint32_t aligned_len, updated_num_blocks;
489 
490 		SPDK_DEBUGLOG(accel_mlx5, "Incorrect src iovs, handled %u out of %u bytes\n", new_len, length);
491 		if (new_len < block_size) {
492 			/* We need to process at least 1 block. If buffer is too fragmented, we can't do
493 			 * anything */
494 			return -ERANGE;
495 		}
496 
497 		/* Regular integer division, we need to round down to prev block size */
498 		updated_num_blocks = new_len / block_size;
499 		assert(updated_num_blocks);
500 		assert(updated_num_blocks < num_blocks);
501 		aligned_len = updated_num_blocks * block_size;
502 
503 		if (aligned_len < new_len) {
504 			uint32_t dt = new_len - aligned_len;
505 
506 			/* We can't process part of block, need to unwind src iov_sgl and sge to the
507 			 * prev block boundary */
508 			SPDK_DEBUGLOG(accel_mlx5, "task %p, unwind src sge for %u bytes\n", task, dt);
509 			accel_mlx5_iov_sgl_unwind(&mlx5_task->src, task->s.iovcnt, dt);
510 			sge->src_sge_count = accel_mlx5_sge_unwind(sge->src_sge, sge->src_sge_count, dt);
511 			if (!sge->src_sge_count) {
512 				return -ERANGE;
513 			}
514 		}
515 		SPDK_DEBUGLOG(accel_mlx5, "task %p, UMR len %u -> %u\n", task, length, aligned_len);
516 		length = aligned_len;
517 		num_blocks = updated_num_blocks;
518 	}
519 
520 	cattr.xts_iv = task->iv + mlx5_task->num_processed_blocks;
521 	cattr.keytag = 0;
522 	cattr.dek_obj_id = dek_data->dek_obj_id;
523 	cattr.tweak_mode = dek_data->tweak_mode;
524 	cattr.enc_order = mlx5_task->enc_order;
525 	cattr.bs_selector = bs_to_bs_selector(mlx5_task->base.block_size);
526 	if (spdk_unlikely(cattr.bs_selector == SPDK_MLX5_BLOCK_SIZE_SELECTOR_RESERVED)) {
527 		SPDK_ERRLOG("unsupported block size %u\n", mlx5_task->base.block_size);
528 		return -EINVAL;
529 	}
530 	umr_attr.mkey = mkey;
531 	umr_attr.sge = sge->src_sge;
532 
533 	if (!mlx5_task->inplace) {
534 		SPDK_DEBUGLOG(accel_mlx5, "task %p, dst sge, domain %p, len %u\n", task, task->dst_domain, length);
535 		rc = accel_mlx5_fill_block_sge(dev, sge->dst_sge, &mlx5_task->dst, length, &remaining,
536 					       task->dst_domain, task->dst_domain_ctx);
537 		if (spdk_unlikely(rc <= 0)) {
538 			if (rc == 0) {
539 				rc = -EINVAL;
540 			}
541 			SPDK_ERRLOG("failed set dst sge, rc %d\n", rc);
542 			return rc;
543 		}
544 		sge->dst_sge_count = rc;
545 		if (spdk_unlikely(remaining)) {
546 			uint32_t new_len = length - remaining;
547 			uint32_t aligned_len, updated_num_blocks, dt;
548 
549 			SPDK_DEBUGLOG(accel_mlx5, "Incorrect dst iovs, handled %u out of %u bytes\n", new_len, length);
550 			if (new_len < block_size) {
551 				/* We need to process at least 1 block. If buffer is too fragmented, we can't do
552 				 * anything */
553 				return -ERANGE;
554 			}
555 
556 			/* Regular integer division, we need to round down to prev block size */
557 			updated_num_blocks = new_len / block_size;
558 			assert(updated_num_blocks);
559 			assert(updated_num_blocks < num_blocks);
560 			aligned_len = updated_num_blocks * block_size;
561 
562 			if (aligned_len < new_len) {
563 				dt = new_len - aligned_len;
564 				assert(dt > 0 && dt < length);
565 				/* We can't process part of block, need to unwind src and dst iov_sgl and sge to the
566 				 * prev block boundary */
567 				SPDK_DEBUGLOG(accel_mlx5, "task %p, unwind dst sge for %u bytes\n", task, dt);
568 				accel_mlx5_iov_sgl_unwind(&mlx5_task->dst, task->d.iovcnt, dt);
569 				sge->dst_sge_count = accel_mlx5_sge_unwind(sge->dst_sge, sge->dst_sge_count, dt);
570 				assert(sge->dst_sge_count > 0 && sge->dst_sge_count <= ACCEL_MLX5_MAX_SGE);
571 				if (!sge->dst_sge_count) {
572 					return -ERANGE;
573 				}
574 			}
575 			assert(length > aligned_len);
576 			dt = length - aligned_len;
577 			SPDK_DEBUGLOG(accel_mlx5, "task %p, unwind src sge for %u bytes\n", task, dt);
578 			/* The same for src iov_sgl and sge. In worst case we can unwind SRC 2 times */
579 			accel_mlx5_iov_sgl_unwind(&mlx5_task->src, task->s.iovcnt, dt);
580 			sge->src_sge_count = accel_mlx5_sge_unwind(sge->src_sge, sge->src_sge_count, dt);
581 			assert(sge->src_sge_count > 0 && sge->src_sge_count <= ACCEL_MLX5_MAX_SGE);
582 			if (!sge->src_sge_count) {
583 				return -ERANGE;
584 			}
585 			SPDK_DEBUGLOG(accel_mlx5, "task %p, UMR len %u -> %u\n", task, length, aligned_len);
586 			length = aligned_len;
587 			num_blocks = updated_num_blocks;
588 		}
589 	}
590 
591 	SPDK_DEBUGLOG(accel_mlx5,
592 		      "task %p: bs %u, iv %"PRIu64", enc_on_tx %d, tweak_mode %d, len %u, mkey %x, blocks %u\n",
593 		      mlx5_task, task->block_size, cattr.xts_iv, mlx5_task->enc_order, cattr.tweak_mode, length, mkey,
594 		      num_blocks);
595 
596 	umr_attr.sge_count = sge->src_sge_count;
597 	umr_attr.umr_len = length;
598 	assert((uint32_t)mlx5_task->num_processed_blocks + num_blocks <= UINT16_MAX);
599 	mlx5_task->num_processed_blocks += num_blocks;
600 
601 	rc = spdk_mlx5_umr_configure_crypto(qp->qp, &umr_attr, &cattr, 0, 0);
602 
603 	return rc;
604 }
605 
606 static inline int
607 accel_mlx5_crypto_task_process(struct accel_mlx5_task *mlx5_task)
608 {
609 	struct accel_mlx5_sge sges[ACCEL_MLX5_MAX_MKEYS_IN_TASK];
610 	struct spdk_mlx5_crypto_dek_data dek_data;
611 	struct accel_mlx5_qp *qp = mlx5_task->qp;
612 	struct accel_mlx5_dev *dev = qp->dev;
613 	/* First RDMA after UMR must have a SMALL_FENCE */
614 	uint32_t first_rdma_fence = SPDK_MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE;
615 	uint16_t num_blocks;
616 	uint16_t num_ops = spdk_min(mlx5_task->num_reqs - mlx5_task->num_completed_reqs,
617 				    mlx5_task->num_ops);
618 	uint16_t qp_slot = accel_mlx5_dev_get_available_slots(dev, qp);
619 	uint16_t i;
620 	int rc;
621 
622 	assert(qp_slot > 1);
623 	num_ops = spdk_min(num_ops, qp_slot >> 1);
624 	if (spdk_unlikely(!num_ops)) {
625 		return -EINVAL;
626 	}
627 
628 	rc = spdk_mlx5_crypto_get_dek_data(mlx5_task->base.crypto_key->priv, dev->dev_ctx->pd, &dek_data);
629 	if (spdk_unlikely(rc)) {
630 		return rc;
631 	}
632 
633 	mlx5_task->num_wrs = 0;
634 	SPDK_DEBUGLOG(accel_mlx5, "begin, task, %p, reqs: total %u, submitted %u, completed %u\n",
635 		      mlx5_task, mlx5_task->num_reqs, mlx5_task->num_submitted_reqs, mlx5_task->num_completed_reqs);
636 	for (i = 0; i < num_ops; i++) {
637 		if (mlx5_task->num_submitted_reqs + i + 1 == mlx5_task->num_reqs) {
638 			/* Last request may consume less than calculated if crypto_multi_block is true */
639 			assert(mlx5_task->num_blocks > mlx5_task->num_submitted_reqs);
640 			num_blocks = mlx5_task->num_blocks - mlx5_task->num_processed_blocks;
641 		} else {
642 			num_blocks = mlx5_task->blocks_per_req;
643 		}
644 
645 		rc = accel_mlx5_configure_crypto_umr(mlx5_task, &sges[i], mlx5_task->mkeys[i]->mkey, num_blocks,
646 						     &dek_data);
647 		if (spdk_unlikely(rc)) {
648 			SPDK_ERRLOG("UMR configure failed with %d\n", rc);
649 			return rc;
650 		}
651 		ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED(qp, mlx5_task);
652 		dev->stats.crypto_umrs++;
653 	}
654 
655 	/* Loop `num_ops - 1` for easy flags handling */
656 	for (i = 0; i < num_ops - 1; i++) {
657 		/* UMR is used as a destination for RDMA_READ - from UMR to sge */
658 		if (mlx5_task->inplace) {
659 			rc = spdk_mlx5_qp_rdma_read(qp->qp, sges[i].src_sge, sges[i].src_sge_count, 0,
660 						    mlx5_task->mkeys[i]->mkey, 0, first_rdma_fence);
661 		} else {
662 			rc = spdk_mlx5_qp_rdma_read(qp->qp, sges[i].dst_sge, sges[i].dst_sge_count, 0,
663 						    mlx5_task->mkeys[i]->mkey, 0, first_rdma_fence);
664 		}
665 		if (spdk_unlikely(rc)) {
666 			SPDK_ERRLOG("RDMA READ/WRITE failed with %d\n", rc);
667 			return rc;
668 		}
669 
670 		first_rdma_fence = 0;
671 		assert(mlx5_task->num_submitted_reqs < mlx5_task->num_reqs);
672 		assert(mlx5_task->num_submitted_reqs < UINT16_MAX);
673 		mlx5_task->num_submitted_reqs++;
674 		ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED(qp, mlx5_task);
675 		dev->stats.rdma_reads++;
676 	}
677 
678 	if (mlx5_task->inplace) {
679 		rc = spdk_mlx5_qp_rdma_read(qp->qp, sges[i].src_sge, sges[i].src_sge_count, 0,
680 					    mlx5_task->mkeys[i]->mkey, (uint64_t)mlx5_task, first_rdma_fence | SPDK_MLX5_WQE_CTRL_CE_CQ_UPDATE);
681 	} else {
682 		rc = spdk_mlx5_qp_rdma_read(qp->qp, sges[i].dst_sge, sges[i].dst_sge_count, 0,
683 					    mlx5_task->mkeys[i]->mkey, (uint64_t)mlx5_task, first_rdma_fence | SPDK_MLX5_WQE_CTRL_CE_CQ_UPDATE);
684 	}
685 	if (spdk_unlikely(rc)) {
686 		SPDK_ERRLOG("RDMA READ/WRITE failed with %d\n", rc);
687 		return rc;
688 	}
689 
690 	assert(mlx5_task->num_submitted_reqs < mlx5_task->num_reqs);
691 	assert(mlx5_task->num_submitted_reqs < UINT16_MAX);
692 	mlx5_task->num_submitted_reqs++;
693 	ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED_SIGNALED(dev, qp, mlx5_task);
694 	dev->stats.rdma_reads++;
695 	STAILQ_INSERT_TAIL(&qp->in_hw, mlx5_task, link);
696 
697 	if (spdk_unlikely(mlx5_task->num_submitted_reqs == mlx5_task->num_reqs &&
698 			  mlx5_task->num_blocks > mlx5_task->num_processed_blocks)) {
699 		/* We hit "out of sge
700 		 * entries" case with highly fragmented payload. In that case
701 		 * accel_mlx5_configure_crypto_umr function handled fewer data blocks than expected
702 		 * That means we need at least 1 more request to complete this task, this request will be
703 		 * executed once all submitted ones are completed */
704 		SPDK_DEBUGLOG(accel_mlx5, "task %p, processed %u/%u blocks, add extra req\n", mlx5_task,
705 			      mlx5_task->num_processed_blocks, mlx5_task->num_blocks);
706 		mlx5_task->num_reqs++;
707 	}
708 
709 	SPDK_DEBUGLOG(accel_mlx5, "end, task, %p, reqs: total %u, submitted %u, completed %u\n", mlx5_task,
710 		      mlx5_task->num_reqs, mlx5_task->num_submitted_reqs, mlx5_task->num_completed_reqs);
711 
712 	return 0;
713 }
714 
715 static inline int
716 accel_mlx5_crypto_task_continue(struct accel_mlx5_task *task)
717 {
718 	struct accel_mlx5_qp *qp = task->qp;
719 	struct accel_mlx5_dev *dev = qp->dev;
720 	uint16_t qp_slot = accel_mlx5_dev_get_available_slots(dev, qp);
721 
722 	assert(task->num_reqs > task->num_completed_reqs);
723 	if (task->num_ops == 0) {
724 		/* No mkeys allocated, try to allocate now */
725 		if (spdk_unlikely(!accel_mlx5_task_alloc_mkeys(task, dev->crypto_mkeys))) {
726 			/* Pool is empty, queue this task */
727 			STAILQ_INSERT_TAIL(&dev->nomem, task, link);
728 			dev->stats.nomem_mkey++;
729 			return -ENOMEM;
730 		}
731 	}
732 	/* We need to post at least 1 UMR and 1 RDMA operation */
733 	if (spdk_unlikely(qp_slot < 2)) {
734 		/* QP is full, queue this task */
735 		STAILQ_INSERT_TAIL(&dev->nomem, task, link);
736 		task->qp->dev->stats.nomem_qdepth++;
737 		return -ENOMEM;
738 	}
739 
740 	return accel_mlx5_crypto_task_process(task);
741 }
742 
743 static inline int
744 accel_mlx5_crypto_task_init(struct accel_mlx5_task *mlx5_task)
745 {
746 	struct spdk_accel_task *task = &mlx5_task->base;
747 	struct accel_mlx5_dev *dev = mlx5_task->qp->dev;
748 	uint64_t src_nbytes = task->nbytes;
749 #ifdef DEBUG
750 	uint64_t dst_nbytes;
751 	uint32_t i;
752 #endif
753 	bool crypto_key_ok;
754 
755 	crypto_key_ok = (task->crypto_key && task->crypto_key->module_if == &g_accel_mlx5.module &&
756 			 task->crypto_key->priv);
757 	if (spdk_unlikely((task->nbytes % mlx5_task->base.block_size != 0) || !crypto_key_ok)) {
758 		if (crypto_key_ok) {
759 			SPDK_ERRLOG("src length %"PRIu64" is not a multiple of the block size %u\n", task->nbytes,
760 				    mlx5_task->base.block_size);
761 		} else {
762 			SPDK_ERRLOG("Wrong crypto key provided\n");
763 		}
764 		return -EINVAL;
765 	}
766 
767 	assert(src_nbytes / mlx5_task->base.block_size <= UINT16_MAX);
768 	mlx5_task->num_blocks = src_nbytes / mlx5_task->base.block_size;
769 	accel_mlx5_iov_sgl_init(&mlx5_task->src, task->s.iovs, task->s.iovcnt);
770 	if (task->d.iovcnt == 0 || (task->d.iovcnt == task->s.iovcnt &&
771 				    accel_mlx5_compare_iovs(task->d.iovs, task->s.iovs, task->s.iovcnt))) {
772 		mlx5_task->inplace = 1;
773 	} else {
774 #ifdef DEBUG
775 		dst_nbytes = 0;
776 		for (i = 0; i < task->d.iovcnt; i++) {
777 			dst_nbytes += task->d.iovs[i].iov_len;
778 		}
779 
780 		if (spdk_unlikely(src_nbytes != dst_nbytes)) {
781 			return -EINVAL;
782 		}
783 #endif
784 		mlx5_task->inplace = 0;
785 		accel_mlx5_iov_sgl_init(&mlx5_task->dst, task->d.iovs, task->d.iovcnt);
786 	}
787 
788 	if (dev->crypto_multi_block) {
789 		if (dev->crypto_split_blocks) {
790 			assert(SPDK_CEIL_DIV(mlx5_task->num_blocks, dev->crypto_split_blocks) <= UINT16_MAX);
791 			mlx5_task->num_reqs = SPDK_CEIL_DIV(mlx5_task->num_blocks, dev->crypto_split_blocks);
792 			/* Last req may consume less blocks */
793 			mlx5_task->blocks_per_req = spdk_min(mlx5_task->num_blocks, dev->crypto_split_blocks);
794 		} else {
795 			if (task->s.iovcnt > ACCEL_MLX5_MAX_SGE || task->d.iovcnt > ACCEL_MLX5_MAX_SGE) {
796 				uint32_t max_sge_count = spdk_max(task->s.iovcnt, task->d.iovcnt);
797 
798 				assert(SPDK_CEIL_DIV(max_sge_count, ACCEL_MLX5_MAX_SGE) <= UINT16_MAX);
799 				mlx5_task->num_reqs = SPDK_CEIL_DIV(max_sge_count, ACCEL_MLX5_MAX_SGE);
800 				mlx5_task->blocks_per_req = SPDK_CEIL_DIV(mlx5_task->num_blocks, mlx5_task->num_reqs);
801 			} else {
802 				mlx5_task->num_reqs = 1;
803 				mlx5_task->blocks_per_req = mlx5_task->num_blocks;
804 			}
805 		}
806 	} else {
807 		mlx5_task->num_reqs = mlx5_task->num_blocks;
808 		mlx5_task->blocks_per_req = 1;
809 	}
810 
811 	if (spdk_unlikely(!accel_mlx5_task_alloc_mkeys(mlx5_task, dev->crypto_mkeys))) {
812 		/* Pool is empty, queue this task */
813 		SPDK_DEBUGLOG(accel_mlx5, "no reqs in pool, dev %s\n", dev->dev_ctx->context->device->name);
814 		dev->stats.nomem_mkey++;
815 		return -ENOMEM;
816 	}
817 	if (spdk_unlikely(accel_mlx5_dev_get_available_slots(dev, &dev->qp) < 2)) {
818 		/* Queue is full, queue this task */
819 		SPDK_DEBUGLOG(accel_mlx5, "dev %s qp %p is full\n", dev->dev_ctx->context->device->name,
820 			      mlx5_task->qp);
821 		dev->stats.nomem_qdepth++;
822 		return -ENOMEM;
823 	}
824 
825 	SPDK_DEBUGLOG(accel_mlx5, "task %p, src_iovs %u, dst_iovs %u, num_reqs %u, "
826 		      "blocks/req %u, blocks %u, inplace %d\n", task, task->s.iovcnt, task->d.iovcnt,
827 		      mlx5_task->num_reqs, mlx5_task->blocks_per_req, mlx5_task->num_blocks, mlx5_task->inplace);
828 
829 	return 0;
830 }
831 
832 static inline void
833 accel_mlx5_copy_task_complete(struct accel_mlx5_task *mlx5_task)
834 {
835 	spdk_accel_task_complete(&mlx5_task->base, 0);
836 }
837 
838 static inline int
839 accel_mlx5_copy_task_process_one(struct accel_mlx5_task *mlx5_task, struct accel_mlx5_qp *qp,
840 				 uint64_t wrid, uint32_t fence)
841 {
842 	struct spdk_accel_task *task = &mlx5_task->base;
843 	struct accel_mlx5_sge sge;
844 	uint32_t remaining = 0;
845 	uint32_t dst_len;
846 	int rc;
847 
848 	/* Limit one RDMA_WRITE by length of dst buffer. Not all src buffers may fit into one dst buffer due to
849 	 * limitation on ACCEL_MLX5_MAX_SGE. If this is the case then remaining is not zero */
850 	assert(mlx5_task->dst.iov->iov_len > mlx5_task->dst.iov_offset);
851 	dst_len = mlx5_task->dst.iov->iov_len - mlx5_task->dst.iov_offset;
852 	rc = accel_mlx5_fill_block_sge(qp->dev, sge.src_sge, &mlx5_task->src, dst_len, &remaining,
853 				       task->src_domain, task->src_domain_ctx);
854 	if (spdk_unlikely(rc <= 0)) {
855 		if (rc == 0) {
856 			rc = -EINVAL;
857 		}
858 		SPDK_ERRLOG("failed set src sge, rc %d\n", rc);
859 		return rc;
860 	}
861 	sge.src_sge_count = rc;
862 	assert(dst_len > remaining);
863 	dst_len -= remaining;
864 
865 	rc = accel_mlx5_fill_block_sge(qp->dev, sge.dst_sge, &mlx5_task->dst, dst_len,  &remaining,
866 				       task->dst_domain, task->dst_domain_ctx);
867 	if (spdk_unlikely(rc != 1)) {
868 		/* We use single dst entry, any result other than 1 is an error */
869 		if (rc == 0) {
870 			rc = -EINVAL;
871 		}
872 		SPDK_ERRLOG("failed set dst sge, rc %d\n", rc);
873 		return rc;
874 	}
875 	if (spdk_unlikely(remaining)) {
876 		SPDK_ERRLOG("Incorrect dst length, remaining %u\n", remaining);
877 		assert(0);
878 		return -EINVAL;
879 	}
880 
881 	rc = spdk_mlx5_qp_rdma_write(mlx5_task->qp->qp, sge.src_sge, sge.src_sge_count,
882 				     sge.dst_sge[0].addr, sge.dst_sge[0].lkey, wrid, fence);
883 	if (spdk_unlikely(rc)) {
884 		SPDK_ERRLOG("new RDMA WRITE failed with %d\n", rc);
885 		return rc;
886 	}
887 	qp->dev->stats.rdma_writes++;
888 
889 	return 0;
890 }
891 
892 static inline int
893 accel_mlx5_copy_task_process(struct accel_mlx5_task *mlx5_task)
894 {
895 
896 	struct accel_mlx5_qp *qp = mlx5_task->qp;
897 	struct accel_mlx5_dev *dev = qp->dev;
898 	uint16_t i;
899 	int rc;
900 
901 	mlx5_task->num_wrs = 0;
902 	assert(mlx5_task->num_reqs > 0);
903 	assert(mlx5_task->num_ops > 0);
904 
905 	/* Handle n-1 reqs in order to simplify wrid and fence handling */
906 	for (i = 0; i < mlx5_task->num_ops - 1; i++) {
907 		rc = accel_mlx5_copy_task_process_one(mlx5_task, qp, 0, 0);
908 		if (spdk_unlikely(rc)) {
909 			return rc;
910 		}
911 		ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED(qp, mlx5_task);
912 		mlx5_task->num_submitted_reqs++;
913 	}
914 
915 	rc = accel_mlx5_copy_task_process_one(mlx5_task, qp, (uint64_t)mlx5_task,
916 					      SPDK_MLX5_WQE_CTRL_CE_CQ_UPDATE);
917 	if (spdk_unlikely(rc)) {
918 		return rc;
919 	}
920 	ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED_SIGNALED(dev, qp, mlx5_task);
921 	mlx5_task->num_submitted_reqs++;
922 	STAILQ_INSERT_TAIL(&qp->in_hw, mlx5_task, link);
923 
924 	SPDK_DEBUGLOG(accel_mlx5, "end, copy task, %p\n", mlx5_task);
925 
926 	return 0;
927 }
928 
929 static inline int
930 accel_mlx5_copy_task_continue(struct accel_mlx5_task *task)
931 {
932 	struct accel_mlx5_qp *qp = task->qp;
933 	struct accel_mlx5_dev *dev = qp->dev;
934 	uint16_t qp_slot = accel_mlx5_dev_get_available_slots(dev, qp);
935 
936 	task->num_ops = spdk_min(qp_slot, task->num_reqs - task->num_completed_reqs);
937 	if (spdk_unlikely(task->num_ops == 0)) {
938 		STAILQ_INSERT_TAIL(&dev->nomem, task, link);
939 		dev->stats.nomem_qdepth++;
940 		return -ENOMEM;
941 	}
942 	return accel_mlx5_copy_task_process(task);
943 }
944 
945 static inline uint32_t
946 accel_mlx5_get_copy_task_count(struct iovec *src_iov, uint32_t src_iovcnt,
947 			       struct iovec *dst_iov, uint32_t dst_iovcnt)
948 {
949 	uint32_t src = 0;
950 	uint32_t dst = 0;
951 	uint64_t src_offset = 0;
952 	uint64_t dst_offset = 0;
953 	uint32_t num_ops = 0;
954 	uint32_t src_sge_count = 0;
955 
956 	while (src < src_iovcnt && dst < dst_iovcnt) {
957 		uint64_t src_len = src_iov[src].iov_len - src_offset;
958 		uint64_t dst_len = dst_iov[dst].iov_len - dst_offset;
959 
960 		if (dst_len < src_len) {
961 			dst_offset = 0;
962 			src_offset += dst_len;
963 			dst++;
964 			num_ops++;
965 			src_sge_count = 0;
966 		} else if (src_len < dst_len) {
967 			dst_offset += src_len;
968 			src_offset = 0;
969 			src++;
970 			if (++src_sge_count >= ACCEL_MLX5_MAX_SGE) {
971 				num_ops++;
972 				src_sge_count = 0;
973 			}
974 		} else {
975 			dst_offset = 0;
976 			src_offset = 0;
977 			dst++;
978 			src++;
979 			num_ops++;
980 			src_sge_count = 0;
981 		}
982 	}
983 
984 	assert(src == src_iovcnt);
985 	assert(dst == dst_iovcnt);
986 	assert(src_offset == 0);
987 	assert(dst_offset == 0);
988 	return num_ops;
989 }
990 
991 static inline int
992 accel_mlx5_copy_task_init(struct accel_mlx5_task *mlx5_task)
993 {
994 	struct spdk_accel_task *task = &mlx5_task->base;
995 	struct accel_mlx5_qp *qp = mlx5_task->qp;
996 	uint16_t qp_slot = accel_mlx5_dev_get_available_slots(qp->dev, qp);
997 
998 	if (spdk_likely(task->s.iovcnt <= ACCEL_MLX5_MAX_SGE)) {
999 		mlx5_task->num_reqs = task->d.iovcnt;
1000 	} else if (task->d.iovcnt == 1) {
1001 		mlx5_task->num_reqs = SPDK_CEIL_DIV(task->s.iovcnt, ACCEL_MLX5_MAX_SGE);
1002 	} else {
1003 		mlx5_task->num_reqs = accel_mlx5_get_copy_task_count(task->s.iovs, task->s.iovcnt,
1004 				      task->d.iovs, task->d.iovcnt);
1005 	}
1006 	mlx5_task->inplace = 0;
1007 	accel_mlx5_iov_sgl_init(&mlx5_task->src, task->s.iovs, task->s.iovcnt);
1008 	accel_mlx5_iov_sgl_init(&mlx5_task->dst, task->d.iovs, task->d.iovcnt);
1009 	mlx5_task->num_ops = spdk_min(qp_slot, mlx5_task->num_reqs);
1010 	if (spdk_unlikely(!mlx5_task->num_ops)) {
1011 		qp->dev->stats.nomem_qdepth++;
1012 		return -ENOMEM;
1013 	}
1014 	SPDK_DEBUGLOG(accel_mlx5, "copy task num_reqs %u, num_ops %u\n", mlx5_task->num_reqs,
1015 		      mlx5_task->num_ops);
1016 
1017 	return 0;
1018 }
1019 
1020 static inline uint32_t
1021 accel_mlx5_advance_iovec(struct iovec *iov, uint32_t iovcnt, size_t *iov_offset, size_t *len)
1022 {
1023 	uint32_t i;
1024 	size_t iov_len;
1025 
1026 	for (i = 0; *len != 0 && i < iovcnt; i++) {
1027 		iov_len = iov[i].iov_len - *iov_offset;
1028 
1029 		if (iov_len < *len) {
1030 			*iov_offset = 0;
1031 			*len -= iov_len;
1032 			continue;
1033 		}
1034 		if (iov_len == *len) {
1035 			*iov_offset = 0;
1036 			i++;
1037 		} else { /* iov_len > *len */
1038 			*iov_offset += *len;
1039 		}
1040 		*len = 0;
1041 		break;
1042 	}
1043 
1044 	return i;
1045 }
1046 
1047 static inline void
1048 accel_mlx5_crc_task_complete(struct accel_mlx5_task *mlx5_task)
1049 {
1050 	struct accel_mlx5_dev *dev = mlx5_task->qp->dev;
1051 
1052 	*mlx5_task->base.crc_dst = mlx5_task->psv->crc ^ UINT32_MAX;
1053 	/* Normal task completion without allocated mkeys is not possible */
1054 	assert(mlx5_task->num_ops);
1055 	spdk_mlx5_mkey_pool_put_bulk(dev->sig_mkeys, mlx5_task->mkeys, mlx5_task->num_ops);
1056 	spdk_mempool_put(dev->dev_ctx->psv_pool, mlx5_task->psv);
1057 	spdk_accel_task_complete(&mlx5_task->base, 0);
1058 }
1059 
1060 static inline int
1061 accel_mlx5_crc_task_configure_umr(struct accel_mlx5_task *mlx5_task, struct ibv_sge *sge,
1062 				  uint32_t sge_count, struct spdk_mlx5_mkey_pool_obj *mkey,
1063 				  enum spdk_mlx5_umr_sig_domain sig_domain, uint32_t umr_len,
1064 				  bool sig_init, bool sig_check_gen)
1065 {
1066 	struct spdk_mlx5_umr_sig_attr sattr = {
1067 		.seed = mlx5_task->base.seed ^ UINT32_MAX,
1068 		.psv_index = mlx5_task->psv->psv_index,
1069 		.domain = sig_domain,
1070 		.sigerr_count = mkey->sig.sigerr_count,
1071 		.raw_data_size = umr_len,
1072 		.init = sig_init,
1073 		.check_gen = sig_check_gen,
1074 	};
1075 	struct spdk_mlx5_umr_attr umr_attr = {
1076 		.mkey = mkey->mkey,
1077 		.umr_len = umr_len,
1078 		.sge_count = sge_count,
1079 		.sge = sge,
1080 	};
1081 
1082 	return spdk_mlx5_umr_configure_sig(mlx5_task->qp->qp, &umr_attr, &sattr, 0, 0);
1083 }
1084 
1085 static inline int
1086 accel_mlx5_crc_task_fill_sge(struct accel_mlx5_task *mlx5_task, struct accel_mlx5_sge *sge)
1087 {
1088 	struct spdk_accel_task *task = &mlx5_task->base;
1089 	struct accel_mlx5_qp *qp = mlx5_task->qp;
1090 	struct accel_mlx5_dev *dev = qp->dev;
1091 	uint32_t remaining;
1092 	int rc;
1093 
1094 	rc = accel_mlx5_fill_block_sge(dev, sge->src_sge, &mlx5_task->src, task->nbytes, &remaining,
1095 				       task->src_domain, task->src_domain_ctx);
1096 	if (spdk_unlikely(rc <= 0)) {
1097 		if (rc == 0) {
1098 			rc = -EINVAL;
1099 		}
1100 		SPDK_ERRLOG("failed set src sge, rc %d\n", rc);
1101 		return rc;
1102 	}
1103 	assert(remaining == 0);
1104 	sge->src_sge_count = rc;
1105 
1106 	if (!mlx5_task->inplace) {
1107 		rc = accel_mlx5_fill_block_sge(dev, sge->dst_sge, &mlx5_task->dst, task->nbytes, &remaining,
1108 					       task->dst_domain, task->dst_domain_ctx);
1109 		if (spdk_unlikely(rc <= 0)) {
1110 			if (rc == 0) {
1111 				rc = -EINVAL;
1112 			}
1113 			SPDK_ERRLOG("failed set dst sge, rc %d\n", rc);
1114 			return rc;
1115 		}
1116 		assert(remaining == 0);
1117 		sge->dst_sge_count = rc;
1118 	}
1119 
1120 	return 0;
1121 }
1122 
1123 static inline int
1124 accel_mlx5_crc_task_process_one_req(struct accel_mlx5_task *mlx5_task)
1125 {
1126 	struct accel_mlx5_sge sges;
1127 	struct accel_mlx5_qp *qp = mlx5_task->qp;
1128 	struct accel_mlx5_dev *dev = qp->dev;
1129 	uint32_t num_ops = spdk_min(mlx5_task->num_reqs - mlx5_task->num_completed_reqs,
1130 				    mlx5_task->num_ops);
1131 	uint16_t qp_slot = accel_mlx5_dev_get_available_slots(dev, qp);
1132 	uint32_t rdma_fence = SPDK_MLX5_WQE_CTRL_STRONG_ORDERING;
1133 	struct ibv_sge *sge;
1134 	int rc;
1135 	uint16_t sge_count;
1136 
1137 	num_ops = spdk_min(num_ops, qp_slot >> 1);
1138 	if (spdk_unlikely(!num_ops)) {
1139 		return -EINVAL;
1140 	}
1141 
1142 	mlx5_task->num_wrs = 0;
1143 	/* At this moment we have as many requests as can be submitted to a qp */
1144 	rc = accel_mlx5_crc_task_fill_sge(mlx5_task, &sges);
1145 	if (spdk_unlikely(rc)) {
1146 		return rc;
1147 	}
1148 	rc = accel_mlx5_crc_task_configure_umr(mlx5_task, sges.src_sge, sges.src_sge_count,
1149 					       mlx5_task->mkeys[0], SPDK_MLX5_UMR_SIG_DOMAIN_WIRE, mlx5_task->base.nbytes, true, true);
1150 	if (spdk_unlikely(rc)) {
1151 		SPDK_ERRLOG("UMR configure failed with %d\n", rc);
1152 		return rc;
1153 	}
1154 	ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED(qp, mlx5_task);
1155 	dev->stats.sig_umrs++;
1156 
1157 	if (mlx5_task->inplace) {
1158 		sge = sges.src_sge;
1159 		sge_count = sges.src_sge_count;
1160 	} else {
1161 		sge = sges.dst_sge;
1162 		sge_count = sges.dst_sge_count;
1163 	}
1164 
1165 	/*
1166 	 * Add the crc destination to the end of sges. A free entry must be available for CRC
1167 	 * because the task init function reserved it.
1168 	 */
1169 	assert(sge_count < ACCEL_MLX5_MAX_SGE);
1170 	sge[sge_count].lkey = mlx5_task->psv->crc_lkey;
1171 	sge[sge_count].addr = (uintptr_t)&mlx5_task->psv->crc;
1172 	sge[sge_count++].length = sizeof(uint32_t);
1173 
1174 	if (spdk_unlikely(mlx5_task->psv->bits.error)) {
1175 		rc = spdk_mlx5_qp_set_psv(qp->qp, mlx5_task->psv->psv_index, *mlx5_task->base.crc_dst, 0, 0);
1176 		if (spdk_unlikely(rc)) {
1177 			SPDK_ERRLOG("SET_PSV failed with %d\n", rc);
1178 			return rc;
1179 		}
1180 		ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED(qp, mlx5_task);
1181 	}
1182 
1183 	rc = spdk_mlx5_qp_rdma_read(qp->qp, sge, sge_count, 0, mlx5_task->mkeys[0]->mkey,
1184 				    (uint64_t)mlx5_task, rdma_fence | SPDK_MLX5_WQE_CTRL_CE_CQ_UPDATE);
1185 	if (spdk_unlikely(rc)) {
1186 		SPDK_ERRLOG("RDMA READ/WRITE failed with %d\n", rc);
1187 		return rc;
1188 	}
1189 	mlx5_task->num_submitted_reqs++;
1190 	ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED_SIGNALED(dev, qp, mlx5_task);
1191 	dev->stats.rdma_reads++;
1192 
1193 	return 0;
1194 }
1195 
1196 static inline int
1197 accel_mlx5_crc_task_fill_umr_sge(struct accel_mlx5_qp *qp, struct ibv_sge *sge,
1198 				 struct accel_mlx5_iov_sgl *umr_iovs, struct spdk_memory_domain *domain,
1199 				 void *domain_ctx, struct accel_mlx5_iov_sgl *rdma_iovs, size_t *len)
1200 {
1201 	int umr_idx = 0;
1202 	int rdma_idx = 0;
1203 	int umr_iovcnt = spdk_min(umr_iovs->iovcnt, (int)ACCEL_MLX5_MAX_SGE);
1204 	int rdma_iovcnt = spdk_min(rdma_iovs->iovcnt, (int)ACCEL_MLX5_MAX_SGE);
1205 	size_t umr_iov_offset;
1206 	size_t rdma_iov_offset;
1207 	size_t umr_len = 0;
1208 	void *sge_addr;
1209 	size_t sge_len;
1210 	size_t umr_sge_len;
1211 	size_t rdma_sge_len;
1212 	int rc;
1213 
1214 	umr_iov_offset = umr_iovs->iov_offset;
1215 	rdma_iov_offset = rdma_iovs->iov_offset;
1216 
1217 	while (umr_idx < umr_iovcnt && rdma_idx < rdma_iovcnt) {
1218 		umr_sge_len = umr_iovs->iov[umr_idx].iov_len - umr_iov_offset;
1219 		rdma_sge_len = rdma_iovs->iov[rdma_idx].iov_len - rdma_iov_offset;
1220 		sge_addr = umr_iovs->iov[umr_idx].iov_base + umr_iov_offset;
1221 
1222 		if (umr_sge_len == rdma_sge_len) {
1223 			rdma_idx++;
1224 			umr_iov_offset = 0;
1225 			rdma_iov_offset = 0;
1226 			sge_len = umr_sge_len;
1227 		} else if (umr_sge_len < rdma_sge_len) {
1228 			umr_iov_offset = 0;
1229 			rdma_iov_offset += umr_sge_len;
1230 			sge_len = umr_sge_len;
1231 		} else {
1232 			size_t remaining;
1233 
1234 			remaining = umr_sge_len - rdma_sge_len;
1235 			while (remaining) {
1236 				rdma_idx++;
1237 				if (rdma_idx == (int)ACCEL_MLX5_MAX_SGE) {
1238 					break;
1239 				}
1240 				rdma_sge_len = rdma_iovs->iov[rdma_idx].iov_len;
1241 				if (remaining == rdma_sge_len) {
1242 					rdma_idx++;
1243 					rdma_iov_offset = 0;
1244 					umr_iov_offset = 0;
1245 					remaining = 0;
1246 					break;
1247 				}
1248 				if (remaining < rdma_sge_len) {
1249 					rdma_iov_offset = remaining;
1250 					umr_iov_offset = 0;
1251 					remaining = 0;
1252 					break;
1253 				}
1254 				remaining -= rdma_sge_len;
1255 			}
1256 			sge_len = umr_sge_len - remaining;
1257 		}
1258 		rc = accel_mlx5_translate_addr(sge_addr, sge_len, domain, domain_ctx, qp->dev, &sge[umr_idx]);
1259 		if (spdk_unlikely(rc)) {
1260 			return -EINVAL;
1261 		}
1262 		SPDK_DEBUGLOG(accel_mlx5, "\t sge[%d] lkey %u, addr %p, len %u\n", umr_idx, sge[umr_idx].lkey,
1263 			      (void *)sge[umr_idx].addr, sge[umr_idx].length);
1264 		umr_len += sge_len;
1265 		umr_idx++;
1266 	}
1267 	accel_mlx5_iov_sgl_advance(umr_iovs, umr_len);
1268 	accel_mlx5_iov_sgl_advance(rdma_iovs, umr_len);
1269 	*len = umr_len;
1270 
1271 	return umr_idx;
1272 }
1273 
1274 static inline int
1275 accel_mlx5_crc_task_process_multi_req(struct accel_mlx5_task *mlx5_task)
1276 {
1277 	size_t umr_len[ACCEL_MLX5_MAX_MKEYS_IN_TASK];
1278 	struct ibv_sge sges[ACCEL_MLX5_MAX_SGE];
1279 	struct spdk_accel_task *task = &mlx5_task->base;
1280 	struct accel_mlx5_qp *qp = mlx5_task->qp;
1281 	struct accel_mlx5_dev *dev = qp->dev;
1282 	struct accel_mlx5_iov_sgl umr_sgl;
1283 	struct accel_mlx5_iov_sgl *umr_sgl_ptr;
1284 	struct accel_mlx5_iov_sgl rdma_sgl;
1285 	uint64_t umr_offset;
1286 	uint32_t rdma_fence = SPDK_MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE;
1287 	int sge_count;
1288 	uint32_t remaining;
1289 	int rc;
1290 	uint16_t i;
1291 	uint16_t num_ops = spdk_min(mlx5_task->num_reqs - mlx5_task->num_completed_reqs,
1292 				    mlx5_task->num_ops);
1293 	uint16_t qp_slot = accel_mlx5_dev_get_available_slots(dev, qp);
1294 	bool sig_init, sig_check_gen = false;
1295 
1296 	num_ops = spdk_min(num_ops, qp_slot >> 1);
1297 	if (spdk_unlikely(!num_ops)) {
1298 		return -EINVAL;
1299 	}
1300 	/* Init signature on the first UMR */
1301 	sig_init = !mlx5_task->num_submitted_reqs;
1302 
1303 	/*
1304 	 * accel_mlx5_crc_task_fill_umr_sge() and accel_mlx5_fill_block_sge() advance an IOV during iteration
1305 	 * on it. We must copy accel_mlx5_iov_sgl to iterate twice or more on the same IOV.
1306 	 *
1307 	 * In the in-place case, we iterate on the source IOV three times. That's why we need two copies of
1308 	 * the source accel_mlx5_iov_sgl.
1309 	 *
1310 	 * In the out-of-place case, we iterate on the source IOV once and on the destination IOV two times.
1311 	 * So, we need one copy of the destination accel_mlx5_iov_sgl.
1312 	 */
1313 	if (mlx5_task->inplace) {
1314 		accel_mlx5_iov_sgl_init(&umr_sgl, mlx5_task->src.iov, mlx5_task->src.iovcnt);
1315 		umr_sgl_ptr = &umr_sgl;
1316 		accel_mlx5_iov_sgl_init(&rdma_sgl, mlx5_task->src.iov, mlx5_task->src.iovcnt);
1317 	} else {
1318 		umr_sgl_ptr = &mlx5_task->src;
1319 		accel_mlx5_iov_sgl_init(&rdma_sgl, mlx5_task->dst.iov, mlx5_task->dst.iovcnt);
1320 	}
1321 	mlx5_task->num_wrs = 0;
1322 	for (i = 0; i < num_ops; i++) {
1323 		/*
1324 		 * The last request may have only CRC. Skip UMR in this case because the MKey from
1325 		 * the previous request is used.
1326 		 */
1327 		if (umr_sgl_ptr->iovcnt == 0) {
1328 			assert((mlx5_task->num_completed_reqs + i + 1) == mlx5_task->num_reqs);
1329 			break;
1330 		}
1331 		sge_count = accel_mlx5_crc_task_fill_umr_sge(qp, sges, umr_sgl_ptr, task->src_domain,
1332 				task->src_domain_ctx, &rdma_sgl, &umr_len[i]);
1333 		if (spdk_unlikely(sge_count <= 0)) {
1334 			rc = (sge_count == 0) ? -EINVAL : sge_count;
1335 			SPDK_ERRLOG("failed set UMR sge, rc %d\n", rc);
1336 			return rc;
1337 		}
1338 		if (umr_sgl_ptr->iovcnt == 0) {
1339 			/*
1340 			 * We post RDMA without UMR if the last request has only CRC. We use an MKey from
1341 			 * the last UMR in this case. Since the last request can be postponed to the next
1342 			 * call of this function, we must save the MKey to the task structure.
1343 			 */
1344 			mlx5_task->last_umr_len = umr_len[i];
1345 			mlx5_task->last_mkey_idx = i;
1346 			sig_check_gen = true;
1347 		}
1348 		rc = accel_mlx5_crc_task_configure_umr(mlx5_task, sges, sge_count, mlx5_task->mkeys[i],
1349 						       SPDK_MLX5_UMR_SIG_DOMAIN_WIRE, umr_len[i], sig_init,
1350 						       sig_check_gen);
1351 		if (spdk_unlikely(rc)) {
1352 			SPDK_ERRLOG("UMR configure failed with %d\n", rc);
1353 			return rc;
1354 		}
1355 		sig_init = false;
1356 		ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED(qp, mlx5_task);
1357 		dev->stats.sig_umrs++;
1358 	}
1359 
1360 	if (spdk_unlikely(mlx5_task->psv->bits.error)) {
1361 		rc = spdk_mlx5_qp_set_psv(qp->qp, mlx5_task->psv->psv_index, *mlx5_task->base.crc_dst, 0, 0);
1362 		if (spdk_unlikely(rc)) {
1363 			SPDK_ERRLOG("SET_PSV failed with %d\n", rc);
1364 			return rc;
1365 		}
1366 		ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED(qp, mlx5_task);
1367 	}
1368 
1369 	for (i = 0; i < num_ops - 1; i++) {
1370 		if (mlx5_task->inplace) {
1371 			sge_count = accel_mlx5_fill_block_sge(dev, sges, &mlx5_task->src, umr_len[i], &remaining,
1372 							      task->src_domain, task->src_domain_ctx);
1373 		} else {
1374 			sge_count = accel_mlx5_fill_block_sge(dev, sges, &mlx5_task->dst, umr_len[i], &remaining,
1375 							      task->dst_domain, task->dst_domain_ctx);
1376 		}
1377 		if (spdk_unlikely(sge_count <= 0)) {
1378 			rc = (sge_count == 0) ? -EINVAL : sge_count;
1379 			SPDK_ERRLOG("failed set RDMA sge, rc %d\n", rc);
1380 			return rc;
1381 		}
1382 		rc = spdk_mlx5_qp_rdma_read(qp->qp, sges, sge_count, 0, mlx5_task->mkeys[i]->mkey,
1383 					    0, rdma_fence);
1384 		if (spdk_unlikely(rc)) {
1385 			SPDK_ERRLOG("RDMA READ/WRITE failed with %d\n", rc);
1386 			return rc;
1387 		}
1388 		mlx5_task->num_submitted_reqs++;
1389 		ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED(qp, mlx5_task);
1390 		dev->stats.rdma_reads++;
1391 		rdma_fence = SPDK_MLX5_WQE_CTRL_STRONG_ORDERING;
1392 	}
1393 	if ((mlx5_task->inplace && mlx5_task->src.iovcnt == 0) || (!mlx5_task->inplace &&
1394 			mlx5_task->dst.iovcnt == 0)) {
1395 		/*
1396 		 * The last RDMA does not have any data, only CRC. It also does not have a paired Mkey.
1397 		 * The CRC is handled in the previous MKey in this case.
1398 		 */
1399 		sge_count = 0;
1400 		umr_offset = mlx5_task->last_umr_len;
1401 	} else {
1402 		umr_offset = 0;
1403 		mlx5_task->last_mkey_idx = i;
1404 		if (mlx5_task->inplace) {
1405 			sge_count = accel_mlx5_fill_block_sge(dev, sges, &mlx5_task->src, umr_len[i], &remaining,
1406 							      task->src_domain, task->src_domain_ctx);
1407 		} else {
1408 			sge_count = accel_mlx5_fill_block_sge(dev, sges, &mlx5_task->dst, umr_len[i], &remaining,
1409 							      task->dst_domain, task->dst_domain_ctx);
1410 		}
1411 		if (spdk_unlikely(sge_count <= 0)) {
1412 			rc = (sge_count == 0) ? -EINVAL : sge_count;
1413 			SPDK_ERRLOG("failed set RDMA sge, rc %d\n", rc);
1414 			return rc;
1415 		}
1416 		assert(remaining == 0);
1417 	}
1418 	if ((mlx5_task->num_completed_reqs + i + 1) == mlx5_task->num_reqs) {
1419 		/* Ensure that there is a free sge for the CRC destination. */
1420 		assert(sge_count < (int)ACCEL_MLX5_MAX_SGE);
1421 		/* Add the crc destination to the end of sges. */
1422 		sges[sge_count].lkey = mlx5_task->psv->crc_lkey;
1423 		sges[sge_count].addr = (uintptr_t)&mlx5_task->psv->crc;
1424 		sges[sge_count++].length = sizeof(uint32_t);
1425 	}
1426 	rdma_fence |= SPDK_MLX5_WQE_CTRL_CE_CQ_UPDATE;
1427 	rc = spdk_mlx5_qp_rdma_read(qp->qp, sges, sge_count, umr_offset,
1428 				    mlx5_task->mkeys[mlx5_task->last_mkey_idx]->mkey,
1429 				    (uint64_t)mlx5_task, rdma_fence);
1430 	if (spdk_unlikely(rc)) {
1431 		SPDK_ERRLOG("RDMA READ/WRITE failed with %d\n", rc);
1432 		return rc;
1433 	}
1434 	mlx5_task->num_submitted_reqs++;
1435 	ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED_SIGNALED(dev, qp, mlx5_task);
1436 	dev->stats.rdma_reads++;
1437 
1438 	return 0;
1439 }
1440 
1441 static inline int
1442 accel_mlx5_crc_task_process(struct accel_mlx5_task *mlx5_task)
1443 {
1444 	int rc;
1445 
1446 	assert(mlx5_task->mlx5_opcode == ACCEL_MLX5_OPC_CRC32C);
1447 
1448 	SPDK_DEBUGLOG(accel_mlx5, "begin, crc task, %p, reqs: total %u, submitted %u, completed %u\n",
1449 		      mlx5_task, mlx5_task->num_reqs, mlx5_task->num_submitted_reqs, mlx5_task->num_completed_reqs);
1450 
1451 	if (mlx5_task->num_reqs == 1) {
1452 		rc = accel_mlx5_crc_task_process_one_req(mlx5_task);
1453 	} else {
1454 		rc = accel_mlx5_crc_task_process_multi_req(mlx5_task);
1455 	}
1456 
1457 	if (rc == 0) {
1458 		STAILQ_INSERT_TAIL(&mlx5_task->qp->in_hw, mlx5_task, link);
1459 		SPDK_DEBUGLOG(accel_mlx5, "end, crc task, %p, reqs: total %u, submitted %u, completed %u\n",
1460 			      mlx5_task, mlx5_task->num_reqs, mlx5_task->num_submitted_reqs,
1461 			      mlx5_task->num_completed_reqs);
1462 	}
1463 
1464 	return rc;
1465 }
1466 
1467 static inline int
1468 accel_mlx5_task_alloc_crc_ctx(struct accel_mlx5_task *task, uint32_t qp_slot)
1469 {
1470 	struct accel_mlx5_qp *qp = task->qp;
1471 	struct accel_mlx5_dev *dev = qp->dev;
1472 
1473 	if (spdk_unlikely(!accel_mlx5_task_alloc_mkeys(task, dev->sig_mkeys))) {
1474 		SPDK_DEBUGLOG(accel_mlx5, "no mkeys in signature mkey pool, dev %s\n",
1475 			      dev->dev_ctx->context->device->name);
1476 		dev->stats.nomem_mkey++;
1477 		return -ENOMEM;
1478 	}
1479 	task->psv = spdk_mempool_get(dev->dev_ctx->psv_pool);
1480 	if (spdk_unlikely(!task->psv)) {
1481 		SPDK_DEBUGLOG(accel_mlx5, "no reqs in psv pool, dev %s\n", dev->dev_ctx->context->device->name);
1482 		spdk_mlx5_mkey_pool_put_bulk(dev->sig_mkeys, task->mkeys, task->num_ops);
1483 		task->num_ops = 0;
1484 		dev->stats.nomem_mkey++;
1485 		return -ENOMEM;
1486 	}
1487 	/* One extra slot is needed for SET_PSV WQE to reset the error state in PSV. */
1488 	if (spdk_unlikely(task->psv->bits.error)) {
1489 		uint32_t n_slots = task->num_ops * 2 + 1;
1490 
1491 		if (qp_slot < n_slots) {
1492 			spdk_mempool_put(dev->dev_ctx->psv_pool, task->psv);
1493 			spdk_mlx5_mkey_pool_put_bulk(dev->sig_mkeys, task->mkeys, task->num_ops);
1494 			dev->stats.nomem_qdepth++;
1495 			task->num_ops = 0;
1496 			return -ENOMEM;
1497 		}
1498 	}
1499 
1500 	return 0;
1501 }
1502 
1503 static inline int
1504 accel_mlx5_crc_task_continue(struct accel_mlx5_task *task)
1505 {
1506 	struct accel_mlx5_qp *qp = task->qp;
1507 	struct accel_mlx5_dev *dev = qp->dev;
1508 	uint16_t qp_slot = accel_mlx5_dev_get_available_slots(dev, qp);
1509 	int rc;
1510 
1511 	assert(task->num_reqs > task->num_completed_reqs);
1512 	if (task->num_ops == 0) {
1513 		/* No mkeys allocated, try to allocate now. */
1514 		rc = accel_mlx5_task_alloc_crc_ctx(task, qp_slot);
1515 		if (spdk_unlikely(rc)) {
1516 			STAILQ_INSERT_TAIL(&dev->nomem, task, link);
1517 			return -ENOMEM;
1518 		}
1519 	}
1520 	/* We need to post at least 1 UMR and 1 RDMA operation */
1521 	if (spdk_unlikely(qp_slot < 2)) {
1522 		STAILQ_INSERT_TAIL(&dev->nomem, task, link);
1523 		dev->stats.nomem_qdepth++;
1524 		return -ENOMEM;
1525 	}
1526 
1527 	return accel_mlx5_crc_task_process(task);
1528 }
1529 
1530 static inline uint32_t
1531 accel_mlx5_get_crc_task_count(struct iovec *src_iov, uint32_t src_iovcnt, struct iovec *dst_iov,
1532 			      uint32_t dst_iovcnt)
1533 {
1534 	uint32_t src_idx = 0;
1535 	uint32_t dst_idx = 0;
1536 	uint32_t num_ops = 1;
1537 	uint32_t num_src_sge = 1;
1538 	uint32_t num_dst_sge = 1;
1539 	size_t src_offset = 0;
1540 	size_t dst_offset = 0;
1541 	uint32_t num_sge;
1542 	size_t src_len;
1543 	size_t dst_len;
1544 
1545 	/* One operation is enough if both iovs fit into ACCEL_MLX5_MAX_SGE. One SGE is reserved for CRC on dst_iov. */
1546 	if (src_iovcnt <= ACCEL_MLX5_MAX_SGE && (dst_iovcnt + 1) <= ACCEL_MLX5_MAX_SGE) {
1547 		return 1;
1548 	}
1549 
1550 	while (src_idx < src_iovcnt && dst_idx < dst_iovcnt) {
1551 		if (num_src_sge > ACCEL_MLX5_MAX_SGE || num_dst_sge > ACCEL_MLX5_MAX_SGE) {
1552 			num_ops++;
1553 			num_src_sge = 1;
1554 			num_dst_sge = 1;
1555 		}
1556 		src_len = src_iov[src_idx].iov_len - src_offset;
1557 		dst_len = dst_iov[dst_idx].iov_len - dst_offset;
1558 
1559 		if (src_len == dst_len) {
1560 			num_src_sge++;
1561 			num_dst_sge++;
1562 			src_offset = 0;
1563 			dst_offset = 0;
1564 			src_idx++;
1565 			dst_idx++;
1566 			continue;
1567 		}
1568 		if (src_len < dst_len) {
1569 			/* Advance src_iov to reach the point that corresponds to the end of the current dst_iov. */
1570 			num_sge = accel_mlx5_advance_iovec(&src_iov[src_idx],
1571 							   spdk_min(ACCEL_MLX5_MAX_SGE + 1 - num_src_sge,
1572 									   src_iovcnt - src_idx),
1573 							   &src_offset, &dst_len);
1574 			src_idx += num_sge;
1575 			num_src_sge += num_sge;
1576 			if (dst_len != 0) {
1577 				/*
1578 				 * ACCEL_MLX5_MAX_SGE is reached on src_iov, and dst_len bytes
1579 				 * are left on the current dst_iov.
1580 				 */
1581 				dst_offset = dst_iov[dst_idx].iov_len - dst_len;
1582 			} else {
1583 				/* The src_iov advance is completed, shift to the next dst_iov. */
1584 				dst_idx++;
1585 				num_dst_sge++;
1586 				dst_offset = 0;
1587 			}
1588 		} else { /* src_len > dst_len */
1589 			/* Advance dst_iov to reach the point that corresponds to the end of the current src_iov. */
1590 			num_sge = accel_mlx5_advance_iovec(&dst_iov[dst_idx],
1591 							   spdk_min(ACCEL_MLX5_MAX_SGE + 1 - num_dst_sge,
1592 									   dst_iovcnt - dst_idx),
1593 							   &dst_offset, &src_len);
1594 			dst_idx += num_sge;
1595 			num_dst_sge += num_sge;
1596 			if (src_len != 0) {
1597 				/*
1598 				 * ACCEL_MLX5_MAX_SGE is reached on dst_iov, and src_len bytes
1599 				 * are left on the current src_iov.
1600 				 */
1601 				src_offset = src_iov[src_idx].iov_len - src_len;
1602 			} else {
1603 				/* The dst_iov advance is completed, shift to the next src_iov. */
1604 				src_idx++;
1605 				num_src_sge++;
1606 				src_offset = 0;
1607 			}
1608 		}
1609 	}
1610 	/* An extra operation is needed if no space is left on dst_iov because CRC takes one SGE. */
1611 	if (num_dst_sge > ACCEL_MLX5_MAX_SGE) {
1612 		num_ops++;
1613 	}
1614 
1615 	/* The above loop must reach the end of both iovs simultaneously because their size is the same. */
1616 	assert(src_idx == src_iovcnt);
1617 	assert(dst_idx == dst_iovcnt);
1618 	assert(src_offset == 0);
1619 	assert(dst_offset == 0);
1620 
1621 	return num_ops;
1622 }
1623 
1624 static inline int
1625 accel_mlx5_crc_task_init(struct accel_mlx5_task *mlx5_task)
1626 {
1627 	struct spdk_accel_task *task = &mlx5_task->base;
1628 	struct accel_mlx5_qp *qp = mlx5_task->qp;
1629 	uint32_t qp_slot = accel_mlx5_dev_get_available_slots(qp->dev, qp);
1630 	int rc;
1631 
1632 	accel_mlx5_iov_sgl_init(&mlx5_task->src, task->s.iovs, task->s.iovcnt);
1633 	if (mlx5_task->inplace) {
1634 		/* One entry is reserved for CRC */
1635 		mlx5_task->num_reqs = SPDK_CEIL_DIV(mlx5_task->src.iovcnt + 1, ACCEL_MLX5_MAX_SGE);
1636 	} else {
1637 		accel_mlx5_iov_sgl_init(&mlx5_task->dst, task->d.iovs, task->d.iovcnt);
1638 		mlx5_task->num_reqs = accel_mlx5_get_crc_task_count(mlx5_task->src.iov, mlx5_task->src.iovcnt,
1639 				      mlx5_task->dst.iov, mlx5_task->dst.iovcnt);
1640 	}
1641 
1642 	rc = accel_mlx5_task_alloc_crc_ctx(mlx5_task, qp_slot);
1643 	if (spdk_unlikely(rc)) {
1644 		return rc;
1645 	}
1646 
1647 	if (spdk_unlikely(qp_slot < 2)) {
1648 		/* Queue is full, queue this task */
1649 		SPDK_DEBUGLOG(accel_mlx5, "dev %s qp %p is full\n", qp->dev->dev_ctx->context->device->name,
1650 			      mlx5_task->qp);
1651 		qp->dev->stats.nomem_qdepth++;
1652 		return -ENOMEM;
1653 	}
1654 	return 0;
1655 }
1656 
1657 static int
1658 accel_mlx5_task_op_not_implemented(struct accel_mlx5_task *mlx5_task)
1659 {
1660 	SPDK_ERRLOG("wrong function called\n");
1661 	SPDK_UNREACHABLE();
1662 }
1663 
1664 static void
1665 accel_mlx5_task_op_not_implemented_v(struct accel_mlx5_task *mlx5_task)
1666 {
1667 	SPDK_ERRLOG("wrong function called\n");
1668 	SPDK_UNREACHABLE();
1669 }
1670 
1671 static int
1672 accel_mlx5_task_op_not_supported(struct accel_mlx5_task *mlx5_task)
1673 {
1674 	SPDK_ERRLOG("Unsupported opcode %d\n", mlx5_task->base.op_code);
1675 
1676 	return -ENOTSUP;
1677 }
1678 
1679 static struct accel_mlx5_task_operations g_accel_mlx5_tasks_ops[] = {
1680 	[ACCEL_MLX5_OPC_COPY] = {
1681 		.init = accel_mlx5_copy_task_init,
1682 		.process = accel_mlx5_copy_task_process,
1683 		.cont = accel_mlx5_copy_task_continue,
1684 		.complete = accel_mlx5_copy_task_complete,
1685 	},
1686 	[ACCEL_MLX5_OPC_CRYPTO] = {
1687 		.init = accel_mlx5_crypto_task_init,
1688 		.process = accel_mlx5_crypto_task_process,
1689 		.cont = accel_mlx5_crypto_task_continue,
1690 		.complete = accel_mlx5_crypto_task_complete,
1691 	},
1692 	[ACCEL_MLX5_OPC_CRC32C] = {
1693 		.init = accel_mlx5_crc_task_init,
1694 		.process = accel_mlx5_crc_task_process,
1695 		.cont = accel_mlx5_crc_task_continue,
1696 		.complete = accel_mlx5_crc_task_complete,
1697 	},
1698 	[ACCEL_MLX5_OPC_LAST] = {
1699 		.init = accel_mlx5_task_op_not_supported,
1700 		.process = accel_mlx5_task_op_not_implemented,
1701 		.cont = accel_mlx5_task_op_not_implemented,
1702 		.complete = accel_mlx5_task_op_not_implemented_v
1703 	},
1704 };
1705 
1706 static inline void
1707 accel_mlx5_task_complete(struct accel_mlx5_task *task)
1708 {
1709 	assert(task->num_reqs == task->num_completed_reqs);
1710 	SPDK_DEBUGLOG(accel_mlx5, "Complete task %p, opc %d\n", task, task->base.op_code);
1711 
1712 	g_accel_mlx5_tasks_ops[task->mlx5_opcode].complete(task);
1713 }
1714 
1715 static inline int
1716 accel_mlx5_task_continue(struct accel_mlx5_task *task)
1717 {
1718 	struct accel_mlx5_qp *qp = task->qp;
1719 	struct accel_mlx5_dev *dev = qp->dev;
1720 
1721 	if (spdk_unlikely(qp->recovering)) {
1722 		STAILQ_INSERT_TAIL(&dev->nomem, task, link);
1723 		return 0;
1724 	}
1725 
1726 	return g_accel_mlx5_tasks_ops[task->mlx5_opcode].cont(task);
1727 }
1728 static inline void
1729 accel_mlx5_task_init_opcode(struct accel_mlx5_task *mlx5_task)
1730 {
1731 	uint8_t base_opcode = mlx5_task->base.op_code;
1732 
1733 	switch (base_opcode) {
1734 	case SPDK_ACCEL_OPC_COPY:
1735 		mlx5_task->mlx5_opcode = ACCEL_MLX5_OPC_COPY;
1736 		break;
1737 	case SPDK_ACCEL_OPC_ENCRYPT:
1738 		assert(g_accel_mlx5.crypto_supported);
1739 		mlx5_task->enc_order = SPDK_MLX5_ENCRYPTION_ORDER_ENCRYPTED_RAW_WIRE;
1740 		mlx5_task->mlx5_opcode =  ACCEL_MLX5_OPC_CRYPTO;
1741 		break;
1742 	case SPDK_ACCEL_OPC_DECRYPT:
1743 		assert(g_accel_mlx5.crypto_supported);
1744 		mlx5_task->enc_order = SPDK_MLX5_ENCRYPTION_ORDER_ENCRYPTED_RAW_MEMORY;
1745 		mlx5_task->mlx5_opcode = ACCEL_MLX5_OPC_CRYPTO;
1746 		break;
1747 	case SPDK_ACCEL_OPC_CRC32C:
1748 		mlx5_task->inplace = 1;
1749 		mlx5_task->mlx5_opcode = ACCEL_MLX5_OPC_CRC32C;
1750 		break;
1751 	case SPDK_ACCEL_OPC_COPY_CRC32C:
1752 		mlx5_task->inplace = 0;
1753 		mlx5_task->mlx5_opcode = ACCEL_MLX5_OPC_CRC32C;
1754 		break;
1755 	default:
1756 		SPDK_ERRLOG("wrong opcode %d\n", base_opcode);
1757 		mlx5_task->mlx5_opcode = ACCEL_MLX5_OPC_LAST;
1758 	}
1759 }
1760 
1761 static inline void
1762 accel_mlx5_task_reset(struct accel_mlx5_task *mlx5_task)
1763 {
1764 	mlx5_task->num_completed_reqs = 0;
1765 	mlx5_task->num_submitted_reqs = 0;
1766 	mlx5_task->num_ops = 0;
1767 	mlx5_task->num_processed_blocks = 0;
1768 	mlx5_task->raw = 0;
1769 }
1770 
1771 static int
1772 accel_mlx5_submit_tasks(struct spdk_io_channel *_ch, struct spdk_accel_task *task)
1773 {
1774 	struct accel_mlx5_io_channel *ch = spdk_io_channel_get_ctx(_ch);
1775 	struct accel_mlx5_task *mlx5_task = SPDK_CONTAINEROF(task, struct accel_mlx5_task, base);
1776 	struct accel_mlx5_dev *dev;
1777 	int rc;
1778 
1779 	/* We should not receive any tasks if the module was not enabled */
1780 	assert(g_accel_mlx5.enabled);
1781 
1782 	dev = &ch->devs[ch->dev_idx];
1783 	ch->dev_idx++;
1784 	if (ch->dev_idx == ch->num_devs) {
1785 		ch->dev_idx = 0;
1786 	}
1787 
1788 	mlx5_task->qp = &dev->qp;
1789 	accel_mlx5_task_reset(mlx5_task);
1790 	accel_mlx5_task_init_opcode(mlx5_task);
1791 
1792 	dev->stats.opcodes[mlx5_task->mlx5_opcode]++;
1793 	rc = g_accel_mlx5_tasks_ops[mlx5_task->mlx5_opcode].init(mlx5_task);
1794 	if (spdk_unlikely(rc)) {
1795 		if (rc == -ENOMEM) {
1796 			SPDK_DEBUGLOG(accel_mlx5, "no reqs to handle new task %p (required %u), put to queue\n", mlx5_task,
1797 				      mlx5_task->num_reqs);
1798 			STAILQ_INSERT_TAIL(&dev->nomem, mlx5_task, link);
1799 			return 0;
1800 		}
1801 		SPDK_ERRLOG("Task opc %d init failed, rc %d\n", task->op_code, rc);
1802 		return rc;
1803 	}
1804 
1805 	if (spdk_unlikely(mlx5_task->qp->recovering)) {
1806 		STAILQ_INSERT_TAIL(&dev->nomem, mlx5_task, link);
1807 		return 0;
1808 	}
1809 
1810 	return g_accel_mlx5_tasks_ops[mlx5_task->mlx5_opcode].process(mlx5_task);
1811 }
1812 
1813 static void accel_mlx5_recover_qp(struct accel_mlx5_qp *qp);
1814 
1815 static int
1816 accel_mlx5_recover_qp_poller(void *arg)
1817 {
1818 	struct accel_mlx5_qp *qp = arg;
1819 
1820 	spdk_poller_unregister(&qp->recover_poller);
1821 	accel_mlx5_recover_qp(qp);
1822 	return SPDK_POLLER_BUSY;
1823 }
1824 
1825 static void
1826 accel_mlx5_recover_qp(struct accel_mlx5_qp *qp)
1827 {
1828 	struct accel_mlx5_dev *dev = qp->dev;
1829 	struct spdk_mlx5_qp_attr mlx5_qp_attr = {};
1830 	int rc;
1831 
1832 	SPDK_NOTICELOG("Recovering qp %p, core %u\n", qp, spdk_env_get_current_core());
1833 	if (qp->qp) {
1834 		spdk_mlx5_qp_destroy(qp->qp);
1835 		qp->qp = NULL;
1836 	}
1837 
1838 	mlx5_qp_attr.cap.max_send_wr = g_accel_mlx5.attr.qp_size;
1839 	mlx5_qp_attr.cap.max_recv_wr = 0;
1840 	mlx5_qp_attr.cap.max_send_sge = ACCEL_MLX5_MAX_SGE;
1841 	mlx5_qp_attr.cap.max_inline_data = sizeof(struct ibv_sge) * ACCEL_MLX5_MAX_SGE;
1842 
1843 	rc = spdk_mlx5_qp_create(dev->dev_ctx->pd, dev->cq, &mlx5_qp_attr, &qp->qp);
1844 	if (rc) {
1845 		SPDK_ERRLOG("Failed to create mlx5 dma QP, rc %d. Retry in %d usec\n",
1846 			    rc, ACCEL_MLX5_RECOVER_POLLER_PERIOD_US);
1847 		qp->recover_poller = SPDK_POLLER_REGISTER(accel_mlx5_recover_qp_poller, qp,
1848 				     ACCEL_MLX5_RECOVER_POLLER_PERIOD_US);
1849 		return;
1850 	}
1851 
1852 	qp->recovering = false;
1853 }
1854 
1855 static inline void
1856 accel_mlx5_process_error_cpl(struct spdk_mlx5_cq_completion *wc, struct accel_mlx5_task *task)
1857 {
1858 	struct accel_mlx5_qp *qp = task->qp;
1859 
1860 	if (wc->status != IBV_WC_WR_FLUSH_ERR) {
1861 		SPDK_WARNLOG("RDMA: qp %p, task %p, WC status %d, core %u\n",
1862 			     qp, task, wc->status, spdk_env_get_current_core());
1863 	} else {
1864 		SPDK_DEBUGLOG(accel_mlx5,
1865 			      "RDMA: qp %p, task %p, WC status %d, core %u\n",
1866 			      qp, task, wc->status, spdk_env_get_current_core());
1867 	}
1868 
1869 	qp->recovering = true;
1870 	assert(task->num_completed_reqs <= task->num_submitted_reqs);
1871 	if (task->num_completed_reqs == task->num_submitted_reqs) {
1872 		STAILQ_REMOVE_HEAD(&qp->in_hw, link);
1873 		accel_mlx5_task_fail(task, -EIO);
1874 	}
1875 }
1876 
1877 static inline int64_t
1878 accel_mlx5_poll_cq(struct accel_mlx5_dev *dev)
1879 {
1880 	struct spdk_mlx5_cq_completion wc[ACCEL_MLX5_MAX_WC];
1881 	struct accel_mlx5_task *task;
1882 	struct accel_mlx5_qp *qp;
1883 	int reaped, i, rc;
1884 	uint16_t completed;
1885 
1886 	dev->stats.polls++;
1887 	reaped = spdk_mlx5_cq_poll_completions(dev->cq, wc, ACCEL_MLX5_MAX_WC);
1888 	if (spdk_unlikely(reaped < 0)) {
1889 		SPDK_ERRLOG("Error polling CQ! (%d): %s\n", errno, spdk_strerror(errno));
1890 		return reaped;
1891 	} else if (reaped == 0) {
1892 		dev->stats.idle_polls++;
1893 		return 0;
1894 	}
1895 	dev->stats.completions += reaped;
1896 
1897 	SPDK_DEBUGLOG(accel_mlx5, "Reaped %d cpls on dev %s\n", reaped,
1898 		      dev->dev_ctx->context->device->name);
1899 
1900 	for (i = 0; i < reaped; i++) {
1901 		if (spdk_unlikely(!wc[i].wr_id)) {
1902 			/* Unsignaled completion with error, ignore */
1903 			continue;
1904 		}
1905 		task = (struct accel_mlx5_task *)wc[i].wr_id;
1906 		qp = task->qp;
1907 		assert(task == STAILQ_FIRST(&qp->in_hw) && "submission mismatch");
1908 		assert(task->num_submitted_reqs > task->num_completed_reqs);
1909 		completed = task->num_submitted_reqs - task->num_completed_reqs;
1910 		assert((uint32_t)task->num_completed_reqs + completed <= UINT16_MAX);
1911 		task->num_completed_reqs += completed;
1912 		assert(qp->wrs_submitted >= task->num_wrs);
1913 		qp->wrs_submitted -= task->num_wrs;
1914 		assert(dev->wrs_in_cq > 0);
1915 		dev->wrs_in_cq--;
1916 
1917 		if (wc[i].status) {
1918 			accel_mlx5_process_error_cpl(&wc[i], task);
1919 			if (qp->wrs_submitted == 0) {
1920 				assert(STAILQ_EMPTY(&qp->in_hw));
1921 				accel_mlx5_recover_qp(qp);
1922 			}
1923 			continue;
1924 		}
1925 
1926 		SPDK_DEBUGLOG(accel_mlx5, "task %p, remaining %u\n", task,
1927 			      task->num_reqs - task->num_completed_reqs);
1928 		if (task->num_completed_reqs == task->num_reqs) {
1929 			STAILQ_REMOVE_HEAD(&qp->in_hw, link);
1930 			accel_mlx5_task_complete(task);
1931 		} else {
1932 			assert(task->num_submitted_reqs < task->num_reqs);
1933 			assert(task->num_completed_reqs == task->num_submitted_reqs);
1934 			STAILQ_REMOVE_HEAD(&qp->in_hw, link);
1935 			rc = accel_mlx5_task_continue(task);
1936 			if (spdk_unlikely(rc)) {
1937 				if (rc != -ENOMEM) {
1938 					accel_mlx5_task_fail(task, rc);
1939 				}
1940 			}
1941 		}
1942 	}
1943 
1944 	return reaped;
1945 }
1946 
1947 static inline void
1948 accel_mlx5_resubmit_nomem_tasks(struct accel_mlx5_dev *dev)
1949 {
1950 	struct accel_mlx5_task *task, *tmp, *last;
1951 	int rc;
1952 
1953 	last = STAILQ_LAST(&dev->nomem, accel_mlx5_task, link);
1954 	STAILQ_FOREACH_SAFE(task, &dev->nomem, link, tmp) {
1955 		STAILQ_REMOVE_HEAD(&dev->nomem, link);
1956 		rc = accel_mlx5_task_continue(task);
1957 		if (spdk_unlikely(rc)) {
1958 			if (rc != -ENOMEM) {
1959 				accel_mlx5_task_fail(task, rc);
1960 			}
1961 			break;
1962 		}
1963 		/* If qpair is recovering, task is added back to the nomem list and 0 is returned. In that case we
1964 		 * need a special condition to iterate the list once and stop this FOREACH loop */
1965 		if (task == last) {
1966 			break;
1967 		}
1968 	}
1969 }
1970 
1971 static int
1972 accel_mlx5_poller(void *ctx)
1973 {
1974 	struct accel_mlx5_io_channel *ch = ctx;
1975 	struct accel_mlx5_dev *dev;
1976 
1977 	int64_t completions = 0, rc;
1978 	uint32_t i;
1979 
1980 	for (i = 0; i < ch->num_devs; i++) {
1981 		dev = &ch->devs[i];
1982 		if (dev->wrs_in_cq) {
1983 			rc = accel_mlx5_poll_cq(dev);
1984 			if (spdk_unlikely(rc < 0)) {
1985 				SPDK_ERRLOG("Error %"PRId64" on CQ, dev %s\n", rc, dev->dev_ctx->context->device->name);
1986 			}
1987 			completions += rc;
1988 			if (dev->qp.wrs_submitted) {
1989 				spdk_mlx5_qp_complete_send(dev->qp.qp);
1990 			}
1991 		}
1992 		if (!STAILQ_EMPTY(&dev->nomem)) {
1993 			accel_mlx5_resubmit_nomem_tasks(dev);
1994 		}
1995 	}
1996 
1997 	return !!completions;
1998 }
1999 
2000 static bool
2001 accel_mlx5_supports_opcode(enum spdk_accel_opcode opc)
2002 {
2003 	assert(g_accel_mlx5.enabled);
2004 
2005 	switch (opc) {
2006 	case SPDK_ACCEL_OPC_COPY:
2007 		return true;
2008 	case SPDK_ACCEL_OPC_ENCRYPT:
2009 	case SPDK_ACCEL_OPC_DECRYPT:
2010 		return g_accel_mlx5.crypto_supported;
2011 	case SPDK_ACCEL_OPC_CRC32C:
2012 	case SPDK_ACCEL_OPC_COPY_CRC32C:
2013 		return g_accel_mlx5.crc32c_supported;
2014 	default:
2015 		return false;
2016 	}
2017 }
2018 
2019 static struct spdk_io_channel *
2020 accel_mlx5_get_io_channel(void)
2021 {
2022 	assert(g_accel_mlx5.enabled);
2023 	return spdk_get_io_channel(&g_accel_mlx5);
2024 }
2025 
2026 static int
2027 accel_mlx5_create_qp(struct accel_mlx5_dev *dev, struct accel_mlx5_qp *qp)
2028 {
2029 	struct spdk_mlx5_qp_attr mlx5_qp_attr = {};
2030 	int rc;
2031 
2032 	mlx5_qp_attr.cap.max_send_wr = g_accel_mlx5.attr.qp_size;
2033 	mlx5_qp_attr.cap.max_recv_wr = 0;
2034 	mlx5_qp_attr.cap.max_send_sge = ACCEL_MLX5_MAX_SGE;
2035 	mlx5_qp_attr.cap.max_inline_data = sizeof(struct ibv_sge) * ACCEL_MLX5_MAX_SGE;
2036 
2037 	rc = spdk_mlx5_qp_create(dev->dev_ctx->pd, dev->cq, &mlx5_qp_attr, &qp->qp);
2038 	if (rc) {
2039 		return rc;
2040 	}
2041 
2042 	STAILQ_INIT(&qp->in_hw);
2043 	qp->dev = dev;
2044 	qp->verbs_qp = spdk_mlx5_qp_get_verbs_qp(qp->qp);
2045 	assert(qp->verbs_qp);
2046 	qp->wrs_max = g_accel_mlx5.attr.qp_size;
2047 
2048 	return 0;
2049 }
2050 
2051 static void
2052 accel_mlx5_add_stats(struct accel_mlx5_stats *stats, const struct accel_mlx5_stats *to_add)
2053 {
2054 	int i;
2055 
2056 	stats->crypto_umrs += to_add->crypto_umrs;
2057 	stats->sig_umrs += to_add->sig_umrs;
2058 	stats->rdma_reads += to_add->rdma_reads;
2059 	stats->rdma_writes += to_add->rdma_writes;
2060 	stats->polls += to_add->polls;
2061 	stats->idle_polls += to_add->idle_polls;
2062 	stats->completions += to_add->completions;
2063 	stats->nomem_qdepth += to_add->nomem_qdepth;
2064 	stats->nomem_mkey += to_add->nomem_mkey;
2065 	for (i = 0; i < ACCEL_MLX5_OPC_LAST; i++) {
2066 		stats->opcodes[i] += to_add->opcodes[i];
2067 	}
2068 }
2069 
2070 static void
2071 accel_mlx5_destroy_cb(void *io_device, void *ctx_buf)
2072 {
2073 	struct accel_mlx5_io_channel *ch = ctx_buf;
2074 	struct accel_mlx5_dev *dev;
2075 	uint32_t i;
2076 
2077 	spdk_poller_unregister(&ch->poller);
2078 	for (i = 0; i < ch->num_devs; i++) {
2079 		dev = &ch->devs[i];
2080 		spdk_mlx5_qp_destroy(dev->qp.qp);
2081 		if (dev->cq) {
2082 			spdk_mlx5_cq_destroy(dev->cq);
2083 		}
2084 		spdk_poller_unregister(&dev->qp.recover_poller);
2085 		if (dev->crypto_mkeys) {
2086 			spdk_mlx5_mkey_pool_put_ref(dev->crypto_mkeys);
2087 		}
2088 		if (dev->sig_mkeys) {
2089 			spdk_mlx5_mkey_pool_put_ref(dev->sig_mkeys);
2090 		}
2091 		spdk_rdma_utils_free_mem_map(&dev->mmap);
2092 		spdk_spin_lock(&g_accel_mlx5.lock);
2093 		accel_mlx5_add_stats(&g_accel_mlx5.stats, &dev->stats);
2094 		spdk_spin_unlock(&g_accel_mlx5.lock);
2095 	}
2096 	free(ch->devs);
2097 }
2098 
2099 static int
2100 accel_mlx5_create_cb(void *io_device, void *ctx_buf)
2101 {
2102 	struct spdk_mlx5_cq_attr cq_attr = {};
2103 	struct accel_mlx5_io_channel *ch = ctx_buf;
2104 	struct accel_mlx5_dev_ctx *dev_ctx;
2105 	struct accel_mlx5_dev *dev;
2106 	uint32_t i;
2107 	int rc;
2108 
2109 	ch->devs = calloc(g_accel_mlx5.num_ctxs, sizeof(*ch->devs));
2110 	if (!ch->devs) {
2111 		SPDK_ERRLOG("Memory allocation failed\n");
2112 		return -ENOMEM;
2113 	}
2114 
2115 	for (i = 0; i < g_accel_mlx5.num_ctxs; i++) {
2116 		dev_ctx = &g_accel_mlx5.dev_ctxs[i];
2117 		dev = &ch->devs[i];
2118 		dev->dev_ctx = dev_ctx;
2119 
2120 		if (dev_ctx->crypto_mkeys) {
2121 			dev->crypto_mkeys = spdk_mlx5_mkey_pool_get_ref(dev_ctx->pd, SPDK_MLX5_MKEY_POOL_FLAG_CRYPTO);
2122 			if (!dev->crypto_mkeys) {
2123 				SPDK_ERRLOG("Failed to get crypto mkey pool channel, dev %s\n", dev_ctx->context->device->name);
2124 				/* Should not happen since mkey pool is created on accel_mlx5 initialization.
2125 				 * We should not be here if pool creation failed */
2126 				assert(0);
2127 				goto err_out;
2128 			}
2129 		}
2130 		if (dev_ctx->sig_mkeys) {
2131 			dev->sig_mkeys = spdk_mlx5_mkey_pool_get_ref(dev_ctx->pd, SPDK_MLX5_MKEY_POOL_FLAG_SIGNATURE);
2132 			if (!dev->sig_mkeys) {
2133 				SPDK_ERRLOG("Failed to get sig mkey pool channel, dev %s\n", dev_ctx->context->device->name);
2134 				/* Should not happen since mkey pool is created on accel_mlx5 initialization.
2135 				 * We should not be here if pool creation failed */
2136 				assert(0);
2137 				goto err_out;
2138 			}
2139 		}
2140 
2141 		memset(&cq_attr, 0, sizeof(cq_attr));
2142 		cq_attr.cqe_cnt = g_accel_mlx5.attr.qp_size;
2143 		cq_attr.cqe_size = 64;
2144 		cq_attr.cq_context = dev;
2145 
2146 		ch->num_devs++;
2147 		rc = spdk_mlx5_cq_create(dev_ctx->pd, &cq_attr, &dev->cq);
2148 		if (rc) {
2149 			SPDK_ERRLOG("Failed to create mlx5 CQ, rc %d\n", rc);
2150 			goto err_out;
2151 		}
2152 
2153 		rc = accel_mlx5_create_qp(dev, &dev->qp);
2154 		if (rc) {
2155 			SPDK_ERRLOG("Failed to create mlx5 QP, rc %d\n", rc);
2156 			goto err_out;
2157 		}
2158 
2159 		dev->mmap = spdk_rdma_utils_create_mem_map(dev_ctx->pd, NULL,
2160 				IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE);
2161 		if (!dev->mmap) {
2162 			SPDK_ERRLOG("Failed to create memory map\n");
2163 			rc = -ENOMEM;
2164 			goto err_out;
2165 		}
2166 		dev->crypto_multi_block = dev_ctx->crypto_multi_block;
2167 		dev->crypto_split_blocks = dev_ctx->crypto_multi_block ? g_accel_mlx5.attr.crypto_split_blocks : 0;
2168 		dev->wrs_in_cq_max = g_accel_mlx5.attr.qp_size;
2169 		STAILQ_INIT(&dev->nomem);
2170 	}
2171 
2172 	ch->poller = SPDK_POLLER_REGISTER(accel_mlx5_poller, ch, 0);
2173 
2174 	return 0;
2175 
2176 err_out:
2177 	accel_mlx5_destroy_cb(&g_accel_mlx5, ctx_buf);
2178 	return rc;
2179 }
2180 
2181 void
2182 accel_mlx5_get_default_attr(struct accel_mlx5_attr *attr)
2183 {
2184 	assert(attr);
2185 
2186 	attr->qp_size = ACCEL_MLX5_QP_SIZE;
2187 	attr->num_requests = ACCEL_MLX5_NUM_REQUESTS;
2188 	attr->allowed_devs = NULL;
2189 	attr->crypto_split_blocks = 0;
2190 }
2191 
2192 static void
2193 accel_mlx5_allowed_devs_free(void)
2194 {
2195 	size_t i;
2196 
2197 	if (!g_accel_mlx5.allowed_devs) {
2198 		return;
2199 	}
2200 
2201 	for (i = 0; i < g_accel_mlx5.allowed_devs_count; i++) {
2202 		free(g_accel_mlx5.allowed_devs[i]);
2203 	}
2204 	free(g_accel_mlx5.attr.allowed_devs);
2205 	free(g_accel_mlx5.allowed_devs);
2206 	g_accel_mlx5.attr.allowed_devs = NULL;
2207 	g_accel_mlx5.allowed_devs = NULL;
2208 	g_accel_mlx5.allowed_devs_count = 0;
2209 }
2210 
2211 static int
2212 accel_mlx5_allowed_devs_parse(const char *allowed_devs)
2213 {
2214 	char *str, *tmp, *tok;
2215 	size_t devs_count = 0;
2216 
2217 	str = strdup(allowed_devs);
2218 	if (!str) {
2219 		return -ENOMEM;
2220 	}
2221 
2222 	accel_mlx5_allowed_devs_free();
2223 
2224 	tmp = str;
2225 	while ((tmp = strchr(tmp, ',')) != NULL) {
2226 		tmp++;
2227 		devs_count++;
2228 	}
2229 	devs_count++;
2230 
2231 	g_accel_mlx5.allowed_devs = calloc(devs_count, sizeof(char *));
2232 	if (!g_accel_mlx5.allowed_devs) {
2233 		free(str);
2234 		return -ENOMEM;
2235 	}
2236 
2237 	devs_count = 0;
2238 	tok = strtok(str, ",");
2239 	while (tok) {
2240 		g_accel_mlx5.allowed_devs[devs_count] = strdup(tok);
2241 		if (!g_accel_mlx5.allowed_devs[devs_count]) {
2242 			free(str);
2243 			accel_mlx5_allowed_devs_free();
2244 			return -ENOMEM;
2245 		}
2246 		tok = strtok(NULL, ",");
2247 		devs_count++;
2248 		g_accel_mlx5.allowed_devs_count++;
2249 	}
2250 
2251 	free(str);
2252 
2253 	return 0;
2254 }
2255 
2256 int
2257 accel_mlx5_enable(struct accel_mlx5_attr *attr)
2258 {
2259 	int rc;
2260 
2261 	if (g_accel_mlx5.enabled) {
2262 		return -EEXIST;
2263 	}
2264 	if (attr) {
2265 		if (attr->num_requests / spdk_env_get_core_count() < ACCEL_MLX5_MAX_MKEYS_IN_TASK) {
2266 			SPDK_ERRLOG("num requests per core must not be less than %u, current value %u\n",
2267 				    ACCEL_MLX5_MAX_MKEYS_IN_TASK, attr->num_requests / spdk_env_get_core_count());
2268 			return -EINVAL;
2269 		}
2270 		if (attr->qp_size < 8) {
2271 			SPDK_ERRLOG("qp_size must be at least 8\n");
2272 			return -EINVAL;
2273 		}
2274 		g_accel_mlx5.attr = *attr;
2275 		g_accel_mlx5.attr.allowed_devs = NULL;
2276 
2277 		if (attr->allowed_devs) {
2278 			/* Contains a copy of user's string */
2279 			g_accel_mlx5.attr.allowed_devs = strndup(attr->allowed_devs, ACCEL_MLX5_ALLOWED_DEVS_MAX_LEN);
2280 			if (!g_accel_mlx5.attr.allowed_devs) {
2281 				return -ENOMEM;
2282 			}
2283 			rc = accel_mlx5_allowed_devs_parse(g_accel_mlx5.attr.allowed_devs);
2284 			if (rc) {
2285 				return rc;
2286 			}
2287 			rc = spdk_mlx5_crypto_devs_allow((const char *const *)g_accel_mlx5.allowed_devs,
2288 							 g_accel_mlx5.allowed_devs_count);
2289 			if (rc) {
2290 				accel_mlx5_allowed_devs_free();
2291 				return rc;
2292 			}
2293 		}
2294 	} else {
2295 		accel_mlx5_get_default_attr(&g_accel_mlx5.attr);
2296 	}
2297 
2298 	g_accel_mlx5.enabled = true;
2299 	spdk_accel_module_list_add(&g_accel_mlx5.module);
2300 
2301 	return 0;
2302 }
2303 
2304 static void
2305 accel_mlx5_psvs_release(struct accel_mlx5_dev_ctx *dev_ctx)
2306 {
2307 	uint32_t i, num_psvs, num_psvs_in_pool;
2308 
2309 	if (!dev_ctx->psvs) {
2310 		return;
2311 	}
2312 
2313 	num_psvs = g_accel_mlx5.attr.num_requests;
2314 
2315 	for (i = 0; i < num_psvs; i++) {
2316 		if (dev_ctx->psvs[i]) {
2317 			spdk_mlx5_destroy_psv(dev_ctx->psvs[i]);
2318 			dev_ctx->psvs[i] = NULL;
2319 		}
2320 	}
2321 	free(dev_ctx->psvs);
2322 
2323 	if (!dev_ctx->psv_pool) {
2324 		return;
2325 	}
2326 	num_psvs_in_pool = spdk_mempool_count(dev_ctx->psv_pool);
2327 	if (num_psvs_in_pool != num_psvs) {
2328 		SPDK_ERRLOG("Expected %u reqs in the pool, but got only %u\n", num_psvs, num_psvs_in_pool);
2329 	}
2330 	spdk_mempool_free(dev_ctx->psv_pool);
2331 }
2332 
2333 static void
2334 accel_mlx5_free_resources(void)
2335 {
2336 	struct accel_mlx5_dev_ctx *dev_ctx;
2337 	uint32_t i;
2338 
2339 	for (i = 0; i < g_accel_mlx5.num_ctxs; i++) {
2340 		dev_ctx = &g_accel_mlx5.dev_ctxs[i];
2341 		accel_mlx5_psvs_release(dev_ctx);
2342 		if (dev_ctx->pd) {
2343 			if (dev_ctx->crypto_mkeys) {
2344 				spdk_mlx5_mkey_pool_destroy(SPDK_MLX5_MKEY_POOL_FLAG_CRYPTO, dev_ctx->pd);
2345 			}
2346 			if (dev_ctx->sig_mkeys) {
2347 				spdk_mlx5_mkey_pool_destroy(SPDK_MLX5_MKEY_POOL_FLAG_SIGNATURE, dev_ctx->pd);
2348 			}
2349 			spdk_rdma_utils_put_pd(dev_ctx->pd);
2350 		}
2351 		if (dev_ctx->domain) {
2352 			spdk_rdma_utils_put_memory_domain(dev_ctx->domain);
2353 		}
2354 	}
2355 
2356 	free(g_accel_mlx5.dev_ctxs);
2357 	g_accel_mlx5.dev_ctxs = NULL;
2358 	g_accel_mlx5.initialized = false;
2359 }
2360 
2361 static void
2362 accel_mlx5_deinit_cb(void *ctx)
2363 {
2364 	accel_mlx5_free_resources();
2365 	spdk_spin_destroy(&g_accel_mlx5.lock);
2366 	spdk_accel_module_finish();
2367 }
2368 
2369 static void
2370 accel_mlx5_deinit(void *ctx)
2371 {
2372 	if (g_accel_mlx5.allowed_devs) {
2373 		accel_mlx5_allowed_devs_free();
2374 	}
2375 	spdk_mlx5_crypto_devs_allow(NULL, 0);
2376 	if (g_accel_mlx5.initialized) {
2377 		spdk_io_device_unregister(&g_accel_mlx5, accel_mlx5_deinit_cb);
2378 	} else {
2379 		spdk_accel_module_finish();
2380 	}
2381 }
2382 
2383 static int
2384 accel_mlx5_mkeys_create(struct ibv_pd *pd, uint32_t num_mkeys, uint32_t flags)
2385 {
2386 	struct spdk_mlx5_mkey_pool_param pool_param = {};
2387 
2388 	pool_param.mkey_count = num_mkeys;
2389 	pool_param.cache_per_thread = num_mkeys * 3 / 4 / spdk_env_get_core_count();
2390 	pool_param.flags = flags;
2391 
2392 	return spdk_mlx5_mkey_pool_init(&pool_param, pd);
2393 }
2394 
2395 static void
2396 accel_mlx5_set_psv_in_pool(struct spdk_mempool *mp, void *cb_arg, void *_psv, unsigned obj_idx)
2397 {
2398 	struct spdk_rdma_utils_memory_translation translation = {};
2399 	struct accel_mlx5_psv_pool_iter_cb_args *args = cb_arg;
2400 	struct accel_mlx5_psv_wrapper *wrapper = _psv;
2401 	struct accel_mlx5_dev_ctx *dev_ctx = args->dev;
2402 	int rc;
2403 
2404 	if (args->rc) {
2405 		return;
2406 	}
2407 	assert(obj_idx < g_accel_mlx5.attr.num_requests);
2408 	assert(dev_ctx->psvs[obj_idx] != NULL);
2409 	memset(wrapper, 0, sizeof(*wrapper));
2410 	wrapper->psv_index = dev_ctx->psvs[obj_idx]->index;
2411 
2412 	rc = spdk_rdma_utils_get_translation(args->map, &wrapper->crc, sizeof(uint32_t), &translation);
2413 	if (rc) {
2414 		SPDK_ERRLOG("Memory translation failed, addr %p, length %zu\n", &wrapper->crc, sizeof(uint32_t));
2415 		args->rc = -EINVAL;
2416 	} else {
2417 		wrapper->crc_lkey = spdk_rdma_utils_memory_translation_get_lkey(&translation);
2418 	}
2419 }
2420 
2421 static int
2422 accel_mlx5_psvs_create(struct accel_mlx5_dev_ctx *dev_ctx)
2423 {
2424 	struct accel_mlx5_psv_pool_iter_cb_args args = {
2425 		.dev = dev_ctx
2426 	};
2427 	char pool_name[32];
2428 	uint32_t i;
2429 	uint32_t num_psvs = g_accel_mlx5.attr.num_requests;
2430 	uint32_t cache_size;
2431 	int rc;
2432 
2433 	dev_ctx->psvs = calloc(num_psvs, (sizeof(struct spdk_mlx5_psv *)));
2434 	if (!dev_ctx->psvs) {
2435 		SPDK_ERRLOG("Failed to alloc PSVs array\n");
2436 		return -ENOMEM;
2437 	}
2438 	for (i = 0; i < num_psvs; i++) {
2439 		dev_ctx->psvs[i] = spdk_mlx5_create_psv(dev_ctx->pd);
2440 		if (!dev_ctx->psvs[i]) {
2441 			SPDK_ERRLOG("Failed to create PSV on dev %s\n", dev_ctx->context->device->name);
2442 			return -EINVAL;
2443 		}
2444 	}
2445 
2446 	rc = snprintf(pool_name, sizeof(pool_name), "accel_psv_%s", dev_ctx->context->device->name);
2447 	if (rc < 0) {
2448 		assert(0);
2449 		return -EINVAL;
2450 	}
2451 	cache_size = num_psvs * 3 / 4 / spdk_env_get_core_count();
2452 	args.map = spdk_rdma_utils_create_mem_map(dev_ctx->pd, NULL,
2453 			IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE);
2454 	if (!args.map) {
2455 		return -ENOMEM;
2456 	}
2457 	dev_ctx->psv_pool = spdk_mempool_create_ctor(pool_name, num_psvs,
2458 			    sizeof(struct accel_mlx5_psv_wrapper),
2459 			    cache_size, SPDK_ENV_SOCKET_ID_ANY,
2460 			    accel_mlx5_set_psv_in_pool, &args);
2461 	spdk_rdma_utils_free_mem_map(&args.map);
2462 	if (!dev_ctx->psv_pool) {
2463 		SPDK_ERRLOG("Failed to create PSV memory pool\n");
2464 		return -ENOMEM;
2465 	}
2466 	if (args.rc) {
2467 		SPDK_ERRLOG("Failed to init PSV memory pool objects, rc %d\n", args.rc);
2468 		return args.rc;
2469 	}
2470 
2471 	return 0;
2472 }
2473 
2474 
2475 static int
2476 accel_mlx5_dev_ctx_init(struct accel_mlx5_dev_ctx *dev_ctx, struct ibv_context *dev,
2477 			struct spdk_mlx5_device_caps *caps)
2478 {
2479 	struct ibv_pd *pd;
2480 	int rc;
2481 
2482 	pd = spdk_rdma_utils_get_pd(dev);
2483 	if (!pd) {
2484 		SPDK_ERRLOG("Failed to get PD for context %p, dev %s\n", dev, dev->device->name);
2485 		return -EINVAL;
2486 	}
2487 	dev_ctx->context = dev;
2488 	dev_ctx->pd = pd;
2489 	dev_ctx->domain = spdk_rdma_utils_get_memory_domain(pd);
2490 	if (!dev_ctx->domain) {
2491 		return -ENOMEM;
2492 	}
2493 
2494 	if (g_accel_mlx5.crypto_supported) {
2495 		dev_ctx->crypto_multi_block = caps->crypto.multi_block_be_tweak;
2496 		if (!dev_ctx->crypto_multi_block && g_accel_mlx5.attr.crypto_split_blocks) {
2497 			SPDK_WARNLOG("\"crypto_split_blocks\" is set but dev %s doesn't support multi block crypto\n",
2498 				     dev->device->name);
2499 		}
2500 		rc = accel_mlx5_mkeys_create(pd, g_accel_mlx5.attr.num_requests, SPDK_MLX5_MKEY_POOL_FLAG_CRYPTO);
2501 		if (rc) {
2502 			SPDK_ERRLOG("Failed to create crypto mkeys pool, rc %d, dev %s\n", rc, dev->device->name);
2503 			return rc;
2504 		}
2505 		dev_ctx->crypto_mkeys = true;
2506 	}
2507 	if (g_accel_mlx5.crc32c_supported) {
2508 		rc = accel_mlx5_mkeys_create(pd, g_accel_mlx5.attr.num_requests,
2509 					     SPDK_MLX5_MKEY_POOL_FLAG_SIGNATURE);
2510 		if (rc) {
2511 			SPDK_ERRLOG("Failed to create signature mkeys pool, rc %d, dev %s\n", rc, dev->device->name);
2512 			return rc;
2513 		}
2514 		dev_ctx->sig_mkeys = true;
2515 		rc = accel_mlx5_psvs_create(dev_ctx);
2516 		if (rc) {
2517 			SPDK_ERRLOG("Failed to create PSVs pool, rc %d, dev %s\n", rc, dev->device->name);
2518 			return rc;
2519 		}
2520 	}
2521 
2522 	return 0;
2523 }
2524 
2525 static struct ibv_context **
2526 accel_mlx5_get_devices(int *_num_devs)
2527 {
2528 	struct ibv_context **rdma_devs, **rdma_devs_out = NULL, *dev;
2529 	struct ibv_device_attr dev_attr;
2530 	size_t j;
2531 	int num_devs = 0, i, rc;
2532 	int num_devs_out = 0;
2533 	bool dev_allowed;
2534 
2535 	rdma_devs = rdma_get_devices(&num_devs);
2536 	if (!rdma_devs || !num_devs) {
2537 		*_num_devs = 0;
2538 		return NULL;
2539 	}
2540 
2541 	rdma_devs_out = calloc(num_devs + 1, sizeof(struct ibv_context *));
2542 	if (!rdma_devs_out) {
2543 		SPDK_ERRLOG("Memory allocation failed\n");
2544 		rdma_free_devices(rdma_devs);
2545 		*_num_devs = 0;
2546 		return NULL;
2547 	}
2548 
2549 	for (i = 0; i < num_devs; i++) {
2550 		dev = rdma_devs[i];
2551 		rc = ibv_query_device(dev, &dev_attr);
2552 		if (rc) {
2553 			SPDK_ERRLOG("Failed to query dev %s, skipping\n", dev->device->name);
2554 			continue;
2555 		}
2556 		if (dev_attr.vendor_id != SPDK_MLX5_VENDOR_ID_MELLANOX) {
2557 			SPDK_DEBUGLOG(accel_mlx5, "dev %s is not Mellanox device, skipping\n", dev->device->name);
2558 			continue;
2559 		}
2560 
2561 		if (g_accel_mlx5.allowed_devs_count) {
2562 			dev_allowed = false;
2563 			for (j = 0; j < g_accel_mlx5.allowed_devs_count; j++) {
2564 				if (strcmp(g_accel_mlx5.allowed_devs[j], dev->device->name) == 0) {
2565 					dev_allowed = true;
2566 					break;
2567 				}
2568 			}
2569 			if (!dev_allowed) {
2570 				continue;
2571 			}
2572 		}
2573 
2574 		rdma_devs_out[num_devs_out] = dev;
2575 		num_devs_out++;
2576 	}
2577 
2578 	rdma_free_devices(rdma_devs);
2579 	*_num_devs = num_devs_out;
2580 
2581 	return rdma_devs_out;
2582 }
2583 
2584 static inline bool
2585 accel_mlx5_dev_supports_crypto(struct spdk_mlx5_device_caps *caps)
2586 {
2587 	return caps->crypto_supported && !caps->crypto.wrapped_import_method_aes_xts &&
2588 	       (caps->crypto.single_block_le_tweak ||
2589 		caps->crypto.multi_block_le_tweak || caps->crypto.multi_block_be_tweak);
2590 }
2591 
2592 static int
2593 accel_mlx5_init(void)
2594 {
2595 	struct spdk_mlx5_device_caps *caps;
2596 	struct ibv_context **rdma_devs, *dev;
2597 	int num_devs = 0,  rc = 0, i;
2598 	int best_dev = -1, first_dev = 0;
2599 	int best_dev_stat = 0, dev_stat;
2600 	bool supports_crypto;
2601 	bool find_best_dev = g_accel_mlx5.allowed_devs_count == 0;
2602 
2603 	if (!g_accel_mlx5.enabled) {
2604 		return -EINVAL;
2605 	}
2606 
2607 	spdk_spin_init(&g_accel_mlx5.lock);
2608 	rdma_devs = accel_mlx5_get_devices(&num_devs);
2609 	if (!rdma_devs || !num_devs) {
2610 		return -ENODEV;
2611 	}
2612 	caps = calloc(num_devs, sizeof(*caps));
2613 	if (!caps) {
2614 		rc = -ENOMEM;
2615 		goto cleanup;
2616 	}
2617 
2618 	g_accel_mlx5.crypto_supported = true;
2619 	g_accel_mlx5.crc32c_supported = true;
2620 	g_accel_mlx5.num_ctxs = 0;
2621 
2622 	/* Iterate devices. We support an offload if all devices support it */
2623 	for (i = 0; i < num_devs; i++) {
2624 		dev = rdma_devs[i];
2625 
2626 		rc = spdk_mlx5_device_query_caps(dev, &caps[i]);
2627 		if (rc) {
2628 			SPDK_ERRLOG("Failed to get crypto caps, dev %s\n", dev->device->name);
2629 			goto cleanup;
2630 		}
2631 		supports_crypto = accel_mlx5_dev_supports_crypto(&caps[i]);
2632 		if (!supports_crypto) {
2633 			SPDK_DEBUGLOG(accel_mlx5, "Disable crypto support because dev %s doesn't support it\n",
2634 				      rdma_devs[i]->device->name);
2635 			g_accel_mlx5.crypto_supported = false;
2636 		}
2637 		if (!caps[i].crc32c_supported) {
2638 			SPDK_DEBUGLOG(accel_mlx5, "Disable crc32c support because dev %s doesn't support it\n",
2639 				      rdma_devs[i]->device->name);
2640 			g_accel_mlx5.crc32c_supported = false;
2641 		}
2642 		if (find_best_dev) {
2643 			/* Find device which supports max number of offloads */
2644 			dev_stat = (int)supports_crypto + (int)caps[i].crc32c_supported;
2645 			if (dev_stat > best_dev_stat) {
2646 				best_dev_stat = dev_stat;
2647 				best_dev = i;
2648 			}
2649 		}
2650 	}
2651 
2652 	/* User didn't specify devices to use, try to select the best one */
2653 	if (find_best_dev) {
2654 		if (best_dev == -1) {
2655 			best_dev = 0;
2656 		}
2657 		g_accel_mlx5.crypto_supported = accel_mlx5_dev_supports_crypto(&caps[best_dev]);
2658 		g_accel_mlx5.crc32c_supported = caps[best_dev].crc32c_supported;
2659 		SPDK_NOTICELOG("Select dev %s, crypto %d, crc32c %d\n", rdma_devs[best_dev]->device->name,
2660 			       g_accel_mlx5.crypto_supported, g_accel_mlx5.crc32c_supported);
2661 		first_dev = best_dev;
2662 		num_devs = 1;
2663 		if (g_accel_mlx5.crypto_supported) {
2664 			const char *const dev_name[] = { rdma_devs[best_dev]->device->name };
2665 			/* Let mlx5 library know which device to use */
2666 			spdk_mlx5_crypto_devs_allow(dev_name, 1);
2667 		}
2668 	} else {
2669 		SPDK_NOTICELOG("Found %d devices, crypto %d\n", num_devs, g_accel_mlx5.crypto_supported);
2670 	}
2671 
2672 	g_accel_mlx5.dev_ctxs = calloc(num_devs, sizeof(*g_accel_mlx5.dev_ctxs));
2673 	if (!g_accel_mlx5.dev_ctxs) {
2674 		SPDK_ERRLOG("Memory allocation failed\n");
2675 		rc = -ENOMEM;
2676 		goto cleanup;
2677 	}
2678 
2679 	for (i = first_dev; i < first_dev + num_devs; i++) {
2680 		rc = accel_mlx5_dev_ctx_init(&g_accel_mlx5.dev_ctxs[g_accel_mlx5.num_ctxs++],
2681 					     rdma_devs[i], &caps[i]);
2682 		if (rc) {
2683 			goto cleanup;
2684 		}
2685 	}
2686 
2687 	SPDK_NOTICELOG("Accel framework mlx5 initialized, found %d devices.\n", num_devs);
2688 	spdk_io_device_register(&g_accel_mlx5, accel_mlx5_create_cb, accel_mlx5_destroy_cb,
2689 				sizeof(struct accel_mlx5_io_channel), "accel_mlx5");
2690 	g_accel_mlx5.initialized = true;
2691 	free(rdma_devs);
2692 	free(caps);
2693 
2694 	return 0;
2695 
2696 cleanup:
2697 	free(rdma_devs);
2698 	free(caps);
2699 	accel_mlx5_free_resources();
2700 	spdk_spin_destroy(&g_accel_mlx5.lock);
2701 
2702 	return rc;
2703 }
2704 
2705 static void
2706 accel_mlx5_write_config_json(struct spdk_json_write_ctx *w)
2707 {
2708 	if (g_accel_mlx5.enabled) {
2709 		spdk_json_write_object_begin(w);
2710 		spdk_json_write_named_string(w, "method", "mlx5_scan_accel_module");
2711 		spdk_json_write_named_object_begin(w, "params");
2712 		spdk_json_write_named_uint16(w, "qp_size", g_accel_mlx5.attr.qp_size);
2713 		spdk_json_write_named_uint32(w, "num_requests", g_accel_mlx5.attr.num_requests);
2714 		if (g_accel_mlx5.attr.allowed_devs) {
2715 			spdk_json_write_named_string(w, "allowed_devs", g_accel_mlx5.attr.allowed_devs);
2716 		}
2717 		spdk_json_write_named_uint16(w, "crypto_split_blocks", g_accel_mlx5.attr.crypto_split_blocks);
2718 		spdk_json_write_object_end(w);
2719 		spdk_json_write_object_end(w);
2720 	}
2721 }
2722 
2723 static size_t
2724 accel_mlx5_get_ctx_size(void)
2725 {
2726 	return sizeof(struct accel_mlx5_task);
2727 }
2728 
2729 static int
2730 accel_mlx5_crypto_key_init(struct spdk_accel_crypto_key *key)
2731 {
2732 	struct spdk_mlx5_crypto_dek_create_attr attr = {};
2733 	struct spdk_mlx5_crypto_keytag *keytag;
2734 	int rc;
2735 
2736 	if (!key || !key->key || !key->key2 || !key->key_size || !key->key2_size) {
2737 		return -EINVAL;
2738 	}
2739 
2740 	attr.dek = calloc(1, key->key_size + key->key2_size);
2741 	if (!attr.dek) {
2742 		return -ENOMEM;
2743 	}
2744 
2745 	memcpy(attr.dek, key->key, key->key_size);
2746 	memcpy(attr.dek + key->key_size, key->key2, key->key2_size);
2747 	attr.dek_len = key->key_size + key->key2_size;
2748 
2749 	rc = spdk_mlx5_crypto_keytag_create(&attr, &keytag);
2750 	spdk_memset_s(attr.dek, attr.dek_len, 0, attr.dek_len);
2751 	free(attr.dek);
2752 	if (rc) {
2753 		SPDK_ERRLOG("Failed to create a keytag, rc %d\n", rc);
2754 		return rc;
2755 	}
2756 
2757 	key->priv = keytag;
2758 
2759 	return 0;
2760 }
2761 
2762 static void
2763 accel_mlx5_crypto_key_deinit(struct spdk_accel_crypto_key *key)
2764 {
2765 	if (!key || key->module_if != &g_accel_mlx5.module || !key->priv) {
2766 		return;
2767 	}
2768 
2769 	spdk_mlx5_crypto_keytag_destroy(key->priv);
2770 }
2771 
2772 static void
2773 accel_mlx5_dump_stats_json(struct spdk_json_write_ctx *w, const char *header,
2774 			   const struct accel_mlx5_stats *stats)
2775 {
2776 	double idle_polls_percentage = 0;
2777 	double cpls_per_poll = 0;
2778 	uint64_t total_tasks = 0;
2779 	int i;
2780 
2781 	if (stats->polls) {
2782 		idle_polls_percentage = (double) stats->idle_polls * 100 / stats->polls;
2783 	}
2784 	if (stats->polls > stats->idle_polls) {
2785 		cpls_per_poll = (double) stats->completions / (stats->polls - stats->idle_polls);
2786 	}
2787 	for (i = 0; i < ACCEL_MLX5_OPC_LAST; i++) {
2788 		total_tasks += stats->opcodes[i];
2789 	}
2790 
2791 	spdk_json_write_named_object_begin(w, header);
2792 
2793 	spdk_json_write_named_object_begin(w, "umrs");
2794 	spdk_json_write_named_uint64(w, "crypto_umrs", stats->crypto_umrs);
2795 	spdk_json_write_named_uint64(w, "sig_umrs", stats->sig_umrs);
2796 	spdk_json_write_named_uint64(w, "total", stats->crypto_umrs + stats->sig_umrs);
2797 	spdk_json_write_object_end(w);
2798 
2799 	spdk_json_write_named_object_begin(w, "rdma");
2800 	spdk_json_write_named_uint64(w, "read", stats->rdma_reads);
2801 	spdk_json_write_named_uint64(w, "write", stats->rdma_writes);
2802 	spdk_json_write_named_uint64(w, "total", stats->rdma_reads + stats->rdma_writes);
2803 	spdk_json_write_object_end(w);
2804 
2805 	spdk_json_write_named_object_begin(w, "polling");
2806 	spdk_json_write_named_uint64(w, "polls", stats->polls);
2807 	spdk_json_write_named_uint64(w, "idle_polls", stats->idle_polls);
2808 	spdk_json_write_named_uint64(w, "completions", stats->completions);
2809 	spdk_json_write_named_double(w, "idle_polls_percentage", idle_polls_percentage);
2810 	spdk_json_write_named_double(w, "cpls_per_poll", cpls_per_poll);
2811 	spdk_json_write_named_uint64(w, "nomem_qdepth", stats->nomem_qdepth);
2812 	spdk_json_write_named_uint64(w, "nomem_mkey", stats->nomem_mkey);
2813 	spdk_json_write_object_end(w);
2814 
2815 	spdk_json_write_named_object_begin(w, "tasks");
2816 	spdk_json_write_named_uint64(w, "copy", stats->opcodes[ACCEL_MLX5_OPC_COPY]);
2817 	spdk_json_write_named_uint64(w, "crypto", stats->opcodes[ACCEL_MLX5_OPC_CRYPTO]);
2818 	spdk_json_write_named_uint64(w, "crc32c", stats->opcodes[ACCEL_MLX5_OPC_CRC32C]);
2819 	spdk_json_write_named_uint64(w, "total", total_tasks);
2820 	spdk_json_write_object_end(w);
2821 
2822 	spdk_json_write_object_end(w);
2823 }
2824 
2825 static void
2826 accel_mlx5_dump_channel_stat(struct spdk_io_channel_iter *i)
2827 {
2828 	struct accel_mlx5_stats ch_stat = {};
2829 	struct accel_mlx5_dump_stats_ctx *ctx;
2830 	struct spdk_io_channel *_ch;
2831 	struct accel_mlx5_io_channel *ch;
2832 	struct accel_mlx5_dev *dev;
2833 	uint32_t j;
2834 
2835 	ctx = spdk_io_channel_iter_get_ctx(i);
2836 	_ch = spdk_io_channel_iter_get_channel(i);
2837 	ch = spdk_io_channel_get_ctx(_ch);
2838 
2839 	if (ctx->level != ACCEL_MLX5_DUMP_STAT_LEVEL_TOTAL) {
2840 		spdk_json_write_object_begin(ctx->w);
2841 		spdk_json_write_named_object_begin(ctx->w, spdk_thread_get_name(spdk_get_thread()));
2842 	}
2843 	if (ctx->level == ACCEL_MLX5_DUMP_STAT_LEVEL_DEV) {
2844 		spdk_json_write_named_array_begin(ctx->w, "devices");
2845 	}
2846 
2847 	for (j = 0; j < ch->num_devs; j++) {
2848 		dev = &ch->devs[j];
2849 		/* Save grand total and channel stats */
2850 		accel_mlx5_add_stats(&ctx->total, &dev->stats);
2851 		accel_mlx5_add_stats(&ch_stat, &dev->stats);
2852 		if (ctx->level == ACCEL_MLX5_DUMP_STAT_LEVEL_DEV) {
2853 			spdk_json_write_object_begin(ctx->w);
2854 			accel_mlx5_dump_stats_json(ctx->w, dev->dev_ctx->context->device->name, &dev->stats);
2855 			spdk_json_write_object_end(ctx->w);
2856 		}
2857 	}
2858 
2859 	if (ctx->level == ACCEL_MLX5_DUMP_STAT_LEVEL_DEV) {
2860 		spdk_json_write_array_end(ctx->w);
2861 	}
2862 	if (ctx->level != ACCEL_MLX5_DUMP_STAT_LEVEL_TOTAL) {
2863 		accel_mlx5_dump_stats_json(ctx->w, "channel_total", &ch_stat);
2864 		spdk_json_write_object_end(ctx->w);
2865 		spdk_json_write_object_end(ctx->w);
2866 	}
2867 
2868 	spdk_for_each_channel_continue(i, 0);
2869 }
2870 
2871 static void
2872 accel_mlx5_dump_channel_stat_done(struct spdk_io_channel_iter *i, int status)
2873 {
2874 	struct accel_mlx5_dump_stats_ctx *ctx;
2875 
2876 	ctx = spdk_io_channel_iter_get_ctx(i);
2877 
2878 	spdk_spin_lock(&g_accel_mlx5.lock);
2879 	/* Add statistics from destroyed channels */
2880 	accel_mlx5_add_stats(&ctx->total, &g_accel_mlx5.stats);
2881 	spdk_spin_unlock(&g_accel_mlx5.lock);
2882 
2883 	if (ctx->level != ACCEL_MLX5_DUMP_STAT_LEVEL_TOTAL) {
2884 		/* channels[] */
2885 		spdk_json_write_array_end(ctx->w);
2886 	}
2887 
2888 	accel_mlx5_dump_stats_json(ctx->w, "total", &ctx->total);
2889 
2890 	/* Ends the whole response which was begun in accel_mlx5_dump_stats */
2891 	spdk_json_write_object_end(ctx->w);
2892 
2893 	ctx->cb(ctx->ctx, 0);
2894 	free(ctx);
2895 }
2896 
2897 int
2898 accel_mlx5_dump_stats(struct spdk_json_write_ctx *w, enum accel_mlx5_dump_state_level level,
2899 		      accel_mlx5_dump_stat_done_cb cb, void *ctx)
2900 {
2901 	struct accel_mlx5_dump_stats_ctx *stat_ctx;
2902 
2903 	if (!w || !cb) {
2904 		return -EINVAL;
2905 	}
2906 	if (!g_accel_mlx5.initialized) {
2907 		return -ENODEV;
2908 	}
2909 
2910 	stat_ctx = calloc(1, sizeof(*stat_ctx));
2911 	if (!stat_ctx) {
2912 		return -ENOMEM;
2913 	}
2914 	stat_ctx->cb = cb;
2915 	stat_ctx->ctx = ctx;
2916 	stat_ctx->level = level;
2917 	stat_ctx->w = w;
2918 
2919 	spdk_json_write_object_begin(w);
2920 
2921 	if (level != ACCEL_MLX5_DUMP_STAT_LEVEL_TOTAL) {
2922 		spdk_json_write_named_array_begin(w, "channels");
2923 	}
2924 
2925 	spdk_for_each_channel(&g_accel_mlx5, accel_mlx5_dump_channel_stat, stat_ctx,
2926 			      accel_mlx5_dump_channel_stat_done);
2927 
2928 	return 0;
2929 }
2930 
2931 static bool
2932 accel_mlx5_crypto_supports_cipher(enum spdk_accel_cipher cipher, size_t key_size)
2933 {
2934 	switch (cipher) {
2935 	case SPDK_ACCEL_CIPHER_AES_XTS:
2936 		return key_size == SPDK_ACCEL_AES_XTS_128_KEY_SIZE || key_size == SPDK_ACCEL_AES_XTS_256_KEY_SIZE;
2937 	default:
2938 		return false;
2939 	}
2940 }
2941 
2942 static int
2943 accel_mlx5_get_memory_domains(struct spdk_memory_domain **domains, int array_size)
2944 {
2945 	int i, size;
2946 
2947 	if (!domains || !array_size) {
2948 		return (int)g_accel_mlx5.num_ctxs;
2949 	}
2950 
2951 	size = spdk_min(array_size, (int)g_accel_mlx5.num_ctxs);
2952 
2953 	for (i = 0; i < size; i++) {
2954 		domains[i] = g_accel_mlx5.dev_ctxs[i].domain;
2955 	}
2956 
2957 	return (int)g_accel_mlx5.num_ctxs;
2958 }
2959 
2960 static struct accel_mlx5_module g_accel_mlx5 = {
2961 	.module = {
2962 		.module_init		= accel_mlx5_init,
2963 		.module_fini		= accel_mlx5_deinit,
2964 		.write_config_json	= accel_mlx5_write_config_json,
2965 		.get_ctx_size		= accel_mlx5_get_ctx_size,
2966 		.name			= "mlx5",
2967 		.supports_opcode	= accel_mlx5_supports_opcode,
2968 		.get_io_channel		= accel_mlx5_get_io_channel,
2969 		.submit_tasks		= accel_mlx5_submit_tasks,
2970 		.crypto_key_init	= accel_mlx5_crypto_key_init,
2971 		.crypto_key_deinit	= accel_mlx5_crypto_key_deinit,
2972 		.crypto_supports_cipher	= accel_mlx5_crypto_supports_cipher,
2973 		.get_memory_domains	= accel_mlx5_get_memory_domains,
2974 	}
2975 };
2976 
2977 SPDK_LOG_REGISTER_COMPONENT(accel_mlx5)
2978