xref: /spdk/module/accel/mlx5/accel_mlx5.c (revision cc533a3e572d8a2256a4e2c932c1dc0c86786c4a)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  */
4 
5 #include "spdk/env.h"
6 #include "spdk/thread.h"
7 #include "spdk/queue.h"
8 #include "spdk/log.h"
9 #include "spdk/string.h"
10 #include "spdk/likely.h"
11 #include "spdk/dma.h"
12 #include "spdk/json.h"
13 #include "spdk/util.h"
14 
15 #include "spdk_internal/mlx5.h"
16 #include "spdk_internal/rdma_utils.h"
17 #include "spdk/accel_module.h"
18 #include "spdk_internal/assert.h"
19 #include "spdk_internal/sgl.h"
20 #include "accel_mlx5.h"
21 
22 #include <infiniband/mlx5dv.h>
23 #include <rdma/rdma_cma.h>
24 
25 #define ACCEL_MLX5_QP_SIZE (256u)
26 #define ACCEL_MLX5_NUM_REQUESTS (2048u - 1)
27 #define ACCEL_MLX5_RECOVER_POLLER_PERIOD_US (10000)
28 #define ACCEL_MLX5_MAX_SGE (16u)
29 #define ACCEL_MLX5_MAX_WC (64u)
30 #define ACCEL_MLX5_MAX_MKEYS_IN_TASK (16u)
31 
32 /* Assume we have up to 16 devices */
33 #define ACCEL_MLX5_ALLOWED_DEVS_MAX_LEN ((SPDK_MLX5_DEV_MAX_NAME_LEN + 1) * 16)
34 
35 #define ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED(qp, task)	\
36 do {							\
37 	assert((qp)->wrs_submitted < (qp)->wrs_max);	\
38 	(qp)->wrs_submitted++;				\
39 	assert((task)->num_wrs < UINT16_MAX);		\
40 	(task)->num_wrs++;				\
41 } while (0)
42 
43 #define ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED_SIGNALED(dev, qp, task)	\
44 do {									\
45 	assert((dev)->wrs_in_cq < (dev)->wrs_in_cq_max);		\
46 	(dev)->wrs_in_cq++;						\
47         assert((qp)->wrs_submitted < (qp)->wrs_max);			\
48 	(qp)->wrs_submitted++;						\
49 	assert((task)->num_wrs < UINT16_MAX);				\
50 	(task)->num_wrs++;						\
51 } while (0)
52 
53 struct accel_mlx5_io_channel;
54 struct accel_mlx5_task;
55 
56 struct accel_mlx5_dev_ctx {
57 	struct ibv_context *context;
58 	struct ibv_pd *pd;
59 	struct spdk_memory_domain *domain;
60 	struct spdk_mempool *psv_pool;
61 	TAILQ_ENTRY(accel_mlx5_dev_ctx) link;
62 	struct spdk_mlx5_psv **psvs;
63 	bool crypto_mkeys;
64 	bool sig_mkeys;
65 	bool crypto_multi_block;
66 };
67 
68 enum accel_mlx5_opcode {
69 	ACCEL_MLX5_OPC_COPY,
70 	ACCEL_MLX5_OPC_CRYPTO,
71 	ACCEL_MLX5_OPC_CRC32C,
72 	ACCEL_MLX5_OPC_LAST
73 };
74 
75 struct accel_mlx5_stats {
76 	uint64_t crypto_umrs;
77 	uint64_t sig_umrs;
78 	uint64_t rdma_reads;
79 	uint64_t rdma_writes;
80 	uint64_t polls;
81 	uint64_t idle_polls;
82 	uint64_t completions;
83 	uint64_t nomem_qdepth;
84 	uint64_t nomem_mkey;
85 	uint64_t opcodes[ACCEL_MLX5_OPC_LAST];
86 };
87 
88 struct accel_mlx5_module {
89 	struct spdk_accel_module_if module;
90 	struct accel_mlx5_stats stats;
91 	struct spdk_spinlock lock;
92 	struct accel_mlx5_dev_ctx *dev_ctxs;
93 	uint32_t num_ctxs;
94 	struct accel_mlx5_attr attr;
95 	char **allowed_devs;
96 	size_t allowed_devs_count;
97 	bool initialized;
98 	bool enabled;
99 	bool crypto_supported;
100 	bool crc32c_supported;
101 };
102 
103 struct accel_mlx5_sge {
104 	uint32_t src_sge_count;
105 	uint32_t dst_sge_count;
106 	struct ibv_sge src_sge[ACCEL_MLX5_MAX_SGE];
107 	struct ibv_sge dst_sge[ACCEL_MLX5_MAX_SGE];
108 };
109 
110 struct accel_mlx5_iov_sgl {
111 	struct iovec	*iov;
112 	uint32_t	iovcnt;
113 	uint32_t	iov_offset;
114 };
115 
116 struct accel_mlx5_psv_wrapper {
117 	uint32_t psv_index;
118 	struct {
119 		uint32_t error : 1;
120 		uint32_t reserved : 31;
121 	} bits;
122 	/* mlx5 engine requires DMAable memory, use this member to copy user's crc value since we don't know which
123 	 * memory it is in */
124 	uint32_t crc;
125 	uint32_t crc_lkey;
126 };
127 
128 struct accel_mlx5_task {
129 	struct spdk_accel_task base;
130 	struct accel_mlx5_iov_sgl src;
131 	struct accel_mlx5_iov_sgl dst;
132 	struct accel_mlx5_qp *qp;
133 	STAILQ_ENTRY(accel_mlx5_task) link;
134 	uint16_t num_reqs;
135 	uint16_t num_completed_reqs;
136 	uint16_t num_submitted_reqs;
137 	uint16_t num_ops; /* number of allocated mkeys or number of operations */
138 	uint16_t num_wrs; /* Number of outstanding operations which consume qp slot */
139 	union {
140 		struct {
141 			uint16_t blocks_per_req;
142 			uint16_t num_processed_blocks;
143 			uint16_t num_blocks;
144 		};
145 		struct {
146 			struct accel_mlx5_psv_wrapper *psv;
147 			uint32_t last_umr_len;
148 			uint8_t last_mkey_idx;
149 		};
150 	};
151 	union {
152 		uint8_t raw;
153 		struct {
154 			uint8_t inplace : 1;
155 			uint8_t enc_order : 2;
156 			uint8_t mlx5_opcode: 5;
157 		};
158 	};
159 	/* Keep this array last since not all elements might be accessed, this reduces amount of data to be
160 	 * cached */
161 	struct spdk_mlx5_mkey_pool_obj *mkeys[ACCEL_MLX5_MAX_MKEYS_IN_TASK];
162 };
163 
164 SPDK_STATIC_ASSERT(ACCEL_MLX5_MAX_MKEYS_IN_TASK <= UINT8_MAX, "uint8_t is used to iterate mkeys");
165 
166 struct accel_mlx5_qp {
167 	struct spdk_mlx5_qp *qp;
168 	struct ibv_qp *verbs_qp;
169 	struct accel_mlx5_dev *dev;
170 	struct accel_mlx5_io_channel *ch;
171 	/* tasks submitted to HW. We can't complete a task even in error case until we reap completions for all
172 	 * submitted requests */
173 	STAILQ_HEAD(, accel_mlx5_task) in_hw;
174 	uint16_t wrs_submitted;
175 	uint16_t wrs_max;
176 	bool recovering;
177 	struct spdk_poller *recover_poller;
178 };
179 
180 struct accel_mlx5_dev {
181 	struct accel_mlx5_qp qp;
182 	struct spdk_mlx5_cq *cq;
183 	struct spdk_mlx5_mkey_pool *crypto_mkeys;
184 	struct spdk_mlx5_mkey_pool *sig_mkeys;
185 	struct spdk_rdma_utils_mem_map *mmap;
186 	struct accel_mlx5_dev_ctx *dev_ctx;
187 	uint16_t wrs_in_cq;
188 	uint16_t wrs_in_cq_max;
189 	uint16_t crypto_split_blocks;
190 	bool crypto_multi_block;
191 	/* Pending tasks waiting for requests resources */
192 	STAILQ_HEAD(, accel_mlx5_task) nomem;
193 	TAILQ_ENTRY(accel_mlx5_dev) link;
194 	struct accel_mlx5_stats stats;
195 };
196 
197 struct accel_mlx5_io_channel {
198 	struct accel_mlx5_dev *devs;
199 	struct spdk_poller *poller;
200 	uint32_t num_devs;
201 	/* Index in \b devs to be used for operations in round-robin way */
202 	uint32_t dev_idx;
203 };
204 
205 struct accel_mlx5_task_operations {
206 	int (*init)(struct accel_mlx5_task *task);
207 	int (*process)(struct accel_mlx5_task *task);
208 	int (*cont)(struct accel_mlx5_task *task);
209 	void (*complete)(struct accel_mlx5_task *task);
210 };
211 
212 struct accel_mlx5_psv_pool_iter_cb_args {
213 	struct accel_mlx5_dev_ctx *dev;
214 	struct spdk_rdma_utils_mem_map *map;
215 	int rc;
216 };
217 
218 struct accel_mlx5_dump_stats_ctx {
219 	struct accel_mlx5_stats total;
220 	struct spdk_json_write_ctx *w;
221 	enum accel_mlx5_dump_state_level level;
222 	accel_mlx5_dump_stat_done_cb cb;
223 	void *ctx;
224 };
225 
226 static struct accel_mlx5_module g_accel_mlx5;
227 
228 static inline void
229 accel_mlx5_iov_sgl_init(struct accel_mlx5_iov_sgl *s, struct iovec *iov, uint32_t iovcnt)
230 {
231 	s->iov = iov;
232 	s->iovcnt = iovcnt;
233 	s->iov_offset = 0;
234 }
235 
236 static inline void
237 accel_mlx5_iov_sgl_advance(struct accel_mlx5_iov_sgl *s, uint32_t step)
238 {
239 	s->iov_offset += step;
240 	while (s->iovcnt > 0) {
241 		assert(s->iov != NULL);
242 		if (s->iov_offset < s->iov->iov_len) {
243 			break;
244 		}
245 
246 		s->iov_offset -= s->iov->iov_len;
247 		s->iov++;
248 		s->iovcnt--;
249 	}
250 }
251 
252 static inline void
253 accel_mlx5_iov_sgl_unwind(struct accel_mlx5_iov_sgl *s, uint32_t max_iovs, uint32_t step)
254 {
255 	SPDK_DEBUGLOG(accel_mlx5, "iov %p, iovcnt %u, max %u, offset %u, step %u\n", s->iov, s->iovcnt,
256 		      max_iovs, s->iov_offset, step);
257 	while (s->iovcnt <= max_iovs) {
258 		assert(s->iov != NULL);
259 		if (s->iov_offset >= step) {
260 			s->iov_offset -= step;
261 			SPDK_DEBUGLOG(accel_mlx5, "\tEND, iov %p, iovcnt %u, offset %u\n", s->iov, s->iovcnt,
262 				      s->iov_offset);
263 			return;
264 		}
265 		step -= s->iov_offset;
266 		s->iov--;
267 		s->iovcnt++;
268 		s->iov_offset = s->iov->iov_len;
269 		SPDK_DEBUGLOG(accel_mlx5, "\tiov %p, iovcnt %u, offset %u, step %u\n", s->iov, s->iovcnt,
270 			      s->iov_offset, step);
271 	}
272 
273 	SPDK_ERRLOG("Can't unwind iovs, remaining  %u\n", step);
274 	assert(0);
275 }
276 
277 static inline int
278 accel_mlx5_sge_unwind(struct ibv_sge *sge, uint32_t sge_count, uint32_t step)
279 {
280 	int i;
281 
282 	assert(sge_count > 0);
283 	SPDK_DEBUGLOG(accel_mlx5, "sge %p, count %u, step %u\n", sge, sge_count, step);
284 	for (i = (int)sge_count - 1; i >= 0; i--) {
285 		if (sge[i].length > step) {
286 			sge[i].length -= step;
287 			SPDK_DEBUGLOG(accel_mlx5, "\tsge[%u] len %u, step %u\n", i, sge[i].length, step);
288 			return (int)i + 1;
289 		}
290 		SPDK_DEBUGLOG(accel_mlx5, "\tsge[%u] len %u, step %u\n", i, sge[i].length, step);
291 		step -= sge[i].length;
292 	}
293 
294 	SPDK_ERRLOG("Can't unwind sge, remaining  %u\n", step);
295 	assert(step == 0);
296 
297 	return 0;
298 }
299 
300 static inline void
301 accel_mlx5_crypto_task_complete(struct accel_mlx5_task *task)
302 {
303 	struct accel_mlx5_dev *dev = task->qp->dev;
304 
305 	assert(task->num_ops);
306 	spdk_mlx5_mkey_pool_put_bulk(dev->crypto_mkeys, task->mkeys, task->num_ops);
307 	spdk_accel_task_complete(&task->base, 0);
308 }
309 
310 static inline void
311 accel_mlx5_task_fail(struct accel_mlx5_task *task, int rc)
312 {
313 	struct accel_mlx5_dev *dev = task->qp->dev;
314 
315 	assert(task->num_reqs == task->num_completed_reqs);
316 	SPDK_DEBUGLOG(accel_mlx5, "Fail task %p, opc %d, rc %d\n", task, task->base.op_code, rc);
317 
318 	if (task->num_ops) {
319 		if (task->mlx5_opcode == ACCEL_MLX5_OPC_CRYPTO) {
320 			spdk_mlx5_mkey_pool_put_bulk(dev->crypto_mkeys, task->mkeys, task->num_ops);
321 		}
322 		if (task->mlx5_opcode == ACCEL_MLX5_OPC_CRC32C) {
323 			spdk_mlx5_mkey_pool_put_bulk(dev->sig_mkeys, task->mkeys, task->num_ops);
324 			spdk_mempool_put(dev->dev_ctx->psv_pool, task->psv);
325 		}
326 	}
327 	spdk_accel_task_complete(&task->base, rc);
328 }
329 
330 static int
331 accel_mlx5_translate_addr(void *addr, size_t size, struct spdk_memory_domain *domain,
332 			  void *domain_ctx, struct accel_mlx5_dev *dev, struct ibv_sge *sge)
333 {
334 	struct spdk_rdma_utils_memory_translation map_translation;
335 	struct spdk_memory_domain_translation_result domain_translation;
336 	struct spdk_memory_domain_translation_ctx local_ctx;
337 	int rc;
338 
339 	if (domain) {
340 		domain_translation.size = sizeof(struct spdk_memory_domain_translation_result);
341 		local_ctx.size = sizeof(local_ctx);
342 		local_ctx.rdma.ibv_qp = dev->qp.verbs_qp;
343 		rc = spdk_memory_domain_translate_data(domain, domain_ctx, dev->dev_ctx->domain,
344 						       &local_ctx, addr, size, &domain_translation);
345 		if (spdk_unlikely(rc || domain_translation.iov_count != 1)) {
346 			SPDK_ERRLOG("Memory domain translation failed, addr %p, length %zu, iovcnt %u\n", addr, size,
347 				    domain_translation.iov_count);
348 			if (rc == 0) {
349 				rc = -EINVAL;
350 			}
351 
352 			return rc;
353 		}
354 		sge->lkey = domain_translation.rdma.lkey;
355 		sge->addr = (uint64_t) domain_translation.iov.iov_base;
356 		sge->length = domain_translation.iov.iov_len;
357 	} else {
358 		rc = spdk_rdma_utils_get_translation(dev->mmap, addr, size,
359 						     &map_translation);
360 		if (spdk_unlikely(rc)) {
361 			SPDK_ERRLOG("Memory translation failed, addr %p, length %zu\n", addr, size);
362 			return rc;
363 		}
364 		sge->lkey = spdk_rdma_utils_memory_translation_get_lkey(&map_translation);
365 		sge->addr = (uint64_t)addr;
366 		sge->length = size;
367 	}
368 
369 	return 0;
370 }
371 
372 static inline int
373 accel_mlx5_fill_block_sge(struct accel_mlx5_dev *dev, struct ibv_sge *sge,
374 			  struct accel_mlx5_iov_sgl *iovs, uint32_t len, uint32_t *_remaining,
375 			  struct spdk_memory_domain *domain, void *domain_ctx)
376 {
377 	void *addr;
378 	uint32_t remaining = len;
379 	uint32_t size;
380 	int i = 0;
381 	int rc;
382 
383 	while (remaining && i < (int)ACCEL_MLX5_MAX_SGE) {
384 		size = spdk_min(remaining, iovs->iov->iov_len - iovs->iov_offset);
385 		addr = (void *)iovs->iov->iov_base + iovs->iov_offset;
386 		rc = accel_mlx5_translate_addr(addr, size, domain, domain_ctx, dev, &sge[i]);
387 		if (spdk_unlikely(rc)) {
388 			return rc;
389 		}
390 		SPDK_DEBUGLOG(accel_mlx5, "\t sge[%d]: lkey %u, len %u, addr %"PRIx64"\n", i, sge[i].lkey,
391 			      sge[i].length, sge[i].addr);
392 		accel_mlx5_iov_sgl_advance(iovs, size);
393 		i++;
394 		assert(remaining >= size);
395 		remaining -= size;
396 	}
397 	*_remaining = remaining;
398 
399 	return i;
400 }
401 
402 static inline bool
403 accel_mlx5_compare_iovs(struct iovec *v1, struct iovec *v2, uint32_t iovcnt)
404 {
405 	return memcmp(v1, v2, sizeof(*v1) * iovcnt) == 0;
406 }
407 
408 static inline uint16_t
409 accel_mlx5_dev_get_available_slots(struct accel_mlx5_dev *dev, struct accel_mlx5_qp *qp)
410 {
411 	assert(qp->wrs_max >= qp->wrs_submitted);
412 	assert(dev->wrs_in_cq_max >= dev->wrs_in_cq);
413 
414 	/* Each time we produce only 1 CQE, so we need 1 CQ slot */
415 	if (spdk_unlikely(dev->wrs_in_cq == dev->wrs_in_cq_max)) {
416 		return 0;
417 	}
418 
419 	return qp->wrs_max - qp->wrs_submitted;
420 }
421 
422 static inline uint32_t
423 accel_mlx5_task_alloc_mkeys(struct accel_mlx5_task *task, struct spdk_mlx5_mkey_pool *pool)
424 {
425 	uint32_t num_ops;
426 	int rc;
427 
428 	assert(task->num_reqs > task->num_completed_reqs);
429 	num_ops = task->num_reqs - task->num_completed_reqs;
430 	num_ops = spdk_min(num_ops, ACCEL_MLX5_MAX_MKEYS_IN_TASK);
431 	if (!num_ops) {
432 		return 0;
433 	}
434 	rc = spdk_mlx5_mkey_pool_get_bulk(pool, task->mkeys, num_ops);
435 	if (spdk_unlikely(rc)) {
436 		return 0;
437 	}
438 	assert(num_ops <= UINT16_MAX);
439 	task->num_ops = num_ops;
440 
441 	return num_ops;
442 }
443 
444 static inline uint8_t
445 bs_to_bs_selector(uint32_t bs)
446 {
447 	switch (bs) {
448 	case 512:
449 		return SPDK_MLX5_BLOCK_SIZE_SELECTOR_512;
450 	case 520:
451 		return SPDK_MLX5_BLOCK_SIZE_SELECTOR_520;
452 	case 4096:
453 		return SPDK_MLX5_BLOCK_SIZE_SELECTOR_4096;
454 	case 4160:
455 		return SPDK_MLX5_BLOCK_SIZE_SELECTOR_4160;
456 	default:
457 		return SPDK_MLX5_BLOCK_SIZE_SELECTOR_RESERVED;
458 	}
459 }
460 
461 static inline int
462 accel_mlx5_configure_crypto_umr(struct accel_mlx5_task *mlx5_task, struct accel_mlx5_sge *sge,
463 				uint32_t mkey, uint32_t num_blocks, struct spdk_mlx5_crypto_dek_data *dek_data)
464 {
465 	struct spdk_mlx5_umr_crypto_attr cattr;
466 	struct spdk_mlx5_umr_attr umr_attr;
467 	struct accel_mlx5_qp *qp = mlx5_task->qp;
468 	struct accel_mlx5_dev *dev = qp->dev;
469 	struct spdk_accel_task *task = &mlx5_task->base;
470 	uint32_t length, remaining = 0, block_size = task->block_size;
471 	int rc;
472 
473 	length = num_blocks * block_size;
474 	SPDK_DEBUGLOG(accel_mlx5, "task %p, domain %p, len %u, blocks %u\n", task, task->src_domain, length,
475 		      num_blocks);
476 	rc = accel_mlx5_fill_block_sge(dev, sge->src_sge, &mlx5_task->src,  length, &remaining,
477 				       task->src_domain, task->src_domain_ctx);
478 	if (spdk_unlikely(rc <= 0)) {
479 		if (rc == 0) {
480 			rc = -EINVAL;
481 		}
482 		SPDK_ERRLOG("failed set src sge, rc %d\n", rc);
483 		return rc;
484 	}
485 	sge->src_sge_count = rc;
486 	if (spdk_unlikely(remaining)) {
487 		uint32_t new_len = length - remaining;
488 		uint32_t aligned_len, updated_num_blocks;
489 
490 		SPDK_DEBUGLOG(accel_mlx5, "Incorrect src iovs, handled %u out of %u bytes\n", new_len, length);
491 		if (new_len < block_size) {
492 			/* We need to process at least 1 block. If buffer is too fragmented, we can't do
493 			 * anything */
494 			return -ERANGE;
495 		}
496 
497 		/* Regular integer division, we need to round down to prev block size */
498 		updated_num_blocks = new_len / block_size;
499 		assert(updated_num_blocks);
500 		assert(updated_num_blocks < num_blocks);
501 		aligned_len = updated_num_blocks * block_size;
502 
503 		if (aligned_len < new_len) {
504 			uint32_t dt = new_len - aligned_len;
505 
506 			/* We can't process part of block, need to unwind src iov_sgl and sge to the
507 			 * prev block boundary */
508 			SPDK_DEBUGLOG(accel_mlx5, "task %p, unwind src sge for %u bytes\n", task, dt);
509 			accel_mlx5_iov_sgl_unwind(&mlx5_task->src, task->s.iovcnt, dt);
510 			sge->src_sge_count = accel_mlx5_sge_unwind(sge->src_sge, sge->src_sge_count, dt);
511 			if (!sge->src_sge_count) {
512 				return -ERANGE;
513 			}
514 		}
515 		SPDK_DEBUGLOG(accel_mlx5, "task %p, UMR len %u -> %u\n", task, length, aligned_len);
516 		length = aligned_len;
517 		num_blocks = updated_num_blocks;
518 	}
519 
520 	cattr.xts_iv = task->iv + mlx5_task->num_processed_blocks;
521 	cattr.keytag = 0;
522 	cattr.dek_obj_id = dek_data->dek_obj_id;
523 	cattr.tweak_mode = dek_data->tweak_mode;
524 	cattr.enc_order = mlx5_task->enc_order;
525 	cattr.bs_selector = bs_to_bs_selector(mlx5_task->base.block_size);
526 	if (spdk_unlikely(cattr.bs_selector == SPDK_MLX5_BLOCK_SIZE_SELECTOR_RESERVED)) {
527 		SPDK_ERRLOG("unsupported block size %u\n", mlx5_task->base.block_size);
528 		return -EINVAL;
529 	}
530 	umr_attr.mkey = mkey;
531 	umr_attr.sge = sge->src_sge;
532 
533 	if (!mlx5_task->inplace) {
534 		SPDK_DEBUGLOG(accel_mlx5, "task %p, dst sge, domain %p, len %u\n", task, task->dst_domain, length);
535 		rc = accel_mlx5_fill_block_sge(dev, sge->dst_sge, &mlx5_task->dst, length, &remaining,
536 					       task->dst_domain, task->dst_domain_ctx);
537 		if (spdk_unlikely(rc <= 0)) {
538 			if (rc == 0) {
539 				rc = -EINVAL;
540 			}
541 			SPDK_ERRLOG("failed set dst sge, rc %d\n", rc);
542 			return rc;
543 		}
544 		sge->dst_sge_count = rc;
545 		if (spdk_unlikely(remaining)) {
546 			uint32_t new_len = length - remaining;
547 			uint32_t aligned_len, updated_num_blocks, dt;
548 
549 			SPDK_DEBUGLOG(accel_mlx5, "Incorrect dst iovs, handled %u out of %u bytes\n", new_len, length);
550 			if (new_len < block_size) {
551 				/* We need to process at least 1 block. If buffer is too fragmented, we can't do
552 				 * anything */
553 				return -ERANGE;
554 			}
555 
556 			/* Regular integer division, we need to round down to prev block size */
557 			updated_num_blocks = new_len / block_size;
558 			assert(updated_num_blocks);
559 			assert(updated_num_blocks < num_blocks);
560 			aligned_len = updated_num_blocks * block_size;
561 
562 			if (aligned_len < new_len) {
563 				dt = new_len - aligned_len;
564 				assert(dt > 0 && dt < length);
565 				/* We can't process part of block, need to unwind src and dst iov_sgl and sge to the
566 				 * prev block boundary */
567 				SPDK_DEBUGLOG(accel_mlx5, "task %p, unwind dst sge for %u bytes\n", task, dt);
568 				accel_mlx5_iov_sgl_unwind(&mlx5_task->dst, task->d.iovcnt, dt);
569 				sge->dst_sge_count = accel_mlx5_sge_unwind(sge->dst_sge, sge->dst_sge_count, dt);
570 				assert(sge->dst_sge_count > 0 && sge->dst_sge_count <= ACCEL_MLX5_MAX_SGE);
571 				if (!sge->dst_sge_count) {
572 					return -ERANGE;
573 				}
574 			}
575 			assert(length > aligned_len);
576 			dt = length - aligned_len;
577 			SPDK_DEBUGLOG(accel_mlx5, "task %p, unwind src sge for %u bytes\n", task, dt);
578 			/* The same for src iov_sgl and sge. In worst case we can unwind SRC 2 times */
579 			accel_mlx5_iov_sgl_unwind(&mlx5_task->src, task->s.iovcnt, dt);
580 			sge->src_sge_count = accel_mlx5_sge_unwind(sge->src_sge, sge->src_sge_count, dt);
581 			assert(sge->src_sge_count > 0 && sge->src_sge_count <= ACCEL_MLX5_MAX_SGE);
582 			if (!sge->src_sge_count) {
583 				return -ERANGE;
584 			}
585 			SPDK_DEBUGLOG(accel_mlx5, "task %p, UMR len %u -> %u\n", task, length, aligned_len);
586 			length = aligned_len;
587 			num_blocks = updated_num_blocks;
588 		}
589 	}
590 
591 	SPDK_DEBUGLOG(accel_mlx5,
592 		      "task %p: bs %u, iv %"PRIu64", enc_on_tx %d, tweak_mode %d, len %u, mkey %x, blocks %u\n",
593 		      mlx5_task, task->block_size, cattr.xts_iv, mlx5_task->enc_order, cattr.tweak_mode, length, mkey,
594 		      num_blocks);
595 
596 	umr_attr.sge_count = sge->src_sge_count;
597 	umr_attr.umr_len = length;
598 	assert((uint32_t)mlx5_task->num_processed_blocks + num_blocks <= UINT16_MAX);
599 	mlx5_task->num_processed_blocks += num_blocks;
600 
601 	rc = spdk_mlx5_umr_configure_crypto(qp->qp, &umr_attr, &cattr, 0, 0);
602 
603 	return rc;
604 }
605 
606 static inline int
607 accel_mlx5_crypto_task_process(struct accel_mlx5_task *mlx5_task)
608 {
609 	struct accel_mlx5_sge sges[ACCEL_MLX5_MAX_MKEYS_IN_TASK];
610 	struct spdk_mlx5_crypto_dek_data dek_data;
611 	struct accel_mlx5_qp *qp = mlx5_task->qp;
612 	struct accel_mlx5_dev *dev = qp->dev;
613 	/* First RDMA after UMR must have a SMALL_FENCE */
614 	uint32_t first_rdma_fence = SPDK_MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE;
615 	uint16_t num_blocks;
616 	uint16_t num_ops = spdk_min(mlx5_task->num_reqs - mlx5_task->num_completed_reqs,
617 				    mlx5_task->num_ops);
618 	uint16_t qp_slot = accel_mlx5_dev_get_available_slots(dev, qp);
619 	uint16_t i;
620 	int rc;
621 
622 	assert(qp_slot > 1);
623 	num_ops = spdk_min(num_ops, qp_slot >> 1);
624 	if (spdk_unlikely(!num_ops)) {
625 		return -EINVAL;
626 	}
627 
628 	rc = spdk_mlx5_crypto_get_dek_data(mlx5_task->base.crypto_key->priv, dev->dev_ctx->pd, &dek_data);
629 	if (spdk_unlikely(rc)) {
630 		return rc;
631 	}
632 
633 	mlx5_task->num_wrs = 0;
634 	SPDK_DEBUGLOG(accel_mlx5, "begin, task, %p, reqs: total %u, submitted %u, completed %u\n",
635 		      mlx5_task, mlx5_task->num_reqs, mlx5_task->num_submitted_reqs, mlx5_task->num_completed_reqs);
636 	for (i = 0; i < num_ops; i++) {
637 		if (mlx5_task->num_submitted_reqs + i + 1 == mlx5_task->num_reqs) {
638 			/* Last request may consume less than calculated if crypto_multi_block is true */
639 			assert(mlx5_task->num_blocks > mlx5_task->num_submitted_reqs);
640 			num_blocks = mlx5_task->num_blocks - mlx5_task->num_processed_blocks;
641 		} else {
642 			num_blocks = mlx5_task->blocks_per_req;
643 		}
644 
645 		rc = accel_mlx5_configure_crypto_umr(mlx5_task, &sges[i], mlx5_task->mkeys[i]->mkey, num_blocks,
646 						     &dek_data);
647 		if (spdk_unlikely(rc)) {
648 			SPDK_ERRLOG("UMR configure failed with %d\n", rc);
649 			return rc;
650 		}
651 		ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED(qp, mlx5_task);
652 		dev->stats.crypto_umrs++;
653 	}
654 
655 	/* Loop `num_ops - 1` for easy flags handling */
656 	for (i = 0; i < num_ops - 1; i++) {
657 		/* UMR is used as a destination for RDMA_READ - from UMR to sge */
658 		if (mlx5_task->inplace) {
659 			rc = spdk_mlx5_qp_rdma_read(qp->qp, sges[i].src_sge, sges[i].src_sge_count, 0,
660 						    mlx5_task->mkeys[i]->mkey, 0, first_rdma_fence);
661 		} else {
662 			rc = spdk_mlx5_qp_rdma_read(qp->qp, sges[i].dst_sge, sges[i].dst_sge_count, 0,
663 						    mlx5_task->mkeys[i]->mkey, 0, first_rdma_fence);
664 		}
665 		if (spdk_unlikely(rc)) {
666 			SPDK_ERRLOG("RDMA READ/WRITE failed with %d\n", rc);
667 			return rc;
668 		}
669 
670 		first_rdma_fence = 0;
671 		assert(mlx5_task->num_submitted_reqs < mlx5_task->num_reqs);
672 		assert(mlx5_task->num_submitted_reqs < UINT16_MAX);
673 		mlx5_task->num_submitted_reqs++;
674 		ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED(qp, mlx5_task);
675 		dev->stats.rdma_reads++;
676 	}
677 
678 	if (mlx5_task->inplace) {
679 		rc = spdk_mlx5_qp_rdma_read(qp->qp, sges[i].src_sge, sges[i].src_sge_count, 0,
680 					    mlx5_task->mkeys[i]->mkey, (uint64_t)mlx5_task, first_rdma_fence | SPDK_MLX5_WQE_CTRL_CE_CQ_UPDATE);
681 	} else {
682 		rc = spdk_mlx5_qp_rdma_read(qp->qp, sges[i].dst_sge, sges[i].dst_sge_count, 0,
683 					    mlx5_task->mkeys[i]->mkey, (uint64_t)mlx5_task, first_rdma_fence | SPDK_MLX5_WQE_CTRL_CE_CQ_UPDATE);
684 	}
685 	if (spdk_unlikely(rc)) {
686 		SPDK_ERRLOG("RDMA READ/WRITE failed with %d\n", rc);
687 		return rc;
688 	}
689 
690 	assert(mlx5_task->num_submitted_reqs < mlx5_task->num_reqs);
691 	assert(mlx5_task->num_submitted_reqs < UINT16_MAX);
692 	mlx5_task->num_submitted_reqs++;
693 	ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED_SIGNALED(dev, qp, mlx5_task);
694 	dev->stats.rdma_reads++;
695 	STAILQ_INSERT_TAIL(&qp->in_hw, mlx5_task, link);
696 
697 	if (spdk_unlikely(mlx5_task->num_submitted_reqs == mlx5_task->num_reqs &&
698 			  mlx5_task->num_blocks > mlx5_task->num_processed_blocks)) {
699 		/* We hit "out of sge
700 		 * entries" case with highly fragmented payload. In that case
701 		 * accel_mlx5_configure_crypto_umr function handled fewer data blocks than expected
702 		 * That means we need at least 1 more request to complete this task, this request will be
703 		 * executed once all submitted ones are completed */
704 		SPDK_DEBUGLOG(accel_mlx5, "task %p, processed %u/%u blocks, add extra req\n", mlx5_task,
705 			      mlx5_task->num_processed_blocks, mlx5_task->num_blocks);
706 		mlx5_task->num_reqs++;
707 	}
708 
709 	SPDK_DEBUGLOG(accel_mlx5, "end, task, %p, reqs: total %u, submitted %u, completed %u\n", mlx5_task,
710 		      mlx5_task->num_reqs, mlx5_task->num_submitted_reqs, mlx5_task->num_completed_reqs);
711 
712 	return 0;
713 }
714 
715 static inline int
716 accel_mlx5_crypto_task_continue(struct accel_mlx5_task *task)
717 {
718 	struct accel_mlx5_qp *qp = task->qp;
719 	struct accel_mlx5_dev *dev = qp->dev;
720 	uint16_t qp_slot = accel_mlx5_dev_get_available_slots(dev, qp);
721 
722 	assert(task->num_reqs > task->num_completed_reqs);
723 	if (task->num_ops == 0) {
724 		/* No mkeys allocated, try to allocate now */
725 		if (spdk_unlikely(!accel_mlx5_task_alloc_mkeys(task, dev->crypto_mkeys))) {
726 			/* Pool is empty, queue this task */
727 			STAILQ_INSERT_TAIL(&dev->nomem, task, link);
728 			dev->stats.nomem_mkey++;
729 			return -ENOMEM;
730 		}
731 	}
732 	/* We need to post at least 1 UMR and 1 RDMA operation */
733 	if (spdk_unlikely(qp_slot < 2)) {
734 		/* QP is full, queue this task */
735 		STAILQ_INSERT_TAIL(&dev->nomem, task, link);
736 		task->qp->dev->stats.nomem_qdepth++;
737 		return -ENOMEM;
738 	}
739 
740 	return accel_mlx5_crypto_task_process(task);
741 }
742 
743 static inline int
744 accel_mlx5_crypto_task_init(struct accel_mlx5_task *mlx5_task)
745 {
746 	struct spdk_accel_task *task = &mlx5_task->base;
747 	struct accel_mlx5_dev *dev = mlx5_task->qp->dev;
748 	uint64_t src_nbytes = task->nbytes;
749 #ifdef DEBUG
750 	uint64_t dst_nbytes;
751 	uint32_t i;
752 #endif
753 	bool crypto_key_ok;
754 
755 	crypto_key_ok = (task->crypto_key && task->crypto_key->module_if == &g_accel_mlx5.module &&
756 			 task->crypto_key->priv);
757 	if (spdk_unlikely((task->nbytes % mlx5_task->base.block_size != 0) || !crypto_key_ok)) {
758 		if (crypto_key_ok) {
759 			SPDK_ERRLOG("src length %"PRIu64" is not a multiple of the block size %u\n", task->nbytes,
760 				    mlx5_task->base.block_size);
761 		} else {
762 			SPDK_ERRLOG("Wrong crypto key provided\n");
763 		}
764 		return -EINVAL;
765 	}
766 
767 	assert(src_nbytes / mlx5_task->base.block_size <= UINT16_MAX);
768 	mlx5_task->num_blocks = src_nbytes / mlx5_task->base.block_size;
769 	accel_mlx5_iov_sgl_init(&mlx5_task->src, task->s.iovs, task->s.iovcnt);
770 	if (task->d.iovcnt == 0 || (task->d.iovcnt == task->s.iovcnt &&
771 				    accel_mlx5_compare_iovs(task->d.iovs, task->s.iovs, task->s.iovcnt))) {
772 		mlx5_task->inplace = 1;
773 	} else {
774 #ifdef DEBUG
775 		dst_nbytes = 0;
776 		for (i = 0; i < task->d.iovcnt; i++) {
777 			dst_nbytes += task->d.iovs[i].iov_len;
778 		}
779 
780 		if (spdk_unlikely(src_nbytes != dst_nbytes)) {
781 			return -EINVAL;
782 		}
783 #endif
784 		mlx5_task->inplace = 0;
785 		accel_mlx5_iov_sgl_init(&mlx5_task->dst, task->d.iovs, task->d.iovcnt);
786 	}
787 
788 	if (dev->crypto_multi_block) {
789 		if (dev->crypto_split_blocks) {
790 			assert(SPDK_CEIL_DIV(mlx5_task->num_blocks, dev->crypto_split_blocks) <= UINT16_MAX);
791 			mlx5_task->num_reqs = SPDK_CEIL_DIV(mlx5_task->num_blocks, dev->crypto_split_blocks);
792 			/* Last req may consume less blocks */
793 			mlx5_task->blocks_per_req = spdk_min(mlx5_task->num_blocks, dev->crypto_split_blocks);
794 		} else {
795 			if (task->s.iovcnt > ACCEL_MLX5_MAX_SGE || task->d.iovcnt > ACCEL_MLX5_MAX_SGE) {
796 				uint32_t max_sge_count = spdk_max(task->s.iovcnt, task->d.iovcnt);
797 
798 				assert(SPDK_CEIL_DIV(max_sge_count, ACCEL_MLX5_MAX_SGE) <= UINT16_MAX);
799 				mlx5_task->num_reqs = SPDK_CEIL_DIV(max_sge_count, ACCEL_MLX5_MAX_SGE);
800 				mlx5_task->blocks_per_req = SPDK_CEIL_DIV(mlx5_task->num_blocks, mlx5_task->num_reqs);
801 			} else {
802 				mlx5_task->num_reqs = 1;
803 				mlx5_task->blocks_per_req = mlx5_task->num_blocks;
804 			}
805 		}
806 	} else {
807 		mlx5_task->num_reqs = mlx5_task->num_blocks;
808 		mlx5_task->blocks_per_req = 1;
809 	}
810 
811 	if (spdk_unlikely(!accel_mlx5_task_alloc_mkeys(mlx5_task, dev->crypto_mkeys))) {
812 		/* Pool is empty, queue this task */
813 		SPDK_DEBUGLOG(accel_mlx5, "no reqs in pool, dev %s\n", dev->dev_ctx->context->device->name);
814 		dev->stats.nomem_mkey++;
815 		return -ENOMEM;
816 	}
817 	if (spdk_unlikely(accel_mlx5_dev_get_available_slots(dev, &dev->qp) < 2)) {
818 		/* Queue is full, queue this task */
819 		SPDK_DEBUGLOG(accel_mlx5, "dev %s qp %p is full\n", dev->dev_ctx->context->device->name,
820 			      mlx5_task->qp);
821 		dev->stats.nomem_qdepth++;
822 		return -ENOMEM;
823 	}
824 
825 	SPDK_DEBUGLOG(accel_mlx5, "task %p, src_iovs %u, dst_iovs %u, num_reqs %u, "
826 		      "blocks/req %u, blocks %u, inplace %d\n", task, task->s.iovcnt, task->d.iovcnt,
827 		      mlx5_task->num_reqs, mlx5_task->blocks_per_req, mlx5_task->num_blocks, mlx5_task->inplace);
828 
829 	return 0;
830 }
831 
832 static inline void
833 accel_mlx5_copy_task_complete(struct accel_mlx5_task *mlx5_task)
834 {
835 	spdk_accel_task_complete(&mlx5_task->base, 0);
836 }
837 
838 static inline int
839 accel_mlx5_copy_task_process_one(struct accel_mlx5_task *mlx5_task, struct accel_mlx5_qp *qp,
840 				 uint64_t wrid, uint32_t fence)
841 {
842 	struct spdk_accel_task *task = &mlx5_task->base;
843 	struct accel_mlx5_sge sge;
844 	uint32_t remaining = 0;
845 	uint32_t dst_len;
846 	int rc;
847 
848 	/* Limit one RDMA_WRITE by length of dst buffer. Not all src buffers may fit into one dst buffer due to
849 	 * limitation on ACCEL_MLX5_MAX_SGE. If this is the case then remaining is not zero */
850 	assert(mlx5_task->dst.iov->iov_len > mlx5_task->dst.iov_offset);
851 	dst_len = mlx5_task->dst.iov->iov_len - mlx5_task->dst.iov_offset;
852 	rc = accel_mlx5_fill_block_sge(qp->dev, sge.src_sge, &mlx5_task->src, dst_len, &remaining,
853 				       task->src_domain, task->src_domain_ctx);
854 	if (spdk_unlikely(rc <= 0)) {
855 		if (rc == 0) {
856 			rc = -EINVAL;
857 		}
858 		SPDK_ERRLOG("failed set src sge, rc %d\n", rc);
859 		return rc;
860 	}
861 	sge.src_sge_count = rc;
862 	assert(dst_len > remaining);
863 	dst_len -= remaining;
864 
865 	rc = accel_mlx5_fill_block_sge(qp->dev, sge.dst_sge, &mlx5_task->dst, dst_len,  &remaining,
866 				       task->dst_domain, task->dst_domain_ctx);
867 	if (spdk_unlikely(rc != 1)) {
868 		/* We use single dst entry, any result other than 1 is an error */
869 		if (rc == 0) {
870 			rc = -EINVAL;
871 		}
872 		SPDK_ERRLOG("failed set dst sge, rc %d\n", rc);
873 		return rc;
874 	}
875 	if (spdk_unlikely(remaining)) {
876 		SPDK_ERRLOG("Incorrect dst length, remaining %u\n", remaining);
877 		assert(0);
878 		return -EINVAL;
879 	}
880 
881 	rc = spdk_mlx5_qp_rdma_write(mlx5_task->qp->qp, sge.src_sge, sge.src_sge_count,
882 				     sge.dst_sge[0].addr, sge.dst_sge[0].lkey, wrid, fence);
883 	if (spdk_unlikely(rc)) {
884 		SPDK_ERRLOG("new RDMA WRITE failed with %d\n", rc);
885 		return rc;
886 	}
887 	qp->dev->stats.rdma_writes++;
888 
889 	return 0;
890 }
891 
892 static inline int
893 accel_mlx5_copy_task_process(struct accel_mlx5_task *mlx5_task)
894 {
895 
896 	struct accel_mlx5_qp *qp = mlx5_task->qp;
897 	struct accel_mlx5_dev *dev = qp->dev;
898 	uint16_t i;
899 	int rc;
900 
901 	mlx5_task->num_wrs = 0;
902 	assert(mlx5_task->num_reqs > 0);
903 	assert(mlx5_task->num_ops > 0);
904 
905 	/* Handle n-1 reqs in order to simplify wrid and fence handling */
906 	for (i = 0; i < mlx5_task->num_ops - 1; i++) {
907 		rc = accel_mlx5_copy_task_process_one(mlx5_task, qp, 0, 0);
908 		if (spdk_unlikely(rc)) {
909 			return rc;
910 		}
911 		ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED(qp, mlx5_task);
912 		mlx5_task->num_submitted_reqs++;
913 	}
914 
915 	rc = accel_mlx5_copy_task_process_one(mlx5_task, qp, (uint64_t)mlx5_task,
916 					      SPDK_MLX5_WQE_CTRL_CE_CQ_UPDATE);
917 	if (spdk_unlikely(rc)) {
918 		return rc;
919 	}
920 	ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED_SIGNALED(dev, qp, mlx5_task);
921 	mlx5_task->num_submitted_reqs++;
922 	STAILQ_INSERT_TAIL(&qp->in_hw, mlx5_task, link);
923 
924 	SPDK_DEBUGLOG(accel_mlx5, "end, copy task, %p\n", mlx5_task);
925 
926 	return 0;
927 }
928 
929 static inline int
930 accel_mlx5_copy_task_continue(struct accel_mlx5_task *task)
931 {
932 	struct accel_mlx5_qp *qp = task->qp;
933 	struct accel_mlx5_dev *dev = qp->dev;
934 	uint16_t qp_slot = accel_mlx5_dev_get_available_slots(dev, qp);
935 
936 	task->num_ops = spdk_min(qp_slot, task->num_reqs - task->num_completed_reqs);
937 	if (spdk_unlikely(task->num_ops == 0)) {
938 		STAILQ_INSERT_TAIL(&dev->nomem, task, link);
939 		dev->stats.nomem_qdepth++;
940 		return -ENOMEM;
941 	}
942 	return accel_mlx5_copy_task_process(task);
943 }
944 
945 static inline uint32_t
946 accel_mlx5_get_copy_task_count(struct iovec *src_iov, uint32_t src_iovcnt,
947 			       struct iovec *dst_iov, uint32_t dst_iovcnt)
948 {
949 	uint32_t src = 0;
950 	uint32_t dst = 0;
951 	uint64_t src_offset = 0;
952 	uint64_t dst_offset = 0;
953 	uint32_t num_ops = 0;
954 	uint32_t src_sge_count = 0;
955 
956 	while (src < src_iovcnt && dst < dst_iovcnt) {
957 		uint64_t src_len = src_iov[src].iov_len - src_offset;
958 		uint64_t dst_len = dst_iov[dst].iov_len - dst_offset;
959 
960 		if (dst_len < src_len) {
961 			dst_offset = 0;
962 			src_offset += dst_len;
963 			dst++;
964 			num_ops++;
965 			src_sge_count = 0;
966 		} else if (src_len < dst_len) {
967 			dst_offset += src_len;
968 			src_offset = 0;
969 			src++;
970 			if (++src_sge_count >= ACCEL_MLX5_MAX_SGE) {
971 				num_ops++;
972 				src_sge_count = 0;
973 			}
974 		} else {
975 			dst_offset = 0;
976 			src_offset = 0;
977 			dst++;
978 			src++;
979 			num_ops++;
980 			src_sge_count = 0;
981 		}
982 	}
983 
984 	assert(src == src_iovcnt);
985 	assert(dst == dst_iovcnt);
986 	assert(src_offset == 0);
987 	assert(dst_offset == 0);
988 	return num_ops;
989 }
990 
991 static inline int
992 accel_mlx5_copy_task_init(struct accel_mlx5_task *mlx5_task)
993 {
994 	struct spdk_accel_task *task = &mlx5_task->base;
995 	struct accel_mlx5_qp *qp = mlx5_task->qp;
996 	uint16_t qp_slot = accel_mlx5_dev_get_available_slots(qp->dev, qp);
997 
998 	if (spdk_likely(task->s.iovcnt <= ACCEL_MLX5_MAX_SGE)) {
999 		mlx5_task->num_reqs = task->d.iovcnt;
1000 	} else if (task->d.iovcnt == 1) {
1001 		mlx5_task->num_reqs = SPDK_CEIL_DIV(task->s.iovcnt, ACCEL_MLX5_MAX_SGE);
1002 	} else {
1003 		mlx5_task->num_reqs = accel_mlx5_get_copy_task_count(task->s.iovs, task->s.iovcnt,
1004 				      task->d.iovs, task->d.iovcnt);
1005 	}
1006 	mlx5_task->inplace = 0;
1007 	accel_mlx5_iov_sgl_init(&mlx5_task->src, task->s.iovs, task->s.iovcnt);
1008 	accel_mlx5_iov_sgl_init(&mlx5_task->dst, task->d.iovs, task->d.iovcnt);
1009 	mlx5_task->num_ops = spdk_min(qp_slot, mlx5_task->num_reqs);
1010 	if (spdk_unlikely(!mlx5_task->num_ops)) {
1011 		qp->dev->stats.nomem_qdepth++;
1012 		return -ENOMEM;
1013 	}
1014 	SPDK_DEBUGLOG(accel_mlx5, "copy task num_reqs %u, num_ops %u\n", mlx5_task->num_reqs,
1015 		      mlx5_task->num_ops);
1016 
1017 	return 0;
1018 }
1019 
1020 static inline uint32_t
1021 accel_mlx5_advance_iovec(struct iovec *iov, uint32_t iovcnt, size_t *iov_offset, size_t *len)
1022 {
1023 	uint32_t i;
1024 	size_t iov_len;
1025 
1026 	for (i = 0; *len != 0 && i < iovcnt; i++) {
1027 		iov_len = iov[i].iov_len - *iov_offset;
1028 
1029 		if (iov_len < *len) {
1030 			*iov_offset = 0;
1031 			*len -= iov_len;
1032 			continue;
1033 		}
1034 		if (iov_len == *len) {
1035 			*iov_offset = 0;
1036 			i++;
1037 		} else { /* iov_len > *len */
1038 			*iov_offset += *len;
1039 		}
1040 		*len = 0;
1041 		break;
1042 	}
1043 
1044 	return i;
1045 }
1046 
1047 static inline void
1048 accel_mlx5_crc_task_complete(struct accel_mlx5_task *mlx5_task)
1049 {
1050 	struct accel_mlx5_dev *dev = mlx5_task->qp->dev;
1051 
1052 	*mlx5_task->base.crc_dst = mlx5_task->psv->crc ^ UINT32_MAX;
1053 	/* Normal task completion without allocated mkeys is not possible */
1054 	assert(mlx5_task->num_ops);
1055 	spdk_mlx5_mkey_pool_put_bulk(dev->sig_mkeys, mlx5_task->mkeys, mlx5_task->num_ops);
1056 	spdk_mempool_put(dev->dev_ctx->psv_pool, mlx5_task->psv);
1057 	spdk_accel_task_complete(&mlx5_task->base, 0);
1058 }
1059 
1060 static inline int
1061 accel_mlx5_crc_task_configure_umr(struct accel_mlx5_task *mlx5_task, struct ibv_sge *sge,
1062 				  uint32_t sge_count, struct spdk_mlx5_mkey_pool_obj *mkey,
1063 				  enum spdk_mlx5_umr_sig_domain sig_domain, uint32_t umr_len,
1064 				  bool sig_init, bool sig_check_gen)
1065 {
1066 	struct spdk_mlx5_umr_sig_attr sattr = {
1067 		.seed = mlx5_task->base.seed ^ UINT32_MAX,
1068 		.psv_index = mlx5_task->psv->psv_index,
1069 		.domain = sig_domain,
1070 		.sigerr_count = mkey->sig.sigerr_count,
1071 		.raw_data_size = umr_len,
1072 		.init = sig_init,
1073 		.check_gen = sig_check_gen,
1074 	};
1075 	struct spdk_mlx5_umr_attr umr_attr = {
1076 		.mkey = mkey->mkey,
1077 		.umr_len = umr_len,
1078 		.sge_count = sge_count,
1079 		.sge = sge,
1080 	};
1081 
1082 	return spdk_mlx5_umr_configure_sig(mlx5_task->qp->qp, &umr_attr, &sattr, 0, 0);
1083 }
1084 
1085 static inline int
1086 accel_mlx5_crc_task_fill_sge(struct accel_mlx5_task *mlx5_task, struct accel_mlx5_sge *sge)
1087 {
1088 	struct spdk_accel_task *task = &mlx5_task->base;
1089 	struct accel_mlx5_qp *qp = mlx5_task->qp;
1090 	struct accel_mlx5_dev *dev = qp->dev;
1091 	uint32_t remaining;
1092 	int rc;
1093 
1094 	rc = accel_mlx5_fill_block_sge(dev, sge->src_sge, &mlx5_task->src, task->nbytes, &remaining,
1095 				       task->src_domain, task->src_domain_ctx);
1096 	if (spdk_unlikely(rc <= 0)) {
1097 		if (rc == 0) {
1098 			rc = -EINVAL;
1099 		}
1100 		SPDK_ERRLOG("failed set src sge, rc %d\n", rc);
1101 		return rc;
1102 	}
1103 	assert(remaining == 0);
1104 	sge->src_sge_count = rc;
1105 
1106 	if (!mlx5_task->inplace) {
1107 		rc = accel_mlx5_fill_block_sge(dev, sge->dst_sge, &mlx5_task->dst, task->nbytes, &remaining,
1108 					       task->dst_domain, task->dst_domain_ctx);
1109 		if (spdk_unlikely(rc <= 0)) {
1110 			if (rc == 0) {
1111 				rc = -EINVAL;
1112 			}
1113 			SPDK_ERRLOG("failed set dst sge, rc %d\n", rc);
1114 			return rc;
1115 		}
1116 		assert(remaining == 0);
1117 		sge->dst_sge_count = rc;
1118 	}
1119 
1120 	return 0;
1121 }
1122 
1123 static inline int
1124 accel_mlx5_crc_task_process_one_req(struct accel_mlx5_task *mlx5_task)
1125 {
1126 	struct accel_mlx5_sge sges;
1127 	struct accel_mlx5_qp *qp = mlx5_task->qp;
1128 	struct accel_mlx5_dev *dev = qp->dev;
1129 	uint32_t num_ops = spdk_min(mlx5_task->num_reqs - mlx5_task->num_completed_reqs,
1130 				    mlx5_task->num_ops);
1131 	uint16_t qp_slot = accel_mlx5_dev_get_available_slots(dev, qp);
1132 	uint32_t rdma_fence = SPDK_MLX5_WQE_CTRL_STRONG_ORDERING;
1133 	struct ibv_sge *sge;
1134 	int rc;
1135 	uint16_t sge_count;
1136 
1137 	num_ops = spdk_min(num_ops, qp_slot >> 1);
1138 	if (spdk_unlikely(!num_ops)) {
1139 		return -EINVAL;
1140 	}
1141 
1142 	mlx5_task->num_wrs = 0;
1143 	/* At this moment we have as many requests as can be submitted to a qp */
1144 	rc = accel_mlx5_crc_task_fill_sge(mlx5_task, &sges);
1145 	if (spdk_unlikely(rc)) {
1146 		return rc;
1147 	}
1148 	rc = accel_mlx5_crc_task_configure_umr(mlx5_task, sges.src_sge, sges.src_sge_count,
1149 					       mlx5_task->mkeys[0], SPDK_MLX5_UMR_SIG_DOMAIN_WIRE, mlx5_task->base.nbytes, true, true);
1150 	if (spdk_unlikely(rc)) {
1151 		SPDK_ERRLOG("UMR configure failed with %d\n", rc);
1152 		return rc;
1153 	}
1154 	ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED(qp, mlx5_task);
1155 	dev->stats.sig_umrs++;
1156 
1157 	if (mlx5_task->inplace) {
1158 		sge = sges.src_sge;
1159 		sge_count = sges.src_sge_count;
1160 	} else {
1161 		sge = sges.dst_sge;
1162 		sge_count = sges.dst_sge_count;
1163 	}
1164 
1165 	/*
1166 	 * Add the crc destination to the end of sges. A free entry must be available for CRC
1167 	 * because the task init function reserved it.
1168 	 */
1169 	assert(sge_count < ACCEL_MLX5_MAX_SGE);
1170 	sge[sge_count].lkey = mlx5_task->psv->crc_lkey;
1171 	sge[sge_count].addr = (uintptr_t)&mlx5_task->psv->crc;
1172 	sge[sge_count++].length = sizeof(uint32_t);
1173 
1174 	if (spdk_unlikely(mlx5_task->psv->bits.error)) {
1175 		rc = spdk_mlx5_qp_set_psv(qp->qp, mlx5_task->psv->psv_index, *mlx5_task->base.crc_dst, 0, 0);
1176 		if (spdk_unlikely(rc)) {
1177 			SPDK_ERRLOG("SET_PSV failed with %d\n", rc);
1178 			return rc;
1179 		}
1180 		ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED(qp, mlx5_task);
1181 	}
1182 
1183 	rc = spdk_mlx5_qp_rdma_read(qp->qp, sge, sge_count, 0, mlx5_task->mkeys[0]->mkey,
1184 				    (uint64_t)mlx5_task, rdma_fence | SPDK_MLX5_WQE_CTRL_CE_CQ_UPDATE);
1185 	if (spdk_unlikely(rc)) {
1186 		SPDK_ERRLOG("RDMA READ/WRITE failed with %d\n", rc);
1187 		return rc;
1188 	}
1189 	mlx5_task->num_submitted_reqs++;
1190 	ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED_SIGNALED(dev, qp, mlx5_task);
1191 	dev->stats.rdma_reads++;
1192 
1193 	return 0;
1194 }
1195 
1196 static inline int
1197 accel_mlx5_crc_task_fill_umr_sge(struct accel_mlx5_qp *qp, struct ibv_sge *sge,
1198 				 struct accel_mlx5_iov_sgl *umr_iovs, struct spdk_memory_domain *domain,
1199 				 void *domain_ctx, struct accel_mlx5_iov_sgl *rdma_iovs, size_t *len)
1200 {
1201 	int umr_idx = 0;
1202 	int rdma_idx = 0;
1203 	int umr_iovcnt = spdk_min(umr_iovs->iovcnt, (int)ACCEL_MLX5_MAX_SGE);
1204 	int rdma_iovcnt = spdk_min(rdma_iovs->iovcnt, (int)ACCEL_MLX5_MAX_SGE);
1205 	size_t umr_iov_offset;
1206 	size_t rdma_iov_offset;
1207 	size_t umr_len = 0;
1208 	void *sge_addr;
1209 	size_t sge_len;
1210 	size_t umr_sge_len;
1211 	size_t rdma_sge_len;
1212 	int rc;
1213 
1214 	umr_iov_offset = umr_iovs->iov_offset;
1215 	rdma_iov_offset = rdma_iovs->iov_offset;
1216 
1217 	while (umr_idx < umr_iovcnt && rdma_idx < rdma_iovcnt) {
1218 		umr_sge_len = umr_iovs->iov[umr_idx].iov_len - umr_iov_offset;
1219 		rdma_sge_len = rdma_iovs->iov[rdma_idx].iov_len - rdma_iov_offset;
1220 		sge_addr = umr_iovs->iov[umr_idx].iov_base + umr_iov_offset;
1221 
1222 		if (umr_sge_len == rdma_sge_len) {
1223 			rdma_idx++;
1224 			umr_iov_offset = 0;
1225 			rdma_iov_offset = 0;
1226 			sge_len = umr_sge_len;
1227 		} else if (umr_sge_len < rdma_sge_len) {
1228 			umr_iov_offset = 0;
1229 			rdma_iov_offset += umr_sge_len;
1230 			sge_len = umr_sge_len;
1231 		} else {
1232 			size_t remaining;
1233 
1234 			remaining = umr_sge_len - rdma_sge_len;
1235 			while (remaining) {
1236 				rdma_idx++;
1237 				if (rdma_idx == (int)ACCEL_MLX5_MAX_SGE) {
1238 					break;
1239 				}
1240 				rdma_sge_len = rdma_iovs->iov[rdma_idx].iov_len;
1241 				if (remaining == rdma_sge_len) {
1242 					rdma_idx++;
1243 					rdma_iov_offset = 0;
1244 					umr_iov_offset = 0;
1245 					remaining = 0;
1246 					break;
1247 				}
1248 				if (remaining < rdma_sge_len) {
1249 					rdma_iov_offset = remaining;
1250 					umr_iov_offset = 0;
1251 					remaining = 0;
1252 					break;
1253 				}
1254 				remaining -= rdma_sge_len;
1255 			}
1256 			sge_len = umr_sge_len - remaining;
1257 		}
1258 		rc = accel_mlx5_translate_addr(sge_addr, sge_len, domain, domain_ctx, qp->dev, &sge[umr_idx]);
1259 		if (spdk_unlikely(rc)) {
1260 			return -EINVAL;
1261 		}
1262 		SPDK_DEBUGLOG(accel_mlx5, "\t sge[%d] lkey %u, addr %p, len %u\n", umr_idx, sge[umr_idx].lkey,
1263 			      (void *)sge[umr_idx].addr, sge[umr_idx].length);
1264 		umr_len += sge_len;
1265 		umr_idx++;
1266 	}
1267 	accel_mlx5_iov_sgl_advance(umr_iovs, umr_len);
1268 	accel_mlx5_iov_sgl_advance(rdma_iovs, umr_len);
1269 	*len = umr_len;
1270 
1271 	return umr_idx;
1272 }
1273 
1274 static inline int
1275 accel_mlx5_crc_task_process_multi_req(struct accel_mlx5_task *mlx5_task)
1276 {
1277 	size_t umr_len[ACCEL_MLX5_MAX_MKEYS_IN_TASK];
1278 	struct ibv_sge sges[ACCEL_MLX5_MAX_SGE];
1279 	struct spdk_accel_task *task = &mlx5_task->base;
1280 	struct accel_mlx5_qp *qp = mlx5_task->qp;
1281 	struct accel_mlx5_dev *dev = qp->dev;
1282 	struct accel_mlx5_iov_sgl umr_sgl;
1283 	struct accel_mlx5_iov_sgl *umr_sgl_ptr;
1284 	struct accel_mlx5_iov_sgl rdma_sgl;
1285 	uint64_t umr_offset;
1286 	uint32_t rdma_fence = SPDK_MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE;
1287 	int sge_count;
1288 	uint32_t remaining;
1289 	int rc;
1290 	uint16_t i;
1291 	uint16_t num_ops = spdk_min(mlx5_task->num_reqs - mlx5_task->num_completed_reqs,
1292 				    mlx5_task->num_ops);
1293 	uint16_t qp_slot = accel_mlx5_dev_get_available_slots(dev, qp);
1294 	bool sig_init, sig_check_gen = false;
1295 
1296 	num_ops = spdk_min(num_ops, qp_slot >> 1);
1297 	if (spdk_unlikely(!num_ops)) {
1298 		return -EINVAL;
1299 	}
1300 	/* Init signature on the first UMR */
1301 	sig_init = !mlx5_task->num_submitted_reqs;
1302 
1303 	/*
1304 	 * accel_mlx5_crc_task_fill_umr_sge() and accel_mlx5_fill_block_sge() advance an IOV during iteration
1305 	 * on it. We must copy accel_mlx5_iov_sgl to iterate twice or more on the same IOV.
1306 	 *
1307 	 * In the in-place case, we iterate on the source IOV three times. That's why we need two copies of
1308 	 * the source accel_mlx5_iov_sgl.
1309 	 *
1310 	 * In the out-of-place case, we iterate on the source IOV once and on the destination IOV two times.
1311 	 * So, we need one copy of the destination accel_mlx5_iov_sgl.
1312 	 */
1313 	if (mlx5_task->inplace) {
1314 		accel_mlx5_iov_sgl_init(&umr_sgl, mlx5_task->src.iov, mlx5_task->src.iovcnt);
1315 		umr_sgl_ptr = &umr_sgl;
1316 		accel_mlx5_iov_sgl_init(&rdma_sgl, mlx5_task->src.iov, mlx5_task->src.iovcnt);
1317 	} else {
1318 		umr_sgl_ptr = &mlx5_task->src;
1319 		accel_mlx5_iov_sgl_init(&rdma_sgl, mlx5_task->dst.iov, mlx5_task->dst.iovcnt);
1320 	}
1321 	mlx5_task->num_wrs = 0;
1322 	for (i = 0; i < num_ops; i++) {
1323 		/*
1324 		 * The last request may have only CRC. Skip UMR in this case because the MKey from
1325 		 * the previous request is used.
1326 		 */
1327 		if (umr_sgl_ptr->iovcnt == 0) {
1328 			assert((mlx5_task->num_completed_reqs + i + 1) == mlx5_task->num_reqs);
1329 			break;
1330 		}
1331 		sge_count = accel_mlx5_crc_task_fill_umr_sge(qp, sges, umr_sgl_ptr, task->src_domain,
1332 				task->src_domain_ctx, &rdma_sgl, &umr_len[i]);
1333 		if (spdk_unlikely(sge_count <= 0)) {
1334 			rc = (sge_count == 0) ? -EINVAL : sge_count;
1335 			SPDK_ERRLOG("failed set UMR sge, rc %d\n", rc);
1336 			return rc;
1337 		}
1338 		if (umr_sgl_ptr->iovcnt == 0) {
1339 			/*
1340 			 * We post RDMA without UMR if the last request has only CRC. We use an MKey from
1341 			 * the last UMR in this case. Since the last request can be postponed to the next
1342 			 * call of this function, we must save the MKey to the task structure.
1343 			 */
1344 			mlx5_task->last_umr_len = umr_len[i];
1345 			mlx5_task->last_mkey_idx = i;
1346 			sig_check_gen = true;
1347 		}
1348 		rc = accel_mlx5_crc_task_configure_umr(mlx5_task, sges, sge_count, mlx5_task->mkeys[i],
1349 						       SPDK_MLX5_UMR_SIG_DOMAIN_WIRE, umr_len[i], sig_init,
1350 						       sig_check_gen);
1351 		if (spdk_unlikely(rc)) {
1352 			SPDK_ERRLOG("UMR configure failed with %d\n", rc);
1353 			return rc;
1354 		}
1355 		sig_init = false;
1356 		ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED(qp, mlx5_task);
1357 		dev->stats.sig_umrs++;
1358 	}
1359 
1360 	if (spdk_unlikely(mlx5_task->psv->bits.error)) {
1361 		rc = spdk_mlx5_qp_set_psv(qp->qp, mlx5_task->psv->psv_index, *mlx5_task->base.crc_dst, 0, 0);
1362 		if (spdk_unlikely(rc)) {
1363 			SPDK_ERRLOG("SET_PSV failed with %d\n", rc);
1364 			return rc;
1365 		}
1366 		ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED(qp, mlx5_task);
1367 	}
1368 
1369 	for (i = 0; i < num_ops - 1; i++) {
1370 		if (mlx5_task->inplace) {
1371 			sge_count = accel_mlx5_fill_block_sge(dev, sges, &mlx5_task->src, umr_len[i], &remaining,
1372 							      task->src_domain, task->src_domain_ctx);
1373 		} else {
1374 			sge_count = accel_mlx5_fill_block_sge(dev, sges, &mlx5_task->dst, umr_len[i], &remaining,
1375 							      task->dst_domain, task->dst_domain_ctx);
1376 		}
1377 		if (spdk_unlikely(sge_count <= 0)) {
1378 			rc = (sge_count == 0) ? -EINVAL : sge_count;
1379 			SPDK_ERRLOG("failed set RDMA sge, rc %d\n", rc);
1380 			return rc;
1381 		}
1382 		rc = spdk_mlx5_qp_rdma_read(qp->qp, sges, sge_count, 0, mlx5_task->mkeys[i]->mkey,
1383 					    0, rdma_fence);
1384 		if (spdk_unlikely(rc)) {
1385 			SPDK_ERRLOG("RDMA READ/WRITE failed with %d\n", rc);
1386 			return rc;
1387 		}
1388 		mlx5_task->num_submitted_reqs++;
1389 		ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED(qp, mlx5_task);
1390 		dev->stats.rdma_reads++;
1391 		rdma_fence = SPDK_MLX5_WQE_CTRL_STRONG_ORDERING;
1392 	}
1393 	if ((mlx5_task->inplace && mlx5_task->src.iovcnt == 0) || (!mlx5_task->inplace &&
1394 			mlx5_task->dst.iovcnt == 0)) {
1395 		/*
1396 		 * The last RDMA does not have any data, only CRC. It also does not have a paired Mkey.
1397 		 * The CRC is handled in the previous MKey in this case.
1398 		 */
1399 		sge_count = 0;
1400 		umr_offset = mlx5_task->last_umr_len;
1401 	} else {
1402 		umr_offset = 0;
1403 		mlx5_task->last_mkey_idx = i;
1404 		if (mlx5_task->inplace) {
1405 			sge_count = accel_mlx5_fill_block_sge(dev, sges, &mlx5_task->src, umr_len[i], &remaining,
1406 							      task->src_domain, task->src_domain_ctx);
1407 		} else {
1408 			sge_count = accel_mlx5_fill_block_sge(dev, sges, &mlx5_task->dst, umr_len[i], &remaining,
1409 							      task->dst_domain, task->dst_domain_ctx);
1410 		}
1411 		if (spdk_unlikely(sge_count <= 0)) {
1412 			rc = (sge_count == 0) ? -EINVAL : sge_count;
1413 			SPDK_ERRLOG("failed set RDMA sge, rc %d\n", rc);
1414 			return rc;
1415 		}
1416 		assert(remaining == 0);
1417 	}
1418 	if ((mlx5_task->num_completed_reqs + i + 1) == mlx5_task->num_reqs) {
1419 		/* Ensure that there is a free sge for the CRC destination. */
1420 		assert(sge_count < (int)ACCEL_MLX5_MAX_SGE);
1421 		/* Add the crc destination to the end of sges. */
1422 		sges[sge_count].lkey = mlx5_task->psv->crc_lkey;
1423 		sges[sge_count].addr = (uintptr_t)&mlx5_task->psv->crc;
1424 		sges[sge_count++].length = sizeof(uint32_t);
1425 	}
1426 	rdma_fence |= SPDK_MLX5_WQE_CTRL_CE_CQ_UPDATE;
1427 	rc = spdk_mlx5_qp_rdma_read(qp->qp, sges, sge_count, umr_offset,
1428 				    mlx5_task->mkeys[mlx5_task->last_mkey_idx]->mkey,
1429 				    (uint64_t)mlx5_task, rdma_fence);
1430 	if (spdk_unlikely(rc)) {
1431 		SPDK_ERRLOG("RDMA READ/WRITE failed with %d\n", rc);
1432 		return rc;
1433 	}
1434 	mlx5_task->num_submitted_reqs++;
1435 	ACCEL_MLX5_UPDATE_ON_WR_SUBMITTED_SIGNALED(dev, qp, mlx5_task);
1436 	dev->stats.rdma_reads++;
1437 
1438 	return 0;
1439 }
1440 
1441 static inline int
1442 accel_mlx5_crc_task_process(struct accel_mlx5_task *mlx5_task)
1443 {
1444 	int rc;
1445 
1446 	assert(mlx5_task->mlx5_opcode == ACCEL_MLX5_OPC_CRC32C);
1447 
1448 	SPDK_DEBUGLOG(accel_mlx5, "begin, crc task, %p, reqs: total %u, submitted %u, completed %u\n",
1449 		      mlx5_task, mlx5_task->num_reqs, mlx5_task->num_submitted_reqs, mlx5_task->num_completed_reqs);
1450 
1451 	if (mlx5_task->num_reqs == 1) {
1452 		rc = accel_mlx5_crc_task_process_one_req(mlx5_task);
1453 	} else {
1454 		rc = accel_mlx5_crc_task_process_multi_req(mlx5_task);
1455 	}
1456 
1457 	if (rc == 0) {
1458 		STAILQ_INSERT_TAIL(&mlx5_task->qp->in_hw, mlx5_task, link);
1459 		SPDK_DEBUGLOG(accel_mlx5, "end, crc task, %p, reqs: total %u, submitted %u, completed %u\n",
1460 			      mlx5_task, mlx5_task->num_reqs, mlx5_task->num_submitted_reqs,
1461 			      mlx5_task->num_completed_reqs);
1462 	}
1463 
1464 	return rc;
1465 }
1466 
1467 static inline int
1468 accel_mlx5_task_alloc_crc_ctx(struct accel_mlx5_task *task, uint32_t qp_slot)
1469 {
1470 	struct accel_mlx5_qp *qp = task->qp;
1471 	struct accel_mlx5_dev *dev = qp->dev;
1472 
1473 	if (spdk_unlikely(!accel_mlx5_task_alloc_mkeys(task, dev->sig_mkeys))) {
1474 		SPDK_DEBUGLOG(accel_mlx5, "no mkeys in signature mkey pool, dev %s\n",
1475 			      dev->dev_ctx->context->device->name);
1476 		dev->stats.nomem_mkey++;
1477 		return -ENOMEM;
1478 	}
1479 	task->psv = spdk_mempool_get(dev->dev_ctx->psv_pool);
1480 	if (spdk_unlikely(!task->psv)) {
1481 		SPDK_DEBUGLOG(accel_mlx5, "no reqs in psv pool, dev %s\n", dev->dev_ctx->context->device->name);
1482 		spdk_mlx5_mkey_pool_put_bulk(dev->sig_mkeys, task->mkeys, task->num_ops);
1483 		task->num_ops = 0;
1484 		dev->stats.nomem_mkey++;
1485 		return -ENOMEM;
1486 	}
1487 	/* One extra slot is needed for SET_PSV WQE to reset the error state in PSV. */
1488 	if (spdk_unlikely(task->psv->bits.error)) {
1489 		uint32_t n_slots = task->num_ops * 2 + 1;
1490 
1491 		if (qp_slot < n_slots) {
1492 			spdk_mempool_put(dev->dev_ctx->psv_pool, task->psv);
1493 			spdk_mlx5_mkey_pool_put_bulk(dev->sig_mkeys, task->mkeys, task->num_ops);
1494 			dev->stats.nomem_qdepth++;
1495 			task->num_ops = 0;
1496 			return -ENOMEM;
1497 		}
1498 	}
1499 
1500 	return 0;
1501 }
1502 
1503 static inline int
1504 accel_mlx5_crc_task_continue(struct accel_mlx5_task *task)
1505 {
1506 	struct accel_mlx5_qp *qp = task->qp;
1507 	struct accel_mlx5_dev *dev = qp->dev;
1508 	uint16_t qp_slot = accel_mlx5_dev_get_available_slots(dev, qp);
1509 	int rc;
1510 
1511 	assert(task->num_reqs > task->num_completed_reqs);
1512 	if (task->num_ops == 0) {
1513 		/* No mkeys allocated, try to allocate now. */
1514 		rc = accel_mlx5_task_alloc_crc_ctx(task, qp_slot);
1515 		if (spdk_unlikely(rc)) {
1516 			STAILQ_INSERT_TAIL(&dev->nomem, task, link);
1517 			return -ENOMEM;
1518 		}
1519 	}
1520 	/* We need to post at least 1 UMR and 1 RDMA operation */
1521 	if (spdk_unlikely(qp_slot < 2)) {
1522 		STAILQ_INSERT_TAIL(&dev->nomem, task, link);
1523 		dev->stats.nomem_qdepth++;
1524 		return -ENOMEM;
1525 	}
1526 
1527 	return accel_mlx5_crc_task_process(task);
1528 }
1529 
1530 static inline uint32_t
1531 accel_mlx5_get_crc_task_count(struct iovec *src_iov, uint32_t src_iovcnt, struct iovec *dst_iov,
1532 			      uint32_t dst_iovcnt)
1533 {
1534 	uint32_t src_idx = 0;
1535 	uint32_t dst_idx = 0;
1536 	uint32_t num_ops = 1;
1537 	uint32_t num_src_sge = 1;
1538 	uint32_t num_dst_sge = 1;
1539 	size_t src_offset = 0;
1540 	size_t dst_offset = 0;
1541 	uint32_t num_sge;
1542 	size_t src_len;
1543 	size_t dst_len;
1544 
1545 	/* One operation is enough if both iovs fit into ACCEL_MLX5_MAX_SGE. One SGE is reserved for CRC on dst_iov. */
1546 	if (src_iovcnt <= ACCEL_MLX5_MAX_SGE && (dst_iovcnt + 1) <= ACCEL_MLX5_MAX_SGE) {
1547 		return 1;
1548 	}
1549 
1550 	while (src_idx < src_iovcnt && dst_idx < dst_iovcnt) {
1551 		if (num_src_sge > ACCEL_MLX5_MAX_SGE || num_dst_sge > ACCEL_MLX5_MAX_SGE) {
1552 			num_ops++;
1553 			num_src_sge = 1;
1554 			num_dst_sge = 1;
1555 		}
1556 		src_len = src_iov[src_idx].iov_len - src_offset;
1557 		dst_len = dst_iov[dst_idx].iov_len - dst_offset;
1558 
1559 		if (src_len == dst_len) {
1560 			num_src_sge++;
1561 			num_dst_sge++;
1562 			src_offset = 0;
1563 			dst_offset = 0;
1564 			src_idx++;
1565 			dst_idx++;
1566 			continue;
1567 		}
1568 		if (src_len < dst_len) {
1569 			/* Advance src_iov to reach the point that corresponds to the end of the current dst_iov. */
1570 			num_sge = accel_mlx5_advance_iovec(&src_iov[src_idx],
1571 							   spdk_min(ACCEL_MLX5_MAX_SGE + 1 - num_src_sge,
1572 									   src_iovcnt - src_idx),
1573 							   &src_offset, &dst_len);
1574 			src_idx += num_sge;
1575 			num_src_sge += num_sge;
1576 			if (dst_len != 0) {
1577 				/*
1578 				 * ACCEL_MLX5_MAX_SGE is reached on src_iov, and dst_len bytes
1579 				 * are left on the current dst_iov.
1580 				 */
1581 				dst_offset = dst_iov[dst_idx].iov_len - dst_len;
1582 			} else {
1583 				/* The src_iov advance is completed, shift to the next dst_iov. */
1584 				dst_idx++;
1585 				num_dst_sge++;
1586 				dst_offset = 0;
1587 			}
1588 		} else { /* src_len > dst_len */
1589 			/* Advance dst_iov to reach the point that corresponds to the end of the current src_iov. */
1590 			num_sge = accel_mlx5_advance_iovec(&dst_iov[dst_idx],
1591 							   spdk_min(ACCEL_MLX5_MAX_SGE + 1 - num_dst_sge,
1592 									   dst_iovcnt - dst_idx),
1593 							   &dst_offset, &src_len);
1594 			dst_idx += num_sge;
1595 			num_dst_sge += num_sge;
1596 			if (src_len != 0) {
1597 				/*
1598 				 * ACCEL_MLX5_MAX_SGE is reached on dst_iov, and src_len bytes
1599 				 * are left on the current src_iov.
1600 				 */
1601 				src_offset = src_iov[src_idx].iov_len - src_len;
1602 			} else {
1603 				/* The dst_iov advance is completed, shift to the next src_iov. */
1604 				src_idx++;
1605 				num_src_sge++;
1606 				src_offset = 0;
1607 			}
1608 		}
1609 	}
1610 	/* An extra operation is needed if no space is left on dst_iov because CRC takes one SGE. */
1611 	if (num_dst_sge > ACCEL_MLX5_MAX_SGE) {
1612 		num_ops++;
1613 	}
1614 
1615 	/* The above loop must reach the end of both iovs simultaneously because their size is the same. */
1616 	assert(src_idx == src_iovcnt);
1617 	assert(dst_idx == dst_iovcnt);
1618 	assert(src_offset == 0);
1619 	assert(dst_offset == 0);
1620 
1621 	return num_ops;
1622 }
1623 
1624 static inline int
1625 accel_mlx5_crc_task_init(struct accel_mlx5_task *mlx5_task)
1626 {
1627 	struct spdk_accel_task *task = &mlx5_task->base;
1628 	struct accel_mlx5_qp *qp = mlx5_task->qp;
1629 	uint32_t qp_slot = accel_mlx5_dev_get_available_slots(qp->dev, qp);
1630 	int rc;
1631 
1632 	accel_mlx5_iov_sgl_init(&mlx5_task->src, task->s.iovs, task->s.iovcnt);
1633 	if (mlx5_task->inplace) {
1634 		/* One entry is reserved for CRC */
1635 		mlx5_task->num_reqs = SPDK_CEIL_DIV(mlx5_task->src.iovcnt + 1, ACCEL_MLX5_MAX_SGE);
1636 	} else {
1637 		accel_mlx5_iov_sgl_init(&mlx5_task->dst, task->d.iovs, task->d.iovcnt);
1638 		mlx5_task->num_reqs = accel_mlx5_get_crc_task_count(mlx5_task->src.iov, mlx5_task->src.iovcnt,
1639 				      mlx5_task->dst.iov, mlx5_task->dst.iovcnt);
1640 	}
1641 
1642 	rc = accel_mlx5_task_alloc_crc_ctx(mlx5_task, qp_slot);
1643 	if (spdk_unlikely(rc)) {
1644 		return rc;
1645 	}
1646 
1647 	if (spdk_unlikely(qp_slot < 2)) {
1648 		/* Queue is full, queue this task */
1649 		SPDK_DEBUGLOG(accel_mlx5, "dev %s qp %p is full\n", qp->dev->dev_ctx->context->device->name,
1650 			      mlx5_task->qp);
1651 		qp->dev->stats.nomem_qdepth++;
1652 		return -ENOMEM;
1653 	}
1654 	return 0;
1655 }
1656 
1657 static int
1658 accel_mlx5_task_op_not_implemented(struct accel_mlx5_task *mlx5_task)
1659 {
1660 	SPDK_ERRLOG("wrong function called\n");
1661 	SPDK_UNREACHABLE();
1662 }
1663 
1664 static void
1665 accel_mlx5_task_op_not_implemented_v(struct accel_mlx5_task *mlx5_task)
1666 {
1667 	SPDK_ERRLOG("wrong function called\n");
1668 	SPDK_UNREACHABLE();
1669 }
1670 
1671 static int
1672 accel_mlx5_task_op_not_supported(struct accel_mlx5_task *mlx5_task)
1673 {
1674 	SPDK_ERRLOG("Unsupported opcode %d\n", mlx5_task->base.op_code);
1675 
1676 	return -ENOTSUP;
1677 }
1678 
1679 static struct accel_mlx5_task_operations g_accel_mlx5_tasks_ops[] = {
1680 	[ACCEL_MLX5_OPC_COPY] = {
1681 		.init = accel_mlx5_copy_task_init,
1682 		.process = accel_mlx5_copy_task_process,
1683 		.cont = accel_mlx5_copy_task_continue,
1684 		.complete = accel_mlx5_copy_task_complete,
1685 	},
1686 	[ACCEL_MLX5_OPC_CRYPTO] = {
1687 		.init = accel_mlx5_crypto_task_init,
1688 		.process = accel_mlx5_crypto_task_process,
1689 		.cont = accel_mlx5_crypto_task_continue,
1690 		.complete = accel_mlx5_crypto_task_complete,
1691 	},
1692 	[ACCEL_MLX5_OPC_CRC32C] = {
1693 		.init = accel_mlx5_crc_task_init,
1694 		.process = accel_mlx5_crc_task_process,
1695 		.cont = accel_mlx5_crc_task_continue,
1696 		.complete = accel_mlx5_crc_task_complete,
1697 	},
1698 	[ACCEL_MLX5_OPC_LAST] = {
1699 		.init = accel_mlx5_task_op_not_supported,
1700 		.process = accel_mlx5_task_op_not_implemented,
1701 		.cont = accel_mlx5_task_op_not_implemented,
1702 		.complete = accel_mlx5_task_op_not_implemented_v
1703 	},
1704 };
1705 
1706 static inline void
1707 accel_mlx5_task_complete(struct accel_mlx5_task *task)
1708 {
1709 	assert(task->num_reqs == task->num_completed_reqs);
1710 	SPDK_DEBUGLOG(accel_mlx5, "Complete task %p, opc %d\n", task, task->base.op_code);
1711 
1712 	g_accel_mlx5_tasks_ops[task->mlx5_opcode].complete(task);
1713 }
1714 
1715 static inline int
1716 accel_mlx5_task_continue(struct accel_mlx5_task *task)
1717 {
1718 	struct accel_mlx5_qp *qp = task->qp;
1719 	struct accel_mlx5_dev *dev = qp->dev;
1720 
1721 	if (spdk_unlikely(qp->recovering)) {
1722 		STAILQ_INSERT_TAIL(&dev->nomem, task, link);
1723 		return 0;
1724 	}
1725 
1726 	return g_accel_mlx5_tasks_ops[task->mlx5_opcode].cont(task);
1727 }
1728 static inline void
1729 accel_mlx5_task_init_opcode(struct accel_mlx5_task *mlx5_task)
1730 {
1731 	uint8_t base_opcode = mlx5_task->base.op_code;
1732 
1733 	switch (base_opcode) {
1734 	case SPDK_ACCEL_OPC_COPY:
1735 		mlx5_task->mlx5_opcode = ACCEL_MLX5_OPC_COPY;
1736 		break;
1737 	case SPDK_ACCEL_OPC_ENCRYPT:
1738 		assert(g_accel_mlx5.crypto_supported);
1739 		mlx5_task->enc_order = SPDK_MLX5_ENCRYPTION_ORDER_ENCRYPTED_RAW_WIRE;
1740 		mlx5_task->mlx5_opcode =  ACCEL_MLX5_OPC_CRYPTO;
1741 		break;
1742 	case SPDK_ACCEL_OPC_DECRYPT:
1743 		assert(g_accel_mlx5.crypto_supported);
1744 		mlx5_task->enc_order = SPDK_MLX5_ENCRYPTION_ORDER_ENCRYPTED_RAW_MEMORY;
1745 		mlx5_task->mlx5_opcode = ACCEL_MLX5_OPC_CRYPTO;
1746 		break;
1747 	case SPDK_ACCEL_OPC_CRC32C:
1748 		mlx5_task->inplace = 1;
1749 		mlx5_task->mlx5_opcode = ACCEL_MLX5_OPC_CRC32C;
1750 		break;
1751 	case SPDK_ACCEL_OPC_COPY_CRC32C:
1752 		mlx5_task->inplace = 0;
1753 		mlx5_task->mlx5_opcode = ACCEL_MLX5_OPC_CRC32C;
1754 		break;
1755 	default:
1756 		SPDK_ERRLOG("wrong opcode %d\n", base_opcode);
1757 		mlx5_task->mlx5_opcode = ACCEL_MLX5_OPC_LAST;
1758 	}
1759 }
1760 
1761 static inline int
1762 _accel_mlx5_submit_tasks(struct accel_mlx5_io_channel *accel_ch, struct spdk_accel_task *task)
1763 {
1764 	struct accel_mlx5_task *mlx5_task = SPDK_CONTAINEROF(task, struct accel_mlx5_task, base);
1765 	struct accel_mlx5_dev *dev = mlx5_task->qp->dev;
1766 	int rc;
1767 
1768 	/* We should not receive any tasks if the module was not enabled */
1769 	assert(g_accel_mlx5.enabled);
1770 
1771 	dev->stats.opcodes[mlx5_task->mlx5_opcode]++;
1772 	rc = g_accel_mlx5_tasks_ops[mlx5_task->mlx5_opcode].init(mlx5_task);
1773 	if (spdk_unlikely(rc)) {
1774 		if (rc == -ENOMEM) {
1775 			SPDK_DEBUGLOG(accel_mlx5, "no reqs to handle new task %p (required %u), put to queue\n", mlx5_task,
1776 				      mlx5_task->num_reqs);
1777 			STAILQ_INSERT_TAIL(&dev->nomem, mlx5_task, link);
1778 			return 0;
1779 		}
1780 		SPDK_ERRLOG("Task opc %d init failed, rc %d\n", task->op_code, rc);
1781 		return rc;
1782 	}
1783 
1784 	if (spdk_unlikely(mlx5_task->qp->recovering)) {
1785 		STAILQ_INSERT_TAIL(&dev->nomem, mlx5_task, link);
1786 		return 0;
1787 	}
1788 
1789 	return g_accel_mlx5_tasks_ops[mlx5_task->mlx5_opcode].process(mlx5_task);
1790 }
1791 
1792 static inline void
1793 accel_mlx5_task_assign_qp(struct accel_mlx5_task *mlx5_task, struct accel_mlx5_io_channel *accel_ch)
1794 {
1795 	struct accel_mlx5_dev *dev;
1796 
1797 	dev = &accel_ch->devs[accel_ch->dev_idx];
1798 	accel_ch->dev_idx++;
1799 	if (accel_ch->dev_idx == accel_ch->num_devs) {
1800 		accel_ch->dev_idx = 0;
1801 	}
1802 
1803 	mlx5_task->qp = &dev->qp;
1804 }
1805 
1806 static inline void
1807 accel_mlx5_task_reset(struct accel_mlx5_task *mlx5_task)
1808 {
1809 	mlx5_task->num_completed_reqs = 0;
1810 	mlx5_task->num_submitted_reqs = 0;
1811 	mlx5_task->num_ops = 0;
1812 	mlx5_task->num_processed_blocks = 0;
1813 	mlx5_task->raw = 0;
1814 }
1815 
1816 static int
1817 accel_mlx5_submit_tasks(struct spdk_io_channel *ch, struct spdk_accel_task *task)
1818 {
1819 	struct accel_mlx5_task *mlx5_task = SPDK_CONTAINEROF(task, struct accel_mlx5_task, base);
1820 	struct accel_mlx5_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
1821 
1822 	accel_mlx5_task_assign_qp(mlx5_task, accel_ch);
1823 	accel_mlx5_task_reset(mlx5_task);
1824 	accel_mlx5_task_init_opcode(mlx5_task);
1825 
1826 	return _accel_mlx5_submit_tasks(accel_ch, task);
1827 }
1828 
1829 static void accel_mlx5_recover_qp(struct accel_mlx5_qp *qp);
1830 
1831 static int
1832 accel_mlx5_recover_qp_poller(void *arg)
1833 {
1834 	struct accel_mlx5_qp *qp = arg;
1835 
1836 	spdk_poller_unregister(&qp->recover_poller);
1837 	accel_mlx5_recover_qp(qp);
1838 	return SPDK_POLLER_BUSY;
1839 }
1840 
1841 static void
1842 accel_mlx5_recover_qp(struct accel_mlx5_qp *qp)
1843 {
1844 	struct accel_mlx5_dev *dev = qp->dev;
1845 	struct spdk_mlx5_qp_attr mlx5_qp_attr = {};
1846 	int rc;
1847 
1848 	SPDK_NOTICELOG("Recovering qp %p, core %u\n", qp, spdk_env_get_current_core());
1849 	if (qp->qp) {
1850 		spdk_mlx5_qp_destroy(qp->qp);
1851 		qp->qp = NULL;
1852 	}
1853 
1854 	mlx5_qp_attr.cap.max_send_wr = g_accel_mlx5.attr.qp_size;
1855 	mlx5_qp_attr.cap.max_recv_wr = 0;
1856 	mlx5_qp_attr.cap.max_send_sge = ACCEL_MLX5_MAX_SGE;
1857 	mlx5_qp_attr.cap.max_inline_data = sizeof(struct ibv_sge) * ACCEL_MLX5_MAX_SGE;
1858 
1859 	rc = spdk_mlx5_qp_create(dev->dev_ctx->pd, dev->cq, &mlx5_qp_attr, &qp->qp);
1860 	if (rc) {
1861 		SPDK_ERRLOG("Failed to create mlx5 dma QP, rc %d. Retry in %d usec\n",
1862 			    rc, ACCEL_MLX5_RECOVER_POLLER_PERIOD_US);
1863 		qp->recover_poller = SPDK_POLLER_REGISTER(accel_mlx5_recover_qp_poller, qp,
1864 				     ACCEL_MLX5_RECOVER_POLLER_PERIOD_US);
1865 		return;
1866 	}
1867 
1868 	qp->recovering = false;
1869 }
1870 
1871 static inline void
1872 accel_mlx5_process_error_cpl(struct spdk_mlx5_cq_completion *wc, struct accel_mlx5_task *task)
1873 {
1874 	struct accel_mlx5_qp *qp = task->qp;
1875 
1876 	if (wc->status != IBV_WC_WR_FLUSH_ERR) {
1877 		SPDK_WARNLOG("RDMA: qp %p, task %p, WC status %d, core %u\n",
1878 			     qp, task, wc->status, spdk_env_get_current_core());
1879 	} else {
1880 		SPDK_DEBUGLOG(accel_mlx5,
1881 			      "RDMA: qp %p, task %p, WC status %d, core %u\n",
1882 			      qp, task, wc->status, spdk_env_get_current_core());
1883 	}
1884 
1885 	qp->recovering = true;
1886 	assert(task->num_completed_reqs <= task->num_submitted_reqs);
1887 	if (task->num_completed_reqs == task->num_submitted_reqs) {
1888 		STAILQ_REMOVE_HEAD(&qp->in_hw, link);
1889 		accel_mlx5_task_fail(task, -EIO);
1890 	}
1891 }
1892 
1893 static inline int64_t
1894 accel_mlx5_poll_cq(struct accel_mlx5_dev *dev)
1895 {
1896 	struct spdk_mlx5_cq_completion wc[ACCEL_MLX5_MAX_WC];
1897 	struct accel_mlx5_task *task;
1898 	struct accel_mlx5_qp *qp;
1899 	int reaped, i, rc;
1900 	uint16_t completed;
1901 
1902 	dev->stats.polls++;
1903 	reaped = spdk_mlx5_cq_poll_completions(dev->cq, wc, ACCEL_MLX5_MAX_WC);
1904 	if (spdk_unlikely(reaped < 0)) {
1905 		SPDK_ERRLOG("Error polling CQ! (%d): %s\n", errno, spdk_strerror(errno));
1906 		return reaped;
1907 	} else if (reaped == 0) {
1908 		dev->stats.idle_polls++;
1909 		return 0;
1910 	}
1911 	dev->stats.completions += reaped;
1912 
1913 	SPDK_DEBUGLOG(accel_mlx5, "Reaped %d cpls on dev %s\n", reaped,
1914 		      dev->dev_ctx->context->device->name);
1915 
1916 	for (i = 0; i < reaped; i++) {
1917 		if (spdk_unlikely(!wc[i].wr_id)) {
1918 			/* Unsignaled completion with error, ignore */
1919 			continue;
1920 		}
1921 		task = (struct accel_mlx5_task *)wc[i].wr_id;
1922 		qp = task->qp;
1923 		assert(task == STAILQ_FIRST(&qp->in_hw) && "submission mismatch");
1924 		assert(task->num_submitted_reqs > task->num_completed_reqs);
1925 		completed = task->num_submitted_reqs - task->num_completed_reqs;
1926 		assert((uint32_t)task->num_completed_reqs + completed <= UINT16_MAX);
1927 		task->num_completed_reqs += completed;
1928 		assert(qp->wrs_submitted >= task->num_wrs);
1929 		qp->wrs_submitted -= task->num_wrs;
1930 		assert(dev->wrs_in_cq > 0);
1931 		dev->wrs_in_cq--;
1932 
1933 		if (spdk_unlikely(wc[i].status)) {
1934 			accel_mlx5_process_error_cpl(&wc[i], task);
1935 			if (qp->wrs_submitted == 0) {
1936 				assert(STAILQ_EMPTY(&qp->in_hw));
1937 				accel_mlx5_recover_qp(qp);
1938 			}
1939 			continue;
1940 		}
1941 
1942 		SPDK_DEBUGLOG(accel_mlx5, "task %p, remaining %u\n", task,
1943 			      task->num_reqs - task->num_completed_reqs);
1944 		if (task->num_completed_reqs == task->num_reqs) {
1945 			STAILQ_REMOVE_HEAD(&qp->in_hw, link);
1946 			accel_mlx5_task_complete(task);
1947 		} else {
1948 			assert(task->num_submitted_reqs < task->num_reqs);
1949 			assert(task->num_completed_reqs == task->num_submitted_reqs);
1950 			STAILQ_REMOVE_HEAD(&qp->in_hw, link);
1951 			rc = accel_mlx5_task_continue(task);
1952 			if (spdk_unlikely(rc)) {
1953 				if (rc != -ENOMEM) {
1954 					accel_mlx5_task_fail(task, rc);
1955 				}
1956 			}
1957 		}
1958 	}
1959 
1960 	return reaped;
1961 }
1962 
1963 static inline void
1964 accel_mlx5_resubmit_nomem_tasks(struct accel_mlx5_dev *dev)
1965 {
1966 	struct accel_mlx5_task *task, *tmp, *last;
1967 	int rc;
1968 
1969 	last = STAILQ_LAST(&dev->nomem, accel_mlx5_task, link);
1970 	STAILQ_FOREACH_SAFE(task, &dev->nomem, link, tmp) {
1971 		STAILQ_REMOVE_HEAD(&dev->nomem, link);
1972 		rc = accel_mlx5_task_continue(task);
1973 		if (spdk_unlikely(rc)) {
1974 			if (rc != -ENOMEM) {
1975 				accel_mlx5_task_fail(task, rc);
1976 			}
1977 			break;
1978 		}
1979 		/* If qpair is recovering, task is added back to the nomem list and 0 is returned. In that case we
1980 		 * need a special condition to iterate the list once and stop this FOREACH loop */
1981 		if (task == last) {
1982 			break;
1983 		}
1984 	}
1985 }
1986 
1987 static int
1988 accel_mlx5_poller(void *ctx)
1989 {
1990 	struct accel_mlx5_io_channel *ch = ctx;
1991 	struct accel_mlx5_dev *dev;
1992 
1993 	int64_t completions = 0, rc;
1994 	uint32_t i;
1995 
1996 	for (i = 0; i < ch->num_devs; i++) {
1997 		dev = &ch->devs[i];
1998 		if (dev->wrs_in_cq) {
1999 			rc = accel_mlx5_poll_cq(dev);
2000 			if (spdk_unlikely(rc < 0)) {
2001 				SPDK_ERRLOG("Error %"PRId64" on CQ, dev %s\n", rc, dev->dev_ctx->context->device->name);
2002 			}
2003 			completions += rc;
2004 			if (dev->qp.wrs_submitted) {
2005 				spdk_mlx5_qp_complete_send(dev->qp.qp);
2006 			}
2007 		}
2008 		if (!STAILQ_EMPTY(&dev->nomem)) {
2009 			accel_mlx5_resubmit_nomem_tasks(dev);
2010 		}
2011 	}
2012 
2013 	return !!completions;
2014 }
2015 
2016 static bool
2017 accel_mlx5_supports_opcode(enum spdk_accel_opcode opc)
2018 {
2019 	assert(g_accel_mlx5.enabled);
2020 
2021 	switch (opc) {
2022 	case SPDK_ACCEL_OPC_COPY:
2023 		return true;
2024 	case SPDK_ACCEL_OPC_ENCRYPT:
2025 	case SPDK_ACCEL_OPC_DECRYPT:
2026 		return g_accel_mlx5.crypto_supported;
2027 	case SPDK_ACCEL_OPC_CRC32C:
2028 	case SPDK_ACCEL_OPC_COPY_CRC32C:
2029 		return g_accel_mlx5.crc32c_supported;
2030 	default:
2031 		return false;
2032 	}
2033 }
2034 
2035 static struct spdk_io_channel *
2036 accel_mlx5_get_io_channel(void)
2037 {
2038 	assert(g_accel_mlx5.enabled);
2039 	return spdk_get_io_channel(&g_accel_mlx5);
2040 }
2041 
2042 static int
2043 accel_mlx5_create_qp(struct accel_mlx5_dev *dev, struct accel_mlx5_qp *qp)
2044 {
2045 	struct spdk_mlx5_qp_attr mlx5_qp_attr = {};
2046 	int rc;
2047 
2048 	mlx5_qp_attr.cap.max_send_wr = g_accel_mlx5.attr.qp_size;
2049 	mlx5_qp_attr.cap.max_recv_wr = 0;
2050 	mlx5_qp_attr.cap.max_send_sge = ACCEL_MLX5_MAX_SGE;
2051 	mlx5_qp_attr.cap.max_inline_data = sizeof(struct ibv_sge) * ACCEL_MLX5_MAX_SGE;
2052 
2053 	rc = spdk_mlx5_qp_create(dev->dev_ctx->pd, dev->cq, &mlx5_qp_attr, &qp->qp);
2054 	if (rc) {
2055 		return rc;
2056 	}
2057 
2058 	STAILQ_INIT(&qp->in_hw);
2059 	qp->dev = dev;
2060 	qp->verbs_qp = spdk_mlx5_qp_get_verbs_qp(qp->qp);
2061 	assert(qp->verbs_qp);
2062 	qp->wrs_max = g_accel_mlx5.attr.qp_size;
2063 
2064 	return 0;
2065 }
2066 
2067 static void
2068 accel_mlx5_add_stats(struct accel_mlx5_stats *stats, const struct accel_mlx5_stats *to_add)
2069 {
2070 	int i;
2071 
2072 	stats->crypto_umrs += to_add->crypto_umrs;
2073 	stats->sig_umrs += to_add->sig_umrs;
2074 	stats->rdma_reads += to_add->rdma_reads;
2075 	stats->rdma_writes += to_add->rdma_writes;
2076 	stats->polls += to_add->polls;
2077 	stats->idle_polls += to_add->idle_polls;
2078 	stats->completions += to_add->completions;
2079 	stats->nomem_qdepth += to_add->nomem_qdepth;
2080 	stats->nomem_mkey += to_add->nomem_mkey;
2081 	for (i = 0; i < ACCEL_MLX5_OPC_LAST; i++) {
2082 		stats->opcodes[i] += to_add->opcodes[i];
2083 	}
2084 }
2085 
2086 static void
2087 accel_mlx5_destroy_cb(void *io_device, void *ctx_buf)
2088 {
2089 	struct accel_mlx5_io_channel *ch = ctx_buf;
2090 	struct accel_mlx5_dev *dev;
2091 	uint32_t i;
2092 
2093 	spdk_poller_unregister(&ch->poller);
2094 	for (i = 0; i < ch->num_devs; i++) {
2095 		dev = &ch->devs[i];
2096 		spdk_mlx5_qp_destroy(dev->qp.qp);
2097 		if (dev->cq) {
2098 			spdk_mlx5_cq_destroy(dev->cq);
2099 		}
2100 		spdk_poller_unregister(&dev->qp.recover_poller);
2101 		if (dev->crypto_mkeys) {
2102 			spdk_mlx5_mkey_pool_put_ref(dev->crypto_mkeys);
2103 		}
2104 		if (dev->sig_mkeys) {
2105 			spdk_mlx5_mkey_pool_put_ref(dev->sig_mkeys);
2106 		}
2107 		spdk_rdma_utils_free_mem_map(&dev->mmap);
2108 		spdk_spin_lock(&g_accel_mlx5.lock);
2109 		accel_mlx5_add_stats(&g_accel_mlx5.stats, &dev->stats);
2110 		spdk_spin_unlock(&g_accel_mlx5.lock);
2111 	}
2112 	free(ch->devs);
2113 }
2114 
2115 static int
2116 accel_mlx5_create_cb(void *io_device, void *ctx_buf)
2117 {
2118 	struct spdk_mlx5_cq_attr cq_attr = {};
2119 	struct accel_mlx5_io_channel *ch = ctx_buf;
2120 	struct accel_mlx5_dev_ctx *dev_ctx;
2121 	struct accel_mlx5_dev *dev;
2122 	uint32_t i;
2123 	int rc;
2124 
2125 	ch->devs = calloc(g_accel_mlx5.num_ctxs, sizeof(*ch->devs));
2126 	if (!ch->devs) {
2127 		SPDK_ERRLOG("Memory allocation failed\n");
2128 		return -ENOMEM;
2129 	}
2130 
2131 	for (i = 0; i < g_accel_mlx5.num_ctxs; i++) {
2132 		dev_ctx = &g_accel_mlx5.dev_ctxs[i];
2133 		dev = &ch->devs[i];
2134 		dev->dev_ctx = dev_ctx;
2135 
2136 		if (dev_ctx->crypto_mkeys) {
2137 			dev->crypto_mkeys = spdk_mlx5_mkey_pool_get_ref(dev_ctx->pd, SPDK_MLX5_MKEY_POOL_FLAG_CRYPTO);
2138 			if (!dev->crypto_mkeys) {
2139 				SPDK_ERRLOG("Failed to get crypto mkey pool channel, dev %s\n", dev_ctx->context->device->name);
2140 				/* Should not happen since mkey pool is created on accel_mlx5 initialization.
2141 				 * We should not be here if pool creation failed */
2142 				assert(0);
2143 				goto err_out;
2144 			}
2145 		}
2146 		if (dev_ctx->sig_mkeys) {
2147 			dev->sig_mkeys = spdk_mlx5_mkey_pool_get_ref(dev_ctx->pd, SPDK_MLX5_MKEY_POOL_FLAG_SIGNATURE);
2148 			if (!dev->sig_mkeys) {
2149 				SPDK_ERRLOG("Failed to get sig mkey pool channel, dev %s\n", dev_ctx->context->device->name);
2150 				/* Should not happen since mkey pool is created on accel_mlx5 initialization.
2151 				 * We should not be here if pool creation failed */
2152 				assert(0);
2153 				goto err_out;
2154 			}
2155 		}
2156 
2157 		memset(&cq_attr, 0, sizeof(cq_attr));
2158 		cq_attr.cqe_cnt = g_accel_mlx5.attr.qp_size;
2159 		cq_attr.cqe_size = 64;
2160 		cq_attr.cq_context = dev;
2161 
2162 		ch->num_devs++;
2163 		rc = spdk_mlx5_cq_create(dev_ctx->pd, &cq_attr, &dev->cq);
2164 		if (rc) {
2165 			SPDK_ERRLOG("Failed to create mlx5 CQ, rc %d\n", rc);
2166 			goto err_out;
2167 		}
2168 
2169 		rc = accel_mlx5_create_qp(dev, &dev->qp);
2170 		if (rc) {
2171 			SPDK_ERRLOG("Failed to create mlx5 QP, rc %d\n", rc);
2172 			goto err_out;
2173 		}
2174 
2175 		dev->mmap = spdk_rdma_utils_create_mem_map(dev_ctx->pd, NULL,
2176 				IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE);
2177 		if (!dev->mmap) {
2178 			SPDK_ERRLOG("Failed to create memory map\n");
2179 			rc = -ENOMEM;
2180 			goto err_out;
2181 		}
2182 		dev->crypto_multi_block = dev_ctx->crypto_multi_block;
2183 		dev->crypto_split_blocks = dev_ctx->crypto_multi_block ? g_accel_mlx5.attr.crypto_split_blocks : 0;
2184 		dev->wrs_in_cq_max = g_accel_mlx5.attr.qp_size;
2185 		STAILQ_INIT(&dev->nomem);
2186 	}
2187 
2188 	ch->poller = SPDK_POLLER_REGISTER(accel_mlx5_poller, ch, 0);
2189 
2190 	return 0;
2191 
2192 err_out:
2193 	accel_mlx5_destroy_cb(&g_accel_mlx5, ctx_buf);
2194 	return rc;
2195 }
2196 
2197 void
2198 accel_mlx5_get_default_attr(struct accel_mlx5_attr *attr)
2199 {
2200 	assert(attr);
2201 
2202 	attr->qp_size = ACCEL_MLX5_QP_SIZE;
2203 	attr->num_requests = ACCEL_MLX5_NUM_REQUESTS;
2204 	attr->allowed_devs = NULL;
2205 	attr->crypto_split_blocks = 0;
2206 }
2207 
2208 static void
2209 accel_mlx5_allowed_devs_free(void)
2210 {
2211 	size_t i;
2212 
2213 	if (!g_accel_mlx5.allowed_devs) {
2214 		return;
2215 	}
2216 
2217 	for (i = 0; i < g_accel_mlx5.allowed_devs_count; i++) {
2218 		free(g_accel_mlx5.allowed_devs[i]);
2219 	}
2220 	free(g_accel_mlx5.attr.allowed_devs);
2221 	free(g_accel_mlx5.allowed_devs);
2222 	g_accel_mlx5.attr.allowed_devs = NULL;
2223 	g_accel_mlx5.allowed_devs = NULL;
2224 	g_accel_mlx5.allowed_devs_count = 0;
2225 }
2226 
2227 static int
2228 accel_mlx5_allowed_devs_parse(const char *allowed_devs)
2229 {
2230 	char *str, *tmp, *tok;
2231 	size_t devs_count = 0;
2232 
2233 	str = strdup(allowed_devs);
2234 	if (!str) {
2235 		return -ENOMEM;
2236 	}
2237 
2238 	accel_mlx5_allowed_devs_free();
2239 
2240 	tmp = str;
2241 	while ((tmp = strchr(tmp, ',')) != NULL) {
2242 		tmp++;
2243 		devs_count++;
2244 	}
2245 	devs_count++;
2246 
2247 	g_accel_mlx5.allowed_devs = calloc(devs_count, sizeof(char *));
2248 	if (!g_accel_mlx5.allowed_devs) {
2249 		free(str);
2250 		return -ENOMEM;
2251 	}
2252 
2253 	devs_count = 0;
2254 	tok = strtok(str, ",");
2255 	while (tok) {
2256 		g_accel_mlx5.allowed_devs[devs_count] = strdup(tok);
2257 		if (!g_accel_mlx5.allowed_devs[devs_count]) {
2258 			free(str);
2259 			accel_mlx5_allowed_devs_free();
2260 			return -ENOMEM;
2261 		}
2262 		tok = strtok(NULL, ",");
2263 		devs_count++;
2264 		g_accel_mlx5.allowed_devs_count++;
2265 	}
2266 
2267 	free(str);
2268 
2269 	return 0;
2270 }
2271 
2272 int
2273 accel_mlx5_enable(struct accel_mlx5_attr *attr)
2274 {
2275 	int rc;
2276 
2277 	if (g_accel_mlx5.enabled) {
2278 		return -EEXIST;
2279 	}
2280 	if (attr) {
2281 		if (attr->num_requests / spdk_env_get_core_count() < ACCEL_MLX5_MAX_MKEYS_IN_TASK) {
2282 			SPDK_ERRLOG("num requests per core must not be less than %u, current value %u\n",
2283 				    ACCEL_MLX5_MAX_MKEYS_IN_TASK, attr->num_requests / spdk_env_get_core_count());
2284 			return -EINVAL;
2285 		}
2286 		if (attr->qp_size < 8) {
2287 			SPDK_ERRLOG("qp_size must be at least 8\n");
2288 			return -EINVAL;
2289 		}
2290 		g_accel_mlx5.attr = *attr;
2291 		g_accel_mlx5.attr.allowed_devs = NULL;
2292 
2293 		if (attr->allowed_devs) {
2294 			/* Contains a copy of user's string */
2295 			g_accel_mlx5.attr.allowed_devs = strndup(attr->allowed_devs, ACCEL_MLX5_ALLOWED_DEVS_MAX_LEN);
2296 			if (!g_accel_mlx5.attr.allowed_devs) {
2297 				return -ENOMEM;
2298 			}
2299 			rc = accel_mlx5_allowed_devs_parse(g_accel_mlx5.attr.allowed_devs);
2300 			if (rc) {
2301 				return rc;
2302 			}
2303 			rc = spdk_mlx5_crypto_devs_allow((const char *const *)g_accel_mlx5.allowed_devs,
2304 							 g_accel_mlx5.allowed_devs_count);
2305 			if (rc) {
2306 				accel_mlx5_allowed_devs_free();
2307 				return rc;
2308 			}
2309 		}
2310 	} else {
2311 		accel_mlx5_get_default_attr(&g_accel_mlx5.attr);
2312 	}
2313 
2314 	g_accel_mlx5.enabled = true;
2315 	spdk_accel_module_list_add(&g_accel_mlx5.module);
2316 
2317 	return 0;
2318 }
2319 
2320 static void
2321 accel_mlx5_psvs_release(struct accel_mlx5_dev_ctx *dev_ctx)
2322 {
2323 	uint32_t i, num_psvs, num_psvs_in_pool;
2324 
2325 	if (!dev_ctx->psvs) {
2326 		return;
2327 	}
2328 
2329 	num_psvs = g_accel_mlx5.attr.num_requests;
2330 
2331 	for (i = 0; i < num_psvs; i++) {
2332 		if (dev_ctx->psvs[i]) {
2333 			spdk_mlx5_destroy_psv(dev_ctx->psvs[i]);
2334 			dev_ctx->psvs[i] = NULL;
2335 		}
2336 	}
2337 	free(dev_ctx->psvs);
2338 
2339 	if (!dev_ctx->psv_pool) {
2340 		return;
2341 	}
2342 	num_psvs_in_pool = spdk_mempool_count(dev_ctx->psv_pool);
2343 	if (num_psvs_in_pool != num_psvs) {
2344 		SPDK_ERRLOG("Expected %u reqs in the pool, but got only %u\n", num_psvs, num_psvs_in_pool);
2345 	}
2346 	spdk_mempool_free(dev_ctx->psv_pool);
2347 }
2348 
2349 static void
2350 accel_mlx5_free_resources(void)
2351 {
2352 	struct accel_mlx5_dev_ctx *dev_ctx;
2353 	uint32_t i;
2354 
2355 	for (i = 0; i < g_accel_mlx5.num_ctxs; i++) {
2356 		dev_ctx = &g_accel_mlx5.dev_ctxs[i];
2357 		accel_mlx5_psvs_release(dev_ctx);
2358 		if (dev_ctx->pd) {
2359 			if (dev_ctx->crypto_mkeys) {
2360 				spdk_mlx5_mkey_pool_destroy(SPDK_MLX5_MKEY_POOL_FLAG_CRYPTO, dev_ctx->pd);
2361 			}
2362 			if (dev_ctx->sig_mkeys) {
2363 				spdk_mlx5_mkey_pool_destroy(SPDK_MLX5_MKEY_POOL_FLAG_SIGNATURE, dev_ctx->pd);
2364 			}
2365 			spdk_rdma_utils_put_pd(dev_ctx->pd);
2366 		}
2367 		if (dev_ctx->domain) {
2368 			spdk_rdma_utils_put_memory_domain(dev_ctx->domain);
2369 		}
2370 	}
2371 
2372 	free(g_accel_mlx5.dev_ctxs);
2373 	g_accel_mlx5.dev_ctxs = NULL;
2374 	g_accel_mlx5.initialized = false;
2375 }
2376 
2377 static void
2378 accel_mlx5_deinit_cb(void *ctx)
2379 {
2380 	accel_mlx5_free_resources();
2381 	spdk_spin_destroy(&g_accel_mlx5.lock);
2382 	spdk_accel_module_finish();
2383 }
2384 
2385 static void
2386 accel_mlx5_deinit(void *ctx)
2387 {
2388 	if (g_accel_mlx5.allowed_devs) {
2389 		accel_mlx5_allowed_devs_free();
2390 	}
2391 	spdk_mlx5_crypto_devs_allow(NULL, 0);
2392 	if (g_accel_mlx5.initialized) {
2393 		spdk_io_device_unregister(&g_accel_mlx5, accel_mlx5_deinit_cb);
2394 	} else {
2395 		spdk_accel_module_finish();
2396 	}
2397 }
2398 
2399 static int
2400 accel_mlx5_mkeys_create(struct ibv_pd *pd, uint32_t num_mkeys, uint32_t flags)
2401 {
2402 	struct spdk_mlx5_mkey_pool_param pool_param = {};
2403 
2404 	pool_param.mkey_count = num_mkeys;
2405 	pool_param.cache_per_thread = num_mkeys * 3 / 4 / spdk_env_get_core_count();
2406 	pool_param.flags = flags;
2407 
2408 	return spdk_mlx5_mkey_pool_init(&pool_param, pd);
2409 }
2410 
2411 static void
2412 accel_mlx5_set_psv_in_pool(struct spdk_mempool *mp, void *cb_arg, void *_psv, unsigned obj_idx)
2413 {
2414 	struct spdk_rdma_utils_memory_translation translation = {};
2415 	struct accel_mlx5_psv_pool_iter_cb_args *args = cb_arg;
2416 	struct accel_mlx5_psv_wrapper *wrapper = _psv;
2417 	struct accel_mlx5_dev_ctx *dev_ctx = args->dev;
2418 	int rc;
2419 
2420 	if (args->rc) {
2421 		return;
2422 	}
2423 	assert(obj_idx < g_accel_mlx5.attr.num_requests);
2424 	assert(dev_ctx->psvs[obj_idx] != NULL);
2425 	memset(wrapper, 0, sizeof(*wrapper));
2426 	wrapper->psv_index = dev_ctx->psvs[obj_idx]->index;
2427 
2428 	rc = spdk_rdma_utils_get_translation(args->map, &wrapper->crc, sizeof(uint32_t), &translation);
2429 	if (rc) {
2430 		SPDK_ERRLOG("Memory translation failed, addr %p, length %zu\n", &wrapper->crc, sizeof(uint32_t));
2431 		args->rc = -EINVAL;
2432 	} else {
2433 		wrapper->crc_lkey = spdk_rdma_utils_memory_translation_get_lkey(&translation);
2434 	}
2435 }
2436 
2437 static int
2438 accel_mlx5_psvs_create(struct accel_mlx5_dev_ctx *dev_ctx)
2439 {
2440 	struct accel_mlx5_psv_pool_iter_cb_args args = {
2441 		.dev = dev_ctx
2442 	};
2443 	char pool_name[32];
2444 	uint32_t i;
2445 	uint32_t num_psvs = g_accel_mlx5.attr.num_requests;
2446 	uint32_t cache_size;
2447 	int rc;
2448 
2449 	dev_ctx->psvs = calloc(num_psvs, (sizeof(struct spdk_mlx5_psv *)));
2450 	if (!dev_ctx->psvs) {
2451 		SPDK_ERRLOG("Failed to alloc PSVs array\n");
2452 		return -ENOMEM;
2453 	}
2454 	for (i = 0; i < num_psvs; i++) {
2455 		dev_ctx->psvs[i] = spdk_mlx5_create_psv(dev_ctx->pd);
2456 		if (!dev_ctx->psvs[i]) {
2457 			SPDK_ERRLOG("Failed to create PSV on dev %s\n", dev_ctx->context->device->name);
2458 			return -EINVAL;
2459 		}
2460 	}
2461 
2462 	rc = snprintf(pool_name, sizeof(pool_name), "accel_psv_%s", dev_ctx->context->device->name);
2463 	if (rc < 0) {
2464 		assert(0);
2465 		return -EINVAL;
2466 	}
2467 	cache_size = num_psvs * 3 / 4 / spdk_env_get_core_count();
2468 	args.map = spdk_rdma_utils_create_mem_map(dev_ctx->pd, NULL,
2469 			IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE);
2470 	if (!args.map) {
2471 		return -ENOMEM;
2472 	}
2473 	dev_ctx->psv_pool = spdk_mempool_create_ctor(pool_name, num_psvs,
2474 			    sizeof(struct accel_mlx5_psv_wrapper),
2475 			    cache_size, SPDK_ENV_SOCKET_ID_ANY,
2476 			    accel_mlx5_set_psv_in_pool, &args);
2477 	spdk_rdma_utils_free_mem_map(&args.map);
2478 	if (!dev_ctx->psv_pool) {
2479 		SPDK_ERRLOG("Failed to create PSV memory pool\n");
2480 		return -ENOMEM;
2481 	}
2482 	if (args.rc) {
2483 		SPDK_ERRLOG("Failed to init PSV memory pool objects, rc %d\n", args.rc);
2484 		return args.rc;
2485 	}
2486 
2487 	return 0;
2488 }
2489 
2490 
2491 static int
2492 accel_mlx5_dev_ctx_init(struct accel_mlx5_dev_ctx *dev_ctx, struct ibv_context *dev,
2493 			struct spdk_mlx5_device_caps *caps)
2494 {
2495 	struct ibv_pd *pd;
2496 	int rc;
2497 
2498 	pd = spdk_rdma_utils_get_pd(dev);
2499 	if (!pd) {
2500 		SPDK_ERRLOG("Failed to get PD for context %p, dev %s\n", dev, dev->device->name);
2501 		return -EINVAL;
2502 	}
2503 	dev_ctx->context = dev;
2504 	dev_ctx->pd = pd;
2505 	dev_ctx->domain = spdk_rdma_utils_get_memory_domain(pd);
2506 	if (!dev_ctx->domain) {
2507 		return -ENOMEM;
2508 	}
2509 
2510 	if (g_accel_mlx5.crypto_supported) {
2511 		dev_ctx->crypto_multi_block = caps->crypto.multi_block_be_tweak;
2512 		if (!dev_ctx->crypto_multi_block && g_accel_mlx5.attr.crypto_split_blocks) {
2513 			SPDK_WARNLOG("\"crypto_split_blocks\" is set but dev %s doesn't support multi block crypto\n",
2514 				     dev->device->name);
2515 		}
2516 		rc = accel_mlx5_mkeys_create(pd, g_accel_mlx5.attr.num_requests, SPDK_MLX5_MKEY_POOL_FLAG_CRYPTO);
2517 		if (rc) {
2518 			SPDK_ERRLOG("Failed to create crypto mkeys pool, rc %d, dev %s\n", rc, dev->device->name);
2519 			return rc;
2520 		}
2521 		dev_ctx->crypto_mkeys = true;
2522 	}
2523 	if (g_accel_mlx5.crc32c_supported) {
2524 		rc = accel_mlx5_mkeys_create(pd, g_accel_mlx5.attr.num_requests,
2525 					     SPDK_MLX5_MKEY_POOL_FLAG_SIGNATURE);
2526 		if (rc) {
2527 			SPDK_ERRLOG("Failed to create signature mkeys pool, rc %d, dev %s\n", rc, dev->device->name);
2528 			return rc;
2529 		}
2530 		dev_ctx->sig_mkeys = true;
2531 		rc = accel_mlx5_psvs_create(dev_ctx);
2532 		if (rc) {
2533 			SPDK_ERRLOG("Failed to create PSVs pool, rc %d, dev %s\n", rc, dev->device->name);
2534 			return rc;
2535 		}
2536 	}
2537 
2538 	return 0;
2539 }
2540 
2541 static struct ibv_context **
2542 accel_mlx5_get_devices(int *_num_devs)
2543 {
2544 	struct ibv_context **rdma_devs, **rdma_devs_out = NULL, *dev;
2545 	struct ibv_device_attr dev_attr;
2546 	size_t j;
2547 	int num_devs = 0, i, rc;
2548 	int num_devs_out = 0;
2549 	bool dev_allowed;
2550 
2551 	rdma_devs = rdma_get_devices(&num_devs);
2552 	if (!rdma_devs || !num_devs) {
2553 		*_num_devs = 0;
2554 		return NULL;
2555 	}
2556 
2557 	rdma_devs_out = calloc(num_devs + 1, sizeof(struct ibv_context *));
2558 	if (!rdma_devs_out) {
2559 		SPDK_ERRLOG("Memory allocation failed\n");
2560 		rdma_free_devices(rdma_devs);
2561 		*_num_devs = 0;
2562 		return NULL;
2563 	}
2564 
2565 	for (i = 0; i < num_devs; i++) {
2566 		dev = rdma_devs[i];
2567 		rc = ibv_query_device(dev, &dev_attr);
2568 		if (rc) {
2569 			SPDK_ERRLOG("Failed to query dev %s, skipping\n", dev->device->name);
2570 			continue;
2571 		}
2572 		if (dev_attr.vendor_id != SPDK_MLX5_VENDOR_ID_MELLANOX) {
2573 			SPDK_DEBUGLOG(accel_mlx5, "dev %s is not Mellanox device, skipping\n", dev->device->name);
2574 			continue;
2575 		}
2576 
2577 		if (g_accel_mlx5.allowed_devs_count) {
2578 			dev_allowed = false;
2579 			for (j = 0; j < g_accel_mlx5.allowed_devs_count; j++) {
2580 				if (strcmp(g_accel_mlx5.allowed_devs[j], dev->device->name) == 0) {
2581 					dev_allowed = true;
2582 					break;
2583 				}
2584 			}
2585 			if (!dev_allowed) {
2586 				continue;
2587 			}
2588 		}
2589 
2590 		rdma_devs_out[num_devs_out] = dev;
2591 		num_devs_out++;
2592 	}
2593 
2594 	rdma_free_devices(rdma_devs);
2595 	*_num_devs = num_devs_out;
2596 
2597 	return rdma_devs_out;
2598 }
2599 
2600 static inline bool
2601 accel_mlx5_dev_supports_crypto(struct spdk_mlx5_device_caps *caps)
2602 {
2603 	return caps->crypto_supported && !caps->crypto.wrapped_import_method_aes_xts &&
2604 	       (caps->crypto.single_block_le_tweak ||
2605 		caps->crypto.multi_block_le_tweak || caps->crypto.multi_block_be_tweak);
2606 }
2607 
2608 static int
2609 accel_mlx5_init(void)
2610 {
2611 	struct spdk_mlx5_device_caps *caps;
2612 	struct ibv_context **rdma_devs, *dev;
2613 	int num_devs = 0,  rc = 0, i;
2614 	int best_dev = -1, first_dev = 0;
2615 	int best_dev_stat = 0, dev_stat;
2616 	bool supports_crypto;
2617 	bool find_best_dev = g_accel_mlx5.allowed_devs_count == 0;
2618 
2619 	if (!g_accel_mlx5.enabled) {
2620 		return -EINVAL;
2621 	}
2622 
2623 	spdk_spin_init(&g_accel_mlx5.lock);
2624 	rdma_devs = accel_mlx5_get_devices(&num_devs);
2625 	if (!rdma_devs || !num_devs) {
2626 		return -ENODEV;
2627 	}
2628 	caps = calloc(num_devs, sizeof(*caps));
2629 	if (!caps) {
2630 		rc = -ENOMEM;
2631 		goto cleanup;
2632 	}
2633 
2634 	g_accel_mlx5.crypto_supported = true;
2635 	g_accel_mlx5.crc32c_supported = true;
2636 	g_accel_mlx5.num_ctxs = 0;
2637 
2638 	/* Iterate devices. We support an offload if all devices support it */
2639 	for (i = 0; i < num_devs; i++) {
2640 		dev = rdma_devs[i];
2641 
2642 		rc = spdk_mlx5_device_query_caps(dev, &caps[i]);
2643 		if (rc) {
2644 			SPDK_ERRLOG("Failed to get crypto caps, dev %s\n", dev->device->name);
2645 			goto cleanup;
2646 		}
2647 		supports_crypto = accel_mlx5_dev_supports_crypto(&caps[i]);
2648 		if (!supports_crypto) {
2649 			SPDK_DEBUGLOG(accel_mlx5, "Disable crypto support because dev %s doesn't support it\n",
2650 				      rdma_devs[i]->device->name);
2651 			g_accel_mlx5.crypto_supported = false;
2652 		}
2653 		if (!caps[i].crc32c_supported) {
2654 			SPDK_DEBUGLOG(accel_mlx5, "Disable crc32c support because dev %s doesn't support it\n",
2655 				      rdma_devs[i]->device->name);
2656 			g_accel_mlx5.crc32c_supported = false;
2657 		}
2658 		if (find_best_dev) {
2659 			/* Find device which supports max number of offloads */
2660 			dev_stat = (int)supports_crypto + (int)caps[i].crc32c_supported;
2661 			if (dev_stat > best_dev_stat) {
2662 				best_dev_stat = dev_stat;
2663 				best_dev = i;
2664 			}
2665 		}
2666 	}
2667 
2668 	/* User didn't specify devices to use, try to select the best one */
2669 	if (find_best_dev) {
2670 		if (best_dev == -1) {
2671 			best_dev = 0;
2672 		}
2673 		g_accel_mlx5.crypto_supported = accel_mlx5_dev_supports_crypto(&caps[best_dev]);
2674 		g_accel_mlx5.crc32c_supported = caps[best_dev].crc32c_supported;
2675 		SPDK_NOTICELOG("Select dev %s, crypto %d, crc32c %d\n", rdma_devs[best_dev]->device->name,
2676 			       g_accel_mlx5.crypto_supported, g_accel_mlx5.crc32c_supported);
2677 		first_dev = best_dev;
2678 		num_devs = 1;
2679 		if (g_accel_mlx5.crypto_supported) {
2680 			const char *const dev_name[] = { rdma_devs[best_dev]->device->name };
2681 			/* Let mlx5 library know which device to use */
2682 			spdk_mlx5_crypto_devs_allow(dev_name, 1);
2683 		}
2684 	} else {
2685 		SPDK_NOTICELOG("Found %d devices, crypto %d\n", num_devs, g_accel_mlx5.crypto_supported);
2686 	}
2687 
2688 	g_accel_mlx5.dev_ctxs = calloc(num_devs, sizeof(*g_accel_mlx5.dev_ctxs));
2689 	if (!g_accel_mlx5.dev_ctxs) {
2690 		SPDK_ERRLOG("Memory allocation failed\n");
2691 		rc = -ENOMEM;
2692 		goto cleanup;
2693 	}
2694 
2695 	for (i = first_dev; i < first_dev + num_devs; i++) {
2696 		rc = accel_mlx5_dev_ctx_init(&g_accel_mlx5.dev_ctxs[g_accel_mlx5.num_ctxs++],
2697 					     rdma_devs[i], &caps[i]);
2698 		if (rc) {
2699 			goto cleanup;
2700 		}
2701 	}
2702 
2703 	SPDK_NOTICELOG("Accel framework mlx5 initialized, found %d devices.\n", num_devs);
2704 	spdk_io_device_register(&g_accel_mlx5, accel_mlx5_create_cb, accel_mlx5_destroy_cb,
2705 				sizeof(struct accel_mlx5_io_channel), "accel_mlx5");
2706 	g_accel_mlx5.initialized = true;
2707 	free(rdma_devs);
2708 	free(caps);
2709 
2710 	return 0;
2711 
2712 cleanup:
2713 	free(rdma_devs);
2714 	free(caps);
2715 	accel_mlx5_free_resources();
2716 	spdk_spin_destroy(&g_accel_mlx5.lock);
2717 
2718 	return rc;
2719 }
2720 
2721 static void
2722 accel_mlx5_write_config_json(struct spdk_json_write_ctx *w)
2723 {
2724 	if (g_accel_mlx5.enabled) {
2725 		spdk_json_write_object_begin(w);
2726 		spdk_json_write_named_string(w, "method", "mlx5_scan_accel_module");
2727 		spdk_json_write_named_object_begin(w, "params");
2728 		spdk_json_write_named_uint16(w, "qp_size", g_accel_mlx5.attr.qp_size);
2729 		spdk_json_write_named_uint32(w, "num_requests", g_accel_mlx5.attr.num_requests);
2730 		if (g_accel_mlx5.attr.allowed_devs) {
2731 			spdk_json_write_named_string(w, "allowed_devs", g_accel_mlx5.attr.allowed_devs);
2732 		}
2733 		spdk_json_write_named_uint16(w, "crypto_split_blocks", g_accel_mlx5.attr.crypto_split_blocks);
2734 		spdk_json_write_object_end(w);
2735 		spdk_json_write_object_end(w);
2736 	}
2737 }
2738 
2739 static size_t
2740 accel_mlx5_get_ctx_size(void)
2741 {
2742 	return sizeof(struct accel_mlx5_task);
2743 }
2744 
2745 static int
2746 accel_mlx5_crypto_key_init(struct spdk_accel_crypto_key *key)
2747 {
2748 	struct spdk_mlx5_crypto_dek_create_attr attr = {};
2749 	struct spdk_mlx5_crypto_keytag *keytag;
2750 	int rc;
2751 
2752 	if (!key || !key->key || !key->key2 || !key->key_size || !key->key2_size) {
2753 		return -EINVAL;
2754 	}
2755 
2756 	attr.dek = calloc(1, key->key_size + key->key2_size);
2757 	if (!attr.dek) {
2758 		return -ENOMEM;
2759 	}
2760 
2761 	memcpy(attr.dek, key->key, key->key_size);
2762 	memcpy(attr.dek + key->key_size, key->key2, key->key2_size);
2763 	attr.dek_len = key->key_size + key->key2_size;
2764 
2765 	rc = spdk_mlx5_crypto_keytag_create(&attr, &keytag);
2766 	spdk_memset_s(attr.dek, attr.dek_len, 0, attr.dek_len);
2767 	free(attr.dek);
2768 	if (rc) {
2769 		SPDK_ERRLOG("Failed to create a keytag, rc %d\n", rc);
2770 		return rc;
2771 	}
2772 
2773 	key->priv = keytag;
2774 
2775 	return 0;
2776 }
2777 
2778 static void
2779 accel_mlx5_crypto_key_deinit(struct spdk_accel_crypto_key *key)
2780 {
2781 	if (!key || key->module_if != &g_accel_mlx5.module || !key->priv) {
2782 		return;
2783 	}
2784 
2785 	spdk_mlx5_crypto_keytag_destroy(key->priv);
2786 }
2787 
2788 static void
2789 accel_mlx5_dump_stats_json(struct spdk_json_write_ctx *w, const char *header,
2790 			   const struct accel_mlx5_stats *stats)
2791 {
2792 	double idle_polls_percentage = 0;
2793 	double cpls_per_poll = 0;
2794 	uint64_t total_tasks = 0;
2795 	int i;
2796 
2797 	if (stats->polls) {
2798 		idle_polls_percentage = (double) stats->idle_polls * 100 / stats->polls;
2799 	}
2800 	if (stats->polls > stats->idle_polls) {
2801 		cpls_per_poll = (double) stats->completions / (stats->polls - stats->idle_polls);
2802 	}
2803 	for (i = 0; i < ACCEL_MLX5_OPC_LAST; i++) {
2804 		total_tasks += stats->opcodes[i];
2805 	}
2806 
2807 	spdk_json_write_named_object_begin(w, header);
2808 
2809 	spdk_json_write_named_object_begin(w, "umrs");
2810 	spdk_json_write_named_uint64(w, "crypto_umrs", stats->crypto_umrs);
2811 	spdk_json_write_named_uint64(w, "sig_umrs", stats->sig_umrs);
2812 	spdk_json_write_named_uint64(w, "total", stats->crypto_umrs + stats->sig_umrs);
2813 	spdk_json_write_object_end(w);
2814 
2815 	spdk_json_write_named_object_begin(w, "rdma");
2816 	spdk_json_write_named_uint64(w, "read", stats->rdma_reads);
2817 	spdk_json_write_named_uint64(w, "write", stats->rdma_writes);
2818 	spdk_json_write_named_uint64(w, "total", stats->rdma_reads + stats->rdma_writes);
2819 	spdk_json_write_object_end(w);
2820 
2821 	spdk_json_write_named_object_begin(w, "polling");
2822 	spdk_json_write_named_uint64(w, "polls", stats->polls);
2823 	spdk_json_write_named_uint64(w, "idle_polls", stats->idle_polls);
2824 	spdk_json_write_named_uint64(w, "completions", stats->completions);
2825 	spdk_json_write_named_double(w, "idle_polls_percentage", idle_polls_percentage);
2826 	spdk_json_write_named_double(w, "cpls_per_poll", cpls_per_poll);
2827 	spdk_json_write_named_uint64(w, "nomem_qdepth", stats->nomem_qdepth);
2828 	spdk_json_write_named_uint64(w, "nomem_mkey", stats->nomem_mkey);
2829 	spdk_json_write_object_end(w);
2830 
2831 	spdk_json_write_named_object_begin(w, "tasks");
2832 	spdk_json_write_named_uint64(w, "copy", stats->opcodes[ACCEL_MLX5_OPC_COPY]);
2833 	spdk_json_write_named_uint64(w, "crypto", stats->opcodes[ACCEL_MLX5_OPC_CRYPTO]);
2834 	spdk_json_write_named_uint64(w, "crc32c", stats->opcodes[ACCEL_MLX5_OPC_CRC32C]);
2835 	spdk_json_write_named_uint64(w, "total", total_tasks);
2836 	spdk_json_write_object_end(w);
2837 
2838 	spdk_json_write_object_end(w);
2839 }
2840 
2841 static void
2842 accel_mlx5_dump_channel_stat(struct spdk_io_channel_iter *i)
2843 {
2844 	struct accel_mlx5_stats ch_stat = {};
2845 	struct accel_mlx5_dump_stats_ctx *ctx;
2846 	struct spdk_io_channel *_ch;
2847 	struct accel_mlx5_io_channel *ch;
2848 	struct accel_mlx5_dev *dev;
2849 	uint32_t j;
2850 
2851 	ctx = spdk_io_channel_iter_get_ctx(i);
2852 	_ch = spdk_io_channel_iter_get_channel(i);
2853 	ch = spdk_io_channel_get_ctx(_ch);
2854 
2855 	if (ctx->level != ACCEL_MLX5_DUMP_STAT_LEVEL_TOTAL) {
2856 		spdk_json_write_object_begin(ctx->w);
2857 		spdk_json_write_named_object_begin(ctx->w, spdk_thread_get_name(spdk_get_thread()));
2858 	}
2859 	if (ctx->level == ACCEL_MLX5_DUMP_STAT_LEVEL_DEV) {
2860 		spdk_json_write_named_array_begin(ctx->w, "devices");
2861 	}
2862 
2863 	for (j = 0; j < ch->num_devs; j++) {
2864 		dev = &ch->devs[j];
2865 		/* Save grand total and channel stats */
2866 		accel_mlx5_add_stats(&ctx->total, &dev->stats);
2867 		accel_mlx5_add_stats(&ch_stat, &dev->stats);
2868 		if (ctx->level == ACCEL_MLX5_DUMP_STAT_LEVEL_DEV) {
2869 			spdk_json_write_object_begin(ctx->w);
2870 			accel_mlx5_dump_stats_json(ctx->w, dev->dev_ctx->context->device->name, &dev->stats);
2871 			spdk_json_write_object_end(ctx->w);
2872 		}
2873 	}
2874 
2875 	if (ctx->level == ACCEL_MLX5_DUMP_STAT_LEVEL_DEV) {
2876 		spdk_json_write_array_end(ctx->w);
2877 	}
2878 	if (ctx->level != ACCEL_MLX5_DUMP_STAT_LEVEL_TOTAL) {
2879 		accel_mlx5_dump_stats_json(ctx->w, "channel_total", &ch_stat);
2880 		spdk_json_write_object_end(ctx->w);
2881 		spdk_json_write_object_end(ctx->w);
2882 	}
2883 
2884 	spdk_for_each_channel_continue(i, 0);
2885 }
2886 
2887 static void
2888 accel_mlx5_dump_channel_stat_done(struct spdk_io_channel_iter *i, int status)
2889 {
2890 	struct accel_mlx5_dump_stats_ctx *ctx;
2891 
2892 	ctx = spdk_io_channel_iter_get_ctx(i);
2893 
2894 	spdk_spin_lock(&g_accel_mlx5.lock);
2895 	/* Add statistics from destroyed channels */
2896 	accel_mlx5_add_stats(&ctx->total, &g_accel_mlx5.stats);
2897 	spdk_spin_unlock(&g_accel_mlx5.lock);
2898 
2899 	if (ctx->level != ACCEL_MLX5_DUMP_STAT_LEVEL_TOTAL) {
2900 		/* channels[] */
2901 		spdk_json_write_array_end(ctx->w);
2902 	}
2903 
2904 	accel_mlx5_dump_stats_json(ctx->w, "total", &ctx->total);
2905 
2906 	/* Ends the whole response which was begun in accel_mlx5_dump_stats */
2907 	spdk_json_write_object_end(ctx->w);
2908 
2909 	ctx->cb(ctx->ctx, 0);
2910 	free(ctx);
2911 }
2912 
2913 int
2914 accel_mlx5_dump_stats(struct spdk_json_write_ctx *w, enum accel_mlx5_dump_state_level level,
2915 		      accel_mlx5_dump_stat_done_cb cb, void *ctx)
2916 {
2917 	struct accel_mlx5_dump_stats_ctx *stat_ctx;
2918 
2919 	if (!w || !cb) {
2920 		return -EINVAL;
2921 	}
2922 	if (!g_accel_mlx5.initialized) {
2923 		return -ENODEV;
2924 	}
2925 
2926 	stat_ctx = calloc(1, sizeof(*stat_ctx));
2927 	if (!stat_ctx) {
2928 		return -ENOMEM;
2929 	}
2930 	stat_ctx->cb = cb;
2931 	stat_ctx->ctx = ctx;
2932 	stat_ctx->level = level;
2933 	stat_ctx->w = w;
2934 
2935 	spdk_json_write_object_begin(w);
2936 
2937 	if (level != ACCEL_MLX5_DUMP_STAT_LEVEL_TOTAL) {
2938 		spdk_json_write_named_array_begin(w, "channels");
2939 	}
2940 
2941 	spdk_for_each_channel(&g_accel_mlx5, accel_mlx5_dump_channel_stat, stat_ctx,
2942 			      accel_mlx5_dump_channel_stat_done);
2943 
2944 	return 0;
2945 }
2946 
2947 static bool
2948 accel_mlx5_crypto_supports_cipher(enum spdk_accel_cipher cipher, size_t key_size)
2949 {
2950 	switch (cipher) {
2951 	case SPDK_ACCEL_CIPHER_AES_XTS:
2952 		return key_size == SPDK_ACCEL_AES_XTS_128_KEY_SIZE || key_size == SPDK_ACCEL_AES_XTS_256_KEY_SIZE;
2953 	default:
2954 		return false;
2955 	}
2956 }
2957 
2958 static int
2959 accel_mlx5_get_memory_domains(struct spdk_memory_domain **domains, int array_size)
2960 {
2961 	int i, size;
2962 
2963 	if (!domains || !array_size) {
2964 		return (int)g_accel_mlx5.num_ctxs;
2965 	}
2966 
2967 	size = spdk_min(array_size, (int)g_accel_mlx5.num_ctxs);
2968 
2969 	for (i = 0; i < size; i++) {
2970 		domains[i] = g_accel_mlx5.dev_ctxs[i].domain;
2971 	}
2972 
2973 	return (int)g_accel_mlx5.num_ctxs;
2974 }
2975 
2976 static struct accel_mlx5_module g_accel_mlx5 = {
2977 	.module = {
2978 		.module_init		= accel_mlx5_init,
2979 		.module_fini		= accel_mlx5_deinit,
2980 		.write_config_json	= accel_mlx5_write_config_json,
2981 		.get_ctx_size		= accel_mlx5_get_ctx_size,
2982 		.name			= "mlx5",
2983 		.supports_opcode	= accel_mlx5_supports_opcode,
2984 		.get_io_channel		= accel_mlx5_get_io_channel,
2985 		.submit_tasks		= accel_mlx5_submit_tasks,
2986 		.crypto_key_init	= accel_mlx5_crypto_key_init,
2987 		.crypto_key_deinit	= accel_mlx5_crypto_key_deinit,
2988 		.crypto_supports_cipher	= accel_mlx5_crypto_supports_cipher,
2989 		.get_memory_domains	= accel_mlx5_get_memory_domains,
2990 	}
2991 };
2992 
2993 SPDK_LOG_REGISTER_COMPONENT(accel_mlx5)
2994