xref: /spdk/module/bdev/xnvme/bdev_xnvme.c (revision 2e1d23f4b70ea8940db7624b3bb974a4a8658ec7)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2022 Intel Corporation.
3  *   Copyright (c) Samsung Electronics Co., Ltd.
4  *   All rights reserved.
5  */
6 
7 #include "libxnvme.h"
8 
9 #include "bdev_xnvme.h"
10 
11 #include "spdk/stdinc.h"
12 
13 #include "spdk/barrier.h"
14 #include "spdk/bdev.h"
15 #include "spdk/env.h"
16 #include "spdk/fd.h"
17 #include "spdk/likely.h"
18 #include "spdk/thread.h"
19 #include "spdk/json.h"
20 #include "spdk/util.h"
21 #include "spdk/string.h"
22 
23 #include "spdk/log.h"
24 
25 struct bdev_xnvme_io_channel {
26 	struct xnvme_queue	*queue;
27 	struct spdk_poller	*poller;
28 };
29 
30 struct bdev_xnvme_task {
31 	struct bdev_xnvme_io_channel *ch;
32 	TAILQ_ENTRY(bdev_xnvme_task) link;
33 };
34 
35 struct bdev_xnvme {
36 	struct spdk_bdev	bdev;
37 	char			*filename;
38 	char			*io_mechanism;
39 	struct xnvme_dev	*dev;
40 	uint32_t		nsid;
41 	bool			conserve_cpu;
42 
43 	TAILQ_ENTRY(bdev_xnvme) link;
44 };
45 
46 static int bdev_xnvme_init(void);
47 static void bdev_xnvme_fini(void);
48 static void bdev_xnvme_free(struct bdev_xnvme *xnvme);
49 static TAILQ_HEAD(, bdev_xnvme) g_xnvme_bdev_head = TAILQ_HEAD_INITIALIZER(g_xnvme_bdev_head);
50 
51 static int
52 bdev_xnvme_get_ctx_size(void)
53 {
54 	return sizeof(struct bdev_xnvme_task);
55 }
56 
57 static int
58 bdev_xnvme_config_json(struct spdk_json_write_ctx *w)
59 {
60 	struct bdev_xnvme *xnvme;
61 
62 	TAILQ_FOREACH(xnvme, &g_xnvme_bdev_head, link) {
63 		spdk_json_write_object_begin(w);
64 
65 		spdk_json_write_named_string(w, "method", "bdev_xnvme_create");
66 
67 		spdk_json_write_named_object_begin(w, "params");
68 		spdk_json_write_named_string(w, "name", xnvme->bdev.name);
69 		spdk_json_write_named_string(w, "filename", xnvme->filename);
70 		spdk_json_write_named_string(w, "io_mechanism", xnvme->io_mechanism);
71 		spdk_json_write_named_bool(w, "conserve_cpu", xnvme->conserve_cpu);
72 		spdk_json_write_object_end(w);
73 
74 		spdk_json_write_object_end(w);
75 	}
76 
77 	return 0;
78 }
79 
80 static struct spdk_bdev_module xnvme_if = {
81 	.name		= "xnvme",
82 	.module_init	= bdev_xnvme_init,
83 	.module_fini	= bdev_xnvme_fini,
84 	.get_ctx_size	= bdev_xnvme_get_ctx_size,
85 	.config_json	= bdev_xnvme_config_json,
86 };
87 
88 SPDK_BDEV_MODULE_REGISTER(xnvme, &xnvme_if)
89 
90 static struct spdk_io_channel *
91 bdev_xnvme_get_io_channel(void *ctx)
92 {
93 	struct bdev_xnvme *xnvme = ctx;
94 
95 	return spdk_get_io_channel(xnvme);
96 }
97 
98 static bool
99 bdev_xnvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
100 {
101 	struct bdev_xnvme *xnvme = ctx;
102 
103 	switch (io_type) {
104 	case SPDK_BDEV_IO_TYPE_READ:
105 	case SPDK_BDEV_IO_TYPE_WRITE:
106 		return true;
107 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
108 	case SPDK_BDEV_IO_TYPE_UNMAP:
109 		/* libaio and io_uring only supports read and write */
110 		return !strcmp(xnvme->io_mechanism, "io_uring_cmd") &&
111 		       xnvme_dev_get_csi(xnvme->dev) == XNVME_SPEC_CSI_NVM;
112 	default:
113 		return false;
114 	}
115 }
116 
117 static void
118 bdev_xnvme_destruct_cb(void *io_device)
119 {
120 	struct bdev_xnvme *xnvme = io_device;
121 
122 	TAILQ_REMOVE(&g_xnvme_bdev_head, xnvme, link);
123 	bdev_xnvme_free(xnvme);
124 }
125 
126 static int
127 bdev_xnvme_destruct(void *ctx)
128 {
129 	struct bdev_xnvme *xnvme = ctx;
130 
131 	spdk_io_device_unregister(xnvme, bdev_xnvme_destruct_cb);
132 
133 	return 0;
134 }
135 
136 static int
137 bdev_xnvme_unmap(struct spdk_bdev_io *bdev_io, struct xnvme_cmd_ctx *ctx, struct bdev_xnvme *xnvme)
138 {
139 	struct spdk_nvme_dsm_range *range;
140 	uint64_t offset, remaining;
141 	uint64_t num_ranges_u64, num_blocks, offset_blocks;
142 	uint16_t num_ranges;
143 
144 	num_blocks = bdev_io->u.bdev.num_blocks;
145 	offset_blocks = bdev_io->u.bdev.offset_blocks;
146 
147 	num_ranges_u64 = spdk_divide_round_up(num_blocks, xnvme->bdev.max_unmap);
148 	if (num_ranges_u64 > xnvme->bdev.max_unmap_segments) {
149 		SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
150 		return -EINVAL;
151 	}
152 	num_ranges = (uint16_t)num_ranges_u64;
153 
154 	offset = offset_blocks;
155 	remaining = num_blocks;
156 
157 	assert(bdev_io->u.bdev.iovcnt == 1);
158 	range = (struct spdk_nvme_dsm_range *) bdev_io->u.bdev.iovs->iov_base;
159 
160 	/* Fill max-size ranges until the remaining blocks fit into one range */
161 	while (remaining > xnvme->bdev.max_unmap) {
162 		range->attributes.raw = 0;
163 		range->length = xnvme->bdev.max_unmap;
164 		range->starting_lba = offset;
165 
166 		offset += xnvme->bdev.max_unmap;
167 		remaining -= xnvme->bdev.max_unmap;
168 		range++;
169 	}
170 
171 	/* Final range describes the remaining blocks */
172 	range->attributes.raw = 0;
173 	range->length = remaining;
174 	range->starting_lba = offset;
175 
176 	ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_DATASET_MANAGEMENT;
177 	ctx->cmd.common.nsid = xnvme->nsid;
178 	ctx->cmd.nvm.nlb = num_blocks - 1;
179 	ctx->cmd.nvm.slba = offset_blocks;
180 	ctx->cmd.dsm.nr = num_ranges - 1;
181 	ctx->cmd.dsm.ad = true;
182 
183 	return 0;
184 }
185 
186 static void
187 _xnvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
188 {
189 	struct bdev_xnvme_task *xnvme_task = (struct bdev_xnvme_task *)bdev_io->driver_ctx;
190 	struct bdev_xnvme *xnvme = (struct bdev_xnvme *)bdev_io->bdev->ctxt;
191 	struct bdev_xnvme_io_channel *xnvme_ch = spdk_io_channel_get_ctx(ch);
192 	struct xnvme_cmd_ctx *ctx = xnvme_queue_get_cmd_ctx(xnvme_ch->queue);
193 	int err;
194 
195 	SPDK_DEBUGLOG(xnvme, "bdev_io : %p, iov_cnt : %d, bdev_xnvme_task : %p\n",
196 		      bdev_io, bdev_io->u.bdev.iovcnt, (struct bdev_xnvme_task *)bdev_io->driver_ctx);
197 
198 	switch (bdev_io->type) {
199 	case SPDK_BDEV_IO_TYPE_READ:
200 		ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_READ;
201 		ctx->cmd.common.nsid = xnvme->nsid;
202 		ctx->cmd.nvm.nlb = bdev_io->u.bdev.num_blocks - 1;
203 		ctx->cmd.nvm.slba = bdev_io->u.bdev.offset_blocks;
204 		break;
205 	case SPDK_BDEV_IO_TYPE_WRITE:
206 		ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_WRITE;
207 		ctx->cmd.common.nsid = xnvme->nsid;
208 		ctx->cmd.nvm.nlb = bdev_io->u.bdev.num_blocks - 1;
209 		ctx->cmd.nvm.slba = bdev_io->u.bdev.offset_blocks;
210 		break;
211 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
212 		ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_WRITE_ZEROES;
213 		ctx->cmd.common.nsid = xnvme->nsid;
214 		ctx->cmd.nvm.nlb = bdev_io->u.bdev.num_blocks - 1;
215 		ctx->cmd.nvm.slba = bdev_io->u.bdev.offset_blocks;
216 		break;
217 	case SPDK_BDEV_IO_TYPE_UNMAP:
218 		if (bdev_xnvme_unmap(bdev_io, ctx, xnvme)) {
219 			xnvme_queue_put_cmd_ctx(xnvme_ch->queue, ctx);
220 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
221 			return;
222 		}
223 		break;
224 	default:
225 		SPDK_ERRLOG("Wrong io type\n");
226 
227 		xnvme_queue_put_cmd_ctx(xnvme_ch->queue, ctx);
228 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
229 		return;
230 	}
231 
232 	xnvme_task->ch = xnvme_ch;
233 	ctx->async.cb_arg = xnvme_task;
234 
235 	err = xnvme_cmd_passv(ctx, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
236 			      bdev_io->u.bdev.num_blocks * xnvme->bdev.blocklen, NULL, 0, 0);
237 
238 	switch (err) {
239 	/* Submission success! */
240 	case 0:
241 		SPDK_DEBUGLOG(xnvme, "io_channel : %p, iovcnt:%d, nblks: %lu off: %#lx\n",
242 			      xnvme_ch, bdev_io->u.bdev.iovcnt,
243 			      bdev_io->u.bdev.num_blocks, bdev_io->u.bdev.offset_blocks);
244 		return;
245 
246 	/* Submission failed: queue is full or no memory  => Queue the I/O in bdev layer */
247 	case -EBUSY:
248 	case -EAGAIN:
249 	case -ENOMEM:
250 		SPDK_WARNLOG("Start to queue I/O for xnvme bdev\n");
251 
252 		xnvme_queue_put_cmd_ctx(xnvme_ch->queue, ctx);
253 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
254 		return;
255 
256 	/* Submission failed: unexpected error, put the command-context back in the queue */
257 	default:
258 		SPDK_ERRLOG("bdev_xnvme_cmd_passv : Submission failed: unexpected error\n");
259 
260 		xnvme_queue_put_cmd_ctx(xnvme_ch->queue, ctx);
261 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
262 		return;
263 	}
264 }
265 
266 static void
267 bdev_xnvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
268 {
269 	struct bdev_xnvme_io_channel *xnvme_ch = spdk_io_channel_get_ctx(ch);
270 
271 	if (!success) {
272 		xnvme_queue_put_cmd_ctx(xnvme_ch->queue, xnvme_queue_get_cmd_ctx(xnvme_ch->queue));
273 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
274 		return;
275 	}
276 
277 	_xnvme_submit_request(ch, bdev_io);
278 }
279 
280 static void
281 bdev_xnvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
282 {
283 	switch (bdev_io->type) {
284 	/* Read and write operations must be performed on buffers aligned to
285 	 * bdev->required_alignment. If user specified unaligned buffers,
286 	 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
287 	case SPDK_BDEV_IO_TYPE_READ:
288 	case SPDK_BDEV_IO_TYPE_WRITE:
289 		spdk_bdev_io_get_buf(bdev_io, bdev_xnvme_get_buf_cb,
290 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
291 		break;
292 	case SPDK_BDEV_IO_TYPE_UNMAP:
293 		/* The max number of segments defined by spec is 256 and an
294 		 * spdk_nvme_dsm_range structure is 16 bytes */
295 		spdk_bdev_io_get_buf(bdev_io, bdev_xnvme_get_buf_cb, 256 * 16);
296 		break;
297 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
298 		_xnvme_submit_request(ch, bdev_io);
299 		break;
300 
301 	default:
302 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
303 		break;
304 	}
305 }
306 
307 static const struct spdk_bdev_fn_table xnvme_fn_table = {
308 	.destruct		= bdev_xnvme_destruct,
309 	.submit_request		= bdev_xnvme_submit_request,
310 	.io_type_supported	= bdev_xnvme_io_type_supported,
311 	.get_io_channel		= bdev_xnvme_get_io_channel,
312 };
313 
314 static void
315 bdev_xnvme_free(struct bdev_xnvme *xnvme)
316 {
317 	assert(xnvme != NULL);
318 
319 	xnvme_dev_close(xnvme->dev);
320 	free(xnvme->io_mechanism);
321 	free(xnvme->filename);
322 	free(xnvme->bdev.name);
323 	free(xnvme);
324 }
325 
326 static void
327 bdev_xnvme_cmd_cb(struct xnvme_cmd_ctx *ctx, void *cb_arg)
328 {
329 	struct bdev_xnvme_task *xnvme_task = ctx->async.cb_arg;
330 	enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS;
331 
332 	SPDK_DEBUGLOG(xnvme, "xnvme_task : %p\n", xnvme_task);
333 
334 	if (xnvme_cmd_ctx_cpl_status(ctx)) {
335 		SPDK_ERRLOG("xNVMe I/O Failed\n");
336 		xnvme_cmd_ctx_pr(ctx, XNVME_PR_DEF);
337 		status = SPDK_BDEV_IO_STATUS_FAILED;
338 	}
339 
340 	spdk_bdev_io_complete(spdk_bdev_io_from_ctx(xnvme_task), status);
341 
342 	/* Completed: Put the command- context back in the queue */
343 	xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
344 }
345 
346 static int
347 bdev_xnvme_poll(void *arg)
348 {
349 	struct bdev_xnvme_io_channel *ch = arg;
350 	int rc;
351 
352 	rc = xnvme_queue_poke(ch->queue, 0);
353 	if (rc < 0) {
354 		SPDK_ERRLOG("xnvme_queue_poke failure rc : %d\n", rc);
355 		return SPDK_POLLER_BUSY;
356 	}
357 
358 	return xnvme_queue_get_outstanding(ch->queue) ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
359 }
360 
361 static int
362 bdev_xnvme_queue_create_cb(void *io_device, void *ctx_buf)
363 {
364 	struct bdev_xnvme *xnvme = io_device;
365 	struct bdev_xnvme_io_channel *ch = ctx_buf;
366 	int rc;
367 	int qd = 512;
368 
369 	rc = xnvme_queue_init(xnvme->dev, qd, 0, &ch->queue);
370 	if (rc) {
371 		SPDK_ERRLOG("xnvme_queue_init failure: %d\n", rc);
372 		return 1;
373 	}
374 
375 	xnvme_queue_set_cb(ch->queue, bdev_xnvme_cmd_cb, ch);
376 
377 	ch->poller = SPDK_POLLER_REGISTER(bdev_xnvme_poll, ch, 0);
378 
379 	return 0;
380 }
381 
382 static void
383 bdev_xnvme_queue_destroy_cb(void *io_device, void *ctx_buf)
384 {
385 	struct bdev_xnvme_io_channel *ch = ctx_buf;
386 
387 	spdk_poller_unregister(&ch->poller);
388 
389 	xnvme_queue_term(ch->queue);
390 }
391 
392 struct spdk_bdev *
393 create_xnvme_bdev(const char *name, const char *filename, const char *io_mechanism,
394 		  bool conserve_cpu)
395 {
396 	struct bdev_xnvme *xnvme;
397 	const struct xnvme_spec_nvm_idfy_ctrlr *ctrlr;
398 	uint32_t block_size;
399 	uint64_t bdev_size;
400 	int rc;
401 	struct xnvme_opts opts = xnvme_opts_default();
402 
403 	xnvme = calloc(1, sizeof(*xnvme));
404 	if (!xnvme) {
405 		SPDK_ERRLOG("Unable to allocate enough memory for xNVMe backend\n");
406 		return NULL;
407 	}
408 
409 	opts.direct = 1;
410 	opts.async = io_mechanism;
411 	if (!opts.async) {
412 		goto error_return;
413 	}
414 	xnvme->io_mechanism = strdup(io_mechanism);
415 	if (!xnvme->io_mechanism) {
416 		goto error_return;
417 	}
418 
419 	xnvme->conserve_cpu = conserve_cpu;
420 	if (!xnvme->conserve_cpu) {
421 		if (!strcmp(xnvme->io_mechanism, "libaio")) {
422 			opts.poll_io = 1;
423 		} else if (!strcmp(xnvme->io_mechanism, "io_uring")) {
424 			opts.poll_io = 1;
425 		} else if (!strcmp(xnvme->io_mechanism, "io_uring_cmd")) {
426 			opts.poll_io = 1;
427 		}
428 	}
429 
430 	xnvme->filename = strdup(filename);
431 	if (!xnvme->filename) {
432 		goto error_return;
433 	}
434 
435 	xnvme->dev = xnvme_dev_open(xnvme->filename, &opts);
436 	if (!xnvme->dev) {
437 		SPDK_ERRLOG("Unable to open xNVMe device %s\n", filename);
438 		goto error_return;
439 	}
440 
441 	xnvme->nsid = xnvme_dev_get_nsid(xnvme->dev);
442 
443 	bdev_size = xnvme_dev_get_geo(xnvme->dev)->tbytes;
444 	block_size = xnvme_dev_get_geo(xnvme->dev)->nbytes;
445 
446 	xnvme->bdev.name = strdup(name);
447 	if (!xnvme->bdev.name) {
448 		goto error_return;
449 	}
450 
451 	xnvme->bdev.product_name = "xNVMe bdev";
452 	xnvme->bdev.module = &xnvme_if;
453 
454 	xnvme->bdev.write_cache = 0;
455 	xnvme->bdev.max_write_zeroes = UINT16_MAX + 1;
456 
457 	if (xnvme_dev_get_csi(xnvme->dev) == XNVME_SPEC_CSI_NVM) {
458 		ctrlr = (struct xnvme_spec_nvm_idfy_ctrlr *) xnvme_dev_get_ctrlr_css(xnvme->dev);
459 		xnvme->bdev.max_unmap = ctrlr->dmrsl ? ctrlr->dmrsl : SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
460 		xnvme->bdev.max_unmap_segments = ctrlr->dmrl ? ctrlr->dmrl :
461 						 SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES;
462 	}
463 
464 	if (block_size == 0) {
465 		SPDK_ERRLOG("Block size could not be auto-detected\n");
466 		goto error_return;
467 	}
468 
469 	if (block_size < 512) {
470 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size);
471 		goto error_return;
472 	}
473 
474 	if (!spdk_u32_is_pow2(block_size)) {
475 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size);
476 		goto error_return;
477 	}
478 
479 	SPDK_DEBUGLOG(xnvme, "bdev_name : %s, bdev_size : %lu, block_size : %d\n",
480 		      xnvme->bdev.name, bdev_size, block_size);
481 
482 	xnvme->bdev.blocklen = block_size;
483 	xnvme->bdev.required_alignment = spdk_u32log2(block_size);
484 
485 	if (bdev_size % xnvme->bdev.blocklen != 0) {
486 		SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
487 			    bdev_size, xnvme->bdev.blocklen);
488 		goto error_return;
489 	}
490 
491 	xnvme->bdev.blockcnt = bdev_size / xnvme->bdev.blocklen;
492 	xnvme->bdev.ctxt = xnvme;
493 
494 	xnvme->bdev.fn_table = &xnvme_fn_table;
495 
496 	spdk_io_device_register(xnvme, bdev_xnvme_queue_create_cb, bdev_xnvme_queue_destroy_cb,
497 				sizeof(struct bdev_xnvme_io_channel),
498 				xnvme->bdev.name);
499 	rc = spdk_bdev_register(&xnvme->bdev);
500 	if (rc) {
501 		spdk_io_device_unregister(xnvme, NULL);
502 		goto error_return;
503 	}
504 
505 	TAILQ_INSERT_TAIL(&g_xnvme_bdev_head, xnvme, link);
506 
507 	return &xnvme->bdev;
508 
509 error_return:
510 	bdev_xnvme_free(xnvme);
511 	return NULL;
512 }
513 
514 void
515 delete_xnvme_bdev(const char *name, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
516 {
517 	int rc;
518 
519 	rc = spdk_bdev_unregister_by_name(name, &xnvme_if, cb_fn, cb_arg);
520 	if (rc != 0) {
521 		cb_fn(cb_arg, rc);
522 	}
523 }
524 
525 static int
526 bdev_xnvme_module_create_cb(void *io_device, void *ctx_buf)
527 {
528 	return 0;
529 }
530 
531 static void
532 bdev_xnvme_module_destroy_cb(void *io_device, void *ctx_buf)
533 {
534 }
535 
536 static int
537 bdev_xnvme_init(void)
538 {
539 	spdk_io_device_register(&xnvme_if, bdev_xnvme_module_create_cb, bdev_xnvme_module_destroy_cb,
540 				0, "xnvme_module");
541 
542 	return 0;
543 }
544 
545 static void
546 bdev_xnvme_fini(void)
547 {
548 	spdk_io_device_unregister(&xnvme_if, NULL);
549 }
550 
551 SPDK_LOG_REGISTER_COMPONENT(xnvme)
552