1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2022 Intel Corporation. 3 * Copyright (c) Samsung Electronics Co., Ltd. 4 * All rights reserved. 5 */ 6 7 #include "libxnvme.h" 8 9 #include "bdev_xnvme.h" 10 11 #include "spdk/stdinc.h" 12 13 #include "spdk/barrier.h" 14 #include "spdk/bdev.h" 15 #include "spdk/env.h" 16 #include "spdk/fd.h" 17 #include "spdk/likely.h" 18 #include "spdk/thread.h" 19 #include "spdk/json.h" 20 #include "spdk/util.h" 21 #include "spdk/string.h" 22 23 #include "spdk/log.h" 24 25 struct bdev_xnvme_io_channel { 26 struct xnvme_queue *queue; 27 struct spdk_poller *poller; 28 }; 29 30 struct bdev_xnvme_task { 31 struct bdev_xnvme_io_channel *ch; 32 TAILQ_ENTRY(bdev_xnvme_task) link; 33 }; 34 35 struct bdev_xnvme { 36 struct spdk_bdev bdev; 37 char *filename; 38 char *io_mechanism; 39 struct xnvme_dev *dev; 40 uint32_t nsid; 41 bool conserve_cpu; 42 43 TAILQ_ENTRY(bdev_xnvme) link; 44 }; 45 46 static int bdev_xnvme_init(void); 47 static void bdev_xnvme_fini(void); 48 static void bdev_xnvme_free(struct bdev_xnvme *xnvme); 49 static TAILQ_HEAD(, bdev_xnvme) g_xnvme_bdev_head = TAILQ_HEAD_INITIALIZER(g_xnvme_bdev_head); 50 51 static int 52 bdev_xnvme_get_ctx_size(void) 53 { 54 return sizeof(struct bdev_xnvme_task); 55 } 56 57 static int 58 bdev_xnvme_config_json(struct spdk_json_write_ctx *w) 59 { 60 struct bdev_xnvme *xnvme; 61 62 TAILQ_FOREACH(xnvme, &g_xnvme_bdev_head, link) { 63 spdk_json_write_object_begin(w); 64 65 spdk_json_write_named_string(w, "method", "bdev_xnvme_create"); 66 67 spdk_json_write_named_object_begin(w, "params"); 68 spdk_json_write_named_string(w, "name", xnvme->bdev.name); 69 spdk_json_write_named_string(w, "filename", xnvme->filename); 70 spdk_json_write_named_string(w, "io_mechanism", xnvme->io_mechanism); 71 spdk_json_write_named_bool(w, "conserve_cpu", xnvme->conserve_cpu); 72 spdk_json_write_object_end(w); 73 74 spdk_json_write_object_end(w); 75 } 76 77 return 0; 78 } 79 80 static struct spdk_bdev_module xnvme_if = { 81 .name = "xnvme", 82 .module_init = bdev_xnvme_init, 83 .module_fini = bdev_xnvme_fini, 84 .get_ctx_size = bdev_xnvme_get_ctx_size, 85 .config_json = bdev_xnvme_config_json, 86 }; 87 88 SPDK_BDEV_MODULE_REGISTER(xnvme, &xnvme_if) 89 90 static struct spdk_io_channel * 91 bdev_xnvme_get_io_channel(void *ctx) 92 { 93 struct bdev_xnvme *xnvme = ctx; 94 95 return spdk_get_io_channel(xnvme); 96 } 97 98 static bool 99 bdev_xnvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 100 { 101 struct bdev_xnvme *xnvme = ctx; 102 103 switch (io_type) { 104 case SPDK_BDEV_IO_TYPE_READ: 105 case SPDK_BDEV_IO_TYPE_WRITE: 106 return true; 107 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 108 case SPDK_BDEV_IO_TYPE_UNMAP: 109 /* libaio and io_uring only supports read and write */ 110 return !strcmp(xnvme->io_mechanism, "io_uring_cmd") && 111 xnvme_dev_get_csi(xnvme->dev) == XNVME_SPEC_CSI_NVM; 112 default: 113 return false; 114 } 115 } 116 117 static void 118 bdev_xnvme_destruct_cb(void *io_device) 119 { 120 struct bdev_xnvme *xnvme = io_device; 121 122 TAILQ_REMOVE(&g_xnvme_bdev_head, xnvme, link); 123 bdev_xnvme_free(xnvme); 124 } 125 126 static int 127 bdev_xnvme_destruct(void *ctx) 128 { 129 struct bdev_xnvme *xnvme = ctx; 130 131 spdk_io_device_unregister(xnvme, bdev_xnvme_destruct_cb); 132 133 return 0; 134 } 135 136 static int 137 bdev_xnvme_unmap(struct spdk_bdev_io *bdev_io, struct xnvme_cmd_ctx *ctx, struct bdev_xnvme *xnvme) 138 { 139 struct spdk_nvme_dsm_range *range; 140 uint64_t offset, remaining; 141 uint64_t num_ranges_u64, num_blocks, offset_blocks; 142 uint16_t num_ranges; 143 144 num_blocks = bdev_io->u.bdev.num_blocks; 145 offset_blocks = bdev_io->u.bdev.offset_blocks; 146 147 num_ranges_u64 = spdk_divide_round_up(num_blocks, xnvme->bdev.max_unmap); 148 if (num_ranges_u64 > xnvme->bdev.max_unmap_segments) { 149 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 150 return -EINVAL; 151 } 152 num_ranges = (uint16_t)num_ranges_u64; 153 154 offset = offset_blocks; 155 remaining = num_blocks; 156 157 assert(bdev_io->u.bdev.iovcnt == 1); 158 range = (struct spdk_nvme_dsm_range *) bdev_io->u.bdev.iovs->iov_base; 159 160 /* Fill max-size ranges until the remaining blocks fit into one range */ 161 while (remaining > xnvme->bdev.max_unmap) { 162 range->attributes.raw = 0; 163 range->length = xnvme->bdev.max_unmap; 164 range->starting_lba = offset; 165 166 offset += xnvme->bdev.max_unmap; 167 remaining -= xnvme->bdev.max_unmap; 168 range++; 169 } 170 171 /* Final range describes the remaining blocks */ 172 range->attributes.raw = 0; 173 range->length = remaining; 174 range->starting_lba = offset; 175 176 ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_DATASET_MANAGEMENT; 177 ctx->cmd.common.nsid = xnvme->nsid; 178 ctx->cmd.nvm.nlb = num_blocks - 1; 179 ctx->cmd.nvm.slba = offset_blocks; 180 ctx->cmd.dsm.nr = num_ranges - 1; 181 ctx->cmd.dsm.ad = true; 182 183 return 0; 184 } 185 186 static void 187 _xnvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 188 { 189 struct bdev_xnvme_task *xnvme_task = (struct bdev_xnvme_task *)bdev_io->driver_ctx; 190 struct bdev_xnvme *xnvme = (struct bdev_xnvme *)bdev_io->bdev->ctxt; 191 struct bdev_xnvme_io_channel *xnvme_ch = spdk_io_channel_get_ctx(ch); 192 struct xnvme_cmd_ctx *ctx = xnvme_queue_get_cmd_ctx(xnvme_ch->queue); 193 int err; 194 195 SPDK_DEBUGLOG(xnvme, "bdev_io : %p, iov_cnt : %d, bdev_xnvme_task : %p\n", 196 bdev_io, bdev_io->u.bdev.iovcnt, (struct bdev_xnvme_task *)bdev_io->driver_ctx); 197 198 switch (bdev_io->type) { 199 case SPDK_BDEV_IO_TYPE_READ: 200 ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_READ; 201 ctx->cmd.common.nsid = xnvme->nsid; 202 ctx->cmd.nvm.nlb = bdev_io->u.bdev.num_blocks - 1; 203 ctx->cmd.nvm.slba = bdev_io->u.bdev.offset_blocks; 204 break; 205 case SPDK_BDEV_IO_TYPE_WRITE: 206 ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_WRITE; 207 ctx->cmd.common.nsid = xnvme->nsid; 208 ctx->cmd.nvm.nlb = bdev_io->u.bdev.num_blocks - 1; 209 ctx->cmd.nvm.slba = bdev_io->u.bdev.offset_blocks; 210 break; 211 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 212 ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_WRITE_ZEROES; 213 ctx->cmd.common.nsid = xnvme->nsid; 214 ctx->cmd.nvm.nlb = bdev_io->u.bdev.num_blocks - 1; 215 ctx->cmd.nvm.slba = bdev_io->u.bdev.offset_blocks; 216 break; 217 case SPDK_BDEV_IO_TYPE_UNMAP: 218 if (bdev_xnvme_unmap(bdev_io, ctx, xnvme)) { 219 xnvme_queue_put_cmd_ctx(xnvme_ch->queue, ctx); 220 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 221 return; 222 } 223 break; 224 default: 225 SPDK_ERRLOG("Wrong io type\n"); 226 227 xnvme_queue_put_cmd_ctx(xnvme_ch->queue, ctx); 228 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 229 return; 230 } 231 232 xnvme_task->ch = xnvme_ch; 233 ctx->async.cb_arg = xnvme_task; 234 235 err = xnvme_cmd_passv(ctx, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 236 bdev_io->u.bdev.num_blocks * xnvme->bdev.blocklen, NULL, 0, 0); 237 238 switch (err) { 239 /* Submission success! */ 240 case 0: 241 SPDK_DEBUGLOG(xnvme, "io_channel : %p, iovcnt:%d, nblks: %lu off: %#lx\n", 242 xnvme_ch, bdev_io->u.bdev.iovcnt, 243 bdev_io->u.bdev.num_blocks, bdev_io->u.bdev.offset_blocks); 244 return; 245 246 /* Submission failed: queue is full or no memory => Queue the I/O in bdev layer */ 247 case -EBUSY: 248 case -EAGAIN: 249 case -ENOMEM: 250 SPDK_WARNLOG("Start to queue I/O for xnvme bdev\n"); 251 252 xnvme_queue_put_cmd_ctx(xnvme_ch->queue, ctx); 253 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); 254 return; 255 256 /* Submission failed: unexpected error, put the command-context back in the queue */ 257 default: 258 SPDK_ERRLOG("bdev_xnvme_cmd_passv : Submission failed: unexpected error\n"); 259 260 xnvme_queue_put_cmd_ctx(xnvme_ch->queue, ctx); 261 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 262 return; 263 } 264 } 265 266 static void 267 bdev_xnvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 268 { 269 struct bdev_xnvme_io_channel *xnvme_ch = spdk_io_channel_get_ctx(ch); 270 271 if (!success) { 272 xnvme_queue_put_cmd_ctx(xnvme_ch->queue, xnvme_queue_get_cmd_ctx(xnvme_ch->queue)); 273 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 274 return; 275 } 276 277 _xnvme_submit_request(ch, bdev_io); 278 } 279 280 static void 281 bdev_xnvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 282 { 283 switch (bdev_io->type) { 284 /* Read and write operations must be performed on buffers aligned to 285 * bdev->required_alignment. If user specified unaligned buffers, 286 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */ 287 case SPDK_BDEV_IO_TYPE_READ: 288 case SPDK_BDEV_IO_TYPE_WRITE: 289 spdk_bdev_io_get_buf(bdev_io, bdev_xnvme_get_buf_cb, 290 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 291 break; 292 case SPDK_BDEV_IO_TYPE_UNMAP: 293 /* The max number of segments defined by spec is 256 and an 294 * spdk_nvme_dsm_range structure is 16 bytes */ 295 spdk_bdev_io_get_buf(bdev_io, bdev_xnvme_get_buf_cb, 256 * 16); 296 break; 297 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 298 _xnvme_submit_request(ch, bdev_io); 299 break; 300 301 default: 302 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 303 break; 304 } 305 } 306 307 static const struct spdk_bdev_fn_table xnvme_fn_table = { 308 .destruct = bdev_xnvme_destruct, 309 .submit_request = bdev_xnvme_submit_request, 310 .io_type_supported = bdev_xnvme_io_type_supported, 311 .get_io_channel = bdev_xnvme_get_io_channel, 312 }; 313 314 static void 315 bdev_xnvme_free(struct bdev_xnvme *xnvme) 316 { 317 assert(xnvme != NULL); 318 319 xnvme_dev_close(xnvme->dev); 320 free(xnvme->io_mechanism); 321 free(xnvme->filename); 322 free(xnvme->bdev.name); 323 free(xnvme); 324 } 325 326 static void 327 bdev_xnvme_cmd_cb(struct xnvme_cmd_ctx *ctx, void *cb_arg) 328 { 329 struct bdev_xnvme_task *xnvme_task = ctx->async.cb_arg; 330 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 331 332 SPDK_DEBUGLOG(xnvme, "xnvme_task : %p\n", xnvme_task); 333 334 if (xnvme_cmd_ctx_cpl_status(ctx)) { 335 SPDK_ERRLOG("xNVMe I/O Failed\n"); 336 xnvme_cmd_ctx_pr(ctx, XNVME_PR_DEF); 337 status = SPDK_BDEV_IO_STATUS_FAILED; 338 } 339 340 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(xnvme_task), status); 341 342 /* Completed: Put the command- context back in the queue */ 343 xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx); 344 } 345 346 static int 347 bdev_xnvme_poll(void *arg) 348 { 349 struct bdev_xnvme_io_channel *ch = arg; 350 int rc; 351 352 rc = xnvme_queue_poke(ch->queue, 0); 353 if (rc < 0) { 354 SPDK_ERRLOG("xnvme_queue_poke failure rc : %d\n", rc); 355 return SPDK_POLLER_BUSY; 356 } 357 358 return xnvme_queue_get_outstanding(ch->queue) ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 359 } 360 361 static int 362 bdev_xnvme_queue_create_cb(void *io_device, void *ctx_buf) 363 { 364 struct bdev_xnvme *xnvme = io_device; 365 struct bdev_xnvme_io_channel *ch = ctx_buf; 366 int rc; 367 int qd = 512; 368 369 rc = xnvme_queue_init(xnvme->dev, qd, 0, &ch->queue); 370 if (rc) { 371 SPDK_ERRLOG("xnvme_queue_init failure: %d\n", rc); 372 return 1; 373 } 374 375 xnvme_queue_set_cb(ch->queue, bdev_xnvme_cmd_cb, ch); 376 377 ch->poller = SPDK_POLLER_REGISTER(bdev_xnvme_poll, ch, 0); 378 379 return 0; 380 } 381 382 static void 383 bdev_xnvme_queue_destroy_cb(void *io_device, void *ctx_buf) 384 { 385 struct bdev_xnvme_io_channel *ch = ctx_buf; 386 387 spdk_poller_unregister(&ch->poller); 388 389 xnvme_queue_term(ch->queue); 390 } 391 392 struct spdk_bdev * 393 create_xnvme_bdev(const char *name, const char *filename, const char *io_mechanism, 394 bool conserve_cpu) 395 { 396 struct bdev_xnvme *xnvme; 397 const struct xnvme_spec_nvm_idfy_ctrlr *ctrlr; 398 uint32_t block_size; 399 uint64_t bdev_size; 400 int rc; 401 struct xnvme_opts opts = xnvme_opts_default(); 402 403 xnvme = calloc(1, sizeof(*xnvme)); 404 if (!xnvme) { 405 SPDK_ERRLOG("Unable to allocate enough memory for xNVMe backend\n"); 406 return NULL; 407 } 408 409 opts.direct = 1; 410 opts.async = io_mechanism; 411 if (!opts.async) { 412 goto error_return; 413 } 414 xnvme->io_mechanism = strdup(io_mechanism); 415 if (!xnvme->io_mechanism) { 416 goto error_return; 417 } 418 419 xnvme->conserve_cpu = conserve_cpu; 420 if (!xnvme->conserve_cpu) { 421 if (!strcmp(xnvme->io_mechanism, "libaio")) { 422 opts.poll_io = 1; 423 } else if (!strcmp(xnvme->io_mechanism, "io_uring")) { 424 opts.poll_io = 1; 425 } else if (!strcmp(xnvme->io_mechanism, "io_uring_cmd")) { 426 opts.poll_io = 1; 427 } 428 } 429 430 xnvme->filename = strdup(filename); 431 if (!xnvme->filename) { 432 goto error_return; 433 } 434 435 xnvme->dev = xnvme_dev_open(xnvme->filename, &opts); 436 if (!xnvme->dev) { 437 SPDK_ERRLOG("Unable to open xNVMe device %s\n", filename); 438 goto error_return; 439 } 440 441 xnvme->nsid = xnvme_dev_get_nsid(xnvme->dev); 442 443 bdev_size = xnvme_dev_get_geo(xnvme->dev)->tbytes; 444 block_size = xnvme_dev_get_geo(xnvme->dev)->nbytes; 445 446 xnvme->bdev.name = strdup(name); 447 if (!xnvme->bdev.name) { 448 goto error_return; 449 } 450 451 xnvme->bdev.product_name = "xNVMe bdev"; 452 xnvme->bdev.module = &xnvme_if; 453 454 xnvme->bdev.write_cache = 0; 455 xnvme->bdev.max_write_zeroes = UINT16_MAX + 1; 456 457 if (xnvme_dev_get_csi(xnvme->dev) == XNVME_SPEC_CSI_NVM) { 458 ctrlr = (struct xnvme_spec_nvm_idfy_ctrlr *) xnvme_dev_get_ctrlr_css(xnvme->dev); 459 xnvme->bdev.max_unmap = ctrlr->dmrsl ? ctrlr->dmrsl : SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 460 xnvme->bdev.max_unmap_segments = ctrlr->dmrl ? ctrlr->dmrl : 461 SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES; 462 } 463 464 if (block_size == 0) { 465 SPDK_ERRLOG("Block size could not be auto-detected\n"); 466 goto error_return; 467 } 468 469 if (block_size < 512) { 470 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size); 471 goto error_return; 472 } 473 474 if (!spdk_u32_is_pow2(block_size)) { 475 SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size); 476 goto error_return; 477 } 478 479 SPDK_DEBUGLOG(xnvme, "bdev_name : %s, bdev_size : %lu, block_size : %d\n", 480 xnvme->bdev.name, bdev_size, block_size); 481 482 xnvme->bdev.blocklen = block_size; 483 xnvme->bdev.required_alignment = spdk_u32log2(block_size); 484 485 if (bdev_size % xnvme->bdev.blocklen != 0) { 486 SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", 487 bdev_size, xnvme->bdev.blocklen); 488 goto error_return; 489 } 490 491 xnvme->bdev.blockcnt = bdev_size / xnvme->bdev.blocklen; 492 xnvme->bdev.ctxt = xnvme; 493 494 xnvme->bdev.fn_table = &xnvme_fn_table; 495 496 spdk_io_device_register(xnvme, bdev_xnvme_queue_create_cb, bdev_xnvme_queue_destroy_cb, 497 sizeof(struct bdev_xnvme_io_channel), 498 xnvme->bdev.name); 499 rc = spdk_bdev_register(&xnvme->bdev); 500 if (rc) { 501 spdk_io_device_unregister(xnvme, NULL); 502 goto error_return; 503 } 504 505 TAILQ_INSERT_TAIL(&g_xnvme_bdev_head, xnvme, link); 506 507 return &xnvme->bdev; 508 509 error_return: 510 bdev_xnvme_free(xnvme); 511 return NULL; 512 } 513 514 void 515 delete_xnvme_bdev(const char *name, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 516 { 517 int rc; 518 519 rc = spdk_bdev_unregister_by_name(name, &xnvme_if, cb_fn, cb_arg); 520 if (rc != 0) { 521 cb_fn(cb_arg, rc); 522 } 523 } 524 525 static int 526 bdev_xnvme_module_create_cb(void *io_device, void *ctx_buf) 527 { 528 return 0; 529 } 530 531 static void 532 bdev_xnvme_module_destroy_cb(void *io_device, void *ctx_buf) 533 { 534 } 535 536 static int 537 bdev_xnvme_init(void) 538 { 539 spdk_io_device_register(&xnvme_if, bdev_xnvme_module_create_cb, bdev_xnvme_module_destroy_cb, 540 0, "xnvme_module"); 541 542 return 0; 543 } 544 545 static void 546 bdev_xnvme_fini(void) 547 { 548 spdk_io_device_unregister(&xnvme_if, NULL); 549 } 550 551 SPDK_LOG_REGISTER_COMPONENT(xnvme) 552