1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2017 Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "bdev_malloc.h" 10 #include "spdk/endian.h" 11 #include "spdk/env.h" 12 #include "spdk/accel.h" 13 #include "spdk/dma.h" 14 #include "spdk/likely.h" 15 #include "spdk/string.h" 16 17 #include "spdk/log.h" 18 19 struct malloc_disk { 20 struct spdk_bdev disk; 21 void *malloc_buf; 22 void *malloc_md_buf; 23 TAILQ_ENTRY(malloc_disk) link; 24 }; 25 26 struct malloc_task { 27 struct iovec iov; 28 int num_outstanding; 29 enum spdk_bdev_io_status status; 30 TAILQ_ENTRY(malloc_task) tailq; 31 }; 32 33 struct malloc_channel { 34 struct spdk_io_channel *accel_channel; 35 struct spdk_poller *completion_poller; 36 TAILQ_HEAD(, malloc_task) completed_tasks; 37 }; 38 39 static int 40 malloc_verify_pi(struct spdk_bdev_io *bdev_io) 41 { 42 struct spdk_bdev *bdev = bdev_io->bdev; 43 struct spdk_dif_ctx dif_ctx; 44 struct spdk_dif_error err_blk; 45 int rc; 46 struct spdk_dif_ctx_init_ext_opts dif_opts; 47 48 assert(bdev_io->u.bdev.memory_domain == NULL); 49 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 50 dif_opts.dif_pi_format = bdev->dif_pi_format; 51 rc = spdk_dif_ctx_init(&dif_ctx, 52 bdev->blocklen, 53 bdev->md_len, 54 bdev->md_interleave, 55 bdev->dif_is_head_of_md, 56 bdev->dif_type, 57 bdev_io->u.bdev.dif_check_flags, 58 bdev_io->u.bdev.offset_blocks & 0xFFFFFFFF, 59 0xFFFF, 0, 0, 0, &dif_opts); 60 if (rc != 0) { 61 SPDK_ERRLOG("Failed to initialize DIF/DIX context\n"); 62 return rc; 63 } 64 65 if (spdk_bdev_is_md_interleaved(bdev)) { 66 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, 67 bdev_io->u.bdev.iovcnt, 68 bdev_io->u.bdev.num_blocks, 69 &dif_ctx, 70 &err_blk); 71 } else { 72 struct iovec md_iov = { 73 .iov_base = bdev_io->u.bdev.md_buf, 74 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 75 }; 76 77 if (bdev_io->u.bdev.md_buf == NULL) { 78 return 0; 79 } 80 81 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, 82 bdev_io->u.bdev.iovcnt, 83 &md_iov, 84 bdev_io->u.bdev.num_blocks, 85 &dif_ctx, 86 &err_blk); 87 } 88 89 if (rc != 0) { 90 SPDK_ERRLOG("DIF/DIX verify failed: lba %" PRIu64 ", num_blocks %" PRIu64 ", " 91 "err_type %u, expected %lu, actual %lu, err_offset %u\n", 92 bdev_io->u.bdev.offset_blocks, 93 bdev_io->u.bdev.num_blocks, 94 err_blk.err_type, 95 err_blk.expected, 96 err_blk.actual, 97 err_blk.err_offset); 98 } 99 100 return rc; 101 } 102 103 static int 104 malloc_unmap_write_zeroes_generate_pi(struct spdk_bdev_io *bdev_io) 105 { 106 struct spdk_bdev *bdev = bdev_io->bdev; 107 struct malloc_disk *mdisk = bdev_io->bdev->ctxt; 108 uint32_t block_size = bdev_io->bdev->blocklen; 109 uint32_t dif_check_flags; 110 struct spdk_dif_ctx dif_ctx; 111 struct spdk_dif_ctx_init_ext_opts dif_opts; 112 int rc; 113 114 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 115 dif_opts.dif_pi_format = bdev->dif_pi_format; 116 dif_check_flags = bdev->dif_check_flags | SPDK_DIF_CHECK_TYPE_REFTAG | 117 SPDK_DIF_FLAGS_APPTAG_CHECK; 118 rc = spdk_dif_ctx_init(&dif_ctx, 119 bdev->blocklen, 120 bdev->md_len, 121 bdev->md_interleave, 122 bdev->dif_is_head_of_md, 123 bdev->dif_type, 124 dif_check_flags, 125 SPDK_DIF_REFTAG_IGNORE, 126 0xFFFF, SPDK_DIF_APPTAG_IGNORE, 127 0, 0, &dif_opts); 128 if (rc != 0) { 129 SPDK_ERRLOG("Initialization of DIF/DIX context failed\n"); 130 return rc; 131 } 132 133 if (bdev->md_interleave) { 134 struct iovec iov = { 135 .iov_base = mdisk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size, 136 .iov_len = bdev_io->u.bdev.num_blocks * block_size, 137 }; 138 139 rc = spdk_dif_generate(&iov, 1, bdev_io->u.bdev.num_blocks, &dif_ctx); 140 } else { 141 struct iovec iov = { 142 .iov_base = mdisk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size, 143 .iov_len = bdev_io->u.bdev.num_blocks * block_size, 144 }; 145 146 struct iovec md_iov = { 147 .iov_base = mdisk->malloc_md_buf + bdev_io->u.bdev.offset_blocks * bdev->md_len, 148 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 149 }; 150 151 rc = spdk_dix_generate(&iov, 1, &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx); 152 } 153 154 if (rc != 0) { 155 SPDK_ERRLOG("Formatting by DIF/DIX failed\n"); 156 } 157 158 159 return rc; 160 } 161 162 static void 163 malloc_done(void *ref, int status) 164 { 165 struct malloc_task *task = (struct malloc_task *)ref; 166 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(task); 167 int rc; 168 169 if (status != 0) { 170 if (status == -ENOMEM) { 171 if (task->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 172 task->status = SPDK_BDEV_IO_STATUS_NOMEM; 173 } 174 } else { 175 task->status = SPDK_BDEV_IO_STATUS_FAILED; 176 } 177 } 178 179 if (--task->num_outstanding != 0) { 180 return; 181 } 182 183 if (bdev_io->bdev->dif_type != SPDK_DIF_DISABLE && 184 bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 185 task->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 186 rc = malloc_verify_pi(bdev_io); 187 if (rc != 0) { 188 task->status = SPDK_BDEV_IO_STATUS_FAILED; 189 } 190 } 191 192 if (bdev_io->bdev->dif_type != SPDK_DIF_DISABLE && 193 (bdev_io->type == SPDK_BDEV_IO_TYPE_UNMAP || bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 194 task->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 195 rc = malloc_unmap_write_zeroes_generate_pi(bdev_io); 196 if (rc != 0) { 197 task->status = SPDK_BDEV_IO_STATUS_FAILED; 198 } 199 } 200 201 assert(!bdev_io->u.bdev.accel_sequence || task->status == SPDK_BDEV_IO_STATUS_NOMEM); 202 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status); 203 } 204 205 static void 206 malloc_complete_task(struct malloc_task *task, struct malloc_channel *mch, 207 enum spdk_bdev_io_status status) 208 { 209 task->status = status; 210 TAILQ_INSERT_TAIL(&mch->completed_tasks, task, tailq); 211 } 212 213 static TAILQ_HEAD(, malloc_disk) g_malloc_disks = TAILQ_HEAD_INITIALIZER(g_malloc_disks); 214 215 int malloc_disk_count = 0; 216 217 static int bdev_malloc_initialize(void); 218 static void bdev_malloc_deinitialize(void); 219 220 static int 221 bdev_malloc_get_ctx_size(void) 222 { 223 return sizeof(struct malloc_task); 224 } 225 226 static struct spdk_bdev_module malloc_if = { 227 .name = "malloc", 228 .module_init = bdev_malloc_initialize, 229 .module_fini = bdev_malloc_deinitialize, 230 .get_ctx_size = bdev_malloc_get_ctx_size, 231 232 }; 233 234 SPDK_BDEV_MODULE_REGISTER(malloc, &malloc_if) 235 236 static void 237 malloc_disk_free(struct malloc_disk *malloc_disk) 238 { 239 if (!malloc_disk) { 240 return; 241 } 242 243 free(malloc_disk->disk.name); 244 spdk_free(malloc_disk->malloc_buf); 245 spdk_free(malloc_disk->malloc_md_buf); 246 free(malloc_disk); 247 } 248 249 static int 250 bdev_malloc_destruct(void *ctx) 251 { 252 struct malloc_disk *malloc_disk = ctx; 253 254 TAILQ_REMOVE(&g_malloc_disks, malloc_disk, link); 255 malloc_disk_free(malloc_disk); 256 return 0; 257 } 258 259 static int 260 bdev_malloc_check_iov_len(struct iovec *iovs, int iovcnt, size_t nbytes) 261 { 262 int i; 263 264 for (i = 0; i < iovcnt; i++) { 265 if (nbytes < iovs[i].iov_len) { 266 return 0; 267 } 268 269 nbytes -= iovs[i].iov_len; 270 } 271 272 return nbytes != 0; 273 } 274 275 static size_t 276 malloc_get_md_len(struct spdk_bdev_io *bdev_io) 277 { 278 return bdev_io->u.bdev.num_blocks * bdev_io->bdev->md_len; 279 } 280 281 static uint64_t 282 malloc_get_md_offset(struct spdk_bdev_io *bdev_io) 283 { 284 return bdev_io->u.bdev.offset_blocks * bdev_io->bdev->md_len; 285 } 286 287 static void * 288 malloc_get_md_buf(struct spdk_bdev_io *bdev_io) 289 { 290 struct malloc_disk *mdisk = SPDK_CONTAINEROF(bdev_io->bdev, struct malloc_disk, disk); 291 292 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 293 294 return (char *)mdisk->malloc_md_buf + malloc_get_md_offset(bdev_io); 295 } 296 297 static void 298 malloc_sequence_fail(struct malloc_task *task, int status) 299 { 300 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(task); 301 302 /* For ENOMEM, the IO will be retried by the bdev layer, so we don't abort the sequence */ 303 if (status != -ENOMEM) { 304 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 305 bdev_io->u.bdev.accel_sequence = NULL; 306 } 307 308 malloc_done(task, status); 309 } 310 311 static void 312 malloc_sequence_done(void *ctx, int status) 313 { 314 struct malloc_task *task = ctx; 315 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(task); 316 317 bdev_io->u.bdev.accel_sequence = NULL; 318 /* Prevent bdev layer from retrying the request if the sequence failed with ENOMEM */ 319 malloc_done(task, status != -ENOMEM ? status : -EFAULT); 320 } 321 322 static void 323 bdev_malloc_readv(struct malloc_disk *mdisk, struct spdk_io_channel *ch, 324 struct malloc_task *task, struct spdk_bdev_io *bdev_io) 325 { 326 uint64_t len, offset; 327 int res = 0; 328 329 len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 330 offset = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen; 331 332 if (bdev_malloc_check_iov_len(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, len)) { 333 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), 334 SPDK_BDEV_IO_STATUS_FAILED); 335 return; 336 } 337 338 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 339 task->num_outstanding = 0; 340 task->iov.iov_base = mdisk->malloc_buf + offset; 341 task->iov.iov_len = len; 342 343 SPDK_DEBUGLOG(bdev_malloc, "read %zu bytes from offset %#" PRIx64 ", iovcnt=%d\n", 344 len, offset, bdev_io->u.bdev.iovcnt); 345 346 task->num_outstanding++; 347 res = spdk_accel_append_copy(&bdev_io->u.bdev.accel_sequence, ch, 348 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 349 bdev_io->u.bdev.memory_domain, 350 bdev_io->u.bdev.memory_domain_ctx, 351 &task->iov, 1, NULL, NULL, NULL, NULL); 352 if (spdk_unlikely(res != 0)) { 353 malloc_sequence_fail(task, res); 354 return; 355 } 356 357 spdk_accel_sequence_reverse(bdev_io->u.bdev.accel_sequence); 358 spdk_accel_sequence_finish(bdev_io->u.bdev.accel_sequence, malloc_sequence_done, task); 359 360 if (bdev_io->u.bdev.md_buf == NULL) { 361 return; 362 } 363 364 SPDK_DEBUGLOG(bdev_malloc, "read metadata %zu bytes from offset%#" PRIx64 "\n", 365 malloc_get_md_len(bdev_io), malloc_get_md_offset(bdev_io)); 366 367 task->num_outstanding++; 368 res = spdk_accel_submit_copy(ch, bdev_io->u.bdev.md_buf, malloc_get_md_buf(bdev_io), 369 malloc_get_md_len(bdev_io), malloc_done, task); 370 if (res != 0) { 371 malloc_done(task, res); 372 } 373 } 374 375 static void 376 bdev_malloc_writev(struct malloc_disk *mdisk, struct spdk_io_channel *ch, 377 struct malloc_task *task, struct spdk_bdev_io *bdev_io) 378 { 379 uint64_t len, offset; 380 int res = 0; 381 382 len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 383 offset = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen; 384 385 if (bdev_malloc_check_iov_len(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, len)) { 386 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), 387 SPDK_BDEV_IO_STATUS_FAILED); 388 return; 389 } 390 391 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 392 task->num_outstanding = 0; 393 task->iov.iov_base = mdisk->malloc_buf + offset; 394 task->iov.iov_len = len; 395 396 SPDK_DEBUGLOG(bdev_malloc, "write %zu bytes to offset %#" PRIx64 ", iovcnt=%d\n", 397 len, offset, bdev_io->u.bdev.iovcnt); 398 399 task->num_outstanding++; 400 res = spdk_accel_append_copy(&bdev_io->u.bdev.accel_sequence, ch, &task->iov, 1, NULL, NULL, 401 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 402 bdev_io->u.bdev.memory_domain, 403 bdev_io->u.bdev.memory_domain_ctx, NULL, NULL); 404 if (spdk_unlikely(res != 0)) { 405 malloc_sequence_fail(task, res); 406 return; 407 } 408 409 spdk_accel_sequence_finish(bdev_io->u.bdev.accel_sequence, malloc_sequence_done, task); 410 411 if (bdev_io->u.bdev.md_buf == NULL) { 412 return; 413 } 414 415 SPDK_DEBUGLOG(bdev_malloc, "write metadata %zu bytes to offset %#" PRIx64 "\n", 416 malloc_get_md_len(bdev_io), malloc_get_md_offset(bdev_io)); 417 418 task->num_outstanding++; 419 res = spdk_accel_submit_copy(ch, malloc_get_md_buf(bdev_io), bdev_io->u.bdev.md_buf, 420 malloc_get_md_len(bdev_io), malloc_done, task); 421 if (res != 0) { 422 malloc_done(task, res); 423 } 424 } 425 426 static int 427 bdev_malloc_unmap(struct malloc_disk *mdisk, 428 struct spdk_io_channel *ch, 429 struct malloc_task *task, 430 uint64_t offset, 431 uint64_t byte_count) 432 { 433 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 434 task->num_outstanding = 1; 435 436 return spdk_accel_submit_fill(ch, mdisk->malloc_buf + offset, 0, 437 byte_count, malloc_done, task); 438 } 439 440 static void 441 bdev_malloc_copy(struct malloc_disk *mdisk, struct spdk_io_channel *ch, 442 struct malloc_task *task, 443 uint64_t dst_offset, uint64_t src_offset, size_t len) 444 { 445 int64_t res = 0; 446 void *dst = mdisk->malloc_buf + dst_offset; 447 void *src = mdisk->malloc_buf + src_offset; 448 449 SPDK_DEBUGLOG(bdev_malloc, "Copy %zu bytes from offset %#" PRIx64 " to offset %#" PRIx64 "\n", 450 len, src_offset, dst_offset); 451 452 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 453 task->num_outstanding = 1; 454 455 res = spdk_accel_submit_copy(ch, dst, src, len, malloc_done, task); 456 if (res != 0) { 457 malloc_done(task, res); 458 } 459 } 460 461 static int 462 _bdev_malloc_submit_request(struct malloc_channel *mch, struct spdk_bdev_io *bdev_io) 463 { 464 struct malloc_task *task = (struct malloc_task *)bdev_io->driver_ctx; 465 struct malloc_disk *disk = bdev_io->bdev->ctxt; 466 uint32_t block_size = bdev_io->bdev->blocklen; 467 int rc; 468 469 switch (bdev_io->type) { 470 case SPDK_BDEV_IO_TYPE_READ: 471 if (bdev_io->u.bdev.iovs[0].iov_base == NULL) { 472 assert(bdev_io->u.bdev.iovcnt == 1); 473 assert(bdev_io->u.bdev.memory_domain == NULL); 474 bdev_io->u.bdev.iovs[0].iov_base = 475 disk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size; 476 bdev_io->u.bdev.iovs[0].iov_len = bdev_io->u.bdev.num_blocks * block_size; 477 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 478 spdk_bdev_io_set_md_buf(bdev_io, malloc_get_md_buf(bdev_io), 479 malloc_get_md_len(bdev_io)); 480 } 481 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 482 return 0; 483 } 484 485 bdev_malloc_readv(disk, mch->accel_channel, task, bdev_io); 486 return 0; 487 488 case SPDK_BDEV_IO_TYPE_WRITE: 489 if (bdev_io->bdev->dif_type != SPDK_DIF_DISABLE) { 490 rc = malloc_verify_pi(bdev_io); 491 if (rc != 0) { 492 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_FAILED); 493 return 0; 494 } 495 } 496 497 bdev_malloc_writev(disk, mch->accel_channel, task, bdev_io); 498 return 0; 499 500 case SPDK_BDEV_IO_TYPE_RESET: 501 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 502 return 0; 503 504 case SPDK_BDEV_IO_TYPE_FLUSH: 505 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 506 return 0; 507 508 case SPDK_BDEV_IO_TYPE_UNMAP: 509 return bdev_malloc_unmap(disk, mch->accel_channel, task, 510 bdev_io->u.bdev.offset_blocks * block_size, 511 bdev_io->u.bdev.num_blocks * block_size); 512 513 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 514 /* bdev_malloc_unmap is implemented with a call to mem_cpy_fill which zeroes out all of the requested bytes. */ 515 return bdev_malloc_unmap(disk, mch->accel_channel, task, 516 bdev_io->u.bdev.offset_blocks * block_size, 517 bdev_io->u.bdev.num_blocks * block_size); 518 519 case SPDK_BDEV_IO_TYPE_ZCOPY: 520 if (bdev_io->u.bdev.zcopy.start) { 521 void *buf; 522 size_t len; 523 524 buf = disk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size; 525 len = bdev_io->u.bdev.num_blocks * block_size; 526 spdk_bdev_io_set_buf(bdev_io, buf, len); 527 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 528 spdk_bdev_io_set_md_buf(bdev_io, malloc_get_md_buf(bdev_io), 529 malloc_get_md_len(bdev_io)); 530 } 531 } 532 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 533 return 0; 534 case SPDK_BDEV_IO_TYPE_ABORT: 535 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_FAILED); 536 return 0; 537 case SPDK_BDEV_IO_TYPE_COPY: 538 bdev_malloc_copy(disk, mch->accel_channel, task, 539 bdev_io->u.bdev.offset_blocks * block_size, 540 bdev_io->u.bdev.copy.src_offset_blocks * block_size, 541 bdev_io->u.bdev.num_blocks * block_size); 542 return 0; 543 544 default: 545 return -1; 546 } 547 return 0; 548 } 549 550 static void 551 bdev_malloc_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 552 { 553 struct malloc_channel *mch = spdk_io_channel_get_ctx(ch); 554 555 if (_bdev_malloc_submit_request(mch, bdev_io) != 0) { 556 malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch, 557 SPDK_BDEV_IO_STATUS_FAILED); 558 } 559 } 560 561 static bool 562 bdev_malloc_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 563 { 564 switch (io_type) { 565 case SPDK_BDEV_IO_TYPE_READ: 566 case SPDK_BDEV_IO_TYPE_WRITE: 567 case SPDK_BDEV_IO_TYPE_FLUSH: 568 case SPDK_BDEV_IO_TYPE_RESET: 569 case SPDK_BDEV_IO_TYPE_UNMAP: 570 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 571 case SPDK_BDEV_IO_TYPE_ZCOPY: 572 case SPDK_BDEV_IO_TYPE_ABORT: 573 case SPDK_BDEV_IO_TYPE_COPY: 574 return true; 575 576 default: 577 return false; 578 } 579 } 580 581 static struct spdk_io_channel * 582 bdev_malloc_get_io_channel(void *ctx) 583 { 584 return spdk_get_io_channel(&g_malloc_disks); 585 } 586 587 static void 588 bdev_malloc_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 589 { 590 spdk_json_write_object_begin(w); 591 592 spdk_json_write_named_string(w, "method", "bdev_malloc_create"); 593 594 spdk_json_write_named_object_begin(w, "params"); 595 spdk_json_write_named_string(w, "name", bdev->name); 596 spdk_json_write_named_uint64(w, "num_blocks", bdev->blockcnt); 597 spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); 598 spdk_json_write_named_uint32(w, "physical_block_size", bdev->phys_blocklen); 599 spdk_json_write_named_uuid(w, "uuid", &bdev->uuid); 600 spdk_json_write_named_uint32(w, "optimal_io_boundary", bdev->optimal_io_boundary); 601 spdk_json_write_named_uint32(w, "md_size", bdev->md_len); 602 spdk_json_write_named_uint32(w, "dif_type", bdev->dif_type); 603 spdk_json_write_named_bool(w, "dif_is_head_of_md", bdev->dif_is_head_of_md); 604 spdk_json_write_named_uint32(w, "dif_pi_format", bdev->dif_pi_format); 605 606 spdk_json_write_object_end(w); 607 608 spdk_json_write_object_end(w); 609 } 610 611 static int 612 bdev_malloc_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 613 { 614 struct malloc_disk *malloc_disk = ctx; 615 struct spdk_memory_domain *domain; 616 int num_domains = 0; 617 618 if (malloc_disk->disk.dif_type != SPDK_DIF_DISABLE) { 619 return 0; 620 } 621 622 /* Report support for every memory domain */ 623 for (domain = spdk_memory_domain_get_first(NULL); domain != NULL; 624 domain = spdk_memory_domain_get_next(domain, NULL)) { 625 if (domains != NULL && num_domains < array_size) { 626 domains[num_domains] = domain; 627 } 628 num_domains++; 629 } 630 631 return num_domains; 632 } 633 634 static bool 635 bdev_malloc_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 636 { 637 struct malloc_disk *malloc_disk = ctx; 638 639 if (malloc_disk->disk.dif_type != SPDK_DIF_DISABLE) { 640 return false; 641 } 642 643 switch (type) { 644 case SPDK_BDEV_IO_TYPE_READ: 645 case SPDK_BDEV_IO_TYPE_WRITE: 646 return true; 647 default: 648 return false; 649 } 650 } 651 652 static const struct spdk_bdev_fn_table malloc_fn_table = { 653 .destruct = bdev_malloc_destruct, 654 .submit_request = bdev_malloc_submit_request, 655 .io_type_supported = bdev_malloc_io_type_supported, 656 .get_io_channel = bdev_malloc_get_io_channel, 657 .write_config_json = bdev_malloc_write_json_config, 658 .get_memory_domains = bdev_malloc_get_memory_domains, 659 .accel_sequence_supported = bdev_malloc_accel_sequence_supported, 660 }; 661 662 static int 663 malloc_disk_setup_pi(struct malloc_disk *mdisk) 664 { 665 struct spdk_bdev *bdev = &mdisk->disk; 666 struct spdk_dif_ctx dif_ctx; 667 struct iovec iov, md_iov; 668 uint32_t dif_check_flags; 669 int rc; 670 struct spdk_dif_ctx_init_ext_opts dif_opts; 671 672 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 673 dif_opts.dif_pi_format = bdev->dif_pi_format; 674 /* Set APPTAG|REFTAG_IGNORE to PI fields after creation of malloc bdev */ 675 dif_check_flags = bdev->dif_check_flags | SPDK_DIF_CHECK_TYPE_REFTAG | 676 SPDK_DIF_FLAGS_APPTAG_CHECK; 677 rc = spdk_dif_ctx_init(&dif_ctx, 678 bdev->blocklen, 679 bdev->md_len, 680 bdev->md_interleave, 681 bdev->dif_is_head_of_md, 682 bdev->dif_type, 683 dif_check_flags, 684 SPDK_DIF_REFTAG_IGNORE, 685 0xFFFF, SPDK_DIF_APPTAG_IGNORE, 686 0, 0, &dif_opts); 687 if (rc != 0) { 688 SPDK_ERRLOG("Initialization of DIF/DIX context failed\n"); 689 return rc; 690 } 691 692 iov.iov_base = mdisk->malloc_buf; 693 iov.iov_len = bdev->blockcnt * bdev->blocklen; 694 695 if (mdisk->disk.md_interleave) { 696 rc = spdk_dif_generate(&iov, 1, bdev->blockcnt, &dif_ctx); 697 } else { 698 md_iov.iov_base = mdisk->malloc_md_buf; 699 md_iov.iov_len = bdev->blockcnt * bdev->md_len; 700 701 rc = spdk_dix_generate(&iov, 1, &md_iov, bdev->blockcnt, &dif_ctx); 702 } 703 704 if (rc != 0) { 705 SPDK_ERRLOG("Formatting by DIF/DIX failed\n"); 706 } 707 708 return rc; 709 } 710 711 int 712 create_malloc_disk(struct spdk_bdev **bdev, const struct malloc_bdev_opts *opts) 713 { 714 struct malloc_disk *mdisk; 715 uint32_t block_size; 716 int rc; 717 718 assert(opts != NULL); 719 720 if (opts->num_blocks == 0) { 721 SPDK_ERRLOG("Disk num_blocks must be greater than 0"); 722 return -EINVAL; 723 } 724 725 if (opts->block_size % 512) { 726 SPDK_ERRLOG("Data block size must be 512 bytes aligned\n"); 727 return -EINVAL; 728 } 729 730 if (opts->physical_block_size % 512) { 731 SPDK_ERRLOG("Physical block must be 512 bytes aligned\n"); 732 return -EINVAL; 733 } 734 735 switch (opts->md_size) { 736 case 0: 737 case 8: 738 case 16: 739 case 32: 740 case 64: 741 case 128: 742 break; 743 default: 744 SPDK_ERRLOG("metadata size %u is not supported\n", opts->md_size); 745 return -EINVAL; 746 } 747 748 if (opts->md_interleave) { 749 block_size = opts->block_size + opts->md_size; 750 } else { 751 block_size = opts->block_size; 752 } 753 754 mdisk = calloc(1, sizeof(*mdisk)); 755 if (!mdisk) { 756 SPDK_ERRLOG("mdisk calloc() failed\n"); 757 return -ENOMEM; 758 } 759 760 /* 761 * Allocate the large backend memory buffer from pinned memory. 762 * 763 * TODO: need to pass a hint so we know which socket to allocate 764 * from on multi-socket systems. 765 */ 766 mdisk->malloc_buf = spdk_zmalloc(opts->num_blocks * block_size, 2 * 1024 * 1024, NULL, 767 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 768 if (!mdisk->malloc_buf) { 769 SPDK_ERRLOG("malloc_buf spdk_zmalloc() failed\n"); 770 malloc_disk_free(mdisk); 771 return -ENOMEM; 772 } 773 774 if (!opts->md_interleave && opts->md_size != 0) { 775 mdisk->malloc_md_buf = spdk_zmalloc(opts->num_blocks * opts->md_size, 2 * 1024 * 1024, NULL, 776 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 777 if (!mdisk->malloc_md_buf) { 778 SPDK_ERRLOG("malloc_md_buf spdk_zmalloc() failed\n"); 779 malloc_disk_free(mdisk); 780 return -ENOMEM; 781 } 782 } 783 784 if (opts->name) { 785 mdisk->disk.name = strdup(opts->name); 786 } else { 787 /* Auto-generate a name */ 788 mdisk->disk.name = spdk_sprintf_alloc("Malloc%d", malloc_disk_count); 789 malloc_disk_count++; 790 } 791 if (!mdisk->disk.name) { 792 malloc_disk_free(mdisk); 793 return -ENOMEM; 794 } 795 mdisk->disk.product_name = "Malloc disk"; 796 797 mdisk->disk.write_cache = 1; 798 mdisk->disk.blocklen = block_size; 799 mdisk->disk.phys_blocklen = opts->physical_block_size; 800 mdisk->disk.blockcnt = opts->num_blocks; 801 mdisk->disk.md_len = opts->md_size; 802 mdisk->disk.md_interleave = opts->md_interleave; 803 mdisk->disk.dif_type = opts->dif_type; 804 mdisk->disk.dif_is_head_of_md = opts->dif_is_head_of_md; 805 /* Current block device layer API does not propagate 806 * any DIF related information from user. So, we can 807 * not generate or verify Application Tag. 808 */ 809 switch (opts->dif_type) { 810 case SPDK_DIF_TYPE1: 811 case SPDK_DIF_TYPE2: 812 mdisk->disk.dif_check_flags = SPDK_DIF_FLAGS_GUARD_CHECK | 813 SPDK_DIF_FLAGS_REFTAG_CHECK; 814 break; 815 case SPDK_DIF_TYPE3: 816 mdisk->disk.dif_check_flags = SPDK_DIF_FLAGS_GUARD_CHECK; 817 break; 818 case SPDK_DIF_DISABLE: 819 break; 820 } 821 mdisk->disk.dif_pi_format = opts->dif_pi_format; 822 823 if (opts->dif_type != SPDK_DIF_DISABLE) { 824 rc = malloc_disk_setup_pi(mdisk); 825 if (rc) { 826 SPDK_ERRLOG("Failed to set up protection information.\n"); 827 malloc_disk_free(mdisk); 828 return rc; 829 } 830 } 831 832 if (opts->optimal_io_boundary) { 833 mdisk->disk.optimal_io_boundary = opts->optimal_io_boundary; 834 mdisk->disk.split_on_optimal_io_boundary = true; 835 } 836 if (!spdk_uuid_is_null(&opts->uuid)) { 837 spdk_uuid_copy(&mdisk->disk.uuid, &opts->uuid); 838 } 839 840 mdisk->disk.max_copy = 0; 841 mdisk->disk.ctxt = mdisk; 842 mdisk->disk.fn_table = &malloc_fn_table; 843 mdisk->disk.module = &malloc_if; 844 845 rc = spdk_bdev_register(&mdisk->disk); 846 if (rc) { 847 malloc_disk_free(mdisk); 848 return rc; 849 } 850 851 *bdev = &(mdisk->disk); 852 853 TAILQ_INSERT_TAIL(&g_malloc_disks, mdisk, link); 854 855 return rc; 856 } 857 858 void 859 delete_malloc_disk(const char *name, spdk_delete_malloc_complete cb_fn, void *cb_arg) 860 { 861 int rc; 862 863 rc = spdk_bdev_unregister_by_name(name, &malloc_if, cb_fn, cb_arg); 864 if (rc != 0) { 865 cb_fn(cb_arg, rc); 866 } 867 } 868 869 static int 870 malloc_completion_poller(void *ctx) 871 { 872 struct malloc_channel *ch = ctx; 873 struct malloc_task *task; 874 TAILQ_HEAD(, malloc_task) completed_tasks; 875 uint32_t num_completions = 0; 876 877 TAILQ_INIT(&completed_tasks); 878 TAILQ_SWAP(&completed_tasks, &ch->completed_tasks, malloc_task, tailq); 879 880 while (!TAILQ_EMPTY(&completed_tasks)) { 881 task = TAILQ_FIRST(&completed_tasks); 882 TAILQ_REMOVE(&completed_tasks, task, tailq); 883 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status); 884 num_completions++; 885 } 886 887 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 888 } 889 890 static int 891 malloc_create_channel_cb(void *io_device, void *ctx) 892 { 893 struct malloc_channel *ch = ctx; 894 895 ch->accel_channel = spdk_accel_get_io_channel(); 896 if (!ch->accel_channel) { 897 SPDK_ERRLOG("Failed to get accel framework's IO channel\n"); 898 return -ENOMEM; 899 } 900 901 ch->completion_poller = SPDK_POLLER_REGISTER(malloc_completion_poller, ch, 0); 902 if (!ch->completion_poller) { 903 SPDK_ERRLOG("Failed to register malloc completion poller\n"); 904 spdk_put_io_channel(ch->accel_channel); 905 return -ENOMEM; 906 } 907 908 TAILQ_INIT(&ch->completed_tasks); 909 910 return 0; 911 } 912 913 static void 914 malloc_destroy_channel_cb(void *io_device, void *ctx) 915 { 916 struct malloc_channel *ch = ctx; 917 918 assert(TAILQ_EMPTY(&ch->completed_tasks)); 919 920 spdk_put_io_channel(ch->accel_channel); 921 spdk_poller_unregister(&ch->completion_poller); 922 } 923 924 static int 925 bdev_malloc_initialize(void) 926 { 927 /* This needs to be reset for each reinitialization of submodules. 928 * Otherwise after enough devices or reinitializations the value gets too high. 929 * TODO: Make malloc bdev name mandatory and remove this counter. */ 930 malloc_disk_count = 0; 931 932 spdk_io_device_register(&g_malloc_disks, malloc_create_channel_cb, 933 malloc_destroy_channel_cb, sizeof(struct malloc_channel), 934 "bdev_malloc"); 935 936 return 0; 937 } 938 939 static void 940 bdev_malloc_deinitialize(void) 941 { 942 spdk_io_device_unregister(&g_malloc_disks, NULL); 943 } 944 945 SPDK_LOG_REGISTER_COMPONENT(bdev_malloc) 946