1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2017 Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "bdev_malloc.h" 10 #include "spdk/endian.h" 11 #include "spdk/env.h" 12 #include "spdk/accel.h" 13 #include "spdk/dma.h" 14 #include "spdk/likely.h" 15 #include "spdk/string.h" 16 17 #include "spdk/log.h" 18 19 struct malloc_disk { 20 struct spdk_bdev disk; 21 void *malloc_buf; 22 void *malloc_md_buf; 23 TAILQ_ENTRY(malloc_disk) link; 24 }; 25 26 struct malloc_task { 27 struct iovec iov; 28 int num_outstanding; 29 enum spdk_bdev_io_status status; 30 TAILQ_ENTRY(malloc_task) tailq; 31 }; 32 33 struct malloc_channel { 34 struct spdk_io_channel *accel_channel; 35 struct spdk_poller *completion_poller; 36 TAILQ_HEAD(, malloc_task) completed_tasks; 37 }; 38 39 static int 40 _malloc_verify_pi(struct spdk_bdev_io *bdev_io, struct iovec *iovs, int iovcnt, 41 void *md_buf) 42 { 43 struct spdk_bdev *bdev = bdev_io->bdev; 44 struct spdk_dif_ctx dif_ctx; 45 struct spdk_dif_error err_blk; 46 int rc; 47 struct spdk_dif_ctx_init_ext_opts dif_opts; 48 49 assert(bdev_io->u.bdev.memory_domain == NULL); 50 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 51 dif_opts.dif_pi_format = bdev->dif_pi_format; 52 rc = spdk_dif_ctx_init(&dif_ctx, 53 bdev->blocklen, 54 bdev->md_len, 55 bdev->md_interleave, 56 bdev->dif_is_head_of_md, 57 bdev->dif_type, 58 bdev_io->u.bdev.dif_check_flags, 59 bdev_io->u.bdev.offset_blocks & 0xFFFFFFFF, 60 0xFFFF, 0, 0, 0, &dif_opts); 61 if (rc != 0) { 62 SPDK_ERRLOG("Failed to initialize DIF/DIX context\n"); 63 return rc; 64 } 65 66 if (spdk_bdev_is_md_interleaved(bdev)) { 67 rc = spdk_dif_verify(iovs, 68 iovcnt, 69 bdev_io->u.bdev.num_blocks, 70 &dif_ctx, 71 &err_blk); 72 } else { 73 struct iovec md_iov = { 74 .iov_base = md_buf, 75 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 76 }; 77 78 if (bdev_io->u.bdev.md_buf == NULL) { 79 return 0; 80 } 81 82 rc = spdk_dix_verify(iovs, 83 iovcnt, 84 &md_iov, 85 bdev_io->u.bdev.num_blocks, 86 &dif_ctx, 87 &err_blk); 88 } 89 90 if (rc != 0) { 91 SPDK_ERRLOG("DIF/DIX verify failed: lba %" PRIu64 ", num_blocks %" PRIu64 ", " 92 "err_type %u, expected %lu, actual %lu, err_offset %u\n", 93 bdev_io->u.bdev.offset_blocks, 94 bdev_io->u.bdev.num_blocks, 95 err_blk.err_type, 96 err_blk.expected, 97 err_blk.actual, 98 err_blk.err_offset); 99 } 100 101 return rc; 102 } 103 104 static int 105 malloc_verify_pi(struct spdk_bdev_io *bdev_io) 106 { 107 return _malloc_verify_pi(bdev_io, 108 bdev_io->u.bdev.iovs, 109 bdev_io->u.bdev.iovcnt, 110 bdev_io->u.bdev.md_buf); 111 } 112 113 static int 114 malloc_unmap_write_zeroes_generate_pi(struct spdk_bdev_io *bdev_io) 115 { 116 struct spdk_bdev *bdev = bdev_io->bdev; 117 struct malloc_disk *mdisk = bdev_io->bdev->ctxt; 118 uint32_t block_size = bdev_io->bdev->blocklen; 119 uint32_t dif_check_flags; 120 struct spdk_dif_ctx dif_ctx; 121 struct spdk_dif_ctx_init_ext_opts dif_opts; 122 int rc; 123 124 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 125 dif_opts.dif_pi_format = bdev->dif_pi_format; 126 dif_check_flags = bdev->dif_check_flags | SPDK_DIF_CHECK_TYPE_REFTAG | 127 SPDK_DIF_FLAGS_APPTAG_CHECK; 128 rc = spdk_dif_ctx_init(&dif_ctx, 129 bdev->blocklen, 130 bdev->md_len, 131 bdev->md_interleave, 132 bdev->dif_is_head_of_md, 133 bdev->dif_type, 134 dif_check_flags, 135 SPDK_DIF_REFTAG_IGNORE, 136 0xFFFF, SPDK_DIF_APPTAG_IGNORE, 137 0, 0, &dif_opts); 138 if (rc != 0) { 139 SPDK_ERRLOG("Initialization of DIF/DIX context failed\n"); 140 return rc; 141 } 142 143 if (bdev->md_interleave) { 144 struct iovec iov = { 145 .iov_base = mdisk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size, 146 .iov_len = bdev_io->u.bdev.num_blocks * block_size, 147 }; 148 149 rc = spdk_dif_generate(&iov, 1, bdev_io->u.bdev.num_blocks, &dif_ctx); 150 } else { 151 struct iovec iov = { 152 .iov_base = mdisk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size, 153 .iov_len = bdev_io->u.bdev.num_blocks * block_size, 154 }; 155 156 struct iovec md_iov = { 157 .iov_base = mdisk->malloc_md_buf + bdev_io->u.bdev.offset_blocks * bdev->md_len, 158 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 159 }; 160 161 rc = spdk_dix_generate(&iov, 1, &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx); 162 } 163 164 if (rc != 0) { 165 SPDK_ERRLOG("Formatting by DIF/DIX failed\n"); 166 } 167 168 169 return rc; 170 } 171 172 static void 173 malloc_done(void *ref, int status) 174 { 175 struct malloc_task *task = (struct malloc_task *)ref; 176 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(task); 177 int rc; 178 179 if (status != 0) { 180 if (status == -ENOMEM) { 181 if (task->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 182 task->status = SPDK_BDEV_IO_STATUS_NOMEM; 183 } 184 } else { 185 task->status = SPDK_BDEV_IO_STATUS_FAILED; 186 } 187 } 188 189 if (--task->num_outstanding != 0) { 190 return; 191 } 192 193 if (bdev_io->bdev->dif_type != SPDK_DIF_DISABLE && 194 task->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 195 switch (bdev_io->type) { 196 case SPDK_BDEV_IO_TYPE_READ: 197 rc = malloc_verify_pi(bdev_io); 198 break; 199 case SPDK_BDEV_IO_TYPE_UNMAP: 200 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 201 rc = malloc_unmap_write_zeroes_generate_pi(bdev_io); 202 break; 203 default: 204 rc = 0; 205 break; 206 } 207 208 if (rc != 0) { 209 task->status = SPDK_BDEV_IO_STATUS_FAILED; 210 } 211 } 212 213 assert(!bdev_io->u.bdev.accel_sequence || task->status == SPDK_BDEV_IO_STATUS_NOMEM); 214 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status); 215 } 216 217 static void 218 malloc_complete_task(struct malloc_task *task, struct malloc_channel *mch, 219 enum spdk_bdev_io_status status) 220 { 221 task->status = status; 222 TAILQ_INSERT_TAIL(&mch->completed_tasks, task, tailq); 223 } 224 225 static TAILQ_HEAD(, malloc_disk) g_malloc_disks = TAILQ_HEAD_INITIALIZER(g_malloc_disks); 226 227 int malloc_disk_count = 0; 228 229 static int bdev_malloc_initialize(void); 230 static void bdev_malloc_deinitialize(void); 231 232 static int 233 bdev_malloc_get_ctx_size(void) 234 { 235 return sizeof(struct malloc_task); 236 } 237 238 static struct spdk_bdev_module malloc_if = { 239 .name = "malloc", 240 .module_init = bdev_malloc_initialize, 241 .module_fini = bdev_malloc_deinitialize, 242 .get_ctx_size = bdev_malloc_get_ctx_size, 243 244 }; 245 246 SPDK_BDEV_MODULE_REGISTER(malloc, &malloc_if) 247 248 static void 249 malloc_disk_free(struct malloc_disk *malloc_disk) 250 { 251 if (!malloc_disk) { 252 return; 253 } 254 255 free(malloc_disk->disk.name); 256 spdk_free(malloc_disk->malloc_buf); 257 spdk_free(malloc_disk->malloc_md_buf); 258 free(malloc_disk); 259 } 260 261 static int 262 bdev_malloc_destruct(void *ctx) 263 { 264 struct malloc_disk *malloc_disk = ctx; 265 266 TAILQ_REMOVE(&g_malloc_disks, malloc_disk, link); 267 malloc_disk_free(malloc_disk); 268 return 0; 269 } 270 271 static int 272 bdev_malloc_check_iov_len(struct iovec *iovs, int iovcnt, size_t nbytes) 273 { 274 int i; 275 276 for (i = 0; i < iovcnt; i++) { 277 if (nbytes < iovs[i].iov_len) { 278 return 0; 279 } 280 281 nbytes -= iovs[i].iov_len; 282 } 283 284 return nbytes != 0; 285 } 286 287 static size_t 288 malloc_get_md_len(struct spdk_bdev_io *bdev_io) 289 { 290 return bdev_io->u.bdev.num_blocks * bdev_io->bdev->md_len; 291 } 292 293 static uint64_t 294 malloc_get_md_offset(struct spdk_bdev_io *bdev_io) 295 { 296 return bdev_io->u.bdev.offset_blocks * bdev_io->bdev->md_len; 297 } 298 299 static void * 300 malloc_get_md_buf(struct spdk_bdev_io *bdev_io) 301 { 302 struct malloc_disk *mdisk = SPDK_CONTAINEROF(bdev_io->bdev, struct malloc_disk, disk); 303 304 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 305 306 return (char *)mdisk->malloc_md_buf + malloc_get_md_offset(bdev_io); 307 } 308 309 static void 310 malloc_sequence_fail(struct malloc_task *task, int status) 311 { 312 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(task); 313 314 /* For ENOMEM, the IO will be retried by the bdev layer, so we don't abort the sequence */ 315 if (status != -ENOMEM) { 316 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 317 bdev_io->u.bdev.accel_sequence = NULL; 318 } 319 320 malloc_done(task, status); 321 } 322 323 static void 324 malloc_sequence_done(void *ctx, int status) 325 { 326 struct malloc_task *task = ctx; 327 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(task); 328 329 bdev_io->u.bdev.accel_sequence = NULL; 330 /* Prevent bdev layer from retrying the request if the sequence failed with ENOMEM */ 331 malloc_done(task, status != -ENOMEM ? status : -EFAULT); 332 } 333 334 static void 335 bdev_malloc_readv(struct malloc_disk *mdisk, struct spdk_io_channel *ch, 336 struct malloc_task *task, struct spdk_bdev_io *bdev_io) 337 { 338 uint64_t len, offset; 339 int res = 0; 340 341 len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 342 offset = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen; 343 344 if (bdev_malloc_check_iov_len(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, len)) { 345 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), 346 SPDK_BDEV_IO_STATUS_FAILED); 347 return; 348 } 349 350 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 351 task->num_outstanding = 0; 352 task->iov.iov_base = mdisk->malloc_buf + offset; 353 task->iov.iov_len = len; 354 355 SPDK_DEBUGLOG(bdev_malloc, "read %zu bytes from offset %#" PRIx64 ", iovcnt=%d\n", 356 len, offset, bdev_io->u.bdev.iovcnt); 357 358 task->num_outstanding++; 359 res = spdk_accel_append_copy(&bdev_io->u.bdev.accel_sequence, ch, 360 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 361 bdev_io->u.bdev.memory_domain, 362 bdev_io->u.bdev.memory_domain_ctx, 363 &task->iov, 1, NULL, NULL, NULL, NULL); 364 if (spdk_unlikely(res != 0)) { 365 malloc_sequence_fail(task, res); 366 return; 367 } 368 369 spdk_accel_sequence_reverse(bdev_io->u.bdev.accel_sequence); 370 spdk_accel_sequence_finish(bdev_io->u.bdev.accel_sequence, malloc_sequence_done, task); 371 372 if (bdev_io->u.bdev.md_buf == NULL) { 373 return; 374 } 375 376 SPDK_DEBUGLOG(bdev_malloc, "read metadata %zu bytes from offset%#" PRIx64 "\n", 377 malloc_get_md_len(bdev_io), malloc_get_md_offset(bdev_io)); 378 379 task->num_outstanding++; 380 res = spdk_accel_submit_copy(ch, bdev_io->u.bdev.md_buf, malloc_get_md_buf(bdev_io), 381 malloc_get_md_len(bdev_io), malloc_done, task); 382 if (res != 0) { 383 malloc_done(task, res); 384 } 385 } 386 387 static void 388 bdev_malloc_writev(struct malloc_disk *mdisk, struct spdk_io_channel *ch, 389 struct malloc_task *task, struct spdk_bdev_io *bdev_io) 390 { 391 uint64_t len, offset; 392 int res = 0; 393 394 len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 395 offset = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen; 396 397 if (bdev_malloc_check_iov_len(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, len)) { 398 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), 399 SPDK_BDEV_IO_STATUS_FAILED); 400 return; 401 } 402 403 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 404 task->num_outstanding = 0; 405 task->iov.iov_base = mdisk->malloc_buf + offset; 406 task->iov.iov_len = len; 407 408 SPDK_DEBUGLOG(bdev_malloc, "write %zu bytes to offset %#" PRIx64 ", iovcnt=%d\n", 409 len, offset, bdev_io->u.bdev.iovcnt); 410 411 task->num_outstanding++; 412 res = spdk_accel_append_copy(&bdev_io->u.bdev.accel_sequence, ch, &task->iov, 1, NULL, NULL, 413 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 414 bdev_io->u.bdev.memory_domain, 415 bdev_io->u.bdev.memory_domain_ctx, NULL, NULL); 416 if (spdk_unlikely(res != 0)) { 417 malloc_sequence_fail(task, res); 418 return; 419 } 420 421 spdk_accel_sequence_finish(bdev_io->u.bdev.accel_sequence, malloc_sequence_done, task); 422 423 if (bdev_io->u.bdev.md_buf == NULL) { 424 return; 425 } 426 427 SPDK_DEBUGLOG(bdev_malloc, "write metadata %zu bytes to offset %#" PRIx64 "\n", 428 malloc_get_md_len(bdev_io), malloc_get_md_offset(bdev_io)); 429 430 task->num_outstanding++; 431 res = spdk_accel_submit_copy(ch, malloc_get_md_buf(bdev_io), bdev_io->u.bdev.md_buf, 432 malloc_get_md_len(bdev_io), malloc_done, task); 433 if (res != 0) { 434 malloc_done(task, res); 435 } 436 } 437 438 static int 439 bdev_malloc_unmap(struct malloc_disk *mdisk, 440 struct spdk_io_channel *ch, 441 struct malloc_task *task, 442 uint64_t offset, 443 uint64_t byte_count) 444 { 445 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 446 task->num_outstanding = 1; 447 448 return spdk_accel_submit_fill(ch, mdisk->malloc_buf + offset, 0, 449 byte_count, malloc_done, task); 450 } 451 452 static void 453 bdev_malloc_copy(struct malloc_disk *mdisk, struct spdk_io_channel *ch, 454 struct malloc_task *task, 455 uint64_t dst_offset, uint64_t src_offset, size_t len) 456 { 457 int64_t res = 0; 458 void *dst = mdisk->malloc_buf + dst_offset; 459 void *src = mdisk->malloc_buf + src_offset; 460 461 SPDK_DEBUGLOG(bdev_malloc, "Copy %zu bytes from offset %#" PRIx64 " to offset %#" PRIx64 "\n", 462 len, src_offset, dst_offset); 463 464 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 465 task->num_outstanding = 1; 466 467 res = spdk_accel_submit_copy(ch, dst, src, len, malloc_done, task); 468 if (res != 0) { 469 malloc_done(task, res); 470 } 471 } 472 473 static int 474 _bdev_malloc_submit_request(struct malloc_channel *mch, struct spdk_bdev_io *bdev_io) 475 { 476 struct malloc_task *task = (struct malloc_task *)bdev_io->driver_ctx; 477 struct malloc_disk *disk = bdev_io->bdev->ctxt; 478 uint32_t block_size = bdev_io->bdev->blocklen; 479 int rc; 480 481 switch (bdev_io->type) { 482 case SPDK_BDEV_IO_TYPE_READ: 483 if (bdev_io->u.bdev.iovs[0].iov_base == NULL) { 484 assert(bdev_io->u.bdev.iovcnt == 1); 485 assert(bdev_io->u.bdev.memory_domain == NULL); 486 bdev_io->u.bdev.iovs[0].iov_base = 487 disk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size; 488 bdev_io->u.bdev.iovs[0].iov_len = bdev_io->u.bdev.num_blocks * block_size; 489 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 490 spdk_bdev_io_set_md_buf(bdev_io, malloc_get_md_buf(bdev_io), 491 malloc_get_md_len(bdev_io)); 492 } 493 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 494 return 0; 495 } 496 497 bdev_malloc_readv(disk, mch->accel_channel, task, bdev_io); 498 return 0; 499 500 case SPDK_BDEV_IO_TYPE_WRITE: 501 if (bdev_io->bdev->dif_type != SPDK_DIF_DISABLE) { 502 rc = malloc_verify_pi(bdev_io); 503 if (rc != 0) { 504 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_FAILED); 505 return 0; 506 } 507 } 508 509 bdev_malloc_writev(disk, mch->accel_channel, task, bdev_io); 510 return 0; 511 512 case SPDK_BDEV_IO_TYPE_RESET: 513 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 514 return 0; 515 516 case SPDK_BDEV_IO_TYPE_FLUSH: 517 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 518 return 0; 519 520 case SPDK_BDEV_IO_TYPE_UNMAP: 521 return bdev_malloc_unmap(disk, mch->accel_channel, task, 522 bdev_io->u.bdev.offset_blocks * block_size, 523 bdev_io->u.bdev.num_blocks * block_size); 524 525 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 526 /* bdev_malloc_unmap is implemented with a call to mem_cpy_fill which zeroes out all of the requested bytes. */ 527 return bdev_malloc_unmap(disk, mch->accel_channel, task, 528 bdev_io->u.bdev.offset_blocks * block_size, 529 bdev_io->u.bdev.num_blocks * block_size); 530 531 case SPDK_BDEV_IO_TYPE_ZCOPY: 532 if (bdev_io->u.bdev.zcopy.start) { 533 void *buf; 534 size_t len; 535 536 buf = disk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size; 537 len = bdev_io->u.bdev.num_blocks * block_size; 538 spdk_bdev_io_set_buf(bdev_io, buf, len); 539 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 540 spdk_bdev_io_set_md_buf(bdev_io, malloc_get_md_buf(bdev_io), 541 malloc_get_md_len(bdev_io)); 542 } 543 } 544 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 545 return 0; 546 case SPDK_BDEV_IO_TYPE_ABORT: 547 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_FAILED); 548 return 0; 549 case SPDK_BDEV_IO_TYPE_COPY: 550 bdev_malloc_copy(disk, mch->accel_channel, task, 551 bdev_io->u.bdev.offset_blocks * block_size, 552 bdev_io->u.bdev.copy.src_offset_blocks * block_size, 553 bdev_io->u.bdev.num_blocks * block_size); 554 return 0; 555 556 default: 557 return -1; 558 } 559 return 0; 560 } 561 562 static void 563 bdev_malloc_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 564 { 565 struct malloc_channel *mch = spdk_io_channel_get_ctx(ch); 566 567 if (_bdev_malloc_submit_request(mch, bdev_io) != 0) { 568 malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch, 569 SPDK_BDEV_IO_STATUS_FAILED); 570 } 571 } 572 573 static bool 574 bdev_malloc_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 575 { 576 switch (io_type) { 577 case SPDK_BDEV_IO_TYPE_READ: 578 case SPDK_BDEV_IO_TYPE_WRITE: 579 case SPDK_BDEV_IO_TYPE_FLUSH: 580 case SPDK_BDEV_IO_TYPE_RESET: 581 case SPDK_BDEV_IO_TYPE_UNMAP: 582 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 583 case SPDK_BDEV_IO_TYPE_ZCOPY: 584 case SPDK_BDEV_IO_TYPE_ABORT: 585 case SPDK_BDEV_IO_TYPE_COPY: 586 return true; 587 588 default: 589 return false; 590 } 591 } 592 593 static struct spdk_io_channel * 594 bdev_malloc_get_io_channel(void *ctx) 595 { 596 return spdk_get_io_channel(&g_malloc_disks); 597 } 598 599 static void 600 bdev_malloc_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 601 { 602 spdk_json_write_object_begin(w); 603 604 spdk_json_write_named_string(w, "method", "bdev_malloc_create"); 605 606 spdk_json_write_named_object_begin(w, "params"); 607 spdk_json_write_named_string(w, "name", bdev->name); 608 spdk_json_write_named_uint64(w, "num_blocks", bdev->blockcnt); 609 spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); 610 spdk_json_write_named_uint32(w, "physical_block_size", bdev->phys_blocklen); 611 spdk_json_write_named_uuid(w, "uuid", &bdev->uuid); 612 spdk_json_write_named_uint32(w, "optimal_io_boundary", bdev->optimal_io_boundary); 613 spdk_json_write_named_uint32(w, "md_size", bdev->md_len); 614 spdk_json_write_named_uint32(w, "dif_type", bdev->dif_type); 615 spdk_json_write_named_bool(w, "dif_is_head_of_md", bdev->dif_is_head_of_md); 616 spdk_json_write_named_uint32(w, "dif_pi_format", bdev->dif_pi_format); 617 618 spdk_json_write_object_end(w); 619 620 spdk_json_write_object_end(w); 621 } 622 623 static int 624 bdev_malloc_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 625 { 626 struct malloc_disk *malloc_disk = ctx; 627 struct spdk_memory_domain *domain; 628 int num_domains = 0; 629 630 if (malloc_disk->disk.dif_type != SPDK_DIF_DISABLE) { 631 return 0; 632 } 633 634 /* Report support for every memory domain */ 635 for (domain = spdk_memory_domain_get_first(NULL); domain != NULL; 636 domain = spdk_memory_domain_get_next(domain, NULL)) { 637 if (domains != NULL && num_domains < array_size) { 638 domains[num_domains] = domain; 639 } 640 num_domains++; 641 } 642 643 return num_domains; 644 } 645 646 static bool 647 bdev_malloc_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 648 { 649 struct malloc_disk *malloc_disk = ctx; 650 651 if (malloc_disk->disk.dif_type != SPDK_DIF_DISABLE) { 652 return false; 653 } 654 655 switch (type) { 656 case SPDK_BDEV_IO_TYPE_READ: 657 case SPDK_BDEV_IO_TYPE_WRITE: 658 return true; 659 default: 660 return false; 661 } 662 } 663 664 static const struct spdk_bdev_fn_table malloc_fn_table = { 665 .destruct = bdev_malloc_destruct, 666 .submit_request = bdev_malloc_submit_request, 667 .io_type_supported = bdev_malloc_io_type_supported, 668 .get_io_channel = bdev_malloc_get_io_channel, 669 .write_config_json = bdev_malloc_write_json_config, 670 .get_memory_domains = bdev_malloc_get_memory_domains, 671 .accel_sequence_supported = bdev_malloc_accel_sequence_supported, 672 }; 673 674 static int 675 malloc_disk_setup_pi(struct malloc_disk *mdisk) 676 { 677 struct spdk_bdev *bdev = &mdisk->disk; 678 struct spdk_dif_ctx dif_ctx; 679 struct iovec iov, md_iov; 680 uint32_t dif_check_flags; 681 int rc; 682 struct spdk_dif_ctx_init_ext_opts dif_opts; 683 684 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 685 dif_opts.dif_pi_format = bdev->dif_pi_format; 686 /* Set APPTAG|REFTAG_IGNORE to PI fields after creation of malloc bdev */ 687 dif_check_flags = bdev->dif_check_flags | SPDK_DIF_CHECK_TYPE_REFTAG | 688 SPDK_DIF_FLAGS_APPTAG_CHECK; 689 rc = spdk_dif_ctx_init(&dif_ctx, 690 bdev->blocklen, 691 bdev->md_len, 692 bdev->md_interleave, 693 bdev->dif_is_head_of_md, 694 bdev->dif_type, 695 dif_check_flags, 696 SPDK_DIF_REFTAG_IGNORE, 697 0xFFFF, SPDK_DIF_APPTAG_IGNORE, 698 0, 0, &dif_opts); 699 if (rc != 0) { 700 SPDK_ERRLOG("Initialization of DIF/DIX context failed\n"); 701 return rc; 702 } 703 704 iov.iov_base = mdisk->malloc_buf; 705 iov.iov_len = bdev->blockcnt * bdev->blocklen; 706 707 if (mdisk->disk.md_interleave) { 708 rc = spdk_dif_generate(&iov, 1, bdev->blockcnt, &dif_ctx); 709 } else { 710 md_iov.iov_base = mdisk->malloc_md_buf; 711 md_iov.iov_len = bdev->blockcnt * bdev->md_len; 712 713 rc = spdk_dix_generate(&iov, 1, &md_iov, bdev->blockcnt, &dif_ctx); 714 } 715 716 if (rc != 0) { 717 SPDK_ERRLOG("Formatting by DIF/DIX failed\n"); 718 } 719 720 return rc; 721 } 722 723 int 724 create_malloc_disk(struct spdk_bdev **bdev, const struct malloc_bdev_opts *opts) 725 { 726 struct malloc_disk *mdisk; 727 uint32_t block_size; 728 int rc; 729 730 assert(opts != NULL); 731 732 if (opts->num_blocks == 0) { 733 SPDK_ERRLOG("Disk num_blocks must be greater than 0"); 734 return -EINVAL; 735 } 736 737 if (opts->block_size % 512) { 738 SPDK_ERRLOG("Data block size must be 512 bytes aligned\n"); 739 return -EINVAL; 740 } 741 742 if (opts->physical_block_size % 512) { 743 SPDK_ERRLOG("Physical block must be 512 bytes aligned\n"); 744 return -EINVAL; 745 } 746 747 switch (opts->md_size) { 748 case 0: 749 case 8: 750 case 16: 751 case 32: 752 case 64: 753 case 128: 754 break; 755 default: 756 SPDK_ERRLOG("metadata size %u is not supported\n", opts->md_size); 757 return -EINVAL; 758 } 759 760 if (opts->md_interleave) { 761 block_size = opts->block_size + opts->md_size; 762 } else { 763 block_size = opts->block_size; 764 } 765 766 mdisk = calloc(1, sizeof(*mdisk)); 767 if (!mdisk) { 768 SPDK_ERRLOG("mdisk calloc() failed\n"); 769 return -ENOMEM; 770 } 771 772 /* 773 * Allocate the large backend memory buffer from pinned memory. 774 * 775 * TODO: need to pass a hint so we know which socket to allocate 776 * from on multi-socket systems. 777 */ 778 mdisk->malloc_buf = spdk_zmalloc(opts->num_blocks * block_size, 2 * 1024 * 1024, NULL, 779 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 780 if (!mdisk->malloc_buf) { 781 SPDK_ERRLOG("malloc_buf spdk_zmalloc() failed\n"); 782 malloc_disk_free(mdisk); 783 return -ENOMEM; 784 } 785 786 if (!opts->md_interleave && opts->md_size != 0) { 787 mdisk->malloc_md_buf = spdk_zmalloc(opts->num_blocks * opts->md_size, 2 * 1024 * 1024, NULL, 788 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 789 if (!mdisk->malloc_md_buf) { 790 SPDK_ERRLOG("malloc_md_buf spdk_zmalloc() failed\n"); 791 malloc_disk_free(mdisk); 792 return -ENOMEM; 793 } 794 } 795 796 if (opts->name) { 797 mdisk->disk.name = strdup(opts->name); 798 } else { 799 /* Auto-generate a name */ 800 mdisk->disk.name = spdk_sprintf_alloc("Malloc%d", malloc_disk_count); 801 malloc_disk_count++; 802 } 803 if (!mdisk->disk.name) { 804 malloc_disk_free(mdisk); 805 return -ENOMEM; 806 } 807 mdisk->disk.product_name = "Malloc disk"; 808 809 mdisk->disk.write_cache = 1; 810 mdisk->disk.blocklen = block_size; 811 mdisk->disk.phys_blocklen = opts->physical_block_size; 812 mdisk->disk.blockcnt = opts->num_blocks; 813 mdisk->disk.md_len = opts->md_size; 814 mdisk->disk.md_interleave = opts->md_interleave; 815 mdisk->disk.dif_type = opts->dif_type; 816 mdisk->disk.dif_is_head_of_md = opts->dif_is_head_of_md; 817 /* Current block device layer API does not propagate 818 * any DIF related information from user. So, we can 819 * not generate or verify Application Tag. 820 */ 821 switch (opts->dif_type) { 822 case SPDK_DIF_TYPE1: 823 case SPDK_DIF_TYPE2: 824 mdisk->disk.dif_check_flags = SPDK_DIF_FLAGS_GUARD_CHECK | 825 SPDK_DIF_FLAGS_REFTAG_CHECK; 826 break; 827 case SPDK_DIF_TYPE3: 828 mdisk->disk.dif_check_flags = SPDK_DIF_FLAGS_GUARD_CHECK; 829 break; 830 case SPDK_DIF_DISABLE: 831 break; 832 } 833 mdisk->disk.dif_pi_format = opts->dif_pi_format; 834 835 if (opts->dif_type != SPDK_DIF_DISABLE) { 836 rc = malloc_disk_setup_pi(mdisk); 837 if (rc) { 838 SPDK_ERRLOG("Failed to set up protection information.\n"); 839 malloc_disk_free(mdisk); 840 return rc; 841 } 842 } 843 844 if (opts->optimal_io_boundary) { 845 mdisk->disk.optimal_io_boundary = opts->optimal_io_boundary; 846 mdisk->disk.split_on_optimal_io_boundary = true; 847 } 848 if (!spdk_uuid_is_null(&opts->uuid)) { 849 spdk_uuid_copy(&mdisk->disk.uuid, &opts->uuid); 850 } 851 852 mdisk->disk.max_copy = 0; 853 mdisk->disk.ctxt = mdisk; 854 mdisk->disk.fn_table = &malloc_fn_table; 855 mdisk->disk.module = &malloc_if; 856 857 rc = spdk_bdev_register(&mdisk->disk); 858 if (rc) { 859 malloc_disk_free(mdisk); 860 return rc; 861 } 862 863 *bdev = &(mdisk->disk); 864 865 TAILQ_INSERT_TAIL(&g_malloc_disks, mdisk, link); 866 867 return rc; 868 } 869 870 void 871 delete_malloc_disk(const char *name, spdk_delete_malloc_complete cb_fn, void *cb_arg) 872 { 873 int rc; 874 875 rc = spdk_bdev_unregister_by_name(name, &malloc_if, cb_fn, cb_arg); 876 if (rc != 0) { 877 cb_fn(cb_arg, rc); 878 } 879 } 880 881 static int 882 malloc_completion_poller(void *ctx) 883 { 884 struct malloc_channel *ch = ctx; 885 struct malloc_task *task; 886 TAILQ_HEAD(, malloc_task) completed_tasks; 887 uint32_t num_completions = 0; 888 889 TAILQ_INIT(&completed_tasks); 890 TAILQ_SWAP(&completed_tasks, &ch->completed_tasks, malloc_task, tailq); 891 892 while (!TAILQ_EMPTY(&completed_tasks)) { 893 task = TAILQ_FIRST(&completed_tasks); 894 TAILQ_REMOVE(&completed_tasks, task, tailq); 895 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status); 896 num_completions++; 897 } 898 899 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 900 } 901 902 static int 903 malloc_create_channel_cb(void *io_device, void *ctx) 904 { 905 struct malloc_channel *ch = ctx; 906 907 ch->accel_channel = spdk_accel_get_io_channel(); 908 if (!ch->accel_channel) { 909 SPDK_ERRLOG("Failed to get accel framework's IO channel\n"); 910 return -ENOMEM; 911 } 912 913 ch->completion_poller = SPDK_POLLER_REGISTER(malloc_completion_poller, ch, 0); 914 if (!ch->completion_poller) { 915 SPDK_ERRLOG("Failed to register malloc completion poller\n"); 916 spdk_put_io_channel(ch->accel_channel); 917 return -ENOMEM; 918 } 919 920 TAILQ_INIT(&ch->completed_tasks); 921 922 return 0; 923 } 924 925 static void 926 malloc_destroy_channel_cb(void *io_device, void *ctx) 927 { 928 struct malloc_channel *ch = ctx; 929 930 assert(TAILQ_EMPTY(&ch->completed_tasks)); 931 932 spdk_put_io_channel(ch->accel_channel); 933 spdk_poller_unregister(&ch->completion_poller); 934 } 935 936 static int 937 bdev_malloc_initialize(void) 938 { 939 /* This needs to be reset for each reinitialization of submodules. 940 * Otherwise after enough devices or reinitializations the value gets too high. 941 * TODO: Make malloc bdev name mandatory and remove this counter. */ 942 malloc_disk_count = 0; 943 944 spdk_io_device_register(&g_malloc_disks, malloc_create_channel_cb, 945 malloc_destroy_channel_cb, sizeof(struct malloc_channel), 946 "bdev_malloc"); 947 948 return 0; 949 } 950 951 static void 952 bdev_malloc_deinitialize(void) 953 { 954 spdk_io_device_unregister(&g_malloc_disks, NULL); 955 } 956 957 SPDK_LOG_REGISTER_COMPONENT(bdev_malloc) 958