1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2017 Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "bdev_malloc.h" 10 #include "spdk/endian.h" 11 #include "spdk/env.h" 12 #include "spdk/accel.h" 13 #include "spdk/dma.h" 14 #include "spdk/likely.h" 15 #include "spdk/string.h" 16 17 #include "spdk/log.h" 18 19 struct malloc_disk { 20 struct spdk_bdev disk; 21 void *malloc_buf; 22 void *malloc_md_buf; 23 TAILQ_ENTRY(malloc_disk) link; 24 }; 25 26 struct malloc_task { 27 struct iovec iov; 28 int num_outstanding; 29 enum spdk_bdev_io_status status; 30 TAILQ_ENTRY(malloc_task) tailq; 31 }; 32 33 struct malloc_channel { 34 struct spdk_io_channel *accel_channel; 35 struct spdk_poller *completion_poller; 36 TAILQ_HEAD(, malloc_task) completed_tasks; 37 }; 38 39 static int 40 _malloc_verify_pi(struct spdk_bdev_io *bdev_io, struct iovec *iovs, int iovcnt, 41 void *md_buf) 42 { 43 struct spdk_bdev *bdev = bdev_io->bdev; 44 struct spdk_dif_ctx dif_ctx; 45 struct spdk_dif_error err_blk; 46 int rc; 47 struct spdk_dif_ctx_init_ext_opts dif_opts; 48 49 assert(bdev_io->u.bdev.memory_domain == NULL); 50 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 51 dif_opts.dif_pi_format = bdev->dif_pi_format; 52 rc = spdk_dif_ctx_init(&dif_ctx, 53 bdev->blocklen, 54 bdev->md_len, 55 bdev->md_interleave, 56 bdev->dif_is_head_of_md, 57 bdev->dif_type, 58 bdev_io->u.bdev.dif_check_flags, 59 bdev_io->u.bdev.offset_blocks & 0xFFFFFFFF, 60 0xFFFF, 0, 0, 0, &dif_opts); 61 if (rc != 0) { 62 SPDK_ERRLOG("Failed to initialize DIF/DIX context\n"); 63 return rc; 64 } 65 66 if (spdk_bdev_is_md_interleaved(bdev)) { 67 rc = spdk_dif_verify(iovs, 68 iovcnt, 69 bdev_io->u.bdev.num_blocks, 70 &dif_ctx, 71 &err_blk); 72 } else { 73 struct iovec md_iov = { 74 .iov_base = md_buf, 75 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 76 }; 77 78 if (bdev_io->u.bdev.md_buf == NULL) { 79 return 0; 80 } 81 82 rc = spdk_dix_verify(iovs, 83 iovcnt, 84 &md_iov, 85 bdev_io->u.bdev.num_blocks, 86 &dif_ctx, 87 &err_blk); 88 } 89 90 if (rc != 0) { 91 SPDK_ERRLOG("DIF/DIX verify failed: lba %" PRIu64 ", num_blocks %" PRIu64 ", " 92 "err_type %u, expected %lu, actual %lu, err_offset %u\n", 93 bdev_io->u.bdev.offset_blocks, 94 bdev_io->u.bdev.num_blocks, 95 err_blk.err_type, 96 err_blk.expected, 97 err_blk.actual, 98 err_blk.err_offset); 99 } 100 101 return rc; 102 } 103 104 static int 105 malloc_verify_pi_io_buf(struct spdk_bdev_io *bdev_io) 106 { 107 return _malloc_verify_pi(bdev_io, 108 bdev_io->u.bdev.iovs, 109 bdev_io->u.bdev.iovcnt, 110 bdev_io->u.bdev.md_buf); 111 } 112 113 static int 114 malloc_verify_pi_malloc_buf(struct spdk_bdev_io *bdev_io) 115 { 116 struct iovec iov; 117 struct spdk_bdev *bdev = bdev_io->bdev; 118 struct malloc_disk *mdisk = bdev->ctxt; 119 uint64_t len, offset; 120 121 len = bdev_io->u.bdev.num_blocks * bdev->blocklen; 122 offset = bdev_io->u.bdev.offset_blocks * bdev->blocklen; 123 124 iov.iov_base = mdisk->malloc_buf + offset; 125 iov.iov_len = len; 126 127 return _malloc_verify_pi(bdev_io, &iov, 1, NULL); 128 } 129 130 static int 131 malloc_unmap_write_zeroes_generate_pi(struct spdk_bdev_io *bdev_io) 132 { 133 struct spdk_bdev *bdev = bdev_io->bdev; 134 struct malloc_disk *mdisk = bdev_io->bdev->ctxt; 135 uint32_t block_size = bdev_io->bdev->blocklen; 136 uint32_t dif_check_flags; 137 struct spdk_dif_ctx dif_ctx; 138 struct spdk_dif_ctx_init_ext_opts dif_opts; 139 int rc; 140 141 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 142 dif_opts.dif_pi_format = bdev->dif_pi_format; 143 dif_check_flags = bdev->dif_check_flags | SPDK_DIF_CHECK_TYPE_REFTAG | 144 SPDK_DIF_FLAGS_APPTAG_CHECK; 145 rc = spdk_dif_ctx_init(&dif_ctx, 146 bdev->blocklen, 147 bdev->md_len, 148 bdev->md_interleave, 149 bdev->dif_is_head_of_md, 150 bdev->dif_type, 151 dif_check_flags, 152 SPDK_DIF_REFTAG_IGNORE, 153 0xFFFF, SPDK_DIF_APPTAG_IGNORE, 154 0, 0, &dif_opts); 155 if (rc != 0) { 156 SPDK_ERRLOG("Initialization of DIF/DIX context failed\n"); 157 return rc; 158 } 159 160 if (bdev->md_interleave) { 161 struct iovec iov = { 162 .iov_base = mdisk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size, 163 .iov_len = bdev_io->u.bdev.num_blocks * block_size, 164 }; 165 166 rc = spdk_dif_generate(&iov, 1, bdev_io->u.bdev.num_blocks, &dif_ctx); 167 } else { 168 struct iovec iov = { 169 .iov_base = mdisk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size, 170 .iov_len = bdev_io->u.bdev.num_blocks * block_size, 171 }; 172 173 struct iovec md_iov = { 174 .iov_base = mdisk->malloc_md_buf + bdev_io->u.bdev.offset_blocks * bdev->md_len, 175 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 176 }; 177 178 rc = spdk_dix_generate(&iov, 1, &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx); 179 } 180 181 if (rc != 0) { 182 SPDK_ERRLOG("Formatting by DIF/DIX failed\n"); 183 } 184 185 186 return rc; 187 } 188 189 static void 190 malloc_done(void *ref, int status) 191 { 192 struct malloc_task *task = (struct malloc_task *)ref; 193 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(task); 194 int rc; 195 196 if (status != 0) { 197 if (status == -ENOMEM) { 198 if (task->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 199 task->status = SPDK_BDEV_IO_STATUS_NOMEM; 200 } 201 } else { 202 task->status = SPDK_BDEV_IO_STATUS_FAILED; 203 } 204 } 205 206 if (--task->num_outstanding != 0) { 207 return; 208 } 209 210 if (bdev_io->bdev->dif_type != SPDK_DIF_DISABLE && 211 task->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 212 switch (bdev_io->type) { 213 case SPDK_BDEV_IO_TYPE_READ: 214 if (!spdk_bdev_io_hide_metadata(bdev_io)) { 215 rc = malloc_verify_pi_io_buf(bdev_io); 216 } else { 217 rc = 0; 218 } 219 break; 220 case SPDK_BDEV_IO_TYPE_WRITE: 221 if (!spdk_bdev_io_hide_metadata(bdev_io)) { 222 rc = 0; 223 } else { 224 rc = malloc_verify_pi_malloc_buf(bdev_io); 225 } 226 break; 227 case SPDK_BDEV_IO_TYPE_UNMAP: 228 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 229 rc = malloc_unmap_write_zeroes_generate_pi(bdev_io); 230 break; 231 default: 232 rc = 0; 233 break; 234 } 235 236 if (rc != 0) { 237 task->status = SPDK_BDEV_IO_STATUS_FAILED; 238 } 239 } 240 241 assert(!bdev_io->u.bdev.accel_sequence || task->status == SPDK_BDEV_IO_STATUS_NOMEM); 242 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status); 243 } 244 245 static void 246 malloc_complete_task(struct malloc_task *task, struct malloc_channel *mch, 247 enum spdk_bdev_io_status status) 248 { 249 task->status = status; 250 TAILQ_INSERT_TAIL(&mch->completed_tasks, task, tailq); 251 } 252 253 static TAILQ_HEAD(, malloc_disk) g_malloc_disks = TAILQ_HEAD_INITIALIZER(g_malloc_disks); 254 255 int malloc_disk_count = 0; 256 257 static int bdev_malloc_initialize(void); 258 static void bdev_malloc_deinitialize(void); 259 260 static int 261 bdev_malloc_get_ctx_size(void) 262 { 263 return sizeof(struct malloc_task); 264 } 265 266 static struct spdk_bdev_module malloc_if = { 267 .name = "malloc", 268 .module_init = bdev_malloc_initialize, 269 .module_fini = bdev_malloc_deinitialize, 270 .get_ctx_size = bdev_malloc_get_ctx_size, 271 272 }; 273 274 SPDK_BDEV_MODULE_REGISTER(malloc, &malloc_if) 275 276 static void 277 malloc_disk_free(struct malloc_disk *malloc_disk) 278 { 279 if (!malloc_disk) { 280 return; 281 } 282 283 free(malloc_disk->disk.name); 284 spdk_free(malloc_disk->malloc_buf); 285 spdk_free(malloc_disk->malloc_md_buf); 286 free(malloc_disk); 287 } 288 289 static int 290 bdev_malloc_destruct(void *ctx) 291 { 292 struct malloc_disk *malloc_disk = ctx; 293 294 TAILQ_REMOVE(&g_malloc_disks, malloc_disk, link); 295 malloc_disk_free(malloc_disk); 296 return 0; 297 } 298 299 static int 300 bdev_malloc_check_iov_len(struct iovec *iovs, int iovcnt, size_t nbytes) 301 { 302 int i; 303 304 for (i = 0; i < iovcnt; i++) { 305 if (nbytes < iovs[i].iov_len) { 306 return 0; 307 } 308 309 nbytes -= iovs[i].iov_len; 310 } 311 312 return nbytes != 0; 313 } 314 315 static size_t 316 malloc_get_md_len(struct spdk_bdev_io *bdev_io) 317 { 318 return bdev_io->u.bdev.num_blocks * bdev_io->bdev->md_len; 319 } 320 321 static uint64_t 322 malloc_get_md_offset(struct spdk_bdev_io *bdev_io) 323 { 324 return bdev_io->u.bdev.offset_blocks * bdev_io->bdev->md_len; 325 } 326 327 static void * 328 malloc_get_md_buf(struct spdk_bdev_io *bdev_io) 329 { 330 struct malloc_disk *mdisk = SPDK_CONTAINEROF(bdev_io->bdev, struct malloc_disk, disk); 331 332 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 333 334 return (char *)mdisk->malloc_md_buf + malloc_get_md_offset(bdev_io); 335 } 336 337 static void 338 malloc_sequence_fail(struct malloc_task *task, int status) 339 { 340 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(task); 341 342 /* For ENOMEM, the IO will be retried by the bdev layer, so we don't abort the sequence */ 343 if (status != -ENOMEM) { 344 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 345 bdev_io->u.bdev.accel_sequence = NULL; 346 } 347 348 malloc_done(task, status); 349 } 350 351 static void 352 malloc_sequence_done(void *ctx, int status) 353 { 354 struct malloc_task *task = ctx; 355 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(task); 356 357 bdev_io->u.bdev.accel_sequence = NULL; 358 /* Prevent bdev layer from retrying the request if the sequence failed with ENOMEM */ 359 malloc_done(task, status != -ENOMEM ? status : -EFAULT); 360 } 361 362 static void 363 bdev_malloc_readv(struct malloc_disk *mdisk, struct spdk_io_channel *ch, 364 struct malloc_task *task, struct spdk_bdev_io *bdev_io) 365 { 366 uint64_t len, offset; 367 int res = 0; 368 369 len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 370 offset = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen; 371 372 if (bdev_malloc_check_iov_len(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, len)) { 373 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), 374 SPDK_BDEV_IO_STATUS_FAILED); 375 return; 376 } 377 378 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 379 task->num_outstanding = 0; 380 task->iov.iov_base = mdisk->malloc_buf + offset; 381 task->iov.iov_len = len; 382 383 SPDK_DEBUGLOG(bdev_malloc, "read %zu bytes from offset %#" PRIx64 ", iovcnt=%d\n", 384 len, offset, bdev_io->u.bdev.iovcnt); 385 386 task->num_outstanding++; 387 res = spdk_accel_append_copy(&bdev_io->u.bdev.accel_sequence, ch, 388 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 389 bdev_io->u.bdev.memory_domain, 390 bdev_io->u.bdev.memory_domain_ctx, 391 &task->iov, 1, NULL, NULL, NULL, NULL); 392 if (spdk_unlikely(res != 0)) { 393 malloc_sequence_fail(task, res); 394 return; 395 } 396 397 spdk_accel_sequence_reverse(bdev_io->u.bdev.accel_sequence); 398 spdk_accel_sequence_finish(bdev_io->u.bdev.accel_sequence, malloc_sequence_done, task); 399 400 if (bdev_io->u.bdev.md_buf == NULL) { 401 return; 402 } 403 404 SPDK_DEBUGLOG(bdev_malloc, "read metadata %zu bytes from offset%#" PRIx64 "\n", 405 malloc_get_md_len(bdev_io), malloc_get_md_offset(bdev_io)); 406 407 task->num_outstanding++; 408 res = spdk_accel_submit_copy(ch, bdev_io->u.bdev.md_buf, malloc_get_md_buf(bdev_io), 409 malloc_get_md_len(bdev_io), malloc_done, task); 410 if (res != 0) { 411 malloc_done(task, res); 412 } 413 } 414 415 static void 416 bdev_malloc_writev(struct malloc_disk *mdisk, struct spdk_io_channel *ch, 417 struct malloc_task *task, struct spdk_bdev_io *bdev_io) 418 { 419 uint64_t len, offset; 420 int res = 0; 421 422 len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 423 offset = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen; 424 425 if (bdev_malloc_check_iov_len(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, len)) { 426 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), 427 SPDK_BDEV_IO_STATUS_FAILED); 428 return; 429 } 430 431 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 432 task->num_outstanding = 0; 433 task->iov.iov_base = mdisk->malloc_buf + offset; 434 task->iov.iov_len = len; 435 436 SPDK_DEBUGLOG(bdev_malloc, "write %zu bytes to offset %#" PRIx64 ", iovcnt=%d\n", 437 len, offset, bdev_io->u.bdev.iovcnt); 438 439 task->num_outstanding++; 440 res = spdk_accel_append_copy(&bdev_io->u.bdev.accel_sequence, ch, &task->iov, 1, NULL, NULL, 441 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 442 bdev_io->u.bdev.memory_domain, 443 bdev_io->u.bdev.memory_domain_ctx, NULL, NULL); 444 if (spdk_unlikely(res != 0)) { 445 malloc_sequence_fail(task, res); 446 return; 447 } 448 449 spdk_accel_sequence_finish(bdev_io->u.bdev.accel_sequence, malloc_sequence_done, task); 450 451 if (bdev_io->u.bdev.md_buf == NULL) { 452 return; 453 } 454 455 SPDK_DEBUGLOG(bdev_malloc, "write metadata %zu bytes to offset %#" PRIx64 "\n", 456 malloc_get_md_len(bdev_io), malloc_get_md_offset(bdev_io)); 457 458 task->num_outstanding++; 459 res = spdk_accel_submit_copy(ch, malloc_get_md_buf(bdev_io), bdev_io->u.bdev.md_buf, 460 malloc_get_md_len(bdev_io), malloc_done, task); 461 if (res != 0) { 462 malloc_done(task, res); 463 } 464 } 465 466 static int 467 bdev_malloc_unmap(struct malloc_disk *mdisk, 468 struct spdk_io_channel *ch, 469 struct malloc_task *task, 470 uint64_t offset, 471 uint64_t byte_count) 472 { 473 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 474 task->num_outstanding = 1; 475 476 return spdk_accel_submit_fill(ch, mdisk->malloc_buf + offset, 0, 477 byte_count, malloc_done, task); 478 } 479 480 static void 481 bdev_malloc_copy(struct malloc_disk *mdisk, struct spdk_io_channel *ch, 482 struct malloc_task *task, 483 uint64_t dst_offset, uint64_t src_offset, size_t len) 484 { 485 int64_t res = 0; 486 void *dst = mdisk->malloc_buf + dst_offset; 487 void *src = mdisk->malloc_buf + src_offset; 488 489 SPDK_DEBUGLOG(bdev_malloc, "Copy %zu bytes from offset %#" PRIx64 " to offset %#" PRIx64 "\n", 490 len, src_offset, dst_offset); 491 492 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 493 task->num_outstanding = 1; 494 495 res = spdk_accel_submit_copy(ch, dst, src, len, malloc_done, task); 496 if (res != 0) { 497 malloc_done(task, res); 498 } 499 } 500 501 static int 502 _bdev_malloc_submit_request(struct malloc_channel *mch, struct spdk_bdev_io *bdev_io) 503 { 504 struct malloc_task *task = (struct malloc_task *)bdev_io->driver_ctx; 505 struct malloc_disk *disk = bdev_io->bdev->ctxt; 506 uint32_t block_size = bdev_io->bdev->blocklen; 507 int rc; 508 509 switch (bdev_io->type) { 510 case SPDK_BDEV_IO_TYPE_READ: 511 if (bdev_io->u.bdev.iovs[0].iov_base == NULL) { 512 assert(bdev_io->u.bdev.iovcnt == 1); 513 assert(bdev_io->u.bdev.memory_domain == NULL); 514 bdev_io->u.bdev.iovs[0].iov_base = 515 disk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size; 516 bdev_io->u.bdev.iovs[0].iov_len = bdev_io->u.bdev.num_blocks * block_size; 517 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 518 spdk_bdev_io_set_md_buf(bdev_io, malloc_get_md_buf(bdev_io), 519 malloc_get_md_len(bdev_io)); 520 } 521 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 522 return 0; 523 } 524 525 if (bdev_io->bdev->dif_type != SPDK_DIF_DISABLE && 526 spdk_bdev_io_hide_metadata(bdev_io)) { 527 rc = malloc_verify_pi_malloc_buf(bdev_io); 528 if (rc != 0) { 529 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_FAILED); 530 return 0; 531 } 532 } 533 534 bdev_malloc_readv(disk, mch->accel_channel, task, bdev_io); 535 return 0; 536 537 case SPDK_BDEV_IO_TYPE_WRITE: 538 if (bdev_io->bdev->dif_type != SPDK_DIF_DISABLE && 539 !spdk_bdev_io_hide_metadata(bdev_io)) { 540 rc = malloc_verify_pi_io_buf(bdev_io); 541 if (rc != 0) { 542 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_FAILED); 543 return 0; 544 } 545 } 546 547 bdev_malloc_writev(disk, mch->accel_channel, task, bdev_io); 548 return 0; 549 550 case SPDK_BDEV_IO_TYPE_RESET: 551 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 552 return 0; 553 554 case SPDK_BDEV_IO_TYPE_FLUSH: 555 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 556 return 0; 557 558 case SPDK_BDEV_IO_TYPE_UNMAP: 559 return bdev_malloc_unmap(disk, mch->accel_channel, task, 560 bdev_io->u.bdev.offset_blocks * block_size, 561 bdev_io->u.bdev.num_blocks * block_size); 562 563 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 564 /* bdev_malloc_unmap is implemented with a call to mem_cpy_fill which zeroes out all of the requested bytes. */ 565 return bdev_malloc_unmap(disk, mch->accel_channel, task, 566 bdev_io->u.bdev.offset_blocks * block_size, 567 bdev_io->u.bdev.num_blocks * block_size); 568 569 case SPDK_BDEV_IO_TYPE_ZCOPY: 570 if (bdev_io->u.bdev.zcopy.start) { 571 void *buf; 572 size_t len; 573 574 buf = disk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size; 575 len = bdev_io->u.bdev.num_blocks * block_size; 576 spdk_bdev_io_set_buf(bdev_io, buf, len); 577 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 578 spdk_bdev_io_set_md_buf(bdev_io, malloc_get_md_buf(bdev_io), 579 malloc_get_md_len(bdev_io)); 580 } 581 } 582 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 583 return 0; 584 case SPDK_BDEV_IO_TYPE_ABORT: 585 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_FAILED); 586 return 0; 587 case SPDK_BDEV_IO_TYPE_COPY: 588 bdev_malloc_copy(disk, mch->accel_channel, task, 589 bdev_io->u.bdev.offset_blocks * block_size, 590 bdev_io->u.bdev.copy.src_offset_blocks * block_size, 591 bdev_io->u.bdev.num_blocks * block_size); 592 return 0; 593 594 default: 595 return -1; 596 } 597 return 0; 598 } 599 600 static void 601 bdev_malloc_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 602 { 603 struct malloc_channel *mch = spdk_io_channel_get_ctx(ch); 604 605 if (_bdev_malloc_submit_request(mch, bdev_io) != 0) { 606 malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch, 607 SPDK_BDEV_IO_STATUS_FAILED); 608 } 609 } 610 611 static bool 612 bdev_malloc_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 613 { 614 switch (io_type) { 615 case SPDK_BDEV_IO_TYPE_READ: 616 case SPDK_BDEV_IO_TYPE_WRITE: 617 case SPDK_BDEV_IO_TYPE_FLUSH: 618 case SPDK_BDEV_IO_TYPE_RESET: 619 case SPDK_BDEV_IO_TYPE_UNMAP: 620 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 621 case SPDK_BDEV_IO_TYPE_ZCOPY: 622 case SPDK_BDEV_IO_TYPE_ABORT: 623 case SPDK_BDEV_IO_TYPE_COPY: 624 return true; 625 626 default: 627 return false; 628 } 629 } 630 631 static struct spdk_io_channel * 632 bdev_malloc_get_io_channel(void *ctx) 633 { 634 return spdk_get_io_channel(&g_malloc_disks); 635 } 636 637 static void 638 bdev_malloc_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 639 { 640 spdk_json_write_object_begin(w); 641 642 spdk_json_write_named_string(w, "method", "bdev_malloc_create"); 643 644 spdk_json_write_named_object_begin(w, "params"); 645 spdk_json_write_named_string(w, "name", bdev->name); 646 spdk_json_write_named_uint64(w, "num_blocks", bdev->blockcnt); 647 spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); 648 spdk_json_write_named_uint32(w, "physical_block_size", bdev->phys_blocklen); 649 spdk_json_write_named_uuid(w, "uuid", &bdev->uuid); 650 spdk_json_write_named_uint32(w, "optimal_io_boundary", bdev->optimal_io_boundary); 651 spdk_json_write_named_uint32(w, "md_size", bdev->md_len); 652 spdk_json_write_named_uint32(w, "dif_type", bdev->dif_type); 653 spdk_json_write_named_bool(w, "dif_is_head_of_md", bdev->dif_is_head_of_md); 654 spdk_json_write_named_uint32(w, "dif_pi_format", bdev->dif_pi_format); 655 656 spdk_json_write_object_end(w); 657 658 spdk_json_write_object_end(w); 659 } 660 661 static int 662 bdev_malloc_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 663 { 664 struct malloc_disk *malloc_disk = ctx; 665 struct spdk_memory_domain *domain; 666 int num_domains = 0; 667 668 if (malloc_disk->disk.dif_type != SPDK_DIF_DISABLE) { 669 return 0; 670 } 671 672 /* Report support for every memory domain */ 673 for (domain = spdk_memory_domain_get_first(NULL); domain != NULL; 674 domain = spdk_memory_domain_get_next(domain, NULL)) { 675 if (domains != NULL && num_domains < array_size) { 676 domains[num_domains] = domain; 677 } 678 num_domains++; 679 } 680 681 return num_domains; 682 } 683 684 static bool 685 bdev_malloc_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 686 { 687 switch (type) { 688 case SPDK_BDEV_IO_TYPE_READ: 689 case SPDK_BDEV_IO_TYPE_WRITE: 690 return true; 691 default: 692 return false; 693 } 694 } 695 696 static const struct spdk_bdev_fn_table malloc_fn_table = { 697 .destruct = bdev_malloc_destruct, 698 .submit_request = bdev_malloc_submit_request, 699 .io_type_supported = bdev_malloc_io_type_supported, 700 .get_io_channel = bdev_malloc_get_io_channel, 701 .write_config_json = bdev_malloc_write_json_config, 702 .get_memory_domains = bdev_malloc_get_memory_domains, 703 .accel_sequence_supported = bdev_malloc_accel_sequence_supported, 704 }; 705 706 static int 707 malloc_disk_setup_pi(struct malloc_disk *mdisk) 708 { 709 struct spdk_bdev *bdev = &mdisk->disk; 710 struct spdk_dif_ctx dif_ctx; 711 struct iovec iov, md_iov; 712 uint32_t dif_check_flags; 713 int rc; 714 struct spdk_dif_ctx_init_ext_opts dif_opts; 715 716 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 717 dif_opts.dif_pi_format = bdev->dif_pi_format; 718 /* Set APPTAG|REFTAG_IGNORE to PI fields after creation of malloc bdev */ 719 dif_check_flags = bdev->dif_check_flags | SPDK_DIF_CHECK_TYPE_REFTAG | 720 SPDK_DIF_FLAGS_APPTAG_CHECK; 721 rc = spdk_dif_ctx_init(&dif_ctx, 722 bdev->blocklen, 723 bdev->md_len, 724 bdev->md_interleave, 725 bdev->dif_is_head_of_md, 726 bdev->dif_type, 727 dif_check_flags, 728 SPDK_DIF_REFTAG_IGNORE, 729 0xFFFF, SPDK_DIF_APPTAG_IGNORE, 730 0, 0, &dif_opts); 731 if (rc != 0) { 732 SPDK_ERRLOG("Initialization of DIF/DIX context failed\n"); 733 return rc; 734 } 735 736 iov.iov_base = mdisk->malloc_buf; 737 iov.iov_len = bdev->blockcnt * bdev->blocklen; 738 739 if (mdisk->disk.md_interleave) { 740 rc = spdk_dif_generate(&iov, 1, bdev->blockcnt, &dif_ctx); 741 } else { 742 md_iov.iov_base = mdisk->malloc_md_buf; 743 md_iov.iov_len = bdev->blockcnt * bdev->md_len; 744 745 rc = spdk_dix_generate(&iov, 1, &md_iov, bdev->blockcnt, &dif_ctx); 746 } 747 748 if (rc != 0) { 749 SPDK_ERRLOG("Formatting by DIF/DIX failed\n"); 750 } 751 752 return rc; 753 } 754 755 int 756 create_malloc_disk(struct spdk_bdev **bdev, const struct malloc_bdev_opts *opts) 757 { 758 struct malloc_disk *mdisk; 759 uint32_t block_size; 760 int rc; 761 762 assert(opts != NULL); 763 764 if (opts->num_blocks == 0) { 765 SPDK_ERRLOG("Disk num_blocks must be greater than 0"); 766 return -EINVAL; 767 } 768 769 if (opts->block_size % 512) { 770 SPDK_ERRLOG("Data block size must be 512 bytes aligned\n"); 771 return -EINVAL; 772 } 773 774 if (opts->physical_block_size % 512) { 775 SPDK_ERRLOG("Physical block must be 512 bytes aligned\n"); 776 return -EINVAL; 777 } 778 779 switch (opts->md_size) { 780 case 0: 781 case 8: 782 case 16: 783 case 32: 784 case 64: 785 case 128: 786 break; 787 default: 788 SPDK_ERRLOG("metadata size %u is not supported\n", opts->md_size); 789 return -EINVAL; 790 } 791 792 if (opts->md_interleave) { 793 block_size = opts->block_size + opts->md_size; 794 } else { 795 block_size = opts->block_size; 796 } 797 798 mdisk = calloc(1, sizeof(*mdisk)); 799 if (!mdisk) { 800 SPDK_ERRLOG("mdisk calloc() failed\n"); 801 return -ENOMEM; 802 } 803 804 /* 805 * Allocate the large backend memory buffer from pinned memory. 806 * 807 * TODO: need to pass a hint so we know which socket to allocate 808 * from on multi-socket systems. 809 */ 810 mdisk->malloc_buf = spdk_zmalloc(opts->num_blocks * block_size, 2 * 1024 * 1024, NULL, 811 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 812 if (!mdisk->malloc_buf) { 813 SPDK_ERRLOG("malloc_buf spdk_zmalloc() failed\n"); 814 malloc_disk_free(mdisk); 815 return -ENOMEM; 816 } 817 818 if (!opts->md_interleave && opts->md_size != 0) { 819 mdisk->malloc_md_buf = spdk_zmalloc(opts->num_blocks * opts->md_size, 2 * 1024 * 1024, NULL, 820 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 821 if (!mdisk->malloc_md_buf) { 822 SPDK_ERRLOG("malloc_md_buf spdk_zmalloc() failed\n"); 823 malloc_disk_free(mdisk); 824 return -ENOMEM; 825 } 826 } 827 828 if (opts->name) { 829 mdisk->disk.name = strdup(opts->name); 830 } else { 831 /* Auto-generate a name */ 832 mdisk->disk.name = spdk_sprintf_alloc("Malloc%d", malloc_disk_count); 833 malloc_disk_count++; 834 } 835 if (!mdisk->disk.name) { 836 malloc_disk_free(mdisk); 837 return -ENOMEM; 838 } 839 mdisk->disk.product_name = "Malloc disk"; 840 841 mdisk->disk.write_cache = 1; 842 mdisk->disk.blocklen = block_size; 843 mdisk->disk.phys_blocklen = opts->physical_block_size; 844 mdisk->disk.blockcnt = opts->num_blocks; 845 mdisk->disk.md_len = opts->md_size; 846 mdisk->disk.md_interleave = opts->md_interleave; 847 mdisk->disk.dif_type = opts->dif_type; 848 mdisk->disk.dif_is_head_of_md = opts->dif_is_head_of_md; 849 /* Current block device layer API does not propagate 850 * any DIF related information from user. So, we can 851 * not generate or verify Application Tag. 852 */ 853 switch (opts->dif_type) { 854 case SPDK_DIF_TYPE1: 855 case SPDK_DIF_TYPE2: 856 mdisk->disk.dif_check_flags = SPDK_DIF_FLAGS_GUARD_CHECK | 857 SPDK_DIF_FLAGS_REFTAG_CHECK; 858 break; 859 case SPDK_DIF_TYPE3: 860 mdisk->disk.dif_check_flags = SPDK_DIF_FLAGS_GUARD_CHECK; 861 break; 862 case SPDK_DIF_DISABLE: 863 break; 864 } 865 mdisk->disk.dif_pi_format = opts->dif_pi_format; 866 867 if (opts->dif_type != SPDK_DIF_DISABLE) { 868 rc = malloc_disk_setup_pi(mdisk); 869 if (rc) { 870 SPDK_ERRLOG("Failed to set up protection information.\n"); 871 malloc_disk_free(mdisk); 872 return rc; 873 } 874 } 875 876 if (opts->optimal_io_boundary) { 877 mdisk->disk.optimal_io_boundary = opts->optimal_io_boundary; 878 mdisk->disk.split_on_optimal_io_boundary = true; 879 } 880 if (!spdk_uuid_is_null(&opts->uuid)) { 881 spdk_uuid_copy(&mdisk->disk.uuid, &opts->uuid); 882 } 883 884 mdisk->disk.max_copy = 0; 885 mdisk->disk.ctxt = mdisk; 886 mdisk->disk.fn_table = &malloc_fn_table; 887 mdisk->disk.module = &malloc_if; 888 889 rc = spdk_bdev_register(&mdisk->disk); 890 if (rc) { 891 malloc_disk_free(mdisk); 892 return rc; 893 } 894 895 *bdev = &(mdisk->disk); 896 897 TAILQ_INSERT_TAIL(&g_malloc_disks, mdisk, link); 898 899 return rc; 900 } 901 902 void 903 delete_malloc_disk(const char *name, spdk_delete_malloc_complete cb_fn, void *cb_arg) 904 { 905 int rc; 906 907 rc = spdk_bdev_unregister_by_name(name, &malloc_if, cb_fn, cb_arg); 908 if (rc != 0) { 909 cb_fn(cb_arg, rc); 910 } 911 } 912 913 static int 914 malloc_completion_poller(void *ctx) 915 { 916 struct malloc_channel *ch = ctx; 917 struct malloc_task *task; 918 TAILQ_HEAD(, malloc_task) completed_tasks; 919 uint32_t num_completions = 0; 920 921 TAILQ_INIT(&completed_tasks); 922 TAILQ_SWAP(&completed_tasks, &ch->completed_tasks, malloc_task, tailq); 923 924 while (!TAILQ_EMPTY(&completed_tasks)) { 925 task = TAILQ_FIRST(&completed_tasks); 926 TAILQ_REMOVE(&completed_tasks, task, tailq); 927 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status); 928 num_completions++; 929 } 930 931 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 932 } 933 934 static int 935 malloc_create_channel_cb(void *io_device, void *ctx) 936 { 937 struct malloc_channel *ch = ctx; 938 939 ch->accel_channel = spdk_accel_get_io_channel(); 940 if (!ch->accel_channel) { 941 SPDK_ERRLOG("Failed to get accel framework's IO channel\n"); 942 return -ENOMEM; 943 } 944 945 ch->completion_poller = SPDK_POLLER_REGISTER(malloc_completion_poller, ch, 0); 946 if (!ch->completion_poller) { 947 SPDK_ERRLOG("Failed to register malloc completion poller\n"); 948 spdk_put_io_channel(ch->accel_channel); 949 return -ENOMEM; 950 } 951 952 TAILQ_INIT(&ch->completed_tasks); 953 954 return 0; 955 } 956 957 static void 958 malloc_destroy_channel_cb(void *io_device, void *ctx) 959 { 960 struct malloc_channel *ch = ctx; 961 962 assert(TAILQ_EMPTY(&ch->completed_tasks)); 963 964 spdk_put_io_channel(ch->accel_channel); 965 spdk_poller_unregister(&ch->completion_poller); 966 } 967 968 static int 969 bdev_malloc_initialize(void) 970 { 971 /* This needs to be reset for each reinitialization of submodules. 972 * Otherwise after enough devices or reinitializations the value gets too high. 973 * TODO: Make malloc bdev name mandatory and remove this counter. */ 974 malloc_disk_count = 0; 975 976 spdk_io_device_register(&g_malloc_disks, malloc_create_channel_cb, 977 malloc_destroy_channel_cb, sizeof(struct malloc_channel), 978 "bdev_malloc"); 979 980 return 0; 981 } 982 983 static void 984 bdev_malloc_deinitialize(void) 985 { 986 spdk_io_device_unregister(&g_malloc_disks, NULL); 987 } 988 989 SPDK_LOG_REGISTER_COMPONENT(bdev_malloc) 990