1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2017 Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "bdev_malloc.h" 10 #include "spdk/endian.h" 11 #include "spdk/env.h" 12 #include "spdk/accel.h" 13 #include "spdk/dma.h" 14 #include "spdk/likely.h" 15 #include "spdk/string.h" 16 17 #include "spdk/log.h" 18 19 struct malloc_disk { 20 struct spdk_bdev disk; 21 void *malloc_buf; 22 void *malloc_md_buf; 23 TAILQ_ENTRY(malloc_disk) link; 24 }; 25 26 struct malloc_task { 27 struct iovec iov; 28 int num_outstanding; 29 enum spdk_bdev_io_status status; 30 TAILQ_ENTRY(malloc_task) tailq; 31 }; 32 33 struct malloc_channel { 34 struct spdk_io_channel *accel_channel; 35 struct spdk_poller *completion_poller; 36 TAILQ_HEAD(, malloc_task) completed_tasks; 37 }; 38 39 static int 40 malloc_verify_pi(struct spdk_bdev_io *bdev_io) 41 { 42 struct spdk_bdev *bdev = bdev_io->bdev; 43 struct spdk_dif_ctx dif_ctx; 44 struct spdk_dif_error err_blk; 45 int rc; 46 struct spdk_dif_ctx_init_ext_opts dif_opts; 47 48 assert(bdev_io->u.bdev.memory_domain == NULL); 49 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 50 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 51 rc = spdk_dif_ctx_init(&dif_ctx, 52 bdev->blocklen, 53 bdev->md_len, 54 bdev->md_interleave, 55 bdev->dif_is_head_of_md, 56 bdev->dif_type, 57 bdev->dif_check_flags, 58 bdev_io->u.bdev.offset_blocks & 0xFFFFFFFF, 59 0xFFFF, 0, 0, 0, &dif_opts); 60 if (rc != 0) { 61 SPDK_ERRLOG("Failed to initialize DIF/DIX context\n"); 62 return rc; 63 } 64 65 if (spdk_bdev_is_md_interleaved(bdev)) { 66 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, 67 bdev_io->u.bdev.iovcnt, 68 bdev_io->u.bdev.num_blocks, 69 &dif_ctx, 70 &err_blk); 71 } else { 72 struct iovec md_iov = { 73 .iov_base = bdev_io->u.bdev.md_buf, 74 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 75 }; 76 77 if (bdev_io->u.bdev.md_buf == NULL) { 78 return 0; 79 } 80 81 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, 82 bdev_io->u.bdev.iovcnt, 83 &md_iov, 84 bdev_io->u.bdev.num_blocks, 85 &dif_ctx, 86 &err_blk); 87 } 88 89 if (rc != 0) { 90 SPDK_ERRLOG("DIF/DIX verify failed: lba %" PRIu64 ", num_blocks %" PRIu64 ", " 91 "err_type %u, expected %lu, actual %lu, err_offset %u\n", 92 bdev_io->u.bdev.offset_blocks, 93 bdev_io->u.bdev.num_blocks, 94 err_blk.err_type, 95 err_blk.expected, 96 err_blk.actual, 97 err_blk.err_offset); 98 } 99 100 return rc; 101 } 102 103 static int 104 malloc_unmap_write_zeroes_generate_pi(struct spdk_bdev_io *bdev_io) 105 { 106 struct spdk_bdev *bdev = bdev_io->bdev; 107 struct malloc_disk *mdisk = bdev_io->bdev->ctxt; 108 uint32_t block_size = bdev_io->bdev->blocklen; 109 struct spdk_dif_ctx dif_ctx; 110 struct spdk_dif_ctx_init_ext_opts dif_opts; 111 int rc; 112 113 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 114 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 115 rc = spdk_dif_ctx_init(&dif_ctx, 116 bdev->blocklen, 117 bdev->md_len, 118 bdev->md_interleave, 119 bdev->dif_is_head_of_md, 120 bdev->dif_type, 121 bdev->dif_check_flags, 122 SPDK_DIF_REFTAG_IGNORE, 123 0xFFFF, SPDK_DIF_APPTAG_IGNORE, 124 0, 0, &dif_opts); 125 if (rc != 0) { 126 SPDK_ERRLOG("Initialization of DIF/DIX context failed\n"); 127 return rc; 128 } 129 130 if (bdev->md_interleave) { 131 struct iovec iov = { 132 .iov_base = mdisk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size, 133 .iov_len = bdev_io->u.bdev.num_blocks * block_size, 134 }; 135 136 rc = spdk_dif_generate(&iov, 1, bdev_io->u.bdev.num_blocks, &dif_ctx); 137 } else { 138 struct iovec iov = { 139 .iov_base = mdisk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size, 140 .iov_len = bdev_io->u.bdev.num_blocks * block_size, 141 }; 142 143 struct iovec md_iov = { 144 .iov_base = mdisk->malloc_md_buf + bdev_io->u.bdev.offset_blocks * bdev->md_len, 145 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 146 }; 147 148 rc = spdk_dix_generate(&iov, 1, &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx); 149 } 150 151 if (rc != 0) { 152 SPDK_ERRLOG("Formatting by DIF/DIX failed\n"); 153 } 154 155 156 return rc; 157 } 158 159 static void 160 malloc_done(void *ref, int status) 161 { 162 struct malloc_task *task = (struct malloc_task *)ref; 163 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(task); 164 int rc; 165 166 if (status != 0) { 167 if (status == -ENOMEM) { 168 if (task->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 169 task->status = SPDK_BDEV_IO_STATUS_NOMEM; 170 } 171 } else { 172 task->status = SPDK_BDEV_IO_STATUS_FAILED; 173 } 174 } 175 176 if (--task->num_outstanding != 0) { 177 return; 178 } 179 180 if (bdev_io->bdev->dif_type != SPDK_DIF_DISABLE && 181 bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 182 task->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 183 rc = malloc_verify_pi(bdev_io); 184 if (rc != 0) { 185 task->status = SPDK_BDEV_IO_STATUS_FAILED; 186 } 187 } 188 189 if (bdev_io->bdev->dif_type != SPDK_DIF_DISABLE && 190 (bdev_io->type == SPDK_BDEV_IO_TYPE_UNMAP || bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 191 task->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 192 rc = malloc_unmap_write_zeroes_generate_pi(bdev_io); 193 if (rc != 0) { 194 task->status = SPDK_BDEV_IO_STATUS_FAILED; 195 } 196 } 197 198 assert(!bdev_io->u.bdev.accel_sequence || task->status == SPDK_BDEV_IO_STATUS_NOMEM); 199 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status); 200 } 201 202 static void 203 malloc_complete_task(struct malloc_task *task, struct malloc_channel *mch, 204 enum spdk_bdev_io_status status) 205 { 206 task->status = status; 207 TAILQ_INSERT_TAIL(&mch->completed_tasks, task, tailq); 208 } 209 210 static TAILQ_HEAD(, malloc_disk) g_malloc_disks = TAILQ_HEAD_INITIALIZER(g_malloc_disks); 211 212 int malloc_disk_count = 0; 213 214 static int bdev_malloc_initialize(void); 215 static void bdev_malloc_deinitialize(void); 216 217 static int 218 bdev_malloc_get_ctx_size(void) 219 { 220 return sizeof(struct malloc_task); 221 } 222 223 static struct spdk_bdev_module malloc_if = { 224 .name = "malloc", 225 .module_init = bdev_malloc_initialize, 226 .module_fini = bdev_malloc_deinitialize, 227 .get_ctx_size = bdev_malloc_get_ctx_size, 228 229 }; 230 231 SPDK_BDEV_MODULE_REGISTER(malloc, &malloc_if) 232 233 static void 234 malloc_disk_free(struct malloc_disk *malloc_disk) 235 { 236 if (!malloc_disk) { 237 return; 238 } 239 240 free(malloc_disk->disk.name); 241 spdk_free(malloc_disk->malloc_buf); 242 spdk_free(malloc_disk->malloc_md_buf); 243 free(malloc_disk); 244 } 245 246 static int 247 bdev_malloc_destruct(void *ctx) 248 { 249 struct malloc_disk *malloc_disk = ctx; 250 251 TAILQ_REMOVE(&g_malloc_disks, malloc_disk, link); 252 malloc_disk_free(malloc_disk); 253 return 0; 254 } 255 256 static int 257 bdev_malloc_check_iov_len(struct iovec *iovs, int iovcnt, size_t nbytes) 258 { 259 int i; 260 261 for (i = 0; i < iovcnt; i++) { 262 if (nbytes < iovs[i].iov_len) { 263 return 0; 264 } 265 266 nbytes -= iovs[i].iov_len; 267 } 268 269 return nbytes != 0; 270 } 271 272 static void 273 malloc_sequence_fail(struct malloc_task *task, int status) 274 { 275 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(task); 276 277 /* For ENOMEM, the IO will be retried by the bdev layer, so we don't abort the sequence */ 278 if (status != -ENOMEM) { 279 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 280 bdev_io->u.bdev.accel_sequence = NULL; 281 } 282 283 malloc_done(task, status); 284 } 285 286 static void 287 malloc_sequence_done(void *ctx, int status) 288 { 289 struct malloc_task *task = ctx; 290 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(task); 291 292 bdev_io->u.bdev.accel_sequence = NULL; 293 /* Prevent bdev layer from retrying the request if the sequence failed with ENOMEM */ 294 malloc_done(task, status != -ENOMEM ? status : -EFAULT); 295 } 296 297 static void 298 bdev_malloc_readv(struct malloc_disk *mdisk, struct spdk_io_channel *ch, 299 struct malloc_task *task, struct spdk_bdev_io *bdev_io) 300 { 301 uint64_t len, offset, md_offset; 302 int res = 0; 303 size_t md_len; 304 305 len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 306 offset = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen; 307 308 if (bdev_malloc_check_iov_len(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, len)) { 309 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), 310 SPDK_BDEV_IO_STATUS_FAILED); 311 return; 312 } 313 314 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 315 task->num_outstanding = 0; 316 task->iov.iov_base = mdisk->malloc_buf + offset; 317 task->iov.iov_len = len; 318 319 SPDK_DEBUGLOG(bdev_malloc, "read %zu bytes from offset %#" PRIx64 ", iovcnt=%d\n", 320 len, offset, bdev_io->u.bdev.iovcnt); 321 322 task->num_outstanding++; 323 res = spdk_accel_append_copy(&bdev_io->u.bdev.accel_sequence, ch, 324 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 325 bdev_io->u.bdev.memory_domain, 326 bdev_io->u.bdev.memory_domain_ctx, 327 &task->iov, 1, NULL, NULL, 0, NULL, NULL); 328 if (spdk_unlikely(res != 0)) { 329 malloc_sequence_fail(task, res); 330 return; 331 } 332 333 spdk_accel_sequence_reverse(bdev_io->u.bdev.accel_sequence); 334 spdk_accel_sequence_finish(bdev_io->u.bdev.accel_sequence, malloc_sequence_done, task); 335 336 if (bdev_io->u.bdev.md_buf == NULL) { 337 return; 338 } 339 340 md_len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->md_len; 341 md_offset = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->md_len; 342 343 SPDK_DEBUGLOG(bdev_malloc, "read metadata %zu bytes from offset%#" PRIx64 "\n", 344 md_len, md_offset); 345 346 task->num_outstanding++; 347 res = spdk_accel_submit_copy(ch, bdev_io->u.bdev.md_buf, mdisk->malloc_md_buf + md_offset, 348 md_len, 0, malloc_done, task); 349 if (res != 0) { 350 malloc_done(task, res); 351 } 352 } 353 354 static void 355 bdev_malloc_writev(struct malloc_disk *mdisk, struct spdk_io_channel *ch, 356 struct malloc_task *task, struct spdk_bdev_io *bdev_io) 357 { 358 uint64_t len, offset, md_offset; 359 int res = 0; 360 size_t md_len; 361 362 len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 363 offset = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen; 364 365 if (bdev_malloc_check_iov_len(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, len)) { 366 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), 367 SPDK_BDEV_IO_STATUS_FAILED); 368 return; 369 } 370 371 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 372 task->num_outstanding = 0; 373 task->iov.iov_base = mdisk->malloc_buf + offset; 374 task->iov.iov_len = len; 375 376 SPDK_DEBUGLOG(bdev_malloc, "wrote %zu bytes to offset %#" PRIx64 ", iovcnt=%d\n", 377 len, offset, bdev_io->u.bdev.iovcnt); 378 379 task->num_outstanding++; 380 res = spdk_accel_append_copy(&bdev_io->u.bdev.accel_sequence, ch, &task->iov, 1, NULL, NULL, 381 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 382 bdev_io->u.bdev.memory_domain, 383 bdev_io->u.bdev.memory_domain_ctx, 0, NULL, NULL); 384 if (spdk_unlikely(res != 0)) { 385 malloc_sequence_fail(task, res); 386 return; 387 } 388 389 spdk_accel_sequence_finish(bdev_io->u.bdev.accel_sequence, malloc_sequence_done, task); 390 391 if (bdev_io->u.bdev.md_buf == NULL) { 392 return; 393 } 394 395 md_len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->md_len; 396 md_offset = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->md_len; 397 398 SPDK_DEBUGLOG(bdev_malloc, "wrote metadata %zu bytes to offset %#" PRIx64 "\n", 399 md_len, md_offset); 400 401 task->num_outstanding++; 402 res = spdk_accel_submit_copy(ch, mdisk->malloc_md_buf + md_offset, bdev_io->u.bdev.md_buf, 403 md_len, 0, malloc_done, task); 404 if (res != 0) { 405 malloc_done(task, res); 406 } 407 } 408 409 static int 410 bdev_malloc_unmap(struct malloc_disk *mdisk, 411 struct spdk_io_channel *ch, 412 struct malloc_task *task, 413 uint64_t offset, 414 uint64_t byte_count) 415 { 416 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 417 task->num_outstanding = 1; 418 419 return spdk_accel_submit_fill(ch, mdisk->malloc_buf + offset, 0, 420 byte_count, 0, malloc_done, task); 421 } 422 423 static void 424 bdev_malloc_copy(struct malloc_disk *mdisk, struct spdk_io_channel *ch, 425 struct malloc_task *task, 426 uint64_t dst_offset, uint64_t src_offset, size_t len) 427 { 428 int64_t res = 0; 429 void *dst = mdisk->malloc_buf + dst_offset; 430 void *src = mdisk->malloc_buf + src_offset; 431 432 SPDK_DEBUGLOG(bdev_malloc, "Copy %zu bytes from offset %#" PRIx64 " to offset %#" PRIx64 "\n", 433 len, src_offset, dst_offset); 434 435 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 436 task->num_outstanding = 1; 437 438 res = spdk_accel_submit_copy(ch, dst, src, len, 0, malloc_done, task); 439 if (res != 0) { 440 malloc_done(task, res); 441 } 442 } 443 444 static int 445 _bdev_malloc_submit_request(struct malloc_channel *mch, struct spdk_bdev_io *bdev_io) 446 { 447 struct malloc_task *task = (struct malloc_task *)bdev_io->driver_ctx; 448 struct malloc_disk *disk = bdev_io->bdev->ctxt; 449 uint32_t block_size = bdev_io->bdev->blocklen; 450 int rc; 451 452 switch (bdev_io->type) { 453 case SPDK_BDEV_IO_TYPE_READ: 454 if (bdev_io->u.bdev.iovs[0].iov_base == NULL) { 455 assert(bdev_io->u.bdev.iovcnt == 1); 456 assert(bdev_io->u.bdev.memory_domain == NULL); 457 bdev_io->u.bdev.iovs[0].iov_base = 458 disk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size; 459 bdev_io->u.bdev.iovs[0].iov_len = bdev_io->u.bdev.num_blocks * block_size; 460 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 461 return 0; 462 } 463 464 bdev_malloc_readv(disk, mch->accel_channel, task, bdev_io); 465 return 0; 466 467 case SPDK_BDEV_IO_TYPE_WRITE: 468 if (bdev_io->bdev->dif_type != SPDK_DIF_DISABLE) { 469 rc = malloc_verify_pi(bdev_io); 470 if (rc != 0) { 471 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_FAILED); 472 return 0; 473 } 474 } 475 476 bdev_malloc_writev(disk, mch->accel_channel, task, bdev_io); 477 return 0; 478 479 case SPDK_BDEV_IO_TYPE_RESET: 480 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 481 return 0; 482 483 case SPDK_BDEV_IO_TYPE_FLUSH: 484 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 485 return 0; 486 487 case SPDK_BDEV_IO_TYPE_UNMAP: 488 return bdev_malloc_unmap(disk, mch->accel_channel, task, 489 bdev_io->u.bdev.offset_blocks * block_size, 490 bdev_io->u.bdev.num_blocks * block_size); 491 492 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 493 /* bdev_malloc_unmap is implemented with a call to mem_cpy_fill which zeroes out all of the requested bytes. */ 494 return bdev_malloc_unmap(disk, mch->accel_channel, task, 495 bdev_io->u.bdev.offset_blocks * block_size, 496 bdev_io->u.bdev.num_blocks * block_size); 497 498 case SPDK_BDEV_IO_TYPE_ZCOPY: 499 if (bdev_io->u.bdev.zcopy.start) { 500 void *buf; 501 size_t len; 502 503 buf = disk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size; 504 len = bdev_io->u.bdev.num_blocks * block_size; 505 spdk_bdev_io_set_buf(bdev_io, buf, len); 506 507 } 508 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 509 return 0; 510 case SPDK_BDEV_IO_TYPE_ABORT: 511 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_FAILED); 512 return 0; 513 case SPDK_BDEV_IO_TYPE_COPY: 514 bdev_malloc_copy(disk, mch->accel_channel, task, 515 bdev_io->u.bdev.offset_blocks * block_size, 516 bdev_io->u.bdev.copy.src_offset_blocks * block_size, 517 bdev_io->u.bdev.num_blocks * block_size); 518 return 0; 519 520 default: 521 return -1; 522 } 523 return 0; 524 } 525 526 static void 527 bdev_malloc_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 528 { 529 struct malloc_channel *mch = spdk_io_channel_get_ctx(ch); 530 531 if (_bdev_malloc_submit_request(mch, bdev_io) != 0) { 532 malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch, 533 SPDK_BDEV_IO_STATUS_FAILED); 534 } 535 } 536 537 static bool 538 bdev_malloc_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 539 { 540 switch (io_type) { 541 case SPDK_BDEV_IO_TYPE_READ: 542 case SPDK_BDEV_IO_TYPE_WRITE: 543 case SPDK_BDEV_IO_TYPE_FLUSH: 544 case SPDK_BDEV_IO_TYPE_RESET: 545 case SPDK_BDEV_IO_TYPE_UNMAP: 546 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 547 case SPDK_BDEV_IO_TYPE_ZCOPY: 548 case SPDK_BDEV_IO_TYPE_ABORT: 549 case SPDK_BDEV_IO_TYPE_COPY: 550 return true; 551 552 default: 553 return false; 554 } 555 } 556 557 static struct spdk_io_channel * 558 bdev_malloc_get_io_channel(void *ctx) 559 { 560 return spdk_get_io_channel(&g_malloc_disks); 561 } 562 563 static void 564 bdev_malloc_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 565 { 566 spdk_json_write_object_begin(w); 567 568 spdk_json_write_named_string(w, "method", "bdev_malloc_create"); 569 570 spdk_json_write_named_object_begin(w, "params"); 571 spdk_json_write_named_string(w, "name", bdev->name); 572 spdk_json_write_named_uint64(w, "num_blocks", bdev->blockcnt); 573 spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); 574 spdk_json_write_named_uint32(w, "physical_block_size", bdev->phys_blocklen); 575 spdk_json_write_named_uuid(w, "uuid", &bdev->uuid); 576 spdk_json_write_named_uint32(w, "optimal_io_boundary", bdev->optimal_io_boundary); 577 578 spdk_json_write_object_end(w); 579 580 spdk_json_write_object_end(w); 581 } 582 583 static int 584 bdev_malloc_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 585 { 586 struct malloc_disk *malloc_disk = ctx; 587 struct spdk_memory_domain *domain; 588 int num_domains = 0; 589 590 if (malloc_disk->disk.dif_type != SPDK_DIF_DISABLE) { 591 return 0; 592 } 593 594 /* Report support for every memory domain */ 595 for (domain = spdk_memory_domain_get_first(NULL); domain != NULL; 596 domain = spdk_memory_domain_get_next(domain, NULL)) { 597 if (domains != NULL && num_domains < array_size) { 598 domains[num_domains] = domain; 599 } 600 num_domains++; 601 } 602 603 return num_domains; 604 } 605 606 static bool 607 bdev_malloc_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 608 { 609 struct malloc_disk *malloc_disk = ctx; 610 611 if (malloc_disk->disk.dif_type != SPDK_DIF_DISABLE) { 612 return false; 613 } 614 615 switch (type) { 616 case SPDK_BDEV_IO_TYPE_READ: 617 case SPDK_BDEV_IO_TYPE_WRITE: 618 return true; 619 default: 620 return false; 621 } 622 } 623 624 static const struct spdk_bdev_fn_table malloc_fn_table = { 625 .destruct = bdev_malloc_destruct, 626 .submit_request = bdev_malloc_submit_request, 627 .io_type_supported = bdev_malloc_io_type_supported, 628 .get_io_channel = bdev_malloc_get_io_channel, 629 .write_config_json = bdev_malloc_write_json_config, 630 .get_memory_domains = bdev_malloc_get_memory_domains, 631 .accel_sequence_supported = bdev_malloc_accel_sequence_supported, 632 }; 633 634 static int 635 malloc_disk_setup_pi(struct malloc_disk *mdisk) 636 { 637 struct spdk_bdev *bdev = &mdisk->disk; 638 struct spdk_dif_ctx dif_ctx; 639 struct iovec iov, md_iov; 640 int rc; 641 struct spdk_dif_ctx_init_ext_opts dif_opts; 642 643 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 644 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 645 /* Set APPTAG|REFTAG_IGNORE to PI fields after creation of malloc bdev */ 646 rc = spdk_dif_ctx_init(&dif_ctx, 647 bdev->blocklen, 648 bdev->md_len, 649 bdev->md_interleave, 650 bdev->dif_is_head_of_md, 651 bdev->dif_type, 652 bdev->dif_check_flags, 653 SPDK_DIF_REFTAG_IGNORE, 654 0xFFFF, SPDK_DIF_APPTAG_IGNORE, 655 0, 0, &dif_opts); 656 if (rc != 0) { 657 SPDK_ERRLOG("Initialization of DIF/DIX context failed\n"); 658 return rc; 659 } 660 661 iov.iov_base = mdisk->malloc_buf; 662 iov.iov_len = bdev->blockcnt * bdev->blocklen; 663 664 if (mdisk->disk.md_interleave) { 665 rc = spdk_dif_generate(&iov, 1, bdev->blockcnt, &dif_ctx); 666 } else { 667 md_iov.iov_base = mdisk->malloc_md_buf; 668 md_iov.iov_len = bdev->blockcnt * bdev->md_len; 669 670 rc = spdk_dix_generate(&iov, 1, &md_iov, bdev->blockcnt, &dif_ctx); 671 } 672 673 if (rc != 0) { 674 SPDK_ERRLOG("Formatting by DIF/DIX failed\n"); 675 } 676 677 return rc; 678 } 679 680 int 681 create_malloc_disk(struct spdk_bdev **bdev, const struct malloc_bdev_opts *opts) 682 { 683 struct malloc_disk *mdisk; 684 uint32_t block_size; 685 int rc; 686 687 assert(opts != NULL); 688 689 if (opts->num_blocks == 0) { 690 SPDK_ERRLOG("Disk num_blocks must be greater than 0"); 691 return -EINVAL; 692 } 693 694 if (opts->block_size % 512) { 695 SPDK_ERRLOG("Data block size must be 512 bytes aligned\n"); 696 return -EINVAL; 697 } 698 699 if (opts->physical_block_size % 512) { 700 SPDK_ERRLOG("Physical block must be 512 bytes aligned\n"); 701 return -EINVAL; 702 } 703 704 switch (opts->md_size) { 705 case 0: 706 case 8: 707 case 16: 708 case 32: 709 case 64: 710 case 128: 711 break; 712 default: 713 SPDK_ERRLOG("metadata size %u is not supported\n", opts->md_size); 714 return -EINVAL; 715 } 716 717 if (opts->md_interleave) { 718 block_size = opts->block_size + opts->md_size; 719 } else { 720 block_size = opts->block_size; 721 } 722 723 if (opts->dif_type < SPDK_DIF_DISABLE || opts->dif_type > SPDK_DIF_TYPE3) { 724 SPDK_ERRLOG("DIF type is invalid\n"); 725 return -EINVAL; 726 } 727 728 if (opts->dif_type != SPDK_DIF_DISABLE && opts->md_size == 0) { 729 SPDK_ERRLOG("Metadata size should not be zero if DIF is enabled\n"); 730 return -EINVAL; 731 } 732 733 mdisk = calloc(1, sizeof(*mdisk)); 734 if (!mdisk) { 735 SPDK_ERRLOG("mdisk calloc() failed\n"); 736 return -ENOMEM; 737 } 738 739 /* 740 * Allocate the large backend memory buffer from pinned memory. 741 * 742 * TODO: need to pass a hint so we know which socket to allocate 743 * from on multi-socket systems. 744 */ 745 mdisk->malloc_buf = spdk_zmalloc(opts->num_blocks * block_size, 2 * 1024 * 1024, NULL, 746 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 747 if (!mdisk->malloc_buf) { 748 SPDK_ERRLOG("malloc_buf spdk_zmalloc() failed\n"); 749 malloc_disk_free(mdisk); 750 return -ENOMEM; 751 } 752 753 if (!opts->md_interleave && opts->md_size != 0) { 754 mdisk->malloc_md_buf = spdk_zmalloc(opts->num_blocks * opts->md_size, 2 * 1024 * 1024, NULL, 755 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 756 if (!mdisk->malloc_md_buf) { 757 SPDK_ERRLOG("malloc_md_buf spdk_zmalloc() failed\n"); 758 malloc_disk_free(mdisk); 759 return -ENOMEM; 760 } 761 } 762 763 if (opts->name) { 764 mdisk->disk.name = strdup(opts->name); 765 } else { 766 /* Auto-generate a name */ 767 mdisk->disk.name = spdk_sprintf_alloc("Malloc%d", malloc_disk_count); 768 malloc_disk_count++; 769 } 770 if (!mdisk->disk.name) { 771 malloc_disk_free(mdisk); 772 return -ENOMEM; 773 } 774 mdisk->disk.product_name = "Malloc disk"; 775 776 mdisk->disk.write_cache = 1; 777 mdisk->disk.blocklen = block_size; 778 mdisk->disk.phys_blocklen = opts->physical_block_size; 779 mdisk->disk.blockcnt = opts->num_blocks; 780 mdisk->disk.md_len = opts->md_size; 781 mdisk->disk.md_interleave = opts->md_interleave; 782 mdisk->disk.dif_type = opts->dif_type; 783 mdisk->disk.dif_is_head_of_md = opts->dif_is_head_of_md; 784 /* Current block device layer API does not propagate 785 * any DIF related information from user. So, we can 786 * not generate or verify Application Tag. 787 */ 788 switch (opts->dif_type) { 789 case SPDK_DIF_TYPE1: 790 case SPDK_DIF_TYPE2: 791 mdisk->disk.dif_check_flags = SPDK_DIF_FLAGS_GUARD_CHECK | 792 SPDK_DIF_FLAGS_REFTAG_CHECK; 793 break; 794 case SPDK_DIF_TYPE3: 795 mdisk->disk.dif_check_flags = SPDK_DIF_FLAGS_GUARD_CHECK; 796 break; 797 case SPDK_DIF_DISABLE: 798 break; 799 } 800 801 if (opts->dif_type != SPDK_DIF_DISABLE) { 802 rc = malloc_disk_setup_pi(mdisk); 803 if (rc) { 804 SPDK_ERRLOG("Failed to set up protection information.\n"); 805 malloc_disk_free(mdisk); 806 return rc; 807 } 808 } 809 810 if (opts->optimal_io_boundary) { 811 mdisk->disk.optimal_io_boundary = opts->optimal_io_boundary; 812 mdisk->disk.split_on_optimal_io_boundary = true; 813 } 814 if (!spdk_uuid_is_null(&opts->uuid)) { 815 spdk_uuid_copy(&mdisk->disk.uuid, &opts->uuid); 816 } 817 818 mdisk->disk.max_copy = 0; 819 mdisk->disk.ctxt = mdisk; 820 mdisk->disk.fn_table = &malloc_fn_table; 821 mdisk->disk.module = &malloc_if; 822 823 rc = spdk_bdev_register(&mdisk->disk); 824 if (rc) { 825 malloc_disk_free(mdisk); 826 return rc; 827 } 828 829 *bdev = &(mdisk->disk); 830 831 TAILQ_INSERT_TAIL(&g_malloc_disks, mdisk, link); 832 833 return rc; 834 } 835 836 void 837 delete_malloc_disk(const char *name, spdk_delete_malloc_complete cb_fn, void *cb_arg) 838 { 839 int rc; 840 841 rc = spdk_bdev_unregister_by_name(name, &malloc_if, cb_fn, cb_arg); 842 if (rc != 0) { 843 cb_fn(cb_arg, rc); 844 } 845 } 846 847 static int 848 malloc_completion_poller(void *ctx) 849 { 850 struct malloc_channel *ch = ctx; 851 struct malloc_task *task; 852 TAILQ_HEAD(, malloc_task) completed_tasks; 853 uint32_t num_completions = 0; 854 855 TAILQ_INIT(&completed_tasks); 856 TAILQ_SWAP(&completed_tasks, &ch->completed_tasks, malloc_task, tailq); 857 858 while (!TAILQ_EMPTY(&completed_tasks)) { 859 task = TAILQ_FIRST(&completed_tasks); 860 TAILQ_REMOVE(&completed_tasks, task, tailq); 861 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status); 862 num_completions++; 863 } 864 865 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 866 } 867 868 static int 869 malloc_create_channel_cb(void *io_device, void *ctx) 870 { 871 struct malloc_channel *ch = ctx; 872 873 ch->accel_channel = spdk_accel_get_io_channel(); 874 if (!ch->accel_channel) { 875 SPDK_ERRLOG("Failed to get accel framework's IO channel\n"); 876 return -ENOMEM; 877 } 878 879 ch->completion_poller = SPDK_POLLER_REGISTER(malloc_completion_poller, ch, 0); 880 if (!ch->completion_poller) { 881 SPDK_ERRLOG("Failed to register malloc completion poller\n"); 882 spdk_put_io_channel(ch->accel_channel); 883 return -ENOMEM; 884 } 885 886 TAILQ_INIT(&ch->completed_tasks); 887 888 return 0; 889 } 890 891 static void 892 malloc_destroy_channel_cb(void *io_device, void *ctx) 893 { 894 struct malloc_channel *ch = ctx; 895 896 assert(TAILQ_EMPTY(&ch->completed_tasks)); 897 898 spdk_put_io_channel(ch->accel_channel); 899 spdk_poller_unregister(&ch->completion_poller); 900 } 901 902 static int 903 bdev_malloc_initialize(void) 904 { 905 /* This needs to be reset for each reinitialization of submodules. 906 * Otherwise after enough devices or reinitializations the value gets too high. 907 * TODO: Make malloc bdev name mandatory and remove this counter. */ 908 malloc_disk_count = 0; 909 910 spdk_io_device_register(&g_malloc_disks, malloc_create_channel_cb, 911 malloc_destroy_channel_cb, sizeof(struct malloc_channel), 912 "bdev_malloc"); 913 914 return 0; 915 } 916 917 static void 918 bdev_malloc_deinitialize(void) 919 { 920 spdk_io_device_unregister(&g_malloc_disks, NULL); 921 } 922 923 SPDK_LOG_REGISTER_COMPONENT(bdev_malloc) 924