1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2017 Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "bdev_malloc.h" 10 #include "spdk/endian.h" 11 #include "spdk/env.h" 12 #include "spdk/accel.h" 13 #include "spdk/dma.h" 14 #include "spdk/likely.h" 15 #include "spdk/string.h" 16 17 #include "spdk/log.h" 18 19 struct malloc_disk { 20 struct spdk_bdev disk; 21 void *malloc_buf; 22 void *malloc_md_buf; 23 TAILQ_ENTRY(malloc_disk) link; 24 }; 25 26 struct malloc_task { 27 struct iovec iov; 28 int num_outstanding; 29 enum spdk_bdev_io_status status; 30 TAILQ_ENTRY(malloc_task) tailq; 31 }; 32 33 struct malloc_channel { 34 struct spdk_io_channel *accel_channel; 35 struct spdk_poller *completion_poller; 36 TAILQ_HEAD(, malloc_task) completed_tasks; 37 }; 38 39 static int 40 malloc_verify_pi(struct spdk_bdev_io *bdev_io) 41 { 42 struct spdk_bdev *bdev = bdev_io->bdev; 43 struct spdk_dif_ctx dif_ctx; 44 struct spdk_dif_error err_blk; 45 int rc; 46 struct spdk_dif_ctx_init_ext_opts dif_opts; 47 48 assert(bdev_io->u.bdev.memory_domain == NULL); 49 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 50 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 51 rc = spdk_dif_ctx_init(&dif_ctx, 52 bdev->blocklen, 53 bdev->md_len, 54 bdev->md_interleave, 55 bdev->dif_is_head_of_md, 56 bdev->dif_type, 57 bdev_io->u.bdev.dif_check_flags, 58 bdev_io->u.bdev.offset_blocks & 0xFFFFFFFF, 59 0xFFFF, 0, 0, 0, &dif_opts); 60 if (rc != 0) { 61 SPDK_ERRLOG("Failed to initialize DIF/DIX context\n"); 62 return rc; 63 } 64 65 if (spdk_bdev_is_md_interleaved(bdev)) { 66 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, 67 bdev_io->u.bdev.iovcnt, 68 bdev_io->u.bdev.num_blocks, 69 &dif_ctx, 70 &err_blk); 71 } else { 72 struct iovec md_iov = { 73 .iov_base = bdev_io->u.bdev.md_buf, 74 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 75 }; 76 77 if (bdev_io->u.bdev.md_buf == NULL) { 78 return 0; 79 } 80 81 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, 82 bdev_io->u.bdev.iovcnt, 83 &md_iov, 84 bdev_io->u.bdev.num_blocks, 85 &dif_ctx, 86 &err_blk); 87 } 88 89 if (rc != 0) { 90 SPDK_ERRLOG("DIF/DIX verify failed: lba %" PRIu64 ", num_blocks %" PRIu64 ", " 91 "err_type %u, expected %lu, actual %lu, err_offset %u\n", 92 bdev_io->u.bdev.offset_blocks, 93 bdev_io->u.bdev.num_blocks, 94 err_blk.err_type, 95 err_blk.expected, 96 err_blk.actual, 97 err_blk.err_offset); 98 } 99 100 return rc; 101 } 102 103 static int 104 malloc_unmap_write_zeroes_generate_pi(struct spdk_bdev_io *bdev_io) 105 { 106 struct spdk_bdev *bdev = bdev_io->bdev; 107 struct malloc_disk *mdisk = bdev_io->bdev->ctxt; 108 uint32_t block_size = bdev_io->bdev->blocklen; 109 struct spdk_dif_ctx dif_ctx; 110 struct spdk_dif_ctx_init_ext_opts dif_opts; 111 int rc; 112 113 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 114 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 115 rc = spdk_dif_ctx_init(&dif_ctx, 116 bdev->blocklen, 117 bdev->md_len, 118 bdev->md_interleave, 119 bdev->dif_is_head_of_md, 120 bdev->dif_type, 121 bdev->dif_check_flags, 122 SPDK_DIF_REFTAG_IGNORE, 123 0xFFFF, SPDK_DIF_APPTAG_IGNORE, 124 0, 0, &dif_opts); 125 if (rc != 0) { 126 SPDK_ERRLOG("Initialization of DIF/DIX context failed\n"); 127 return rc; 128 } 129 130 if (bdev->md_interleave) { 131 struct iovec iov = { 132 .iov_base = mdisk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size, 133 .iov_len = bdev_io->u.bdev.num_blocks * block_size, 134 }; 135 136 rc = spdk_dif_generate(&iov, 1, bdev_io->u.bdev.num_blocks, &dif_ctx); 137 } else { 138 struct iovec iov = { 139 .iov_base = mdisk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size, 140 .iov_len = bdev_io->u.bdev.num_blocks * block_size, 141 }; 142 143 struct iovec md_iov = { 144 .iov_base = mdisk->malloc_md_buf + bdev_io->u.bdev.offset_blocks * bdev->md_len, 145 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 146 }; 147 148 rc = spdk_dix_generate(&iov, 1, &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx); 149 } 150 151 if (rc != 0) { 152 SPDK_ERRLOG("Formatting by DIF/DIX failed\n"); 153 } 154 155 156 return rc; 157 } 158 159 static void 160 malloc_done(void *ref, int status) 161 { 162 struct malloc_task *task = (struct malloc_task *)ref; 163 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(task); 164 int rc; 165 166 if (status != 0) { 167 if (status == -ENOMEM) { 168 if (task->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 169 task->status = SPDK_BDEV_IO_STATUS_NOMEM; 170 } 171 } else { 172 task->status = SPDK_BDEV_IO_STATUS_FAILED; 173 } 174 } 175 176 if (--task->num_outstanding != 0) { 177 return; 178 } 179 180 if (bdev_io->bdev->dif_type != SPDK_DIF_DISABLE && 181 bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 182 task->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 183 rc = malloc_verify_pi(bdev_io); 184 if (rc != 0) { 185 task->status = SPDK_BDEV_IO_STATUS_FAILED; 186 } 187 } 188 189 if (bdev_io->bdev->dif_type != SPDK_DIF_DISABLE && 190 (bdev_io->type == SPDK_BDEV_IO_TYPE_UNMAP || bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 191 task->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 192 rc = malloc_unmap_write_zeroes_generate_pi(bdev_io); 193 if (rc != 0) { 194 task->status = SPDK_BDEV_IO_STATUS_FAILED; 195 } 196 } 197 198 assert(!bdev_io->u.bdev.accel_sequence || task->status == SPDK_BDEV_IO_STATUS_NOMEM); 199 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status); 200 } 201 202 static void 203 malloc_complete_task(struct malloc_task *task, struct malloc_channel *mch, 204 enum spdk_bdev_io_status status) 205 { 206 task->status = status; 207 TAILQ_INSERT_TAIL(&mch->completed_tasks, task, tailq); 208 } 209 210 static TAILQ_HEAD(, malloc_disk) g_malloc_disks = TAILQ_HEAD_INITIALIZER(g_malloc_disks); 211 212 int malloc_disk_count = 0; 213 214 static int bdev_malloc_initialize(void); 215 static void bdev_malloc_deinitialize(void); 216 217 static int 218 bdev_malloc_get_ctx_size(void) 219 { 220 return sizeof(struct malloc_task); 221 } 222 223 static struct spdk_bdev_module malloc_if = { 224 .name = "malloc", 225 .module_init = bdev_malloc_initialize, 226 .module_fini = bdev_malloc_deinitialize, 227 .get_ctx_size = bdev_malloc_get_ctx_size, 228 229 }; 230 231 SPDK_BDEV_MODULE_REGISTER(malloc, &malloc_if) 232 233 static void 234 malloc_disk_free(struct malloc_disk *malloc_disk) 235 { 236 if (!malloc_disk) { 237 return; 238 } 239 240 free(malloc_disk->disk.name); 241 spdk_free(malloc_disk->malloc_buf); 242 spdk_free(malloc_disk->malloc_md_buf); 243 free(malloc_disk); 244 } 245 246 static int 247 bdev_malloc_destruct(void *ctx) 248 { 249 struct malloc_disk *malloc_disk = ctx; 250 251 TAILQ_REMOVE(&g_malloc_disks, malloc_disk, link); 252 malloc_disk_free(malloc_disk); 253 return 0; 254 } 255 256 static int 257 bdev_malloc_check_iov_len(struct iovec *iovs, int iovcnt, size_t nbytes) 258 { 259 int i; 260 261 for (i = 0; i < iovcnt; i++) { 262 if (nbytes < iovs[i].iov_len) { 263 return 0; 264 } 265 266 nbytes -= iovs[i].iov_len; 267 } 268 269 return nbytes != 0; 270 } 271 272 static size_t 273 malloc_get_md_len(struct spdk_bdev_io *bdev_io) 274 { 275 return bdev_io->u.bdev.num_blocks * bdev_io->bdev->md_len; 276 } 277 278 static uint64_t 279 malloc_get_md_offset(struct spdk_bdev_io *bdev_io) 280 { 281 return bdev_io->u.bdev.offset_blocks * bdev_io->bdev->md_len; 282 } 283 284 static void * 285 malloc_get_md_buf(struct spdk_bdev_io *bdev_io) 286 { 287 struct malloc_disk *mdisk = SPDK_CONTAINEROF(bdev_io->bdev, struct malloc_disk, disk); 288 289 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 290 291 return (char *)mdisk->malloc_md_buf + malloc_get_md_offset(bdev_io); 292 } 293 294 static void 295 malloc_sequence_fail(struct malloc_task *task, int status) 296 { 297 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(task); 298 299 /* For ENOMEM, the IO will be retried by the bdev layer, so we don't abort the sequence */ 300 if (status != -ENOMEM) { 301 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 302 bdev_io->u.bdev.accel_sequence = NULL; 303 } 304 305 malloc_done(task, status); 306 } 307 308 static void 309 malloc_sequence_done(void *ctx, int status) 310 { 311 struct malloc_task *task = ctx; 312 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(task); 313 314 bdev_io->u.bdev.accel_sequence = NULL; 315 /* Prevent bdev layer from retrying the request if the sequence failed with ENOMEM */ 316 malloc_done(task, status != -ENOMEM ? status : -EFAULT); 317 } 318 319 static void 320 bdev_malloc_readv(struct malloc_disk *mdisk, struct spdk_io_channel *ch, 321 struct malloc_task *task, struct spdk_bdev_io *bdev_io) 322 { 323 uint64_t len, offset; 324 int res = 0; 325 326 len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 327 offset = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen; 328 329 if (bdev_malloc_check_iov_len(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, len)) { 330 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), 331 SPDK_BDEV_IO_STATUS_FAILED); 332 return; 333 } 334 335 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 336 task->num_outstanding = 0; 337 task->iov.iov_base = mdisk->malloc_buf + offset; 338 task->iov.iov_len = len; 339 340 SPDK_DEBUGLOG(bdev_malloc, "read %zu bytes from offset %#" PRIx64 ", iovcnt=%d\n", 341 len, offset, bdev_io->u.bdev.iovcnt); 342 343 task->num_outstanding++; 344 res = spdk_accel_append_copy(&bdev_io->u.bdev.accel_sequence, ch, 345 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 346 bdev_io->u.bdev.memory_domain, 347 bdev_io->u.bdev.memory_domain_ctx, 348 &task->iov, 1, NULL, NULL, 0, NULL, NULL); 349 if (spdk_unlikely(res != 0)) { 350 malloc_sequence_fail(task, res); 351 return; 352 } 353 354 spdk_accel_sequence_reverse(bdev_io->u.bdev.accel_sequence); 355 spdk_accel_sequence_finish(bdev_io->u.bdev.accel_sequence, malloc_sequence_done, task); 356 357 if (bdev_io->u.bdev.md_buf == NULL) { 358 return; 359 } 360 361 SPDK_DEBUGLOG(bdev_malloc, "read metadata %zu bytes from offset%#" PRIx64 "\n", 362 malloc_get_md_len(bdev_io), malloc_get_md_offset(bdev_io)); 363 364 task->num_outstanding++; 365 res = spdk_accel_submit_copy(ch, bdev_io->u.bdev.md_buf, malloc_get_md_buf(bdev_io), 366 malloc_get_md_len(bdev_io), 0, malloc_done, task); 367 if (res != 0) { 368 malloc_done(task, res); 369 } 370 } 371 372 static void 373 bdev_malloc_writev(struct malloc_disk *mdisk, struct spdk_io_channel *ch, 374 struct malloc_task *task, struct spdk_bdev_io *bdev_io) 375 { 376 uint64_t len, offset; 377 int res = 0; 378 379 len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 380 offset = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen; 381 382 if (bdev_malloc_check_iov_len(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, len)) { 383 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), 384 SPDK_BDEV_IO_STATUS_FAILED); 385 return; 386 } 387 388 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 389 task->num_outstanding = 0; 390 task->iov.iov_base = mdisk->malloc_buf + offset; 391 task->iov.iov_len = len; 392 393 SPDK_DEBUGLOG(bdev_malloc, "write %zu bytes to offset %#" PRIx64 ", iovcnt=%d\n", 394 len, offset, bdev_io->u.bdev.iovcnt); 395 396 task->num_outstanding++; 397 res = spdk_accel_append_copy(&bdev_io->u.bdev.accel_sequence, ch, &task->iov, 1, NULL, NULL, 398 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 399 bdev_io->u.bdev.memory_domain, 400 bdev_io->u.bdev.memory_domain_ctx, 0, NULL, NULL); 401 if (spdk_unlikely(res != 0)) { 402 malloc_sequence_fail(task, res); 403 return; 404 } 405 406 spdk_accel_sequence_finish(bdev_io->u.bdev.accel_sequence, malloc_sequence_done, task); 407 408 if (bdev_io->u.bdev.md_buf == NULL) { 409 return; 410 } 411 412 SPDK_DEBUGLOG(bdev_malloc, "write metadata %zu bytes to offset %#" PRIx64 "\n", 413 malloc_get_md_len(bdev_io), malloc_get_md_offset(bdev_io)); 414 415 task->num_outstanding++; 416 res = spdk_accel_submit_copy(ch, malloc_get_md_buf(bdev_io), bdev_io->u.bdev.md_buf, 417 malloc_get_md_len(bdev_io), 0, malloc_done, task); 418 if (res != 0) { 419 malloc_done(task, res); 420 } 421 } 422 423 static int 424 bdev_malloc_unmap(struct malloc_disk *mdisk, 425 struct spdk_io_channel *ch, 426 struct malloc_task *task, 427 uint64_t offset, 428 uint64_t byte_count) 429 { 430 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 431 task->num_outstanding = 1; 432 433 return spdk_accel_submit_fill(ch, mdisk->malloc_buf + offset, 0, 434 byte_count, 0, malloc_done, task); 435 } 436 437 static void 438 bdev_malloc_copy(struct malloc_disk *mdisk, struct spdk_io_channel *ch, 439 struct malloc_task *task, 440 uint64_t dst_offset, uint64_t src_offset, size_t len) 441 { 442 int64_t res = 0; 443 void *dst = mdisk->malloc_buf + dst_offset; 444 void *src = mdisk->malloc_buf + src_offset; 445 446 SPDK_DEBUGLOG(bdev_malloc, "Copy %zu bytes from offset %#" PRIx64 " to offset %#" PRIx64 "\n", 447 len, src_offset, dst_offset); 448 449 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 450 task->num_outstanding = 1; 451 452 res = spdk_accel_submit_copy(ch, dst, src, len, 0, malloc_done, task); 453 if (res != 0) { 454 malloc_done(task, res); 455 } 456 } 457 458 static int 459 _bdev_malloc_submit_request(struct malloc_channel *mch, struct spdk_bdev_io *bdev_io) 460 { 461 struct malloc_task *task = (struct malloc_task *)bdev_io->driver_ctx; 462 struct malloc_disk *disk = bdev_io->bdev->ctxt; 463 uint32_t block_size = bdev_io->bdev->blocklen; 464 int rc; 465 466 switch (bdev_io->type) { 467 case SPDK_BDEV_IO_TYPE_READ: 468 if (bdev_io->u.bdev.iovs[0].iov_base == NULL) { 469 assert(bdev_io->u.bdev.iovcnt == 1); 470 assert(bdev_io->u.bdev.memory_domain == NULL); 471 bdev_io->u.bdev.iovs[0].iov_base = 472 disk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size; 473 bdev_io->u.bdev.iovs[0].iov_len = bdev_io->u.bdev.num_blocks * block_size; 474 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 475 spdk_bdev_io_set_md_buf(bdev_io, malloc_get_md_buf(bdev_io), 476 malloc_get_md_len(bdev_io)); 477 } 478 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 479 return 0; 480 } 481 482 bdev_malloc_readv(disk, mch->accel_channel, task, bdev_io); 483 return 0; 484 485 case SPDK_BDEV_IO_TYPE_WRITE: 486 if (bdev_io->bdev->dif_type != SPDK_DIF_DISABLE) { 487 rc = malloc_verify_pi(bdev_io); 488 if (rc != 0) { 489 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_FAILED); 490 return 0; 491 } 492 } 493 494 bdev_malloc_writev(disk, mch->accel_channel, task, bdev_io); 495 return 0; 496 497 case SPDK_BDEV_IO_TYPE_RESET: 498 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 499 return 0; 500 501 case SPDK_BDEV_IO_TYPE_FLUSH: 502 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 503 return 0; 504 505 case SPDK_BDEV_IO_TYPE_UNMAP: 506 return bdev_malloc_unmap(disk, mch->accel_channel, task, 507 bdev_io->u.bdev.offset_blocks * block_size, 508 bdev_io->u.bdev.num_blocks * block_size); 509 510 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 511 /* bdev_malloc_unmap is implemented with a call to mem_cpy_fill which zeroes out all of the requested bytes. */ 512 return bdev_malloc_unmap(disk, mch->accel_channel, task, 513 bdev_io->u.bdev.offset_blocks * block_size, 514 bdev_io->u.bdev.num_blocks * block_size); 515 516 case SPDK_BDEV_IO_TYPE_ZCOPY: 517 if (bdev_io->u.bdev.zcopy.start) { 518 void *buf; 519 size_t len; 520 521 buf = disk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size; 522 len = bdev_io->u.bdev.num_blocks * block_size; 523 spdk_bdev_io_set_buf(bdev_io, buf, len); 524 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 525 spdk_bdev_io_set_md_buf(bdev_io, malloc_get_md_buf(bdev_io), 526 malloc_get_md_len(bdev_io)); 527 } 528 } 529 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 530 return 0; 531 case SPDK_BDEV_IO_TYPE_ABORT: 532 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_FAILED); 533 return 0; 534 case SPDK_BDEV_IO_TYPE_COPY: 535 bdev_malloc_copy(disk, mch->accel_channel, task, 536 bdev_io->u.bdev.offset_blocks * block_size, 537 bdev_io->u.bdev.copy.src_offset_blocks * block_size, 538 bdev_io->u.bdev.num_blocks * block_size); 539 return 0; 540 541 default: 542 return -1; 543 } 544 return 0; 545 } 546 547 static void 548 bdev_malloc_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 549 { 550 struct malloc_channel *mch = spdk_io_channel_get_ctx(ch); 551 552 if (_bdev_malloc_submit_request(mch, bdev_io) != 0) { 553 malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch, 554 SPDK_BDEV_IO_STATUS_FAILED); 555 } 556 } 557 558 static bool 559 bdev_malloc_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 560 { 561 switch (io_type) { 562 case SPDK_BDEV_IO_TYPE_READ: 563 case SPDK_BDEV_IO_TYPE_WRITE: 564 case SPDK_BDEV_IO_TYPE_FLUSH: 565 case SPDK_BDEV_IO_TYPE_RESET: 566 case SPDK_BDEV_IO_TYPE_UNMAP: 567 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 568 case SPDK_BDEV_IO_TYPE_ZCOPY: 569 case SPDK_BDEV_IO_TYPE_ABORT: 570 case SPDK_BDEV_IO_TYPE_COPY: 571 return true; 572 573 default: 574 return false; 575 } 576 } 577 578 static struct spdk_io_channel * 579 bdev_malloc_get_io_channel(void *ctx) 580 { 581 return spdk_get_io_channel(&g_malloc_disks); 582 } 583 584 static void 585 bdev_malloc_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 586 { 587 spdk_json_write_object_begin(w); 588 589 spdk_json_write_named_string(w, "method", "bdev_malloc_create"); 590 591 spdk_json_write_named_object_begin(w, "params"); 592 spdk_json_write_named_string(w, "name", bdev->name); 593 spdk_json_write_named_uint64(w, "num_blocks", bdev->blockcnt); 594 spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); 595 spdk_json_write_named_uint32(w, "physical_block_size", bdev->phys_blocklen); 596 spdk_json_write_named_uuid(w, "uuid", &bdev->uuid); 597 spdk_json_write_named_uint32(w, "optimal_io_boundary", bdev->optimal_io_boundary); 598 599 spdk_json_write_object_end(w); 600 601 spdk_json_write_object_end(w); 602 } 603 604 static int 605 bdev_malloc_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 606 { 607 struct malloc_disk *malloc_disk = ctx; 608 struct spdk_memory_domain *domain; 609 int num_domains = 0; 610 611 if (malloc_disk->disk.dif_type != SPDK_DIF_DISABLE) { 612 return 0; 613 } 614 615 /* Report support for every memory domain */ 616 for (domain = spdk_memory_domain_get_first(NULL); domain != NULL; 617 domain = spdk_memory_domain_get_next(domain, NULL)) { 618 if (domains != NULL && num_domains < array_size) { 619 domains[num_domains] = domain; 620 } 621 num_domains++; 622 } 623 624 return num_domains; 625 } 626 627 static bool 628 bdev_malloc_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 629 { 630 struct malloc_disk *malloc_disk = ctx; 631 632 if (malloc_disk->disk.dif_type != SPDK_DIF_DISABLE) { 633 return false; 634 } 635 636 switch (type) { 637 case SPDK_BDEV_IO_TYPE_READ: 638 case SPDK_BDEV_IO_TYPE_WRITE: 639 return true; 640 default: 641 return false; 642 } 643 } 644 645 static const struct spdk_bdev_fn_table malloc_fn_table = { 646 .destruct = bdev_malloc_destruct, 647 .submit_request = bdev_malloc_submit_request, 648 .io_type_supported = bdev_malloc_io_type_supported, 649 .get_io_channel = bdev_malloc_get_io_channel, 650 .write_config_json = bdev_malloc_write_json_config, 651 .get_memory_domains = bdev_malloc_get_memory_domains, 652 .accel_sequence_supported = bdev_malloc_accel_sequence_supported, 653 }; 654 655 static int 656 malloc_disk_setup_pi(struct malloc_disk *mdisk) 657 { 658 struct spdk_bdev *bdev = &mdisk->disk; 659 struct spdk_dif_ctx dif_ctx; 660 struct iovec iov, md_iov; 661 int rc; 662 struct spdk_dif_ctx_init_ext_opts dif_opts; 663 664 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 665 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 666 /* Set APPTAG|REFTAG_IGNORE to PI fields after creation of malloc bdev */ 667 rc = spdk_dif_ctx_init(&dif_ctx, 668 bdev->blocklen, 669 bdev->md_len, 670 bdev->md_interleave, 671 bdev->dif_is_head_of_md, 672 bdev->dif_type, 673 bdev->dif_check_flags, 674 SPDK_DIF_REFTAG_IGNORE, 675 0xFFFF, SPDK_DIF_APPTAG_IGNORE, 676 0, 0, &dif_opts); 677 if (rc != 0) { 678 SPDK_ERRLOG("Initialization of DIF/DIX context failed\n"); 679 return rc; 680 } 681 682 iov.iov_base = mdisk->malloc_buf; 683 iov.iov_len = bdev->blockcnt * bdev->blocklen; 684 685 if (mdisk->disk.md_interleave) { 686 rc = spdk_dif_generate(&iov, 1, bdev->blockcnt, &dif_ctx); 687 } else { 688 md_iov.iov_base = mdisk->malloc_md_buf; 689 md_iov.iov_len = bdev->blockcnt * bdev->md_len; 690 691 rc = spdk_dix_generate(&iov, 1, &md_iov, bdev->blockcnt, &dif_ctx); 692 } 693 694 if (rc != 0) { 695 SPDK_ERRLOG("Formatting by DIF/DIX failed\n"); 696 } 697 698 return rc; 699 } 700 701 int 702 create_malloc_disk(struct spdk_bdev **bdev, const struct malloc_bdev_opts *opts) 703 { 704 struct malloc_disk *mdisk; 705 uint32_t block_size; 706 int rc; 707 708 assert(opts != NULL); 709 710 if (opts->num_blocks == 0) { 711 SPDK_ERRLOG("Disk num_blocks must be greater than 0"); 712 return -EINVAL; 713 } 714 715 if (opts->block_size % 512) { 716 SPDK_ERRLOG("Data block size must be 512 bytes aligned\n"); 717 return -EINVAL; 718 } 719 720 if (opts->physical_block_size % 512) { 721 SPDK_ERRLOG("Physical block must be 512 bytes aligned\n"); 722 return -EINVAL; 723 } 724 725 switch (opts->md_size) { 726 case 0: 727 case 8: 728 case 16: 729 case 32: 730 case 64: 731 case 128: 732 break; 733 default: 734 SPDK_ERRLOG("metadata size %u is not supported\n", opts->md_size); 735 return -EINVAL; 736 } 737 738 if (opts->md_interleave) { 739 block_size = opts->block_size + opts->md_size; 740 } else { 741 block_size = opts->block_size; 742 } 743 744 if (opts->dif_type < SPDK_DIF_DISABLE || opts->dif_type > SPDK_DIF_TYPE3) { 745 SPDK_ERRLOG("DIF type is invalid\n"); 746 return -EINVAL; 747 } 748 749 if (opts->dif_type != SPDK_DIF_DISABLE && opts->md_size == 0) { 750 SPDK_ERRLOG("Metadata size should not be zero if DIF is enabled\n"); 751 return -EINVAL; 752 } 753 754 mdisk = calloc(1, sizeof(*mdisk)); 755 if (!mdisk) { 756 SPDK_ERRLOG("mdisk calloc() failed\n"); 757 return -ENOMEM; 758 } 759 760 /* 761 * Allocate the large backend memory buffer from pinned memory. 762 * 763 * TODO: need to pass a hint so we know which socket to allocate 764 * from on multi-socket systems. 765 */ 766 mdisk->malloc_buf = spdk_zmalloc(opts->num_blocks * block_size, 2 * 1024 * 1024, NULL, 767 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 768 if (!mdisk->malloc_buf) { 769 SPDK_ERRLOG("malloc_buf spdk_zmalloc() failed\n"); 770 malloc_disk_free(mdisk); 771 return -ENOMEM; 772 } 773 774 if (!opts->md_interleave && opts->md_size != 0) { 775 mdisk->malloc_md_buf = spdk_zmalloc(opts->num_blocks * opts->md_size, 2 * 1024 * 1024, NULL, 776 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 777 if (!mdisk->malloc_md_buf) { 778 SPDK_ERRLOG("malloc_md_buf spdk_zmalloc() failed\n"); 779 malloc_disk_free(mdisk); 780 return -ENOMEM; 781 } 782 } 783 784 if (opts->name) { 785 mdisk->disk.name = strdup(opts->name); 786 } else { 787 /* Auto-generate a name */ 788 mdisk->disk.name = spdk_sprintf_alloc("Malloc%d", malloc_disk_count); 789 malloc_disk_count++; 790 } 791 if (!mdisk->disk.name) { 792 malloc_disk_free(mdisk); 793 return -ENOMEM; 794 } 795 mdisk->disk.product_name = "Malloc disk"; 796 797 mdisk->disk.write_cache = 1; 798 mdisk->disk.blocklen = block_size; 799 mdisk->disk.phys_blocklen = opts->physical_block_size; 800 mdisk->disk.blockcnt = opts->num_blocks; 801 mdisk->disk.md_len = opts->md_size; 802 mdisk->disk.md_interleave = opts->md_interleave; 803 mdisk->disk.dif_type = opts->dif_type; 804 mdisk->disk.dif_is_head_of_md = opts->dif_is_head_of_md; 805 /* Current block device layer API does not propagate 806 * any DIF related information from user. So, we can 807 * not generate or verify Application Tag. 808 */ 809 switch (opts->dif_type) { 810 case SPDK_DIF_TYPE1: 811 case SPDK_DIF_TYPE2: 812 mdisk->disk.dif_check_flags = SPDK_DIF_FLAGS_GUARD_CHECK | 813 SPDK_DIF_FLAGS_REFTAG_CHECK; 814 break; 815 case SPDK_DIF_TYPE3: 816 mdisk->disk.dif_check_flags = SPDK_DIF_FLAGS_GUARD_CHECK; 817 break; 818 case SPDK_DIF_DISABLE: 819 break; 820 } 821 822 if (opts->dif_type != SPDK_DIF_DISABLE) { 823 rc = malloc_disk_setup_pi(mdisk); 824 if (rc) { 825 SPDK_ERRLOG("Failed to set up protection information.\n"); 826 malloc_disk_free(mdisk); 827 return rc; 828 } 829 } 830 831 if (opts->optimal_io_boundary) { 832 mdisk->disk.optimal_io_boundary = opts->optimal_io_boundary; 833 mdisk->disk.split_on_optimal_io_boundary = true; 834 } 835 if (!spdk_uuid_is_null(&opts->uuid)) { 836 spdk_uuid_copy(&mdisk->disk.uuid, &opts->uuid); 837 } 838 839 mdisk->disk.max_copy = 0; 840 mdisk->disk.ctxt = mdisk; 841 mdisk->disk.fn_table = &malloc_fn_table; 842 mdisk->disk.module = &malloc_if; 843 844 rc = spdk_bdev_register(&mdisk->disk); 845 if (rc) { 846 malloc_disk_free(mdisk); 847 return rc; 848 } 849 850 *bdev = &(mdisk->disk); 851 852 TAILQ_INSERT_TAIL(&g_malloc_disks, mdisk, link); 853 854 return rc; 855 } 856 857 void 858 delete_malloc_disk(const char *name, spdk_delete_malloc_complete cb_fn, void *cb_arg) 859 { 860 int rc; 861 862 rc = spdk_bdev_unregister_by_name(name, &malloc_if, cb_fn, cb_arg); 863 if (rc != 0) { 864 cb_fn(cb_arg, rc); 865 } 866 } 867 868 static int 869 malloc_completion_poller(void *ctx) 870 { 871 struct malloc_channel *ch = ctx; 872 struct malloc_task *task; 873 TAILQ_HEAD(, malloc_task) completed_tasks; 874 uint32_t num_completions = 0; 875 876 TAILQ_INIT(&completed_tasks); 877 TAILQ_SWAP(&completed_tasks, &ch->completed_tasks, malloc_task, tailq); 878 879 while (!TAILQ_EMPTY(&completed_tasks)) { 880 task = TAILQ_FIRST(&completed_tasks); 881 TAILQ_REMOVE(&completed_tasks, task, tailq); 882 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status); 883 num_completions++; 884 } 885 886 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 887 } 888 889 static int 890 malloc_create_channel_cb(void *io_device, void *ctx) 891 { 892 struct malloc_channel *ch = ctx; 893 894 ch->accel_channel = spdk_accel_get_io_channel(); 895 if (!ch->accel_channel) { 896 SPDK_ERRLOG("Failed to get accel framework's IO channel\n"); 897 return -ENOMEM; 898 } 899 900 ch->completion_poller = SPDK_POLLER_REGISTER(malloc_completion_poller, ch, 0); 901 if (!ch->completion_poller) { 902 SPDK_ERRLOG("Failed to register malloc completion poller\n"); 903 spdk_put_io_channel(ch->accel_channel); 904 return -ENOMEM; 905 } 906 907 TAILQ_INIT(&ch->completed_tasks); 908 909 return 0; 910 } 911 912 static void 913 malloc_destroy_channel_cb(void *io_device, void *ctx) 914 { 915 struct malloc_channel *ch = ctx; 916 917 assert(TAILQ_EMPTY(&ch->completed_tasks)); 918 919 spdk_put_io_channel(ch->accel_channel); 920 spdk_poller_unregister(&ch->completion_poller); 921 } 922 923 static int 924 bdev_malloc_initialize(void) 925 { 926 /* This needs to be reset for each reinitialization of submodules. 927 * Otherwise after enough devices or reinitializations the value gets too high. 928 * TODO: Make malloc bdev name mandatory and remove this counter. */ 929 malloc_disk_count = 0; 930 931 spdk_io_device_register(&g_malloc_disks, malloc_create_channel_cb, 932 malloc_destroy_channel_cb, sizeof(struct malloc_channel), 933 "bdev_malloc"); 934 935 return 0; 936 } 937 938 static void 939 bdev_malloc_deinitialize(void) 940 { 941 spdk_io_device_unregister(&g_malloc_disks, NULL); 942 } 943 944 SPDK_LOG_REGISTER_COMPONENT(bdev_malloc) 945