1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2017 Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "bdev_malloc.h" 10 #include "spdk/endian.h" 11 #include "spdk/env.h" 12 #include "spdk/accel.h" 13 #include "spdk/dma.h" 14 #include "spdk/likely.h" 15 #include "spdk/string.h" 16 17 #include "spdk/log.h" 18 19 struct malloc_disk { 20 struct spdk_bdev disk; 21 void *malloc_buf; 22 void *malloc_md_buf; 23 TAILQ_ENTRY(malloc_disk) link; 24 }; 25 26 struct malloc_task { 27 struct iovec iov; 28 int num_outstanding; 29 enum spdk_bdev_io_status status; 30 TAILQ_ENTRY(malloc_task) tailq; 31 }; 32 33 struct malloc_channel { 34 struct spdk_io_channel *accel_channel; 35 struct spdk_poller *completion_poller; 36 TAILQ_HEAD(, malloc_task) completed_tasks; 37 }; 38 39 static int 40 malloc_verify_pi(struct spdk_bdev_io *bdev_io) 41 { 42 struct spdk_bdev *bdev = bdev_io->bdev; 43 struct spdk_dif_ctx dif_ctx; 44 struct spdk_dif_error err_blk; 45 int rc; 46 struct spdk_dif_ctx_init_ext_opts dif_opts; 47 48 assert(bdev_io->u.bdev.memory_domain == NULL); 49 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 50 dif_opts.dif_pi_format = bdev->dif_pi_format; 51 rc = spdk_dif_ctx_init(&dif_ctx, 52 bdev->blocklen, 53 bdev->md_len, 54 bdev->md_interleave, 55 bdev->dif_is_head_of_md, 56 bdev->dif_type, 57 bdev_io->u.bdev.dif_check_flags, 58 bdev_io->u.bdev.offset_blocks & 0xFFFFFFFF, 59 0xFFFF, 0, 0, 0, &dif_opts); 60 if (rc != 0) { 61 SPDK_ERRLOG("Failed to initialize DIF/DIX context\n"); 62 return rc; 63 } 64 65 if (spdk_bdev_is_md_interleaved(bdev)) { 66 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, 67 bdev_io->u.bdev.iovcnt, 68 bdev_io->u.bdev.num_blocks, 69 &dif_ctx, 70 &err_blk); 71 } else { 72 struct iovec md_iov = { 73 .iov_base = bdev_io->u.bdev.md_buf, 74 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 75 }; 76 77 if (bdev_io->u.bdev.md_buf == NULL) { 78 return 0; 79 } 80 81 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, 82 bdev_io->u.bdev.iovcnt, 83 &md_iov, 84 bdev_io->u.bdev.num_blocks, 85 &dif_ctx, 86 &err_blk); 87 } 88 89 if (rc != 0) { 90 SPDK_ERRLOG("DIF/DIX verify failed: lba %" PRIu64 ", num_blocks %" PRIu64 ", " 91 "err_type %u, expected %lu, actual %lu, err_offset %u\n", 92 bdev_io->u.bdev.offset_blocks, 93 bdev_io->u.bdev.num_blocks, 94 err_blk.err_type, 95 err_blk.expected, 96 err_blk.actual, 97 err_blk.err_offset); 98 } 99 100 return rc; 101 } 102 103 static int 104 malloc_unmap_write_zeroes_generate_pi(struct spdk_bdev_io *bdev_io) 105 { 106 struct spdk_bdev *bdev = bdev_io->bdev; 107 struct malloc_disk *mdisk = bdev_io->bdev->ctxt; 108 uint32_t block_size = bdev_io->bdev->blocklen; 109 struct spdk_dif_ctx dif_ctx; 110 struct spdk_dif_ctx_init_ext_opts dif_opts; 111 int rc; 112 113 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 114 dif_opts.dif_pi_format = bdev->dif_pi_format; 115 rc = spdk_dif_ctx_init(&dif_ctx, 116 bdev->blocklen, 117 bdev->md_len, 118 bdev->md_interleave, 119 bdev->dif_is_head_of_md, 120 bdev->dif_type, 121 bdev->dif_check_flags, 122 SPDK_DIF_REFTAG_IGNORE, 123 0xFFFF, SPDK_DIF_APPTAG_IGNORE, 124 0, 0, &dif_opts); 125 if (rc != 0) { 126 SPDK_ERRLOG("Initialization of DIF/DIX context failed\n"); 127 return rc; 128 } 129 130 if (bdev->md_interleave) { 131 struct iovec iov = { 132 .iov_base = mdisk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size, 133 .iov_len = bdev_io->u.bdev.num_blocks * block_size, 134 }; 135 136 rc = spdk_dif_generate(&iov, 1, bdev_io->u.bdev.num_blocks, &dif_ctx); 137 } else { 138 struct iovec iov = { 139 .iov_base = mdisk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size, 140 .iov_len = bdev_io->u.bdev.num_blocks * block_size, 141 }; 142 143 struct iovec md_iov = { 144 .iov_base = mdisk->malloc_md_buf + bdev_io->u.bdev.offset_blocks * bdev->md_len, 145 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 146 }; 147 148 rc = spdk_dix_generate(&iov, 1, &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx); 149 } 150 151 if (rc != 0) { 152 SPDK_ERRLOG("Formatting by DIF/DIX failed\n"); 153 } 154 155 156 return rc; 157 } 158 159 static void 160 malloc_done(void *ref, int status) 161 { 162 struct malloc_task *task = (struct malloc_task *)ref; 163 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(task); 164 int rc; 165 166 if (status != 0) { 167 if (status == -ENOMEM) { 168 if (task->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 169 task->status = SPDK_BDEV_IO_STATUS_NOMEM; 170 } 171 } else { 172 task->status = SPDK_BDEV_IO_STATUS_FAILED; 173 } 174 } 175 176 if (--task->num_outstanding != 0) { 177 return; 178 } 179 180 if (bdev_io->bdev->dif_type != SPDK_DIF_DISABLE && 181 bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 182 task->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 183 rc = malloc_verify_pi(bdev_io); 184 if (rc != 0) { 185 task->status = SPDK_BDEV_IO_STATUS_FAILED; 186 } 187 } 188 189 if (bdev_io->bdev->dif_type != SPDK_DIF_DISABLE && 190 (bdev_io->type == SPDK_BDEV_IO_TYPE_UNMAP || bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 191 task->status == SPDK_BDEV_IO_STATUS_SUCCESS) { 192 rc = malloc_unmap_write_zeroes_generate_pi(bdev_io); 193 if (rc != 0) { 194 task->status = SPDK_BDEV_IO_STATUS_FAILED; 195 } 196 } 197 198 assert(!bdev_io->u.bdev.accel_sequence || task->status == SPDK_BDEV_IO_STATUS_NOMEM); 199 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status); 200 } 201 202 static void 203 malloc_complete_task(struct malloc_task *task, struct malloc_channel *mch, 204 enum spdk_bdev_io_status status) 205 { 206 task->status = status; 207 TAILQ_INSERT_TAIL(&mch->completed_tasks, task, tailq); 208 } 209 210 static TAILQ_HEAD(, malloc_disk) g_malloc_disks = TAILQ_HEAD_INITIALIZER(g_malloc_disks); 211 212 int malloc_disk_count = 0; 213 214 static int bdev_malloc_initialize(void); 215 static void bdev_malloc_deinitialize(void); 216 217 static int 218 bdev_malloc_get_ctx_size(void) 219 { 220 return sizeof(struct malloc_task); 221 } 222 223 static struct spdk_bdev_module malloc_if = { 224 .name = "malloc", 225 .module_init = bdev_malloc_initialize, 226 .module_fini = bdev_malloc_deinitialize, 227 .get_ctx_size = bdev_malloc_get_ctx_size, 228 229 }; 230 231 SPDK_BDEV_MODULE_REGISTER(malloc, &malloc_if) 232 233 static void 234 malloc_disk_free(struct malloc_disk *malloc_disk) 235 { 236 if (!malloc_disk) { 237 return; 238 } 239 240 free(malloc_disk->disk.name); 241 spdk_free(malloc_disk->malloc_buf); 242 spdk_free(malloc_disk->malloc_md_buf); 243 free(malloc_disk); 244 } 245 246 static int 247 bdev_malloc_destruct(void *ctx) 248 { 249 struct malloc_disk *malloc_disk = ctx; 250 251 TAILQ_REMOVE(&g_malloc_disks, malloc_disk, link); 252 malloc_disk_free(malloc_disk); 253 return 0; 254 } 255 256 static int 257 bdev_malloc_check_iov_len(struct iovec *iovs, int iovcnt, size_t nbytes) 258 { 259 int i; 260 261 for (i = 0; i < iovcnt; i++) { 262 if (nbytes < iovs[i].iov_len) { 263 return 0; 264 } 265 266 nbytes -= iovs[i].iov_len; 267 } 268 269 return nbytes != 0; 270 } 271 272 static size_t 273 malloc_get_md_len(struct spdk_bdev_io *bdev_io) 274 { 275 return bdev_io->u.bdev.num_blocks * bdev_io->bdev->md_len; 276 } 277 278 static uint64_t 279 malloc_get_md_offset(struct spdk_bdev_io *bdev_io) 280 { 281 return bdev_io->u.bdev.offset_blocks * bdev_io->bdev->md_len; 282 } 283 284 static void * 285 malloc_get_md_buf(struct spdk_bdev_io *bdev_io) 286 { 287 struct malloc_disk *mdisk = SPDK_CONTAINEROF(bdev_io->bdev, struct malloc_disk, disk); 288 289 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 290 291 return (char *)mdisk->malloc_md_buf + malloc_get_md_offset(bdev_io); 292 } 293 294 static void 295 malloc_sequence_fail(struct malloc_task *task, int status) 296 { 297 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(task); 298 299 /* For ENOMEM, the IO will be retried by the bdev layer, so we don't abort the sequence */ 300 if (status != -ENOMEM) { 301 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 302 bdev_io->u.bdev.accel_sequence = NULL; 303 } 304 305 malloc_done(task, status); 306 } 307 308 static void 309 malloc_sequence_done(void *ctx, int status) 310 { 311 struct malloc_task *task = ctx; 312 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(task); 313 314 bdev_io->u.bdev.accel_sequence = NULL; 315 /* Prevent bdev layer from retrying the request if the sequence failed with ENOMEM */ 316 malloc_done(task, status != -ENOMEM ? status : -EFAULT); 317 } 318 319 static void 320 bdev_malloc_readv(struct malloc_disk *mdisk, struct spdk_io_channel *ch, 321 struct malloc_task *task, struct spdk_bdev_io *bdev_io) 322 { 323 uint64_t len, offset; 324 int res = 0; 325 326 len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 327 offset = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen; 328 329 if (bdev_malloc_check_iov_len(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, len)) { 330 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), 331 SPDK_BDEV_IO_STATUS_FAILED); 332 return; 333 } 334 335 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 336 task->num_outstanding = 0; 337 task->iov.iov_base = mdisk->malloc_buf + offset; 338 task->iov.iov_len = len; 339 340 SPDK_DEBUGLOG(bdev_malloc, "read %zu bytes from offset %#" PRIx64 ", iovcnt=%d\n", 341 len, offset, bdev_io->u.bdev.iovcnt); 342 343 task->num_outstanding++; 344 res = spdk_accel_append_copy(&bdev_io->u.bdev.accel_sequence, ch, 345 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 346 bdev_io->u.bdev.memory_domain, 347 bdev_io->u.bdev.memory_domain_ctx, 348 &task->iov, 1, NULL, NULL, NULL, NULL); 349 if (spdk_unlikely(res != 0)) { 350 malloc_sequence_fail(task, res); 351 return; 352 } 353 354 spdk_accel_sequence_reverse(bdev_io->u.bdev.accel_sequence); 355 spdk_accel_sequence_finish(bdev_io->u.bdev.accel_sequence, malloc_sequence_done, task); 356 357 if (bdev_io->u.bdev.md_buf == NULL) { 358 return; 359 } 360 361 SPDK_DEBUGLOG(bdev_malloc, "read metadata %zu bytes from offset%#" PRIx64 "\n", 362 malloc_get_md_len(bdev_io), malloc_get_md_offset(bdev_io)); 363 364 task->num_outstanding++; 365 res = spdk_accel_submit_copy(ch, bdev_io->u.bdev.md_buf, malloc_get_md_buf(bdev_io), 366 malloc_get_md_len(bdev_io), malloc_done, task); 367 if (res != 0) { 368 malloc_done(task, res); 369 } 370 } 371 372 static void 373 bdev_malloc_writev(struct malloc_disk *mdisk, struct spdk_io_channel *ch, 374 struct malloc_task *task, struct spdk_bdev_io *bdev_io) 375 { 376 uint64_t len, offset; 377 int res = 0; 378 379 len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 380 offset = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen; 381 382 if (bdev_malloc_check_iov_len(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, len)) { 383 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), 384 SPDK_BDEV_IO_STATUS_FAILED); 385 return; 386 } 387 388 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 389 task->num_outstanding = 0; 390 task->iov.iov_base = mdisk->malloc_buf + offset; 391 task->iov.iov_len = len; 392 393 SPDK_DEBUGLOG(bdev_malloc, "write %zu bytes to offset %#" PRIx64 ", iovcnt=%d\n", 394 len, offset, bdev_io->u.bdev.iovcnt); 395 396 task->num_outstanding++; 397 res = spdk_accel_append_copy(&bdev_io->u.bdev.accel_sequence, ch, &task->iov, 1, NULL, NULL, 398 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 399 bdev_io->u.bdev.memory_domain, 400 bdev_io->u.bdev.memory_domain_ctx, NULL, NULL); 401 if (spdk_unlikely(res != 0)) { 402 malloc_sequence_fail(task, res); 403 return; 404 } 405 406 spdk_accel_sequence_finish(bdev_io->u.bdev.accel_sequence, malloc_sequence_done, task); 407 408 if (bdev_io->u.bdev.md_buf == NULL) { 409 return; 410 } 411 412 SPDK_DEBUGLOG(bdev_malloc, "write metadata %zu bytes to offset %#" PRIx64 "\n", 413 malloc_get_md_len(bdev_io), malloc_get_md_offset(bdev_io)); 414 415 task->num_outstanding++; 416 res = spdk_accel_submit_copy(ch, malloc_get_md_buf(bdev_io), bdev_io->u.bdev.md_buf, 417 malloc_get_md_len(bdev_io), malloc_done, task); 418 if (res != 0) { 419 malloc_done(task, res); 420 } 421 } 422 423 static int 424 bdev_malloc_unmap(struct malloc_disk *mdisk, 425 struct spdk_io_channel *ch, 426 struct malloc_task *task, 427 uint64_t offset, 428 uint64_t byte_count) 429 { 430 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 431 task->num_outstanding = 1; 432 433 return spdk_accel_submit_fill(ch, mdisk->malloc_buf + offset, 0, 434 byte_count, malloc_done, task); 435 } 436 437 static void 438 bdev_malloc_copy(struct malloc_disk *mdisk, struct spdk_io_channel *ch, 439 struct malloc_task *task, 440 uint64_t dst_offset, uint64_t src_offset, size_t len) 441 { 442 int64_t res = 0; 443 void *dst = mdisk->malloc_buf + dst_offset; 444 void *src = mdisk->malloc_buf + src_offset; 445 446 SPDK_DEBUGLOG(bdev_malloc, "Copy %zu bytes from offset %#" PRIx64 " to offset %#" PRIx64 "\n", 447 len, src_offset, dst_offset); 448 449 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 450 task->num_outstanding = 1; 451 452 res = spdk_accel_submit_copy(ch, dst, src, len, malloc_done, task); 453 if (res != 0) { 454 malloc_done(task, res); 455 } 456 } 457 458 static int 459 _bdev_malloc_submit_request(struct malloc_channel *mch, struct spdk_bdev_io *bdev_io) 460 { 461 struct malloc_task *task = (struct malloc_task *)bdev_io->driver_ctx; 462 struct malloc_disk *disk = bdev_io->bdev->ctxt; 463 uint32_t block_size = bdev_io->bdev->blocklen; 464 int rc; 465 466 switch (bdev_io->type) { 467 case SPDK_BDEV_IO_TYPE_READ: 468 if (bdev_io->u.bdev.iovs[0].iov_base == NULL) { 469 assert(bdev_io->u.bdev.iovcnt == 1); 470 assert(bdev_io->u.bdev.memory_domain == NULL); 471 bdev_io->u.bdev.iovs[0].iov_base = 472 disk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size; 473 bdev_io->u.bdev.iovs[0].iov_len = bdev_io->u.bdev.num_blocks * block_size; 474 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 475 spdk_bdev_io_set_md_buf(bdev_io, malloc_get_md_buf(bdev_io), 476 malloc_get_md_len(bdev_io)); 477 } 478 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 479 return 0; 480 } 481 482 bdev_malloc_readv(disk, mch->accel_channel, task, bdev_io); 483 return 0; 484 485 case SPDK_BDEV_IO_TYPE_WRITE: 486 if (bdev_io->bdev->dif_type != SPDK_DIF_DISABLE) { 487 rc = malloc_verify_pi(bdev_io); 488 if (rc != 0) { 489 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_FAILED); 490 return 0; 491 } 492 } 493 494 bdev_malloc_writev(disk, mch->accel_channel, task, bdev_io); 495 return 0; 496 497 case SPDK_BDEV_IO_TYPE_RESET: 498 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 499 return 0; 500 501 case SPDK_BDEV_IO_TYPE_FLUSH: 502 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 503 return 0; 504 505 case SPDK_BDEV_IO_TYPE_UNMAP: 506 return bdev_malloc_unmap(disk, mch->accel_channel, task, 507 bdev_io->u.bdev.offset_blocks * block_size, 508 bdev_io->u.bdev.num_blocks * block_size); 509 510 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 511 /* bdev_malloc_unmap is implemented with a call to mem_cpy_fill which zeroes out all of the requested bytes. */ 512 return bdev_malloc_unmap(disk, mch->accel_channel, task, 513 bdev_io->u.bdev.offset_blocks * block_size, 514 bdev_io->u.bdev.num_blocks * block_size); 515 516 case SPDK_BDEV_IO_TYPE_ZCOPY: 517 if (bdev_io->u.bdev.zcopy.start) { 518 void *buf; 519 size_t len; 520 521 buf = disk->malloc_buf + bdev_io->u.bdev.offset_blocks * block_size; 522 len = bdev_io->u.bdev.num_blocks * block_size; 523 spdk_bdev_io_set_buf(bdev_io, buf, len); 524 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 525 spdk_bdev_io_set_md_buf(bdev_io, malloc_get_md_buf(bdev_io), 526 malloc_get_md_len(bdev_io)); 527 } 528 } 529 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_SUCCESS); 530 return 0; 531 case SPDK_BDEV_IO_TYPE_ABORT: 532 malloc_complete_task(task, mch, SPDK_BDEV_IO_STATUS_FAILED); 533 return 0; 534 case SPDK_BDEV_IO_TYPE_COPY: 535 bdev_malloc_copy(disk, mch->accel_channel, task, 536 bdev_io->u.bdev.offset_blocks * block_size, 537 bdev_io->u.bdev.copy.src_offset_blocks * block_size, 538 bdev_io->u.bdev.num_blocks * block_size); 539 return 0; 540 541 default: 542 return -1; 543 } 544 return 0; 545 } 546 547 static void 548 bdev_malloc_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 549 { 550 struct malloc_channel *mch = spdk_io_channel_get_ctx(ch); 551 552 if (_bdev_malloc_submit_request(mch, bdev_io) != 0) { 553 malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch, 554 SPDK_BDEV_IO_STATUS_FAILED); 555 } 556 } 557 558 static bool 559 bdev_malloc_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 560 { 561 switch (io_type) { 562 case SPDK_BDEV_IO_TYPE_READ: 563 case SPDK_BDEV_IO_TYPE_WRITE: 564 case SPDK_BDEV_IO_TYPE_FLUSH: 565 case SPDK_BDEV_IO_TYPE_RESET: 566 case SPDK_BDEV_IO_TYPE_UNMAP: 567 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 568 case SPDK_BDEV_IO_TYPE_ZCOPY: 569 case SPDK_BDEV_IO_TYPE_ABORT: 570 case SPDK_BDEV_IO_TYPE_COPY: 571 return true; 572 573 default: 574 return false; 575 } 576 } 577 578 static struct spdk_io_channel * 579 bdev_malloc_get_io_channel(void *ctx) 580 { 581 return spdk_get_io_channel(&g_malloc_disks); 582 } 583 584 static void 585 bdev_malloc_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 586 { 587 spdk_json_write_object_begin(w); 588 589 spdk_json_write_named_string(w, "method", "bdev_malloc_create"); 590 591 spdk_json_write_named_object_begin(w, "params"); 592 spdk_json_write_named_string(w, "name", bdev->name); 593 spdk_json_write_named_uint64(w, "num_blocks", bdev->blockcnt); 594 spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); 595 spdk_json_write_named_uint32(w, "physical_block_size", bdev->phys_blocklen); 596 spdk_json_write_named_uuid(w, "uuid", &bdev->uuid); 597 spdk_json_write_named_uint32(w, "optimal_io_boundary", bdev->optimal_io_boundary); 598 spdk_json_write_named_uint32(w, "md_size", bdev->md_len); 599 spdk_json_write_named_uint32(w, "dif_type", bdev->dif_type); 600 spdk_json_write_named_bool(w, "dif_is_head_of_md", bdev->dif_is_head_of_md); 601 spdk_json_write_named_uint32(w, "dif_pi_format", bdev->dif_pi_format); 602 603 spdk_json_write_object_end(w); 604 605 spdk_json_write_object_end(w); 606 } 607 608 static int 609 bdev_malloc_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 610 { 611 struct malloc_disk *malloc_disk = ctx; 612 struct spdk_memory_domain *domain; 613 int num_domains = 0; 614 615 if (malloc_disk->disk.dif_type != SPDK_DIF_DISABLE) { 616 return 0; 617 } 618 619 /* Report support for every memory domain */ 620 for (domain = spdk_memory_domain_get_first(NULL); domain != NULL; 621 domain = spdk_memory_domain_get_next(domain, NULL)) { 622 if (domains != NULL && num_domains < array_size) { 623 domains[num_domains] = domain; 624 } 625 num_domains++; 626 } 627 628 return num_domains; 629 } 630 631 static bool 632 bdev_malloc_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 633 { 634 struct malloc_disk *malloc_disk = ctx; 635 636 if (malloc_disk->disk.dif_type != SPDK_DIF_DISABLE) { 637 return false; 638 } 639 640 switch (type) { 641 case SPDK_BDEV_IO_TYPE_READ: 642 case SPDK_BDEV_IO_TYPE_WRITE: 643 return true; 644 default: 645 return false; 646 } 647 } 648 649 static const struct spdk_bdev_fn_table malloc_fn_table = { 650 .destruct = bdev_malloc_destruct, 651 .submit_request = bdev_malloc_submit_request, 652 .io_type_supported = bdev_malloc_io_type_supported, 653 .get_io_channel = bdev_malloc_get_io_channel, 654 .write_config_json = bdev_malloc_write_json_config, 655 .get_memory_domains = bdev_malloc_get_memory_domains, 656 .accel_sequence_supported = bdev_malloc_accel_sequence_supported, 657 }; 658 659 static int 660 malloc_disk_setup_pi(struct malloc_disk *mdisk) 661 { 662 struct spdk_bdev *bdev = &mdisk->disk; 663 struct spdk_dif_ctx dif_ctx; 664 struct iovec iov, md_iov; 665 int rc; 666 struct spdk_dif_ctx_init_ext_opts dif_opts; 667 668 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 669 dif_opts.dif_pi_format = bdev->dif_pi_format; 670 /* Set APPTAG|REFTAG_IGNORE to PI fields after creation of malloc bdev */ 671 rc = spdk_dif_ctx_init(&dif_ctx, 672 bdev->blocklen, 673 bdev->md_len, 674 bdev->md_interleave, 675 bdev->dif_is_head_of_md, 676 bdev->dif_type, 677 bdev->dif_check_flags, 678 SPDK_DIF_REFTAG_IGNORE, 679 0xFFFF, SPDK_DIF_APPTAG_IGNORE, 680 0, 0, &dif_opts); 681 if (rc != 0) { 682 SPDK_ERRLOG("Initialization of DIF/DIX context failed\n"); 683 return rc; 684 } 685 686 iov.iov_base = mdisk->malloc_buf; 687 iov.iov_len = bdev->blockcnt * bdev->blocklen; 688 689 if (mdisk->disk.md_interleave) { 690 rc = spdk_dif_generate(&iov, 1, bdev->blockcnt, &dif_ctx); 691 } else { 692 md_iov.iov_base = mdisk->malloc_md_buf; 693 md_iov.iov_len = bdev->blockcnt * bdev->md_len; 694 695 rc = spdk_dix_generate(&iov, 1, &md_iov, bdev->blockcnt, &dif_ctx); 696 } 697 698 if (rc != 0) { 699 SPDK_ERRLOG("Formatting by DIF/DIX failed\n"); 700 } 701 702 return rc; 703 } 704 705 int 706 create_malloc_disk(struct spdk_bdev **bdev, const struct malloc_bdev_opts *opts) 707 { 708 struct malloc_disk *mdisk; 709 uint32_t block_size; 710 int rc; 711 712 assert(opts != NULL); 713 714 if (opts->num_blocks == 0) { 715 SPDK_ERRLOG("Disk num_blocks must be greater than 0"); 716 return -EINVAL; 717 } 718 719 if (opts->block_size % 512) { 720 SPDK_ERRLOG("Data block size must be 512 bytes aligned\n"); 721 return -EINVAL; 722 } 723 724 if (opts->physical_block_size % 512) { 725 SPDK_ERRLOG("Physical block must be 512 bytes aligned\n"); 726 return -EINVAL; 727 } 728 729 switch (opts->md_size) { 730 case 0: 731 case 8: 732 case 16: 733 case 32: 734 case 64: 735 case 128: 736 break; 737 default: 738 SPDK_ERRLOG("metadata size %u is not supported\n", opts->md_size); 739 return -EINVAL; 740 } 741 742 if (opts->md_interleave) { 743 block_size = opts->block_size + opts->md_size; 744 } else { 745 block_size = opts->block_size; 746 } 747 748 mdisk = calloc(1, sizeof(*mdisk)); 749 if (!mdisk) { 750 SPDK_ERRLOG("mdisk calloc() failed\n"); 751 return -ENOMEM; 752 } 753 754 /* 755 * Allocate the large backend memory buffer from pinned memory. 756 * 757 * TODO: need to pass a hint so we know which socket to allocate 758 * from on multi-socket systems. 759 */ 760 mdisk->malloc_buf = spdk_zmalloc(opts->num_blocks * block_size, 2 * 1024 * 1024, NULL, 761 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 762 if (!mdisk->malloc_buf) { 763 SPDK_ERRLOG("malloc_buf spdk_zmalloc() failed\n"); 764 malloc_disk_free(mdisk); 765 return -ENOMEM; 766 } 767 768 if (!opts->md_interleave && opts->md_size != 0) { 769 mdisk->malloc_md_buf = spdk_zmalloc(opts->num_blocks * opts->md_size, 2 * 1024 * 1024, NULL, 770 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 771 if (!mdisk->malloc_md_buf) { 772 SPDK_ERRLOG("malloc_md_buf spdk_zmalloc() failed\n"); 773 malloc_disk_free(mdisk); 774 return -ENOMEM; 775 } 776 } 777 778 if (opts->name) { 779 mdisk->disk.name = strdup(opts->name); 780 } else { 781 /* Auto-generate a name */ 782 mdisk->disk.name = spdk_sprintf_alloc("Malloc%d", malloc_disk_count); 783 malloc_disk_count++; 784 } 785 if (!mdisk->disk.name) { 786 malloc_disk_free(mdisk); 787 return -ENOMEM; 788 } 789 mdisk->disk.product_name = "Malloc disk"; 790 791 mdisk->disk.write_cache = 1; 792 mdisk->disk.blocklen = block_size; 793 mdisk->disk.phys_blocklen = opts->physical_block_size; 794 mdisk->disk.blockcnt = opts->num_blocks; 795 mdisk->disk.md_len = opts->md_size; 796 mdisk->disk.md_interleave = opts->md_interleave; 797 mdisk->disk.dif_type = opts->dif_type; 798 mdisk->disk.dif_is_head_of_md = opts->dif_is_head_of_md; 799 /* Current block device layer API does not propagate 800 * any DIF related information from user. So, we can 801 * not generate or verify Application Tag. 802 */ 803 switch (opts->dif_type) { 804 case SPDK_DIF_TYPE1: 805 case SPDK_DIF_TYPE2: 806 mdisk->disk.dif_check_flags = SPDK_DIF_FLAGS_GUARD_CHECK | 807 SPDK_DIF_FLAGS_REFTAG_CHECK; 808 break; 809 case SPDK_DIF_TYPE3: 810 mdisk->disk.dif_check_flags = SPDK_DIF_FLAGS_GUARD_CHECK; 811 break; 812 case SPDK_DIF_DISABLE: 813 break; 814 } 815 mdisk->disk.dif_pi_format = opts->dif_pi_format; 816 817 if (opts->dif_type != SPDK_DIF_DISABLE) { 818 rc = malloc_disk_setup_pi(mdisk); 819 if (rc) { 820 SPDK_ERRLOG("Failed to set up protection information.\n"); 821 malloc_disk_free(mdisk); 822 return rc; 823 } 824 } 825 826 if (opts->optimal_io_boundary) { 827 mdisk->disk.optimal_io_boundary = opts->optimal_io_boundary; 828 mdisk->disk.split_on_optimal_io_boundary = true; 829 } 830 if (!spdk_uuid_is_null(&opts->uuid)) { 831 spdk_uuid_copy(&mdisk->disk.uuid, &opts->uuid); 832 } 833 834 mdisk->disk.max_copy = 0; 835 mdisk->disk.ctxt = mdisk; 836 mdisk->disk.fn_table = &malloc_fn_table; 837 mdisk->disk.module = &malloc_if; 838 839 rc = spdk_bdev_register(&mdisk->disk); 840 if (rc) { 841 malloc_disk_free(mdisk); 842 return rc; 843 } 844 845 *bdev = &(mdisk->disk); 846 847 TAILQ_INSERT_TAIL(&g_malloc_disks, mdisk, link); 848 849 return rc; 850 } 851 852 void 853 delete_malloc_disk(const char *name, spdk_delete_malloc_complete cb_fn, void *cb_arg) 854 { 855 int rc; 856 857 rc = spdk_bdev_unregister_by_name(name, &malloc_if, cb_fn, cb_arg); 858 if (rc != 0) { 859 cb_fn(cb_arg, rc); 860 } 861 } 862 863 static int 864 malloc_completion_poller(void *ctx) 865 { 866 struct malloc_channel *ch = ctx; 867 struct malloc_task *task; 868 TAILQ_HEAD(, malloc_task) completed_tasks; 869 uint32_t num_completions = 0; 870 871 TAILQ_INIT(&completed_tasks); 872 TAILQ_SWAP(&completed_tasks, &ch->completed_tasks, malloc_task, tailq); 873 874 while (!TAILQ_EMPTY(&completed_tasks)) { 875 task = TAILQ_FIRST(&completed_tasks); 876 TAILQ_REMOVE(&completed_tasks, task, tailq); 877 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status); 878 num_completions++; 879 } 880 881 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 882 } 883 884 static int 885 malloc_create_channel_cb(void *io_device, void *ctx) 886 { 887 struct malloc_channel *ch = ctx; 888 889 ch->accel_channel = spdk_accel_get_io_channel(); 890 if (!ch->accel_channel) { 891 SPDK_ERRLOG("Failed to get accel framework's IO channel\n"); 892 return -ENOMEM; 893 } 894 895 ch->completion_poller = SPDK_POLLER_REGISTER(malloc_completion_poller, ch, 0); 896 if (!ch->completion_poller) { 897 SPDK_ERRLOG("Failed to register malloc completion poller\n"); 898 spdk_put_io_channel(ch->accel_channel); 899 return -ENOMEM; 900 } 901 902 TAILQ_INIT(&ch->completed_tasks); 903 904 return 0; 905 } 906 907 static void 908 malloc_destroy_channel_cb(void *io_device, void *ctx) 909 { 910 struct malloc_channel *ch = ctx; 911 912 assert(TAILQ_EMPTY(&ch->completed_tasks)); 913 914 spdk_put_io_channel(ch->accel_channel); 915 spdk_poller_unregister(&ch->completion_poller); 916 } 917 918 static int 919 bdev_malloc_initialize(void) 920 { 921 /* This needs to be reset for each reinitialization of submodules. 922 * Otherwise after enough devices or reinitializations the value gets too high. 923 * TODO: Make malloc bdev name mandatory and remove this counter. */ 924 malloc_disk_count = 0; 925 926 spdk_io_device_register(&g_malloc_disks, malloc_create_channel_cb, 927 malloc_destroy_channel_cb, sizeof(struct malloc_channel), 928 "bdev_malloc"); 929 930 return 0; 931 } 932 933 static void 934 bdev_malloc_deinitialize(void) 935 { 936 spdk_io_device_unregister(&g_malloc_disks, NULL); 937 } 938 939 SPDK_LOG_REGISTER_COMPONENT(bdev_malloc) 940