1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include "spdk/stdinc.h" 36 37 #include "bdev_malloc.h" 38 #include "spdk/bdev.h" 39 #include "spdk/endian.h" 40 #include "spdk/env.h" 41 #include "spdk/accel_engine.h" 42 #include "spdk/json.h" 43 #include "spdk/thread.h" 44 #include "spdk/queue.h" 45 #include "spdk/string.h" 46 47 #include "spdk/bdev_module.h" 48 #include "spdk/log.h" 49 50 struct malloc_disk { 51 struct spdk_bdev disk; 52 void *malloc_buf; 53 TAILQ_ENTRY(malloc_disk) link; 54 }; 55 56 struct malloc_task { 57 int num_outstanding; 58 enum spdk_bdev_io_status status; 59 TAILQ_ENTRY(malloc_task) tailq; 60 }; 61 62 struct malloc_channel { 63 struct spdk_io_channel *accel_channel; 64 struct spdk_poller *completion_poller; 65 TAILQ_HEAD(, malloc_task) completed_tasks; 66 }; 67 68 static void 69 malloc_done(void *ref, int status) 70 { 71 struct malloc_task *task = (struct malloc_task *)ref; 72 73 if (status != 0) { 74 if (status == -ENOMEM) { 75 task->status = SPDK_BDEV_IO_STATUS_NOMEM; 76 } else { 77 task->status = SPDK_BDEV_IO_STATUS_FAILED; 78 } 79 } 80 81 if (--task->num_outstanding == 0) { 82 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status); 83 } 84 } 85 86 static void 87 malloc_complete_task(struct malloc_task *task, struct malloc_channel *mch, 88 enum spdk_bdev_io_status status) 89 { 90 task->status = status; 91 TAILQ_INSERT_TAIL(&mch->completed_tasks, task, tailq); 92 } 93 94 static TAILQ_HEAD(, malloc_disk) g_malloc_disks = TAILQ_HEAD_INITIALIZER(g_malloc_disks); 95 96 int malloc_disk_count = 0; 97 98 static int bdev_malloc_initialize(void); 99 static void bdev_malloc_deinitialize(void); 100 101 static int 102 bdev_malloc_get_ctx_size(void) 103 { 104 return sizeof(struct malloc_task); 105 } 106 107 static struct spdk_bdev_module malloc_if = { 108 .name = "malloc", 109 .module_init = bdev_malloc_initialize, 110 .module_fini = bdev_malloc_deinitialize, 111 .get_ctx_size = bdev_malloc_get_ctx_size, 112 113 }; 114 115 SPDK_BDEV_MODULE_REGISTER(malloc, &malloc_if) 116 117 static void 118 malloc_disk_free(struct malloc_disk *malloc_disk) 119 { 120 if (!malloc_disk) { 121 return; 122 } 123 124 free(malloc_disk->disk.name); 125 spdk_free(malloc_disk->malloc_buf); 126 free(malloc_disk); 127 } 128 129 static int 130 bdev_malloc_destruct(void *ctx) 131 { 132 struct malloc_disk *malloc_disk = ctx; 133 134 TAILQ_REMOVE(&g_malloc_disks, malloc_disk, link); 135 malloc_disk_free(malloc_disk); 136 return 0; 137 } 138 139 static int 140 bdev_malloc_check_iov_len(struct iovec *iovs, int iovcnt, size_t nbytes) 141 { 142 int i; 143 144 for (i = 0; i < iovcnt; i++) { 145 if (nbytes < iovs[i].iov_len) { 146 return 0; 147 } 148 149 nbytes -= iovs[i].iov_len; 150 } 151 152 return nbytes != 0; 153 } 154 155 static void 156 bdev_malloc_readv(struct malloc_disk *mdisk, struct spdk_io_channel *ch, 157 struct malloc_task *task, 158 struct iovec *iov, int iovcnt, size_t len, uint64_t offset) 159 { 160 int64_t res = 0; 161 void *src = mdisk->malloc_buf + offset; 162 int i; 163 164 if (bdev_malloc_check_iov_len(iov, iovcnt, len)) { 165 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), 166 SPDK_BDEV_IO_STATUS_FAILED); 167 return; 168 } 169 170 SPDK_DEBUGLOG(bdev_malloc, "read %zu bytes from offset %#" PRIx64 ", iovcnt=%d\n", 171 len, offset, iovcnt); 172 173 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 174 task->num_outstanding = 0; 175 176 for (i = 0; i < iovcnt; i++) { 177 task->num_outstanding++; 178 res = spdk_accel_submit_copy(ch, iov[i].iov_base, 179 src, iov[i].iov_len, 0, malloc_done, task); 180 181 if (res != 0) { 182 malloc_done(task, res); 183 break; 184 } 185 186 src += iov[i].iov_len; 187 len -= iov[i].iov_len; 188 } 189 } 190 191 static void 192 bdev_malloc_writev(struct malloc_disk *mdisk, struct spdk_io_channel *ch, 193 struct malloc_task *task, 194 struct iovec *iov, int iovcnt, size_t len, uint64_t offset) 195 { 196 int64_t res = 0; 197 void *dst = mdisk->malloc_buf + offset; 198 int i; 199 200 if (bdev_malloc_check_iov_len(iov, iovcnt, len)) { 201 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), 202 SPDK_BDEV_IO_STATUS_FAILED); 203 return; 204 } 205 206 SPDK_DEBUGLOG(bdev_malloc, "wrote %zu bytes to offset %#" PRIx64 ", iovcnt=%d\n", 207 len, offset, iovcnt); 208 209 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 210 task->num_outstanding = 0; 211 212 for (i = 0; i < iovcnt; i++) { 213 task->num_outstanding++; 214 res = spdk_accel_submit_copy(ch, dst, iov[i].iov_base, 215 iov[i].iov_len, 0, malloc_done, task); 216 217 if (res != 0) { 218 malloc_done(task, res); 219 break; 220 } 221 222 dst += iov[i].iov_len; 223 } 224 } 225 226 static int 227 bdev_malloc_unmap(struct malloc_disk *mdisk, 228 struct spdk_io_channel *ch, 229 struct malloc_task *task, 230 uint64_t offset, 231 uint64_t byte_count) 232 { 233 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 234 task->num_outstanding = 1; 235 236 return spdk_accel_submit_fill(ch, mdisk->malloc_buf + offset, 0, 237 byte_count, 0, malloc_done, task); 238 } 239 240 static int _bdev_malloc_submit_request(struct malloc_channel *mch, struct spdk_bdev_io *bdev_io) 241 { 242 uint32_t block_size = bdev_io->bdev->blocklen; 243 244 switch (bdev_io->type) { 245 case SPDK_BDEV_IO_TYPE_READ: 246 if (bdev_io->u.bdev.iovs[0].iov_base == NULL) { 247 assert(bdev_io->u.bdev.iovcnt == 1); 248 bdev_io->u.bdev.iovs[0].iov_base = 249 ((struct malloc_disk *)bdev_io->bdev->ctxt)->malloc_buf + 250 bdev_io->u.bdev.offset_blocks * block_size; 251 bdev_io->u.bdev.iovs[0].iov_len = bdev_io->u.bdev.num_blocks * block_size; 252 malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch, 253 SPDK_BDEV_IO_STATUS_SUCCESS); 254 return 0; 255 } 256 257 bdev_malloc_readv((struct malloc_disk *)bdev_io->bdev->ctxt, 258 mch->accel_channel, 259 (struct malloc_task *)bdev_io->driver_ctx, 260 bdev_io->u.bdev.iovs, 261 bdev_io->u.bdev.iovcnt, 262 bdev_io->u.bdev.num_blocks * block_size, 263 bdev_io->u.bdev.offset_blocks * block_size); 264 return 0; 265 266 case SPDK_BDEV_IO_TYPE_WRITE: 267 bdev_malloc_writev((struct malloc_disk *)bdev_io->bdev->ctxt, 268 mch->accel_channel, 269 (struct malloc_task *)bdev_io->driver_ctx, 270 bdev_io->u.bdev.iovs, 271 bdev_io->u.bdev.iovcnt, 272 bdev_io->u.bdev.num_blocks * block_size, 273 bdev_io->u.bdev.offset_blocks * block_size); 274 return 0; 275 276 case SPDK_BDEV_IO_TYPE_RESET: 277 malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch, 278 SPDK_BDEV_IO_STATUS_SUCCESS); 279 return 0; 280 281 case SPDK_BDEV_IO_TYPE_FLUSH: 282 malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch, 283 SPDK_BDEV_IO_STATUS_SUCCESS); 284 return 0; 285 286 case SPDK_BDEV_IO_TYPE_UNMAP: 287 return bdev_malloc_unmap((struct malloc_disk *)bdev_io->bdev->ctxt, 288 mch->accel_channel, 289 (struct malloc_task *)bdev_io->driver_ctx, 290 bdev_io->u.bdev.offset_blocks * block_size, 291 bdev_io->u.bdev.num_blocks * block_size); 292 293 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 294 /* bdev_malloc_unmap is implemented with a call to mem_cpy_fill which zeroes out all of the requested bytes. */ 295 return bdev_malloc_unmap((struct malloc_disk *)bdev_io->bdev->ctxt, 296 mch->accel_channel, 297 (struct malloc_task *)bdev_io->driver_ctx, 298 bdev_io->u.bdev.offset_blocks * block_size, 299 bdev_io->u.bdev.num_blocks * block_size); 300 301 case SPDK_BDEV_IO_TYPE_ZCOPY: 302 if (bdev_io->u.bdev.zcopy.start) { 303 void *buf; 304 size_t len; 305 306 buf = ((struct malloc_disk *)bdev_io->bdev->ctxt)->malloc_buf + 307 bdev_io->u.bdev.offset_blocks * block_size; 308 len = bdev_io->u.bdev.num_blocks * block_size; 309 spdk_bdev_io_set_buf(bdev_io, buf, len); 310 311 } 312 malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch, 313 SPDK_BDEV_IO_STATUS_SUCCESS); 314 return 0; 315 case SPDK_BDEV_IO_TYPE_ABORT: 316 malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch, 317 SPDK_BDEV_IO_STATUS_FAILED); 318 return 0; 319 default: 320 return -1; 321 } 322 return 0; 323 } 324 325 static void bdev_malloc_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 326 { 327 struct malloc_channel *mch = spdk_io_channel_get_ctx(ch); 328 329 if (_bdev_malloc_submit_request(mch, bdev_io) != 0) { 330 malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch, 331 SPDK_BDEV_IO_STATUS_FAILED); 332 } 333 } 334 335 static bool 336 bdev_malloc_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 337 { 338 switch (io_type) { 339 case SPDK_BDEV_IO_TYPE_READ: 340 case SPDK_BDEV_IO_TYPE_WRITE: 341 case SPDK_BDEV_IO_TYPE_FLUSH: 342 case SPDK_BDEV_IO_TYPE_RESET: 343 case SPDK_BDEV_IO_TYPE_UNMAP: 344 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 345 case SPDK_BDEV_IO_TYPE_ZCOPY: 346 case SPDK_BDEV_IO_TYPE_ABORT: 347 return true; 348 349 default: 350 return false; 351 } 352 } 353 354 static struct spdk_io_channel * 355 bdev_malloc_get_io_channel(void *ctx) 356 { 357 return spdk_get_io_channel(&g_malloc_disks); 358 } 359 360 static void 361 bdev_malloc_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 362 { 363 char uuid_str[SPDK_UUID_STRING_LEN]; 364 365 spdk_json_write_object_begin(w); 366 367 spdk_json_write_named_string(w, "method", "bdev_malloc_create"); 368 369 spdk_json_write_named_object_begin(w, "params"); 370 spdk_json_write_named_string(w, "name", bdev->name); 371 spdk_json_write_named_uint64(w, "num_blocks", bdev->blockcnt); 372 spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); 373 spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &bdev->uuid); 374 spdk_json_write_named_string(w, "uuid", uuid_str); 375 spdk_json_write_named_uint32(w, "optimal_io_boundary", bdev->optimal_io_boundary); 376 377 spdk_json_write_object_end(w); 378 379 spdk_json_write_object_end(w); 380 } 381 382 static const struct spdk_bdev_fn_table malloc_fn_table = { 383 .destruct = bdev_malloc_destruct, 384 .submit_request = bdev_malloc_submit_request, 385 .io_type_supported = bdev_malloc_io_type_supported, 386 .get_io_channel = bdev_malloc_get_io_channel, 387 .write_config_json = bdev_malloc_write_json_config, 388 }; 389 390 int 391 create_malloc_disk(struct spdk_bdev **bdev, const char *name, const struct spdk_uuid *uuid, 392 uint64_t num_blocks, uint32_t block_size, uint32_t optimal_io_boundary) 393 { 394 struct malloc_disk *mdisk; 395 int rc; 396 397 if (num_blocks == 0) { 398 SPDK_ERRLOG("Disk num_blocks must be greater than 0"); 399 return -EINVAL; 400 } 401 402 if (block_size % 512) { 403 SPDK_ERRLOG("block size must be 512 bytes aligned\n"); 404 return -EINVAL; 405 } 406 407 mdisk = calloc(1, sizeof(*mdisk)); 408 if (!mdisk) { 409 SPDK_ERRLOG("mdisk calloc() failed\n"); 410 return -ENOMEM; 411 } 412 413 /* 414 * Allocate the large backend memory buffer from pinned memory. 415 * 416 * TODO: need to pass a hint so we know which socket to allocate 417 * from on multi-socket systems. 418 */ 419 mdisk->malloc_buf = spdk_zmalloc(num_blocks * block_size, 2 * 1024 * 1024, NULL, 420 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 421 if (!mdisk->malloc_buf) { 422 SPDK_ERRLOG("malloc_buf spdk_zmalloc() failed\n"); 423 malloc_disk_free(mdisk); 424 return -ENOMEM; 425 } 426 427 if (name) { 428 mdisk->disk.name = strdup(name); 429 } else { 430 /* Auto-generate a name */ 431 mdisk->disk.name = spdk_sprintf_alloc("Malloc%d", malloc_disk_count); 432 malloc_disk_count++; 433 } 434 if (!mdisk->disk.name) { 435 malloc_disk_free(mdisk); 436 return -ENOMEM; 437 } 438 mdisk->disk.product_name = "Malloc disk"; 439 440 mdisk->disk.write_cache = 1; 441 mdisk->disk.blocklen = block_size; 442 mdisk->disk.blockcnt = num_blocks; 443 if (optimal_io_boundary) { 444 mdisk->disk.optimal_io_boundary = optimal_io_boundary; 445 mdisk->disk.split_on_optimal_io_boundary = true; 446 } 447 if (uuid) { 448 mdisk->disk.uuid = *uuid; 449 } else { 450 spdk_uuid_generate(&mdisk->disk.uuid); 451 } 452 453 mdisk->disk.ctxt = mdisk; 454 mdisk->disk.fn_table = &malloc_fn_table; 455 mdisk->disk.module = &malloc_if; 456 457 rc = spdk_bdev_register(&mdisk->disk); 458 if (rc) { 459 malloc_disk_free(mdisk); 460 return rc; 461 } 462 463 *bdev = &(mdisk->disk); 464 465 TAILQ_INSERT_TAIL(&g_malloc_disks, mdisk, link); 466 467 return rc; 468 } 469 470 void 471 delete_malloc_disk(const char *name, spdk_delete_malloc_complete cb_fn, void *cb_arg) 472 { 473 int rc; 474 475 rc = spdk_bdev_unregister_by_name(name, &malloc_if, cb_fn, cb_arg); 476 if (rc != 0) { 477 cb_fn(cb_arg, rc); 478 } 479 } 480 481 static int 482 malloc_completion_poller(void *ctx) 483 { 484 struct malloc_channel *ch = ctx; 485 struct malloc_task *task; 486 TAILQ_HEAD(, malloc_task) completed_tasks; 487 uint32_t num_completions = 0; 488 489 TAILQ_INIT(&completed_tasks); 490 TAILQ_SWAP(&completed_tasks, &ch->completed_tasks, malloc_task, tailq); 491 492 while (!TAILQ_EMPTY(&completed_tasks)) { 493 task = TAILQ_FIRST(&completed_tasks); 494 TAILQ_REMOVE(&completed_tasks, task, tailq); 495 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status); 496 num_completions++; 497 } 498 499 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 500 } 501 502 static int 503 malloc_create_channel_cb(void *io_device, void *ctx) 504 { 505 struct malloc_channel *ch = ctx; 506 507 ch->accel_channel = spdk_accel_engine_get_io_channel(); 508 if (!ch->accel_channel) { 509 SPDK_ERRLOG("Failed to get accel engine's IO channel\n"); 510 return -ENOMEM; 511 } 512 513 ch->completion_poller = SPDK_POLLER_REGISTER(malloc_completion_poller, ch, 0); 514 if (!ch->completion_poller) { 515 SPDK_ERRLOG("Failed to register malloc completion poller\n"); 516 spdk_put_io_channel(ch->accel_channel); 517 return -ENOMEM; 518 } 519 520 TAILQ_INIT(&ch->completed_tasks); 521 522 return 0; 523 } 524 525 static void 526 malloc_destroy_channel_cb(void *io_device, void *ctx) 527 { 528 struct malloc_channel *ch = ctx; 529 530 assert(TAILQ_EMPTY(&ch->completed_tasks)); 531 532 spdk_put_io_channel(ch->accel_channel); 533 spdk_poller_unregister(&ch->completion_poller); 534 } 535 536 static int bdev_malloc_initialize(void) 537 { 538 /* This needs to be reset for each reinitialization of submodules. 539 * Otherwise after enough devices or reinitializations the value gets too high. 540 * TODO: Make malloc bdev name mandatory and remove this counter. */ 541 malloc_disk_count = 0; 542 543 spdk_io_device_register(&g_malloc_disks, malloc_create_channel_cb, 544 malloc_destroy_channel_cb, sizeof(struct malloc_channel), 545 "bdev_malloc"); 546 547 return 0; 548 } 549 550 static void 551 bdev_malloc_deinitialize(void) 552 { 553 spdk_io_device_unregister(&g_malloc_disks, NULL); 554 } 555 556 SPDK_LOG_REGISTER_COMPONENT(bdev_malloc) 557