1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "bdev_malloc.h" 10 #include "spdk/bdev.h" 11 #include "spdk/endian.h" 12 #include "spdk/env.h" 13 #include "spdk/accel.h" 14 #include "spdk/json.h" 15 #include "spdk/thread.h" 16 #include "spdk/queue.h" 17 #include "spdk/string.h" 18 19 #include "spdk/bdev_module.h" 20 #include "spdk/log.h" 21 22 struct malloc_disk { 23 struct spdk_bdev disk; 24 void *malloc_buf; 25 TAILQ_ENTRY(malloc_disk) link; 26 }; 27 28 struct malloc_task { 29 int num_outstanding; 30 enum spdk_bdev_io_status status; 31 TAILQ_ENTRY(malloc_task) tailq; 32 }; 33 34 struct malloc_channel { 35 struct spdk_io_channel *accel_channel; 36 struct spdk_poller *completion_poller; 37 TAILQ_HEAD(, malloc_task) completed_tasks; 38 }; 39 40 static void 41 malloc_done(void *ref, int status) 42 { 43 struct malloc_task *task = (struct malloc_task *)ref; 44 45 if (status != 0) { 46 if (status == -ENOMEM) { 47 task->status = SPDK_BDEV_IO_STATUS_NOMEM; 48 } else { 49 task->status = SPDK_BDEV_IO_STATUS_FAILED; 50 } 51 } 52 53 if (--task->num_outstanding == 0) { 54 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status); 55 } 56 } 57 58 static void 59 malloc_complete_task(struct malloc_task *task, struct malloc_channel *mch, 60 enum spdk_bdev_io_status status) 61 { 62 task->status = status; 63 TAILQ_INSERT_TAIL(&mch->completed_tasks, task, tailq); 64 } 65 66 static TAILQ_HEAD(, malloc_disk) g_malloc_disks = TAILQ_HEAD_INITIALIZER(g_malloc_disks); 67 68 int malloc_disk_count = 0; 69 70 static int bdev_malloc_initialize(void); 71 static void bdev_malloc_deinitialize(void); 72 73 static int 74 bdev_malloc_get_ctx_size(void) 75 { 76 return sizeof(struct malloc_task); 77 } 78 79 static struct spdk_bdev_module malloc_if = { 80 .name = "malloc", 81 .module_init = bdev_malloc_initialize, 82 .module_fini = bdev_malloc_deinitialize, 83 .get_ctx_size = bdev_malloc_get_ctx_size, 84 85 }; 86 87 SPDK_BDEV_MODULE_REGISTER(malloc, &malloc_if) 88 89 static void 90 malloc_disk_free(struct malloc_disk *malloc_disk) 91 { 92 if (!malloc_disk) { 93 return; 94 } 95 96 free(malloc_disk->disk.name); 97 spdk_free(malloc_disk->malloc_buf); 98 free(malloc_disk); 99 } 100 101 static int 102 bdev_malloc_destruct(void *ctx) 103 { 104 struct malloc_disk *malloc_disk = ctx; 105 106 TAILQ_REMOVE(&g_malloc_disks, malloc_disk, link); 107 malloc_disk_free(malloc_disk); 108 return 0; 109 } 110 111 static int 112 bdev_malloc_check_iov_len(struct iovec *iovs, int iovcnt, size_t nbytes) 113 { 114 int i; 115 116 for (i = 0; i < iovcnt; i++) { 117 if (nbytes < iovs[i].iov_len) { 118 return 0; 119 } 120 121 nbytes -= iovs[i].iov_len; 122 } 123 124 return nbytes != 0; 125 } 126 127 static void 128 bdev_malloc_readv(struct malloc_disk *mdisk, struct spdk_io_channel *ch, 129 struct malloc_task *task, 130 struct iovec *iov, int iovcnt, size_t len, uint64_t offset) 131 { 132 int64_t res = 0; 133 void *src = mdisk->malloc_buf + offset; 134 int i; 135 136 if (bdev_malloc_check_iov_len(iov, iovcnt, len)) { 137 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), 138 SPDK_BDEV_IO_STATUS_FAILED); 139 return; 140 } 141 142 SPDK_DEBUGLOG(bdev_malloc, "read %zu bytes from offset %#" PRIx64 ", iovcnt=%d\n", 143 len, offset, iovcnt); 144 145 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 146 task->num_outstanding = 0; 147 148 for (i = 0; i < iovcnt; i++) { 149 task->num_outstanding++; 150 res = spdk_accel_submit_copy(ch, iov[i].iov_base, 151 src, iov[i].iov_len, 0, malloc_done, task); 152 153 if (res != 0) { 154 malloc_done(task, res); 155 break; 156 } 157 158 src += iov[i].iov_len; 159 len -= iov[i].iov_len; 160 } 161 } 162 163 static void 164 bdev_malloc_writev(struct malloc_disk *mdisk, struct spdk_io_channel *ch, 165 struct malloc_task *task, 166 struct iovec *iov, int iovcnt, size_t len, uint64_t offset) 167 { 168 int64_t res = 0; 169 void *dst = mdisk->malloc_buf + offset; 170 int i; 171 172 if (bdev_malloc_check_iov_len(iov, iovcnt, len)) { 173 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), 174 SPDK_BDEV_IO_STATUS_FAILED); 175 return; 176 } 177 178 SPDK_DEBUGLOG(bdev_malloc, "wrote %zu bytes to offset %#" PRIx64 ", iovcnt=%d\n", 179 len, offset, iovcnt); 180 181 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 182 task->num_outstanding = 0; 183 184 for (i = 0; i < iovcnt; i++) { 185 task->num_outstanding++; 186 res = spdk_accel_submit_copy(ch, dst, iov[i].iov_base, 187 iov[i].iov_len, 0, malloc_done, task); 188 189 if (res != 0) { 190 malloc_done(task, res); 191 break; 192 } 193 194 dst += iov[i].iov_len; 195 } 196 } 197 198 static int 199 bdev_malloc_unmap(struct malloc_disk *mdisk, 200 struct spdk_io_channel *ch, 201 struct malloc_task *task, 202 uint64_t offset, 203 uint64_t byte_count) 204 { 205 task->status = SPDK_BDEV_IO_STATUS_SUCCESS; 206 task->num_outstanding = 1; 207 208 return spdk_accel_submit_fill(ch, mdisk->malloc_buf + offset, 0, 209 byte_count, 0, malloc_done, task); 210 } 211 212 static int 213 _bdev_malloc_submit_request(struct malloc_channel *mch, struct spdk_bdev_io *bdev_io) 214 { 215 uint32_t block_size = bdev_io->bdev->blocklen; 216 217 switch (bdev_io->type) { 218 case SPDK_BDEV_IO_TYPE_READ: 219 if (bdev_io->u.bdev.iovs[0].iov_base == NULL) { 220 assert(bdev_io->u.bdev.iovcnt == 1); 221 bdev_io->u.bdev.iovs[0].iov_base = 222 ((struct malloc_disk *)bdev_io->bdev->ctxt)->malloc_buf + 223 bdev_io->u.bdev.offset_blocks * block_size; 224 bdev_io->u.bdev.iovs[0].iov_len = bdev_io->u.bdev.num_blocks * block_size; 225 malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch, 226 SPDK_BDEV_IO_STATUS_SUCCESS); 227 return 0; 228 } 229 230 bdev_malloc_readv((struct malloc_disk *)bdev_io->bdev->ctxt, 231 mch->accel_channel, 232 (struct malloc_task *)bdev_io->driver_ctx, 233 bdev_io->u.bdev.iovs, 234 bdev_io->u.bdev.iovcnt, 235 bdev_io->u.bdev.num_blocks * block_size, 236 bdev_io->u.bdev.offset_blocks * block_size); 237 return 0; 238 239 case SPDK_BDEV_IO_TYPE_WRITE: 240 bdev_malloc_writev((struct malloc_disk *)bdev_io->bdev->ctxt, 241 mch->accel_channel, 242 (struct malloc_task *)bdev_io->driver_ctx, 243 bdev_io->u.bdev.iovs, 244 bdev_io->u.bdev.iovcnt, 245 bdev_io->u.bdev.num_blocks * block_size, 246 bdev_io->u.bdev.offset_blocks * block_size); 247 return 0; 248 249 case SPDK_BDEV_IO_TYPE_RESET: 250 malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch, 251 SPDK_BDEV_IO_STATUS_SUCCESS); 252 return 0; 253 254 case SPDK_BDEV_IO_TYPE_FLUSH: 255 malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch, 256 SPDK_BDEV_IO_STATUS_SUCCESS); 257 return 0; 258 259 case SPDK_BDEV_IO_TYPE_UNMAP: 260 return bdev_malloc_unmap((struct malloc_disk *)bdev_io->bdev->ctxt, 261 mch->accel_channel, 262 (struct malloc_task *)bdev_io->driver_ctx, 263 bdev_io->u.bdev.offset_blocks * block_size, 264 bdev_io->u.bdev.num_blocks * block_size); 265 266 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 267 /* bdev_malloc_unmap is implemented with a call to mem_cpy_fill which zeroes out all of the requested bytes. */ 268 return bdev_malloc_unmap((struct malloc_disk *)bdev_io->bdev->ctxt, 269 mch->accel_channel, 270 (struct malloc_task *)bdev_io->driver_ctx, 271 bdev_io->u.bdev.offset_blocks * block_size, 272 bdev_io->u.bdev.num_blocks * block_size); 273 274 case SPDK_BDEV_IO_TYPE_ZCOPY: 275 if (bdev_io->u.bdev.zcopy.start) { 276 void *buf; 277 size_t len; 278 279 buf = ((struct malloc_disk *)bdev_io->bdev->ctxt)->malloc_buf + 280 bdev_io->u.bdev.offset_blocks * block_size; 281 len = bdev_io->u.bdev.num_blocks * block_size; 282 spdk_bdev_io_set_buf(bdev_io, buf, len); 283 284 } 285 malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch, 286 SPDK_BDEV_IO_STATUS_SUCCESS); 287 return 0; 288 case SPDK_BDEV_IO_TYPE_ABORT: 289 malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch, 290 SPDK_BDEV_IO_STATUS_FAILED); 291 return 0; 292 default: 293 return -1; 294 } 295 return 0; 296 } 297 298 static void 299 bdev_malloc_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 300 { 301 struct malloc_channel *mch = spdk_io_channel_get_ctx(ch); 302 303 if (_bdev_malloc_submit_request(mch, bdev_io) != 0) { 304 malloc_complete_task((struct malloc_task *)bdev_io->driver_ctx, mch, 305 SPDK_BDEV_IO_STATUS_FAILED); 306 } 307 } 308 309 static bool 310 bdev_malloc_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 311 { 312 switch (io_type) { 313 case SPDK_BDEV_IO_TYPE_READ: 314 case SPDK_BDEV_IO_TYPE_WRITE: 315 case SPDK_BDEV_IO_TYPE_FLUSH: 316 case SPDK_BDEV_IO_TYPE_RESET: 317 case SPDK_BDEV_IO_TYPE_UNMAP: 318 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 319 case SPDK_BDEV_IO_TYPE_ZCOPY: 320 case SPDK_BDEV_IO_TYPE_ABORT: 321 return true; 322 323 default: 324 return false; 325 } 326 } 327 328 static struct spdk_io_channel * 329 bdev_malloc_get_io_channel(void *ctx) 330 { 331 return spdk_get_io_channel(&g_malloc_disks); 332 } 333 334 static void 335 bdev_malloc_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 336 { 337 char uuid_str[SPDK_UUID_STRING_LEN]; 338 339 spdk_json_write_object_begin(w); 340 341 spdk_json_write_named_string(w, "method", "bdev_malloc_create"); 342 343 spdk_json_write_named_object_begin(w, "params"); 344 spdk_json_write_named_string(w, "name", bdev->name); 345 spdk_json_write_named_uint64(w, "num_blocks", bdev->blockcnt); 346 spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); 347 spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &bdev->uuid); 348 spdk_json_write_named_string(w, "uuid", uuid_str); 349 spdk_json_write_named_uint32(w, "optimal_io_boundary", bdev->optimal_io_boundary); 350 351 spdk_json_write_object_end(w); 352 353 spdk_json_write_object_end(w); 354 } 355 356 static const struct spdk_bdev_fn_table malloc_fn_table = { 357 .destruct = bdev_malloc_destruct, 358 .submit_request = bdev_malloc_submit_request, 359 .io_type_supported = bdev_malloc_io_type_supported, 360 .get_io_channel = bdev_malloc_get_io_channel, 361 .write_config_json = bdev_malloc_write_json_config, 362 }; 363 364 int 365 create_malloc_disk(struct spdk_bdev **bdev, const char *name, const struct spdk_uuid *uuid, 366 uint64_t num_blocks, uint32_t block_size, uint32_t optimal_io_boundary) 367 { 368 struct malloc_disk *mdisk; 369 int rc; 370 371 if (num_blocks == 0) { 372 SPDK_ERRLOG("Disk num_blocks must be greater than 0"); 373 return -EINVAL; 374 } 375 376 if (block_size % 512) { 377 SPDK_ERRLOG("block size must be 512 bytes aligned\n"); 378 return -EINVAL; 379 } 380 381 mdisk = calloc(1, sizeof(*mdisk)); 382 if (!mdisk) { 383 SPDK_ERRLOG("mdisk calloc() failed\n"); 384 return -ENOMEM; 385 } 386 387 /* 388 * Allocate the large backend memory buffer from pinned memory. 389 * 390 * TODO: need to pass a hint so we know which socket to allocate 391 * from on multi-socket systems. 392 */ 393 mdisk->malloc_buf = spdk_zmalloc(num_blocks * block_size, 2 * 1024 * 1024, NULL, 394 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 395 if (!mdisk->malloc_buf) { 396 SPDK_ERRLOG("malloc_buf spdk_zmalloc() failed\n"); 397 malloc_disk_free(mdisk); 398 return -ENOMEM; 399 } 400 401 if (name) { 402 mdisk->disk.name = strdup(name); 403 } else { 404 /* Auto-generate a name */ 405 mdisk->disk.name = spdk_sprintf_alloc("Malloc%d", malloc_disk_count); 406 malloc_disk_count++; 407 } 408 if (!mdisk->disk.name) { 409 malloc_disk_free(mdisk); 410 return -ENOMEM; 411 } 412 mdisk->disk.product_name = "Malloc disk"; 413 414 mdisk->disk.write_cache = 1; 415 mdisk->disk.blocklen = block_size; 416 mdisk->disk.blockcnt = num_blocks; 417 if (optimal_io_boundary) { 418 mdisk->disk.optimal_io_boundary = optimal_io_boundary; 419 mdisk->disk.split_on_optimal_io_boundary = true; 420 } 421 if (uuid) { 422 mdisk->disk.uuid = *uuid; 423 } else { 424 spdk_uuid_generate(&mdisk->disk.uuid); 425 } 426 427 mdisk->disk.ctxt = mdisk; 428 mdisk->disk.fn_table = &malloc_fn_table; 429 mdisk->disk.module = &malloc_if; 430 431 rc = spdk_bdev_register(&mdisk->disk); 432 if (rc) { 433 malloc_disk_free(mdisk); 434 return rc; 435 } 436 437 *bdev = &(mdisk->disk); 438 439 TAILQ_INSERT_TAIL(&g_malloc_disks, mdisk, link); 440 441 return rc; 442 } 443 444 void 445 delete_malloc_disk(const char *name, spdk_delete_malloc_complete cb_fn, void *cb_arg) 446 { 447 int rc; 448 449 rc = spdk_bdev_unregister_by_name(name, &malloc_if, cb_fn, cb_arg); 450 if (rc != 0) { 451 cb_fn(cb_arg, rc); 452 } 453 } 454 455 static int 456 malloc_completion_poller(void *ctx) 457 { 458 struct malloc_channel *ch = ctx; 459 struct malloc_task *task; 460 TAILQ_HEAD(, malloc_task) completed_tasks; 461 uint32_t num_completions = 0; 462 463 TAILQ_INIT(&completed_tasks); 464 TAILQ_SWAP(&completed_tasks, &ch->completed_tasks, malloc_task, tailq); 465 466 while (!TAILQ_EMPTY(&completed_tasks)) { 467 task = TAILQ_FIRST(&completed_tasks); 468 TAILQ_REMOVE(&completed_tasks, task, tailq); 469 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status); 470 num_completions++; 471 } 472 473 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 474 } 475 476 static int 477 malloc_create_channel_cb(void *io_device, void *ctx) 478 { 479 struct malloc_channel *ch = ctx; 480 481 ch->accel_channel = spdk_accel_get_io_channel(); 482 if (!ch->accel_channel) { 483 SPDK_ERRLOG("Failed to get accel framework's IO channel\n"); 484 return -ENOMEM; 485 } 486 487 ch->completion_poller = SPDK_POLLER_REGISTER(malloc_completion_poller, ch, 0); 488 if (!ch->completion_poller) { 489 SPDK_ERRLOG("Failed to register malloc completion poller\n"); 490 spdk_put_io_channel(ch->accel_channel); 491 return -ENOMEM; 492 } 493 494 TAILQ_INIT(&ch->completed_tasks); 495 496 return 0; 497 } 498 499 static void 500 malloc_destroy_channel_cb(void *io_device, void *ctx) 501 { 502 struct malloc_channel *ch = ctx; 503 504 assert(TAILQ_EMPTY(&ch->completed_tasks)); 505 506 spdk_put_io_channel(ch->accel_channel); 507 spdk_poller_unregister(&ch->completion_poller); 508 } 509 510 static int 511 bdev_malloc_initialize(void) 512 { 513 /* This needs to be reset for each reinitialization of submodules. 514 * Otherwise after enough devices or reinitializations the value gets too high. 515 * TODO: Make malloc bdev name mandatory and remove this counter. */ 516 malloc_disk_count = 0; 517 518 spdk_io_device_register(&g_malloc_disks, malloc_create_channel_cb, 519 malloc_destroy_channel_cb, sizeof(struct malloc_channel), 520 "bdev_malloc"); 521 522 return 0; 523 } 524 525 static void 526 bdev_malloc_deinitialize(void) 527 { 528 spdk_io_device_unregister(&g_malloc_disks, NULL); 529 } 530 531 SPDK_LOG_REGISTER_COMPONENT(bdev_malloc) 532