1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2017 Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/blob_bdev.h" 10 #include "spdk/blob.h" 11 #include "spdk/thread.h" 12 #include "spdk/log.h" 13 #include "spdk/endian.h" 14 #define __SPDK_BDEV_MODULE_ONLY 15 #include "spdk/bdev_module.h" 16 17 struct blob_bdev { 18 struct spdk_bs_dev bs_dev; 19 struct spdk_bdev *bdev; 20 struct spdk_bdev_desc *desc; 21 bool write; 22 int32_t refs; 23 struct spdk_spinlock lock; 24 }; 25 26 struct blob_resubmit { 27 struct spdk_bdev_io_wait_entry bdev_io_wait; 28 enum spdk_bdev_io_type io_type; 29 struct spdk_bs_dev *dev; 30 struct spdk_io_channel *channel; 31 void *payload; 32 int iovcnt; 33 uint64_t lba; 34 uint64_t src_lba; 35 uint32_t lba_count; 36 struct spdk_bs_dev_cb_args *cb_args; 37 struct spdk_blob_ext_io_opts *ext_io_opts; 38 }; 39 static void bdev_blob_resubmit(void *); 40 41 static inline struct spdk_bdev_desc * 42 __get_desc(struct spdk_bs_dev *dev) 43 { 44 return ((struct blob_bdev *)dev)->desc; 45 } 46 47 static inline struct spdk_bdev * 48 __get_bdev(struct spdk_bs_dev *dev) 49 { 50 return ((struct blob_bdev *)dev)->bdev; 51 } 52 53 static void 54 bdev_blob_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *arg) 55 { 56 struct spdk_bs_dev_cb_args *cb_args = arg; 57 int bserrno; 58 59 if (success) { 60 bserrno = 0; 61 } else { 62 bserrno = -EIO; 63 } 64 cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, bserrno); 65 spdk_bdev_free_io(bdev_io); 66 } 67 68 static void 69 bdev_blob_queue_io(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload, 70 int iovcnt, uint64_t lba, uint64_t src_lba, uint32_t lba_count, 71 enum spdk_bdev_io_type io_type, struct spdk_bs_dev_cb_args *cb_args, 72 struct spdk_blob_ext_io_opts *ext_io_opts) 73 { 74 int rc; 75 struct spdk_bdev *bdev = __get_bdev(dev); 76 struct blob_resubmit *ctx; 77 78 ctx = calloc(1, sizeof(struct blob_resubmit)); 79 80 if (ctx == NULL) { 81 SPDK_ERRLOG("Not enough memory to queue io\n"); 82 cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -ENOMEM); 83 return; 84 } 85 86 ctx->io_type = io_type; 87 ctx->dev = dev; 88 ctx->channel = channel; 89 ctx->payload = payload; 90 ctx->iovcnt = iovcnt; 91 ctx->lba = lba; 92 ctx->src_lba = src_lba; 93 ctx->lba_count = lba_count; 94 ctx->cb_args = cb_args; 95 ctx->bdev_io_wait.bdev = bdev; 96 ctx->bdev_io_wait.cb_fn = bdev_blob_resubmit; 97 ctx->bdev_io_wait.cb_arg = ctx; 98 ctx->ext_io_opts = ext_io_opts; 99 100 rc = spdk_bdev_queue_io_wait(bdev, channel, &ctx->bdev_io_wait); 101 if (rc != 0) { 102 SPDK_ERRLOG("Queue io failed, rc=%d\n", rc); 103 cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); 104 free(ctx); 105 assert(false); 106 } 107 } 108 109 static void 110 bdev_blob_read(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload, 111 uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) 112 { 113 int rc; 114 115 rc = spdk_bdev_read_blocks(__get_desc(dev), channel, payload, lba, 116 lba_count, bdev_blob_io_complete, cb_args); 117 if (rc == -ENOMEM) { 118 bdev_blob_queue_io(dev, channel, payload, 0, lba, 0, 119 lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args, NULL); 120 } else if (rc != 0) { 121 cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); 122 } 123 } 124 125 static void 126 bdev_blob_write(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload, 127 uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) 128 { 129 int rc; 130 131 rc = spdk_bdev_write_blocks(__get_desc(dev), channel, payload, lba, 132 lba_count, bdev_blob_io_complete, cb_args); 133 if (rc == -ENOMEM) { 134 bdev_blob_queue_io(dev, channel, payload, 0, lba, 0, 135 lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args, NULL); 136 } else if (rc != 0) { 137 cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); 138 } 139 } 140 141 static void 142 bdev_blob_readv(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, 143 struct iovec *iov, int iovcnt, 144 uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) 145 { 146 int rc; 147 148 rc = spdk_bdev_readv_blocks(__get_desc(dev), channel, iov, iovcnt, lba, 149 lba_count, bdev_blob_io_complete, cb_args); 150 if (rc == -ENOMEM) { 151 bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0, 152 lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args, NULL); 153 } else if (rc != 0) { 154 cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); 155 } 156 } 157 158 static void 159 bdev_blob_writev(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, 160 struct iovec *iov, int iovcnt, 161 uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) 162 { 163 int rc; 164 165 rc = spdk_bdev_writev_blocks(__get_desc(dev), channel, iov, iovcnt, lba, 166 lba_count, bdev_blob_io_complete, cb_args); 167 if (rc == -ENOMEM) { 168 bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0, 169 lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args, NULL); 170 } else if (rc != 0) { 171 cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); 172 } 173 } 174 175 static inline void 176 blob_ext_io_opts_to_bdev_opts(struct spdk_bdev_ext_io_opts *dst, struct spdk_blob_ext_io_opts *src) 177 { 178 memset(dst, 0, sizeof(*dst)); 179 dst->size = sizeof(*dst); 180 dst->memory_domain = src->memory_domain; 181 dst->memory_domain_ctx = src->memory_domain_ctx; 182 } 183 184 static void 185 bdev_blob_readv_ext(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, 186 struct iovec *iov, int iovcnt, 187 uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args, 188 struct spdk_blob_ext_io_opts *io_opts) 189 { 190 struct spdk_bdev_ext_io_opts bdev_io_opts; 191 int rc; 192 193 blob_ext_io_opts_to_bdev_opts(&bdev_io_opts, io_opts); 194 rc = spdk_bdev_readv_blocks_ext(__get_desc(dev), channel, iov, iovcnt, lba, lba_count, 195 bdev_blob_io_complete, cb_args, &bdev_io_opts); 196 if (rc == -ENOMEM) { 197 bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0, lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args, 198 io_opts); 199 } else if (rc != 0) { 200 cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); 201 } 202 } 203 204 static void 205 bdev_blob_writev_ext(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, 206 struct iovec *iov, int iovcnt, 207 uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args, 208 struct spdk_blob_ext_io_opts *io_opts) 209 { 210 struct spdk_bdev_ext_io_opts bdev_io_opts; 211 int rc; 212 213 blob_ext_io_opts_to_bdev_opts(&bdev_io_opts, io_opts); 214 rc = spdk_bdev_writev_blocks_ext(__get_desc(dev), channel, iov, iovcnt, lba, lba_count, 215 bdev_blob_io_complete, cb_args, &bdev_io_opts); 216 if (rc == -ENOMEM) { 217 bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0, lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args, 218 io_opts); 219 } else if (rc != 0) { 220 cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); 221 } 222 } 223 224 static void 225 bdev_blob_write_zeroes(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, uint64_t lba, 226 uint64_t lba_count, struct spdk_bs_dev_cb_args *cb_args) 227 { 228 int rc; 229 230 rc = spdk_bdev_write_zeroes_blocks(__get_desc(dev), channel, lba, 231 lba_count, bdev_blob_io_complete, cb_args); 232 if (rc == -ENOMEM) { 233 bdev_blob_queue_io(dev, channel, NULL, 0, lba, 0, 234 lba_count, SPDK_BDEV_IO_TYPE_WRITE_ZEROES, cb_args, NULL); 235 } else if (rc != 0) { 236 cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); 237 } 238 } 239 240 static void 241 bdev_blob_unmap(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, uint64_t lba, 242 uint64_t lba_count, struct spdk_bs_dev_cb_args *cb_args) 243 { 244 struct blob_bdev *blob_bdev = (struct blob_bdev *)dev; 245 int rc; 246 247 if (spdk_bdev_io_type_supported(blob_bdev->bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { 248 rc = spdk_bdev_unmap_blocks(__get_desc(dev), channel, lba, lba_count, 249 bdev_blob_io_complete, cb_args); 250 if (rc == -ENOMEM) { 251 bdev_blob_queue_io(dev, channel, NULL, 0, lba, 0, 252 lba_count, SPDK_BDEV_IO_TYPE_UNMAP, cb_args, NULL); 253 } else if (rc != 0) { 254 cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); 255 } 256 } else { 257 /* 258 * If the device doesn't support unmap, immediately complete 259 * the request. Blobstore does not rely on unmap zeroing 260 * data. 261 */ 262 cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, 0); 263 } 264 } 265 266 static void 267 bdev_blob_copy(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, 268 uint64_t dst_lba, uint64_t src_lba, uint64_t lba_count, 269 struct spdk_bs_dev_cb_args *cb_args) 270 { 271 int rc; 272 273 rc = spdk_bdev_copy_blocks(__get_desc(dev), channel, 274 dst_lba, src_lba, lba_count, 275 bdev_blob_io_complete, cb_args); 276 if (rc == -ENOMEM) { 277 bdev_blob_queue_io(dev, channel, NULL, 0, dst_lba, src_lba, 278 lba_count, SPDK_BDEV_IO_TYPE_COPY, cb_args, NULL); 279 } else if (rc != 0) { 280 cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); 281 } 282 } 283 284 static void 285 bdev_blob_resubmit(void *arg) 286 { 287 struct blob_resubmit *ctx = (struct blob_resubmit *) arg; 288 289 switch (ctx->io_type) { 290 case SPDK_BDEV_IO_TYPE_READ: 291 if (ctx->iovcnt > 0) { 292 if (ctx->ext_io_opts) { 293 bdev_blob_readv_ext(ctx->dev, ctx->channel, (struct iovec *) ctx->payload, ctx->iovcnt, 294 ctx->lba, ctx->lba_count, ctx->cb_args, ctx->ext_io_opts); 295 } else { 296 bdev_blob_readv(ctx->dev, ctx->channel, (struct iovec *) ctx->payload, ctx->iovcnt, 297 ctx->lba, ctx->lba_count, ctx->cb_args); 298 } 299 } else { 300 bdev_blob_read(ctx->dev, ctx->channel, ctx->payload, 301 ctx->lba, ctx->lba_count, ctx->cb_args); 302 } 303 break; 304 case SPDK_BDEV_IO_TYPE_WRITE: 305 if (ctx->iovcnt > 0) { 306 if (ctx->ext_io_opts) { 307 bdev_blob_writev_ext(ctx->dev, ctx->channel, (struct iovec *) ctx->payload, ctx->iovcnt, 308 ctx->lba, ctx->lba_count, ctx->cb_args, ctx->ext_io_opts); 309 } else { 310 bdev_blob_writev(ctx->dev, ctx->channel, (struct iovec *) ctx->payload, ctx->iovcnt, 311 ctx->lba, ctx->lba_count, ctx->cb_args); 312 } 313 } else { 314 bdev_blob_write(ctx->dev, ctx->channel, ctx->payload, 315 ctx->lba, ctx->lba_count, ctx->cb_args); 316 } 317 break; 318 case SPDK_BDEV_IO_TYPE_UNMAP: 319 bdev_blob_unmap(ctx->dev, ctx->channel, 320 ctx->lba, ctx->lba_count, ctx->cb_args); 321 break; 322 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 323 bdev_blob_write_zeroes(ctx->dev, ctx->channel, 324 ctx->lba, ctx->lba_count, ctx->cb_args); 325 break; 326 case SPDK_BDEV_IO_TYPE_COPY: 327 bdev_blob_copy(ctx->dev, ctx->channel, 328 ctx->lba, ctx->src_lba, ctx->lba_count, ctx->cb_args); 329 break; 330 default: 331 SPDK_ERRLOG("Unsupported io type %d\n", ctx->io_type); 332 assert(false); 333 break; 334 } 335 free(ctx); 336 } 337 338 int 339 spdk_bs_bdev_claim(struct spdk_bs_dev *bs_dev, struct spdk_bdev_module *module) 340 { 341 struct blob_bdev *blob_bdev = (struct blob_bdev *)bs_dev; 342 struct spdk_bdev_desc *desc = blob_bdev->desc; 343 enum spdk_bdev_claim_type claim_type; 344 int rc; 345 346 claim_type = blob_bdev->write ? SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE : 347 SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE; 348 rc = spdk_bdev_module_claim_bdev_desc(desc, claim_type, NULL, module); 349 if (rc != 0) { 350 SPDK_ERRLOG("could not claim bs dev\n"); 351 return rc; 352 } 353 354 return rc; 355 } 356 357 static struct spdk_io_channel * 358 bdev_blob_create_channel(struct spdk_bs_dev *dev) 359 { 360 struct blob_bdev *blob_bdev = (struct blob_bdev *)dev; 361 struct spdk_io_channel *ch; 362 363 ch = spdk_bdev_get_io_channel(blob_bdev->desc); 364 if (ch != NULL) { 365 spdk_spin_lock(&blob_bdev->lock); 366 blob_bdev->refs++; 367 spdk_spin_unlock(&blob_bdev->lock); 368 } 369 370 return ch; 371 } 372 373 static void 374 bdev_blob_free(struct blob_bdev *blob_bdev) 375 { 376 assert(blob_bdev->refs == 0); 377 378 spdk_spin_destroy(&blob_bdev->lock); 379 free(blob_bdev); 380 } 381 382 static void 383 bdev_blob_destroy_channel(struct spdk_bs_dev *dev, struct spdk_io_channel *channel) 384 { 385 struct blob_bdev *blob_bdev = (struct blob_bdev *)dev; 386 int32_t refs; 387 388 spdk_spin_lock(&blob_bdev->lock); 389 390 assert(blob_bdev->refs > 0); 391 blob_bdev->refs--; 392 refs = blob_bdev->refs; 393 394 spdk_spin_unlock(&blob_bdev->lock); 395 396 spdk_put_io_channel(channel); 397 398 /* 399 * If the value of blob_bdev->refs taken while holding blob_bdev->refs is zero, the blob and 400 * this channel have been destroyed. This means that dev->destroy() has been called and it 401 * would be an error (akin to use after free) if dev is dereferenced after destroying it. 402 * Thus, there should be no race with bdev_blob_create_channel(). 403 * 404 * Because the value of blob_bdev->refs was taken while holding the lock here and the same 405 * is done in bdev_blob_destroy(), there is no race with bdev_blob_destroy(). 406 */ 407 if (refs == 0) { 408 bdev_blob_free(blob_bdev); 409 } 410 } 411 412 static void 413 bdev_blob_destroy(struct spdk_bs_dev *bs_dev) 414 { 415 struct blob_bdev *blob_bdev = (struct blob_bdev *)bs_dev; 416 struct spdk_bdev_desc *desc; 417 int32_t refs; 418 419 spdk_spin_lock(&blob_bdev->lock); 420 421 desc = blob_bdev->desc; 422 blob_bdev->desc = NULL; 423 blob_bdev->refs--; 424 refs = blob_bdev->refs; 425 426 spdk_spin_unlock(&blob_bdev->lock); 427 428 spdk_bdev_close(desc); 429 430 /* 431 * If the value of blob_bdev->refs taken while holding blob_bdev->refs is zero, 432 * bs_dev->destroy() has been called and all the channels have been destroyed. It would be 433 * an error (akin to use after free) if bs_dev is dereferenced after destroying it. Thus, 434 * there should be no race with bdev_blob_create_channel(). 435 * 436 * Because the value of blob_bdev->refs was taken while holding the lock here and the same 437 * is done in bdev_blob_destroy_channel(), there is no race with 438 * bdev_blob_destroy_channel(). 439 */ 440 if (refs == 0) { 441 bdev_blob_free(blob_bdev); 442 } 443 } 444 445 static struct spdk_bdev * 446 bdev_blob_get_base_bdev(struct spdk_bs_dev *bs_dev) 447 { 448 return __get_bdev(bs_dev); 449 } 450 451 static bool 452 bdev_blob_is_zeroes(struct spdk_bs_dev *dev, uint64_t lba, uint64_t lba_count) 453 { 454 return false; 455 } 456 457 static bool 458 bdev_blob_is_range_valid(struct spdk_bs_dev *dev, uint64_t lba, uint64_t lba_count) 459 { 460 struct spdk_bdev *bdev = __get_bdev(dev); 461 462 /* The lba requested should be within the bounds of this bs_dev. */ 463 if (lba >= spdk_bdev_get_num_blocks(bdev)) { 464 return false; 465 } else if (lba + lba_count > spdk_bdev_get_num_blocks(bdev)) { 466 /* bdevs used for esnaps must currently be an exact multiple of the 467 * blobstore cluster size (see spdk_lvol_create_esnap_clone()), but if that 468 * ever changes this code here needs to be updated to account for it. */ 469 SPDK_ERRLOG("Entire range must be within the bs_dev bounds for CoW.\n" 470 "lba(lba_count): %lu(%lu), num_blks: %lu\n", lba, lba_count, spdk_bdev_get_num_blocks(bdev)); 471 assert(false); 472 return false; 473 } 474 475 return true; 476 } 477 478 static bool 479 bdev_blob_translate_lba(struct spdk_bs_dev *dev, uint64_t lba, uint64_t *base_lba) 480 { 481 *base_lba = lba; 482 return true; 483 } 484 485 static void 486 blob_bdev_init(struct blob_bdev *b, struct spdk_bdev_desc *desc) 487 { 488 struct spdk_bdev *bdev; 489 490 bdev = spdk_bdev_desc_get_bdev(desc); 491 assert(bdev != NULL); 492 493 b->bdev = bdev; 494 b->desc = desc; 495 b->bs_dev.blockcnt = spdk_bdev_get_num_blocks(bdev); 496 b->bs_dev.blocklen = spdk_bdev_get_block_size(bdev); 497 b->bs_dev.phys_blocklen = spdk_bdev_get_physical_block_size(bdev); 498 b->bs_dev.create_channel = bdev_blob_create_channel; 499 b->bs_dev.destroy_channel = bdev_blob_destroy_channel; 500 b->bs_dev.destroy = bdev_blob_destroy; 501 b->bs_dev.read = bdev_blob_read; 502 b->bs_dev.write = bdev_blob_write; 503 b->bs_dev.readv = bdev_blob_readv; 504 b->bs_dev.writev = bdev_blob_writev; 505 b->bs_dev.readv_ext = bdev_blob_readv_ext; 506 b->bs_dev.writev_ext = bdev_blob_writev_ext; 507 b->bs_dev.write_zeroes = bdev_blob_write_zeroes; 508 b->bs_dev.unmap = bdev_blob_unmap; 509 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 510 b->bs_dev.copy = bdev_blob_copy; 511 } 512 b->bs_dev.get_base_bdev = bdev_blob_get_base_bdev; 513 b->bs_dev.is_zeroes = bdev_blob_is_zeroes; 514 b->bs_dev.is_range_valid = bdev_blob_is_range_valid; 515 b->bs_dev.translate_lba = bdev_blob_translate_lba; 516 } 517 518 void 519 spdk_bdev_update_bs_blockcnt(struct spdk_bs_dev *bs_dev) 520 { 521 struct blob_bdev *blob_bdev = (struct blob_bdev *)bs_dev; 522 523 assert(bs_dev->blocklen == spdk_bdev_get_block_size(blob_bdev->bdev)); 524 bs_dev->blockcnt = spdk_bdev_get_num_blocks(blob_bdev->bdev); 525 } 526 527 int 528 spdk_bdev_create_bs_dev(const char *bdev_name, bool write, 529 struct spdk_bdev_bs_dev_opts *opts, size_t opts_size, 530 spdk_bdev_event_cb_t event_cb, void *event_ctx, 531 struct spdk_bs_dev **bs_dev) 532 { 533 struct blob_bdev *b; 534 struct spdk_bdev_desc *desc; 535 int rc; 536 537 assert(spdk_get_thread() != NULL); 538 539 if (opts != NULL && opts_size != sizeof(*opts)) { 540 SPDK_ERRLOG("bdev name '%s': unsupported options\n", bdev_name); 541 return -EINVAL; 542 } 543 544 b = calloc(1, sizeof(*b)); 545 546 if (b == NULL) { 547 SPDK_ERRLOG("could not allocate blob_bdev\n"); 548 return -ENOMEM; 549 } 550 551 rc = spdk_bdev_open_ext(bdev_name, write, event_cb, event_ctx, &desc); 552 if (rc != 0) { 553 free(b); 554 return rc; 555 } 556 557 blob_bdev_init(b, desc); 558 559 *bs_dev = &b->bs_dev; 560 b->write = write; 561 b->refs = 1; 562 spdk_spin_init(&b->lock); 563 564 return 0; 565 } 566 567 int 568 spdk_bdev_create_bs_dev_ext(const char *bdev_name, spdk_bdev_event_cb_t event_cb, 569 void *event_ctx, struct spdk_bs_dev **bs_dev) 570 { 571 return spdk_bdev_create_bs_dev(bdev_name, true, NULL, 0, event_cb, event_ctx, bs_dev); 572 } 573