1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2018 Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "bdev_raid.h" 8 #include "spdk/env.h" 9 #include "spdk/thread.h" 10 #include "spdk/log.h" 11 #include "spdk/string.h" 12 #include "spdk/util.h" 13 #include "spdk/json.h" 14 #include "spdk/likely.h" 15 16 #define RAID_OFFSET_BLOCKS_INVALID UINT64_MAX 17 #define RAID_BDEV_PROCESS_MAX_QD 16 18 19 #define RAID_BDEV_PROCESS_WINDOW_SIZE_KB_DEFAULT 1024 20 21 static bool g_shutdown_started = false; 22 23 /* List of all raid bdevs */ 24 struct raid_all_tailq g_raid_bdev_list = TAILQ_HEAD_INITIALIZER(g_raid_bdev_list); 25 26 static TAILQ_HEAD(, raid_bdev_module) g_raid_modules = TAILQ_HEAD_INITIALIZER(g_raid_modules); 27 28 /* 29 * raid_bdev_io_channel is the context of spdk_io_channel for raid bdev device. It 30 * contains the relationship of raid bdev io channel with base bdev io channels. 31 */ 32 struct raid_bdev_io_channel { 33 /* Array of IO channels of base bdevs */ 34 struct spdk_io_channel **base_channel; 35 36 /* Private raid module IO channel */ 37 struct spdk_io_channel *module_channel; 38 39 /* Background process data */ 40 struct { 41 uint64_t offset; 42 struct spdk_io_channel *target_ch; 43 struct raid_bdev_io_channel *ch_processed; 44 } process; 45 }; 46 47 enum raid_bdev_process_state { 48 RAID_PROCESS_STATE_INIT, 49 RAID_PROCESS_STATE_RUNNING, 50 RAID_PROCESS_STATE_STOPPING, 51 RAID_PROCESS_STATE_STOPPED, 52 }; 53 54 struct raid_bdev_process { 55 struct raid_bdev *raid_bdev; 56 enum raid_process_type type; 57 enum raid_bdev_process_state state; 58 struct spdk_thread *thread; 59 struct raid_bdev_io_channel *raid_ch; 60 TAILQ_HEAD(, raid_bdev_process_request) requests; 61 uint64_t max_window_size; 62 uint64_t window_size; 63 uint64_t window_remaining; 64 int window_status; 65 uint64_t window_offset; 66 bool window_range_locked; 67 struct raid_base_bdev_info *target; 68 int status; 69 TAILQ_HEAD(, raid_process_finish_action) finish_actions; 70 }; 71 72 struct raid_process_finish_action { 73 spdk_msg_fn cb; 74 void *cb_ctx; 75 TAILQ_ENTRY(raid_process_finish_action) link; 76 }; 77 78 static struct spdk_raid_bdev_opts g_opts = { 79 .process_window_size_kb = RAID_BDEV_PROCESS_WINDOW_SIZE_KB_DEFAULT, 80 }; 81 82 void 83 raid_bdev_get_opts(struct spdk_raid_bdev_opts *opts) 84 { 85 *opts = g_opts; 86 } 87 88 int 89 raid_bdev_set_opts(const struct spdk_raid_bdev_opts *opts) 90 { 91 if (opts->process_window_size_kb == 0) { 92 return -EINVAL; 93 } 94 95 g_opts = *opts; 96 97 return 0; 98 } 99 100 static struct raid_bdev_module * 101 raid_bdev_module_find(enum raid_level level) 102 { 103 struct raid_bdev_module *raid_module; 104 105 TAILQ_FOREACH(raid_module, &g_raid_modules, link) { 106 if (raid_module->level == level) { 107 return raid_module; 108 } 109 } 110 111 return NULL; 112 } 113 114 void 115 raid_bdev_module_list_add(struct raid_bdev_module *raid_module) 116 { 117 if (raid_bdev_module_find(raid_module->level) != NULL) { 118 SPDK_ERRLOG("module for raid level '%s' already registered.\n", 119 raid_bdev_level_to_str(raid_module->level)); 120 assert(false); 121 } else { 122 TAILQ_INSERT_TAIL(&g_raid_modules, raid_module, link); 123 } 124 } 125 126 struct spdk_io_channel * 127 raid_bdev_channel_get_base_channel(struct raid_bdev_io_channel *raid_ch, uint8_t idx) 128 { 129 return raid_ch->base_channel[idx]; 130 } 131 132 void * 133 raid_bdev_channel_get_module_ctx(struct raid_bdev_io_channel *raid_ch) 134 { 135 assert(raid_ch->module_channel != NULL); 136 137 return spdk_io_channel_get_ctx(raid_ch->module_channel); 138 } 139 140 /* Function declarations */ 141 static void raid_bdev_examine(struct spdk_bdev *bdev); 142 static int raid_bdev_init(void); 143 static void raid_bdev_deconfigure(struct raid_bdev *raid_bdev, 144 raid_bdev_destruct_cb cb_fn, void *cb_arg); 145 146 static void 147 raid_bdev_ch_process_cleanup(struct raid_bdev_io_channel *raid_ch) 148 { 149 raid_ch->process.offset = RAID_OFFSET_BLOCKS_INVALID; 150 151 if (raid_ch->process.target_ch != NULL) { 152 spdk_put_io_channel(raid_ch->process.target_ch); 153 raid_ch->process.target_ch = NULL; 154 } 155 156 if (raid_ch->process.ch_processed != NULL) { 157 free(raid_ch->process.ch_processed->base_channel); 158 free(raid_ch->process.ch_processed); 159 raid_ch->process.ch_processed = NULL; 160 } 161 } 162 163 static int 164 raid_bdev_ch_process_setup(struct raid_bdev_io_channel *raid_ch, struct raid_bdev_process *process) 165 { 166 struct raid_bdev *raid_bdev = process->raid_bdev; 167 struct raid_bdev_io_channel *raid_ch_processed; 168 struct raid_base_bdev_info *base_info; 169 170 raid_ch->process.offset = process->window_offset; 171 172 /* In the future we may have other types of processes which don't use a target bdev, 173 * like data scrubbing or strip size migration. Until then, expect that there always is 174 * a process target. */ 175 assert(process->target != NULL); 176 177 raid_ch->process.target_ch = spdk_bdev_get_io_channel(process->target->desc); 178 if (raid_ch->process.target_ch == NULL) { 179 goto err; 180 } 181 182 raid_ch_processed = calloc(1, sizeof(*raid_ch_processed)); 183 if (raid_ch_processed == NULL) { 184 goto err; 185 } 186 raid_ch->process.ch_processed = raid_ch_processed; 187 188 raid_ch_processed->base_channel = calloc(raid_bdev->num_base_bdevs, 189 sizeof(*raid_ch_processed->base_channel)); 190 if (raid_ch_processed->base_channel == NULL) { 191 goto err; 192 } 193 194 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 195 uint8_t slot = raid_bdev_base_bdev_slot(base_info); 196 197 if (base_info != process->target) { 198 raid_ch_processed->base_channel[slot] = raid_ch->base_channel[slot]; 199 } else { 200 raid_ch_processed->base_channel[slot] = raid_ch->process.target_ch; 201 } 202 } 203 204 raid_ch_processed->module_channel = raid_ch->module_channel; 205 raid_ch_processed->process.offset = RAID_OFFSET_BLOCKS_INVALID; 206 207 return 0; 208 err: 209 raid_bdev_ch_process_cleanup(raid_ch); 210 return -ENOMEM; 211 } 212 213 /* 214 * brief: 215 * raid_bdev_create_cb function is a cb function for raid bdev which creates the 216 * hierarchy from raid bdev to base bdev io channels. It will be called per core 217 * params: 218 * io_device - pointer to raid bdev io device represented by raid_bdev 219 * ctx_buf - pointer to context buffer for raid bdev io channel 220 * returns: 221 * 0 - success 222 * non zero - failure 223 */ 224 static int 225 raid_bdev_create_cb(void *io_device, void *ctx_buf) 226 { 227 struct raid_bdev *raid_bdev = io_device; 228 struct raid_bdev_io_channel *raid_ch = ctx_buf; 229 uint8_t i; 230 int ret = -ENOMEM; 231 232 SPDK_DEBUGLOG(bdev_raid, "raid_bdev_create_cb, %p\n", raid_ch); 233 234 assert(raid_bdev != NULL); 235 assert(raid_bdev->state == RAID_BDEV_STATE_ONLINE); 236 237 raid_ch->base_channel = calloc(raid_bdev->num_base_bdevs, sizeof(struct spdk_io_channel *)); 238 if (!raid_ch->base_channel) { 239 SPDK_ERRLOG("Unable to allocate base bdevs io channel\n"); 240 return -ENOMEM; 241 } 242 243 spdk_spin_lock(&raid_bdev->base_bdev_lock); 244 for (i = 0; i < raid_bdev->num_base_bdevs; i++) { 245 /* 246 * Get the spdk_io_channel for all the base bdevs. This is used during 247 * split logic to send the respective child bdev ios to respective base 248 * bdev io channel. 249 * Skip missing base bdevs and the process target, which should also be treated as 250 * missing until the process completes. 251 */ 252 if (raid_bdev->base_bdev_info[i].desc == NULL || 253 (raid_bdev->process != NULL && raid_bdev->process->target == &raid_bdev->base_bdev_info[i])) { 254 continue; 255 } 256 raid_ch->base_channel[i] = spdk_bdev_get_io_channel( 257 raid_bdev->base_bdev_info[i].desc); 258 if (!raid_ch->base_channel[i]) { 259 SPDK_ERRLOG("Unable to create io channel for base bdev\n"); 260 goto err; 261 } 262 } 263 264 if (raid_bdev->module->get_io_channel) { 265 raid_ch->module_channel = raid_bdev->module->get_io_channel(raid_bdev); 266 if (!raid_ch->module_channel) { 267 SPDK_ERRLOG("Unable to create io channel for raid module\n"); 268 goto err; 269 } 270 } 271 272 if (raid_bdev->process != NULL) { 273 ret = raid_bdev_ch_process_setup(raid_ch, raid_bdev->process); 274 if (ret != 0) { 275 SPDK_ERRLOG("Failed to setup process io channel\n"); 276 goto err; 277 } 278 } else { 279 raid_ch->process.offset = RAID_OFFSET_BLOCKS_INVALID; 280 } 281 spdk_spin_unlock(&raid_bdev->base_bdev_lock); 282 283 return 0; 284 err: 285 spdk_spin_unlock(&raid_bdev->base_bdev_lock); 286 for (i = 0; i < raid_bdev->num_base_bdevs; i++) { 287 if (raid_ch->base_channel[i] != NULL) { 288 spdk_put_io_channel(raid_ch->base_channel[i]); 289 } 290 } 291 free(raid_ch->base_channel); 292 293 raid_bdev_ch_process_cleanup(raid_ch); 294 295 return ret; 296 } 297 298 /* 299 * brief: 300 * raid_bdev_destroy_cb function is a cb function for raid bdev which deletes the 301 * hierarchy from raid bdev to base bdev io channels. It will be called per core 302 * params: 303 * io_device - pointer to raid bdev io device represented by raid_bdev 304 * ctx_buf - pointer to context buffer for raid bdev io channel 305 * returns: 306 * none 307 */ 308 static void 309 raid_bdev_destroy_cb(void *io_device, void *ctx_buf) 310 { 311 struct raid_bdev *raid_bdev = io_device; 312 struct raid_bdev_io_channel *raid_ch = ctx_buf; 313 uint8_t i; 314 315 SPDK_DEBUGLOG(bdev_raid, "raid_bdev_destroy_cb\n"); 316 317 assert(raid_ch != NULL); 318 assert(raid_ch->base_channel); 319 320 if (raid_ch->module_channel) { 321 spdk_put_io_channel(raid_ch->module_channel); 322 } 323 324 for (i = 0; i < raid_bdev->num_base_bdevs; i++) { 325 /* Free base bdev channels */ 326 if (raid_ch->base_channel[i] != NULL) { 327 spdk_put_io_channel(raid_ch->base_channel[i]); 328 } 329 } 330 free(raid_ch->base_channel); 331 raid_ch->base_channel = NULL; 332 333 raid_bdev_ch_process_cleanup(raid_ch); 334 } 335 336 /* 337 * brief: 338 * raid_bdev_cleanup is used to cleanup raid_bdev related data 339 * structures. 340 * params: 341 * raid_bdev - pointer to raid_bdev 342 * returns: 343 * none 344 */ 345 static void 346 raid_bdev_cleanup(struct raid_bdev *raid_bdev) 347 { 348 struct raid_base_bdev_info *base_info; 349 350 SPDK_DEBUGLOG(bdev_raid, "raid_bdev_cleanup, %p name %s, state %s\n", 351 raid_bdev, raid_bdev->bdev.name, raid_bdev_state_to_str(raid_bdev->state)); 352 assert(raid_bdev->state != RAID_BDEV_STATE_ONLINE); 353 assert(spdk_get_thread() == spdk_thread_get_app_thread()); 354 355 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 356 assert(base_info->desc == NULL); 357 free(base_info->name); 358 } 359 360 TAILQ_REMOVE(&g_raid_bdev_list, raid_bdev, global_link); 361 } 362 363 static void 364 raid_bdev_free(struct raid_bdev *raid_bdev) 365 { 366 raid_bdev_free_superblock(raid_bdev); 367 spdk_spin_destroy(&raid_bdev->base_bdev_lock); 368 free(raid_bdev->base_bdev_info); 369 free(raid_bdev->bdev.name); 370 free(raid_bdev); 371 } 372 373 static void 374 raid_bdev_cleanup_and_free(struct raid_bdev *raid_bdev) 375 { 376 raid_bdev_cleanup(raid_bdev); 377 raid_bdev_free(raid_bdev); 378 } 379 380 /* 381 * brief: 382 * free resource of base bdev for raid bdev 383 * params: 384 * base_info - raid base bdev info 385 * returns: 386 * none 387 */ 388 static void 389 raid_bdev_free_base_bdev_resource(struct raid_base_bdev_info *base_info) 390 { 391 struct raid_bdev *raid_bdev = base_info->raid_bdev; 392 393 assert(spdk_get_thread() == spdk_thread_get_app_thread()); 394 395 free(base_info->name); 396 base_info->name = NULL; 397 if (raid_bdev->state != RAID_BDEV_STATE_CONFIGURING) { 398 spdk_uuid_set_null(&base_info->uuid); 399 } 400 401 if (base_info->desc == NULL) { 402 return; 403 } 404 405 spdk_bdev_module_release_bdev(spdk_bdev_desc_get_bdev(base_info->desc)); 406 spdk_bdev_close(base_info->desc); 407 base_info->desc = NULL; 408 spdk_put_io_channel(base_info->app_thread_ch); 409 base_info->app_thread_ch = NULL; 410 411 if (base_info->is_configured) { 412 assert(raid_bdev->num_base_bdevs_discovered); 413 raid_bdev->num_base_bdevs_discovered--; 414 base_info->is_configured = false; 415 } 416 } 417 418 static void 419 raid_bdev_io_device_unregister_cb(void *io_device) 420 { 421 struct raid_bdev *raid_bdev = io_device; 422 423 if (raid_bdev->num_base_bdevs_discovered == 0) { 424 /* Free raid_bdev when there are no base bdevs left */ 425 SPDK_DEBUGLOG(bdev_raid, "raid bdev base bdevs is 0, going to free all in destruct\n"); 426 raid_bdev_cleanup(raid_bdev); 427 spdk_bdev_destruct_done(&raid_bdev->bdev, 0); 428 raid_bdev_free(raid_bdev); 429 } else { 430 spdk_bdev_destruct_done(&raid_bdev->bdev, 0); 431 } 432 } 433 434 void 435 raid_bdev_module_stop_done(struct raid_bdev *raid_bdev) 436 { 437 if (raid_bdev->state != RAID_BDEV_STATE_CONFIGURING) { 438 spdk_io_device_unregister(raid_bdev, raid_bdev_io_device_unregister_cb); 439 } 440 } 441 442 static void 443 _raid_bdev_destruct(void *ctxt) 444 { 445 struct raid_bdev *raid_bdev = ctxt; 446 struct raid_base_bdev_info *base_info; 447 448 SPDK_DEBUGLOG(bdev_raid, "raid_bdev_destruct\n"); 449 450 assert(raid_bdev->process == NULL); 451 452 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 453 /* 454 * Close all base bdev descriptors for which call has come from below 455 * layers. Also close the descriptors if we have started shutdown. 456 */ 457 if (g_shutdown_started || base_info->remove_scheduled == true) { 458 raid_bdev_free_base_bdev_resource(base_info); 459 } 460 } 461 462 if (g_shutdown_started) { 463 raid_bdev->state = RAID_BDEV_STATE_OFFLINE; 464 } 465 466 if (raid_bdev->module->stop != NULL) { 467 if (raid_bdev->module->stop(raid_bdev) == false) { 468 return; 469 } 470 } 471 472 raid_bdev_module_stop_done(raid_bdev); 473 } 474 475 static int 476 raid_bdev_destruct(void *ctx) 477 { 478 spdk_thread_exec_msg(spdk_thread_get_app_thread(), _raid_bdev_destruct, ctx); 479 480 return 1; 481 } 482 483 static int 484 raid_bdev_remap_dix_reftag(void *md_buf, uint64_t num_blocks, 485 struct spdk_bdev *bdev, uint32_t remapped_offset) 486 { 487 struct spdk_dif_ctx dif_ctx; 488 struct spdk_dif_error err_blk = {}; 489 int rc; 490 struct spdk_dif_ctx_init_ext_opts dif_opts; 491 struct iovec md_iov = { 492 .iov_base = md_buf, 493 .iov_len = num_blocks * bdev->md_len, 494 }; 495 496 if (md_buf == NULL) { 497 return 0; 498 } 499 500 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 501 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 502 rc = spdk_dif_ctx_init(&dif_ctx, 503 bdev->blocklen, bdev->md_len, bdev->md_interleave, 504 bdev->dif_is_head_of_md, bdev->dif_type, 505 SPDK_DIF_FLAGS_REFTAG_CHECK, 506 0, 0, 0, 0, 0, &dif_opts); 507 if (rc != 0) { 508 SPDK_ERRLOG("Initialization of DIF context failed\n"); 509 return rc; 510 } 511 512 spdk_dif_ctx_set_remapped_init_ref_tag(&dif_ctx, remapped_offset); 513 514 rc = spdk_dix_remap_ref_tag(&md_iov, num_blocks, &dif_ctx, &err_blk, false); 515 if (rc != 0) { 516 SPDK_ERRLOG("Remapping reference tag failed. type=%d, offset=%d" 517 PRIu32 "\n", err_blk.err_type, err_blk.err_offset); 518 } 519 520 return rc; 521 } 522 523 int 524 raid_bdev_verify_dix_reftag(struct iovec *iovs, int iovcnt, void *md_buf, 525 uint64_t num_blocks, struct spdk_bdev *bdev, uint32_t offset_blocks) 526 { 527 struct spdk_dif_ctx dif_ctx; 528 struct spdk_dif_error err_blk = {}; 529 int rc; 530 struct spdk_dif_ctx_init_ext_opts dif_opts; 531 struct iovec md_iov = { 532 .iov_base = md_buf, 533 .iov_len = num_blocks * bdev->md_len, 534 }; 535 536 if (md_buf == NULL) { 537 return 0; 538 } 539 540 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 541 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 542 rc = spdk_dif_ctx_init(&dif_ctx, 543 bdev->blocklen, bdev->md_len, bdev->md_interleave, 544 bdev->dif_is_head_of_md, bdev->dif_type, 545 SPDK_DIF_FLAGS_REFTAG_CHECK, 546 offset_blocks, 0, 0, 0, 0, &dif_opts); 547 if (rc != 0) { 548 SPDK_ERRLOG("Initialization of DIF context failed\n"); 549 return rc; 550 } 551 552 rc = spdk_dix_verify(iovs, iovcnt, &md_iov, num_blocks, &dif_ctx, &err_blk); 553 if (rc != 0) { 554 SPDK_ERRLOG("Reference tag check failed. type=%d, offset=%d" 555 PRIu32 "\n", err_blk.err_type, err_blk.err_offset); 556 } 557 558 return rc; 559 } 560 561 /** 562 * Raid bdev I/O read/write wrapper for spdk_bdev_readv_blocks_ext function. 563 */ 564 int 565 raid_bdev_readv_blocks_ext(struct raid_base_bdev_info *base_info, struct spdk_io_channel *ch, 566 struct iovec *iov, int iovcnt, uint64_t offset_blocks, 567 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 568 struct spdk_bdev_ext_io_opts *opts) 569 { 570 return spdk_bdev_readv_blocks_ext(base_info->desc, ch, iov, iovcnt, 571 base_info->data_offset + offset_blocks, num_blocks, cb, cb_arg, opts); 572 } 573 574 /** 575 * Raid bdev I/O read/write wrapper for spdk_bdev_writev_blocks_ext function. 576 */ 577 int 578 raid_bdev_writev_blocks_ext(struct raid_base_bdev_info *base_info, struct spdk_io_channel *ch, 579 struct iovec *iov, int iovcnt, uint64_t offset_blocks, 580 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 581 struct spdk_bdev_ext_io_opts *opts) 582 { 583 int rc; 584 uint64_t remapped_offset_blocks = base_info->data_offset + offset_blocks; 585 586 if (spdk_unlikely(spdk_bdev_get_dif_type(&base_info->raid_bdev->bdev) != SPDK_DIF_DISABLE && 587 base_info->raid_bdev->bdev.dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK)) { 588 589 rc = raid_bdev_remap_dix_reftag(opts->metadata, num_blocks, &base_info->raid_bdev->bdev, 590 remapped_offset_blocks); 591 if (rc != 0) { 592 return rc; 593 } 594 } 595 596 return spdk_bdev_writev_blocks_ext(base_info->desc, ch, iov, iovcnt, 597 remapped_offset_blocks, num_blocks, cb, cb_arg, opts); 598 } 599 600 void 601 raid_bdev_io_complete(struct raid_bdev_io *raid_io, enum spdk_bdev_io_status status) 602 { 603 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); 604 int rc; 605 606 if (raid_io->split.offset != RAID_OFFSET_BLOCKS_INVALID) { 607 struct iovec *split_iov = raid_io->split.iov; 608 const struct iovec *split_iov_orig = &raid_io->split.iov_copy; 609 610 /* 611 * Non-zero offset here means that this is the completion of the first part of the 612 * split I/O (the higher LBAs). Then, we submit the second part and set offset to 0. 613 */ 614 if (raid_io->split.offset != 0) { 615 raid_io->offset_blocks = bdev_io->u.bdev.offset_blocks; 616 raid_io->md_buf = bdev_io->u.bdev.md_buf; 617 618 if (status == SPDK_BDEV_IO_STATUS_SUCCESS) { 619 raid_io->num_blocks = raid_io->split.offset; 620 raid_io->iovcnt = raid_io->iovs - bdev_io->u.bdev.iovs; 621 raid_io->iovs = bdev_io->u.bdev.iovs; 622 if (split_iov != NULL) { 623 raid_io->iovcnt++; 624 split_iov->iov_len = split_iov->iov_base - split_iov_orig->iov_base; 625 split_iov->iov_base = split_iov_orig->iov_base; 626 } 627 628 raid_io->split.offset = 0; 629 raid_io->base_bdev_io_submitted = 0; 630 raid_io->raid_ch = raid_io->raid_ch->process.ch_processed; 631 632 raid_io->raid_bdev->module->submit_rw_request(raid_io); 633 return; 634 } 635 } 636 637 raid_io->num_blocks = bdev_io->u.bdev.num_blocks; 638 raid_io->iovcnt = bdev_io->u.bdev.iovcnt; 639 raid_io->iovs = bdev_io->u.bdev.iovs; 640 if (split_iov != NULL) { 641 *split_iov = *split_iov_orig; 642 } 643 } 644 645 if (spdk_unlikely(raid_io->completion_cb != NULL)) { 646 raid_io->completion_cb(raid_io, status); 647 } else { 648 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 649 spdk_bdev_get_dif_type(bdev_io->bdev) != SPDK_DIF_DISABLE && 650 bdev_io->bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK && 651 status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 652 653 rc = raid_bdev_remap_dix_reftag(bdev_io->u.bdev.md_buf, 654 bdev_io->u.bdev.num_blocks, bdev_io->bdev, 655 bdev_io->u.bdev.offset_blocks); 656 if (rc != 0) { 657 status = SPDK_BDEV_IO_STATUS_FAILED; 658 } 659 } 660 spdk_bdev_io_complete(bdev_io, status); 661 } 662 } 663 664 /* 665 * brief: 666 * raid_bdev_io_complete_part - signal the completion of a part of the expected 667 * base bdev IOs and complete the raid_io if this is the final expected IO. 668 * The caller should first set raid_io->base_bdev_io_remaining. This function 669 * will decrement this counter by the value of the 'completed' parameter and 670 * complete the raid_io if the counter reaches 0. The caller is free to 671 * interpret the 'base_bdev_io_remaining' and 'completed' values as needed, 672 * it can represent e.g. blocks or IOs. 673 * params: 674 * raid_io - pointer to raid_bdev_io 675 * completed - the part of the raid_io that has been completed 676 * status - status of the base IO 677 * returns: 678 * true - if the raid_io is completed 679 * false - otherwise 680 */ 681 bool 682 raid_bdev_io_complete_part(struct raid_bdev_io *raid_io, uint64_t completed, 683 enum spdk_bdev_io_status status) 684 { 685 assert(raid_io->base_bdev_io_remaining >= completed); 686 raid_io->base_bdev_io_remaining -= completed; 687 688 if (status != SPDK_BDEV_IO_STATUS_SUCCESS) { 689 raid_io->base_bdev_io_status = status; 690 } 691 692 if (raid_io->base_bdev_io_remaining == 0) { 693 raid_bdev_io_complete(raid_io, raid_io->base_bdev_io_status); 694 return true; 695 } else { 696 return false; 697 } 698 } 699 700 /* 701 * brief: 702 * raid_bdev_queue_io_wait function processes the IO which failed to submit. 703 * It will try to queue the IOs after storing the context to bdev wait queue logic. 704 * params: 705 * raid_io - pointer to raid_bdev_io 706 * bdev - the block device that the IO is submitted to 707 * ch - io channel 708 * cb_fn - callback when the spdk_bdev_io for bdev becomes available 709 * returns: 710 * none 711 */ 712 void 713 raid_bdev_queue_io_wait(struct raid_bdev_io *raid_io, struct spdk_bdev *bdev, 714 struct spdk_io_channel *ch, spdk_bdev_io_wait_cb cb_fn) 715 { 716 raid_io->waitq_entry.bdev = bdev; 717 raid_io->waitq_entry.cb_fn = cb_fn; 718 raid_io->waitq_entry.cb_arg = raid_io; 719 spdk_bdev_queue_io_wait(bdev, ch, &raid_io->waitq_entry); 720 } 721 722 static void 723 raid_base_bdev_reset_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 724 { 725 struct raid_bdev_io *raid_io = cb_arg; 726 727 spdk_bdev_free_io(bdev_io); 728 729 raid_bdev_io_complete_part(raid_io, 1, success ? 730 SPDK_BDEV_IO_STATUS_SUCCESS : 731 SPDK_BDEV_IO_STATUS_FAILED); 732 } 733 734 static void raid_bdev_submit_reset_request(struct raid_bdev_io *raid_io); 735 736 static void 737 _raid_bdev_submit_reset_request(void *_raid_io) 738 { 739 struct raid_bdev_io *raid_io = _raid_io; 740 741 raid_bdev_submit_reset_request(raid_io); 742 } 743 744 /* 745 * brief: 746 * raid_bdev_submit_reset_request function submits reset requests 747 * to member disks; it will submit as many as possible unless a reset fails with -ENOMEM, in 748 * which case it will queue it for later submission 749 * params: 750 * raid_io 751 * returns: 752 * none 753 */ 754 static void 755 raid_bdev_submit_reset_request(struct raid_bdev_io *raid_io) 756 { 757 struct raid_bdev *raid_bdev; 758 int ret; 759 uint8_t i; 760 struct raid_base_bdev_info *base_info; 761 struct spdk_io_channel *base_ch; 762 763 raid_bdev = raid_io->raid_bdev; 764 765 if (raid_io->base_bdev_io_remaining == 0) { 766 raid_io->base_bdev_io_remaining = raid_bdev->num_base_bdevs; 767 } 768 769 for (i = raid_io->base_bdev_io_submitted; i < raid_bdev->num_base_bdevs; i++) { 770 base_info = &raid_bdev->base_bdev_info[i]; 771 base_ch = raid_io->raid_ch->base_channel[i]; 772 if (base_ch == NULL) { 773 raid_io->base_bdev_io_submitted++; 774 raid_bdev_io_complete_part(raid_io, 1, SPDK_BDEV_IO_STATUS_SUCCESS); 775 continue; 776 } 777 ret = spdk_bdev_reset(base_info->desc, base_ch, 778 raid_base_bdev_reset_complete, raid_io); 779 if (ret == 0) { 780 raid_io->base_bdev_io_submitted++; 781 } else if (ret == -ENOMEM) { 782 raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc), 783 base_ch, _raid_bdev_submit_reset_request); 784 return; 785 } else { 786 SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); 787 assert(false); 788 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 789 return; 790 } 791 } 792 } 793 794 static void 795 raid_bdev_io_split(struct raid_bdev_io *raid_io, uint64_t split_offset) 796 { 797 struct raid_bdev *raid_bdev = raid_io->raid_bdev; 798 size_t iov_offset = split_offset * raid_bdev->bdev.blocklen; 799 int i; 800 801 assert(split_offset != 0); 802 assert(raid_io->split.offset == RAID_OFFSET_BLOCKS_INVALID); 803 raid_io->split.offset = split_offset; 804 805 raid_io->offset_blocks += split_offset; 806 raid_io->num_blocks -= split_offset; 807 if (raid_io->md_buf != NULL) { 808 raid_io->md_buf += (split_offset * raid_bdev->bdev.md_len); 809 } 810 811 for (i = 0; i < raid_io->iovcnt; i++) { 812 struct iovec *iov = &raid_io->iovs[i]; 813 814 if (iov_offset < iov->iov_len) { 815 if (iov_offset == 0) { 816 raid_io->split.iov = NULL; 817 } else { 818 raid_io->split.iov = iov; 819 raid_io->split.iov_copy = *iov; 820 iov->iov_base += iov_offset; 821 iov->iov_len -= iov_offset; 822 } 823 raid_io->iovs += i; 824 raid_io->iovcnt -= i; 825 break; 826 } 827 828 iov_offset -= iov->iov_len; 829 } 830 } 831 832 static void 833 raid_bdev_submit_rw_request(struct raid_bdev_io *raid_io) 834 { 835 struct raid_bdev_io_channel *raid_ch = raid_io->raid_ch; 836 837 if (raid_ch->process.offset != RAID_OFFSET_BLOCKS_INVALID) { 838 uint64_t offset_begin = raid_io->offset_blocks; 839 uint64_t offset_end = offset_begin + raid_io->num_blocks; 840 841 if (offset_end > raid_ch->process.offset) { 842 if (offset_begin < raid_ch->process.offset) { 843 /* 844 * If the I/O spans both the processed and unprocessed ranges, 845 * split it and first handle the unprocessed part. After it 846 * completes, the rest will be handled. 847 * This situation occurs when the process thread is not active 848 * or is waiting for the process window range to be locked 849 * (quiesced). When a window is being processed, such I/Os will be 850 * deferred by the bdev layer until the window is unlocked. 851 */ 852 SPDK_DEBUGLOG(bdev_raid, "split: process_offset: %lu offset_begin: %lu offset_end: %lu\n", 853 raid_ch->process.offset, offset_begin, offset_end); 854 raid_bdev_io_split(raid_io, raid_ch->process.offset - offset_begin); 855 } 856 } else { 857 /* Use the child channel, which corresponds to the already processed range */ 858 raid_io->raid_ch = raid_ch->process.ch_processed; 859 } 860 } 861 862 raid_io->raid_bdev->module->submit_rw_request(raid_io); 863 } 864 865 /* 866 * brief: 867 * Callback function to spdk_bdev_io_get_buf. 868 * params: 869 * ch - pointer to raid bdev io channel 870 * bdev_io - pointer to parent bdev_io on raid bdev device 871 * success - True if buffer is allocated or false otherwise. 872 * returns: 873 * none 874 */ 875 static void 876 raid_bdev_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 877 bool success) 878 { 879 struct raid_bdev_io *raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx; 880 881 if (!success) { 882 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 883 return; 884 } 885 886 raid_bdev_submit_rw_request(raid_io); 887 } 888 889 void 890 raid_bdev_io_init(struct raid_bdev_io *raid_io, struct raid_bdev_io_channel *raid_ch, 891 enum spdk_bdev_io_type type, uint64_t offset_blocks, 892 uint64_t num_blocks, struct iovec *iovs, int iovcnt, void *md_buf, 893 struct spdk_memory_domain *memory_domain, void *memory_domain_ctx) 894 { 895 struct spdk_io_channel *ch = spdk_io_channel_from_ctx(raid_ch); 896 struct raid_bdev *raid_bdev = spdk_io_channel_get_io_device(ch); 897 898 raid_io->type = type; 899 raid_io->offset_blocks = offset_blocks; 900 raid_io->num_blocks = num_blocks; 901 raid_io->iovs = iovs; 902 raid_io->iovcnt = iovcnt; 903 raid_io->memory_domain = memory_domain; 904 raid_io->memory_domain_ctx = memory_domain_ctx; 905 raid_io->md_buf = md_buf; 906 907 raid_io->raid_bdev = raid_bdev; 908 raid_io->raid_ch = raid_ch; 909 raid_io->base_bdev_io_remaining = 0; 910 raid_io->base_bdev_io_submitted = 0; 911 raid_io->base_bdev_io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 912 raid_io->completion_cb = NULL; 913 raid_io->split.offset = RAID_OFFSET_BLOCKS_INVALID; 914 } 915 916 /* 917 * brief: 918 * raid_bdev_submit_request function is the submit_request function pointer of 919 * raid bdev function table. This is used to submit the io on raid_bdev to below 920 * layers. 921 * params: 922 * ch - pointer to raid bdev io channel 923 * bdev_io - pointer to parent bdev_io on raid bdev device 924 * returns: 925 * none 926 */ 927 static void 928 raid_bdev_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 929 { 930 struct raid_bdev_io *raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx; 931 932 raid_bdev_io_init(raid_io, spdk_io_channel_get_ctx(ch), bdev_io->type, 933 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 934 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.md_buf, 935 bdev_io->u.bdev.memory_domain, bdev_io->u.bdev.memory_domain_ctx); 936 937 switch (bdev_io->type) { 938 case SPDK_BDEV_IO_TYPE_READ: 939 spdk_bdev_io_get_buf(bdev_io, raid_bdev_get_buf_cb, 940 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 941 break; 942 case SPDK_BDEV_IO_TYPE_WRITE: 943 raid_bdev_submit_rw_request(raid_io); 944 break; 945 946 case SPDK_BDEV_IO_TYPE_RESET: 947 raid_bdev_submit_reset_request(raid_io); 948 break; 949 950 case SPDK_BDEV_IO_TYPE_FLUSH: 951 case SPDK_BDEV_IO_TYPE_UNMAP: 952 if (raid_io->raid_bdev->process != NULL) { 953 /* TODO: rebuild support */ 954 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 955 return; 956 } 957 raid_io->raid_bdev->module->submit_null_payload_request(raid_io); 958 break; 959 960 default: 961 SPDK_ERRLOG("submit request, invalid io type %u\n", bdev_io->type); 962 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 963 break; 964 } 965 } 966 967 /* 968 * brief: 969 * _raid_bdev_io_type_supported checks whether io_type is supported in 970 * all base bdev modules of raid bdev module. If anyone among the base_bdevs 971 * doesn't support, the raid device doesn't supports. 972 * 973 * params: 974 * raid_bdev - pointer to raid bdev context 975 * io_type - io type 976 * returns: 977 * true - io_type is supported 978 * false - io_type is not supported 979 */ 980 inline static bool 981 _raid_bdev_io_type_supported(struct raid_bdev *raid_bdev, enum spdk_bdev_io_type io_type) 982 { 983 struct raid_base_bdev_info *base_info; 984 985 if (io_type == SPDK_BDEV_IO_TYPE_FLUSH || 986 io_type == SPDK_BDEV_IO_TYPE_UNMAP) { 987 if (raid_bdev->module->submit_null_payload_request == NULL) { 988 return false; 989 } 990 } 991 992 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 993 if (base_info->desc == NULL) { 994 continue; 995 } 996 997 if (spdk_bdev_io_type_supported(spdk_bdev_desc_get_bdev(base_info->desc), io_type) == false) { 998 return false; 999 } 1000 } 1001 1002 return true; 1003 } 1004 1005 /* 1006 * brief: 1007 * raid_bdev_io_type_supported is the io_supported function for bdev function 1008 * table which returns whether the particular io type is supported or not by 1009 * raid bdev module 1010 * params: 1011 * ctx - pointer to raid bdev context 1012 * type - io type 1013 * returns: 1014 * true - io_type is supported 1015 * false - io_type is not supported 1016 */ 1017 static bool 1018 raid_bdev_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 1019 { 1020 switch (io_type) { 1021 case SPDK_BDEV_IO_TYPE_READ: 1022 case SPDK_BDEV_IO_TYPE_WRITE: 1023 return true; 1024 1025 case SPDK_BDEV_IO_TYPE_FLUSH: 1026 case SPDK_BDEV_IO_TYPE_RESET: 1027 case SPDK_BDEV_IO_TYPE_UNMAP: 1028 return _raid_bdev_io_type_supported(ctx, io_type); 1029 1030 default: 1031 return false; 1032 } 1033 1034 return false; 1035 } 1036 1037 /* 1038 * brief: 1039 * raid_bdev_get_io_channel is the get_io_channel function table pointer for 1040 * raid bdev. This is used to return the io channel for this raid bdev 1041 * params: 1042 * ctxt - pointer to raid_bdev 1043 * returns: 1044 * pointer to io channel for raid bdev 1045 */ 1046 static struct spdk_io_channel * 1047 raid_bdev_get_io_channel(void *ctxt) 1048 { 1049 struct raid_bdev *raid_bdev = ctxt; 1050 1051 return spdk_get_io_channel(raid_bdev); 1052 } 1053 1054 void 1055 raid_bdev_write_info_json(struct raid_bdev *raid_bdev, struct spdk_json_write_ctx *w) 1056 { 1057 struct raid_base_bdev_info *base_info; 1058 1059 assert(raid_bdev != NULL); 1060 assert(spdk_get_thread() == spdk_thread_get_app_thread()); 1061 1062 spdk_json_write_named_uuid(w, "uuid", &raid_bdev->bdev.uuid); 1063 spdk_json_write_named_uint32(w, "strip_size_kb", raid_bdev->strip_size_kb); 1064 spdk_json_write_named_string(w, "state", raid_bdev_state_to_str(raid_bdev->state)); 1065 spdk_json_write_named_string(w, "raid_level", raid_bdev_level_to_str(raid_bdev->level)); 1066 spdk_json_write_named_bool(w, "superblock", raid_bdev->superblock_enabled); 1067 spdk_json_write_named_uint32(w, "num_base_bdevs", raid_bdev->num_base_bdevs); 1068 spdk_json_write_named_uint32(w, "num_base_bdevs_discovered", raid_bdev->num_base_bdevs_discovered); 1069 spdk_json_write_named_uint32(w, "num_base_bdevs_operational", 1070 raid_bdev->num_base_bdevs_operational); 1071 if (raid_bdev->process) { 1072 struct raid_bdev_process *process = raid_bdev->process; 1073 uint64_t offset = process->window_offset; 1074 1075 spdk_json_write_named_object_begin(w, "process"); 1076 spdk_json_write_name(w, "type"); 1077 spdk_json_write_string(w, raid_bdev_process_to_str(process->type)); 1078 spdk_json_write_named_string(w, "target", process->target->name); 1079 spdk_json_write_named_object_begin(w, "progress"); 1080 spdk_json_write_named_uint64(w, "blocks", offset); 1081 spdk_json_write_named_uint32(w, "percent", offset * 100.0 / raid_bdev->bdev.blockcnt); 1082 spdk_json_write_object_end(w); 1083 spdk_json_write_object_end(w); 1084 } 1085 spdk_json_write_name(w, "base_bdevs_list"); 1086 spdk_json_write_array_begin(w); 1087 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 1088 spdk_json_write_object_begin(w); 1089 spdk_json_write_name(w, "name"); 1090 if (base_info->name) { 1091 spdk_json_write_string(w, base_info->name); 1092 } else { 1093 spdk_json_write_null(w); 1094 } 1095 spdk_json_write_named_uuid(w, "uuid", &base_info->uuid); 1096 spdk_json_write_named_bool(w, "is_configured", base_info->is_configured); 1097 spdk_json_write_named_uint64(w, "data_offset", base_info->data_offset); 1098 spdk_json_write_named_uint64(w, "data_size", base_info->data_size); 1099 spdk_json_write_object_end(w); 1100 } 1101 spdk_json_write_array_end(w); 1102 } 1103 1104 /* 1105 * brief: 1106 * raid_bdev_dump_info_json is the function table pointer for raid bdev 1107 * params: 1108 * ctx - pointer to raid_bdev 1109 * w - pointer to json context 1110 * returns: 1111 * 0 - success 1112 * non zero - failure 1113 */ 1114 static int 1115 raid_bdev_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 1116 { 1117 struct raid_bdev *raid_bdev = ctx; 1118 1119 SPDK_DEBUGLOG(bdev_raid, "raid_bdev_dump_config_json\n"); 1120 1121 /* Dump the raid bdev configuration related information */ 1122 spdk_json_write_named_object_begin(w, "raid"); 1123 raid_bdev_write_info_json(raid_bdev, w); 1124 spdk_json_write_object_end(w); 1125 1126 return 0; 1127 } 1128 1129 /* 1130 * brief: 1131 * raid_bdev_write_config_json is the function table pointer for raid bdev 1132 * params: 1133 * bdev - pointer to spdk_bdev 1134 * w - pointer to json context 1135 * returns: 1136 * none 1137 */ 1138 static void 1139 raid_bdev_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1140 { 1141 struct raid_bdev *raid_bdev = bdev->ctxt; 1142 struct raid_base_bdev_info *base_info; 1143 1144 assert(spdk_get_thread() == spdk_thread_get_app_thread()); 1145 1146 if (raid_bdev->superblock_enabled) { 1147 /* raid bdev configuration is stored in the superblock */ 1148 return; 1149 } 1150 1151 spdk_json_write_object_begin(w); 1152 1153 spdk_json_write_named_string(w, "method", "bdev_raid_create"); 1154 1155 spdk_json_write_named_object_begin(w, "params"); 1156 spdk_json_write_named_string(w, "name", bdev->name); 1157 spdk_json_write_named_uuid(w, "uuid", &raid_bdev->bdev.uuid); 1158 spdk_json_write_named_uint32(w, "strip_size_kb", raid_bdev->strip_size_kb); 1159 spdk_json_write_named_string(w, "raid_level", raid_bdev_level_to_str(raid_bdev->level)); 1160 spdk_json_write_named_bool(w, "superblock", raid_bdev->superblock_enabled); 1161 1162 spdk_json_write_named_array_begin(w, "base_bdevs"); 1163 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 1164 if (base_info->desc) { 1165 spdk_json_write_string(w, spdk_bdev_desc_get_bdev(base_info->desc)->name); 1166 } 1167 } 1168 spdk_json_write_array_end(w); 1169 spdk_json_write_object_end(w); 1170 1171 spdk_json_write_object_end(w); 1172 } 1173 1174 static int 1175 raid_bdev_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 1176 { 1177 struct raid_bdev *raid_bdev = ctx; 1178 struct raid_base_bdev_info *base_info; 1179 int domains_count = 0, rc = 0; 1180 1181 if (raid_bdev->module->memory_domains_supported == false) { 1182 return 0; 1183 } 1184 1185 spdk_spin_lock(&raid_bdev->base_bdev_lock); 1186 1187 /* First loop to get the number of memory domains */ 1188 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 1189 if (base_info->desc == NULL) { 1190 continue; 1191 } 1192 rc = spdk_bdev_get_memory_domains(spdk_bdev_desc_get_bdev(base_info->desc), NULL, 0); 1193 if (rc < 0) { 1194 goto out; 1195 } 1196 domains_count += rc; 1197 } 1198 1199 if (!domains || array_size < domains_count) { 1200 goto out; 1201 } 1202 1203 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 1204 if (base_info->desc == NULL) { 1205 continue; 1206 } 1207 rc = spdk_bdev_get_memory_domains(spdk_bdev_desc_get_bdev(base_info->desc), domains, array_size); 1208 if (rc < 0) { 1209 goto out; 1210 } 1211 domains += rc; 1212 array_size -= rc; 1213 } 1214 out: 1215 spdk_spin_unlock(&raid_bdev->base_bdev_lock); 1216 1217 if (rc < 0) { 1218 return rc; 1219 } 1220 1221 return domains_count; 1222 } 1223 1224 /* g_raid_bdev_fn_table is the function table for raid bdev */ 1225 static const struct spdk_bdev_fn_table g_raid_bdev_fn_table = { 1226 .destruct = raid_bdev_destruct, 1227 .submit_request = raid_bdev_submit_request, 1228 .io_type_supported = raid_bdev_io_type_supported, 1229 .get_io_channel = raid_bdev_get_io_channel, 1230 .dump_info_json = raid_bdev_dump_info_json, 1231 .write_config_json = raid_bdev_write_config_json, 1232 .get_memory_domains = raid_bdev_get_memory_domains, 1233 }; 1234 1235 struct raid_bdev * 1236 raid_bdev_find_by_name(const char *name) 1237 { 1238 struct raid_bdev *raid_bdev; 1239 1240 TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) { 1241 if (strcmp(raid_bdev->bdev.name, name) == 0) { 1242 return raid_bdev; 1243 } 1244 } 1245 1246 return NULL; 1247 } 1248 1249 static struct { 1250 const char *name; 1251 enum raid_level value; 1252 } g_raid_level_names[] = { 1253 { "raid0", RAID0 }, 1254 { "0", RAID0 }, 1255 { "raid1", RAID1 }, 1256 { "1", RAID1 }, 1257 { "raid5f", RAID5F }, 1258 { "5f", RAID5F }, 1259 { "concat", CONCAT }, 1260 { } 1261 }; 1262 1263 const char *g_raid_state_names[] = { 1264 [RAID_BDEV_STATE_ONLINE] = "online", 1265 [RAID_BDEV_STATE_CONFIGURING] = "configuring", 1266 [RAID_BDEV_STATE_OFFLINE] = "offline", 1267 [RAID_BDEV_STATE_MAX] = NULL 1268 }; 1269 1270 static const char *g_raid_process_type_names[] = { 1271 [RAID_PROCESS_NONE] = "none", 1272 [RAID_PROCESS_REBUILD] = "rebuild", 1273 [RAID_PROCESS_MAX] = NULL 1274 }; 1275 1276 /* We have to use the typedef in the function declaration to appease astyle. */ 1277 typedef enum raid_level raid_level_t; 1278 typedef enum raid_bdev_state raid_bdev_state_t; 1279 1280 raid_level_t 1281 raid_bdev_str_to_level(const char *str) 1282 { 1283 unsigned int i; 1284 1285 assert(str != NULL); 1286 1287 for (i = 0; g_raid_level_names[i].name != NULL; i++) { 1288 if (strcasecmp(g_raid_level_names[i].name, str) == 0) { 1289 return g_raid_level_names[i].value; 1290 } 1291 } 1292 1293 return INVALID_RAID_LEVEL; 1294 } 1295 1296 const char * 1297 raid_bdev_level_to_str(enum raid_level level) 1298 { 1299 unsigned int i; 1300 1301 for (i = 0; g_raid_level_names[i].name != NULL; i++) { 1302 if (g_raid_level_names[i].value == level) { 1303 return g_raid_level_names[i].name; 1304 } 1305 } 1306 1307 return ""; 1308 } 1309 1310 raid_bdev_state_t 1311 raid_bdev_str_to_state(const char *str) 1312 { 1313 unsigned int i; 1314 1315 assert(str != NULL); 1316 1317 for (i = 0; i < RAID_BDEV_STATE_MAX; i++) { 1318 if (strcasecmp(g_raid_state_names[i], str) == 0) { 1319 break; 1320 } 1321 } 1322 1323 return i; 1324 } 1325 1326 const char * 1327 raid_bdev_state_to_str(enum raid_bdev_state state) 1328 { 1329 if (state >= RAID_BDEV_STATE_MAX) { 1330 return ""; 1331 } 1332 1333 return g_raid_state_names[state]; 1334 } 1335 1336 const char * 1337 raid_bdev_process_to_str(enum raid_process_type value) 1338 { 1339 if (value >= RAID_PROCESS_MAX) { 1340 return ""; 1341 } 1342 1343 return g_raid_process_type_names[value]; 1344 } 1345 1346 /* 1347 * brief: 1348 * raid_bdev_fini_start is called when bdev layer is starting the 1349 * shutdown process 1350 * params: 1351 * none 1352 * returns: 1353 * none 1354 */ 1355 static void 1356 raid_bdev_fini_start(void) 1357 { 1358 SPDK_DEBUGLOG(bdev_raid, "raid_bdev_fini_start\n"); 1359 g_shutdown_started = true; 1360 } 1361 1362 /* 1363 * brief: 1364 * raid_bdev_exit is called on raid bdev module exit time by bdev layer 1365 * params: 1366 * none 1367 * returns: 1368 * none 1369 */ 1370 static void 1371 raid_bdev_exit(void) 1372 { 1373 struct raid_bdev *raid_bdev, *tmp; 1374 1375 SPDK_DEBUGLOG(bdev_raid, "raid_bdev_exit\n"); 1376 1377 TAILQ_FOREACH_SAFE(raid_bdev, &g_raid_bdev_list, global_link, tmp) { 1378 raid_bdev_cleanup_and_free(raid_bdev); 1379 } 1380 } 1381 1382 static void 1383 raid_bdev_opts_config_json(struct spdk_json_write_ctx *w) 1384 { 1385 spdk_json_write_object_begin(w); 1386 1387 spdk_json_write_named_string(w, "method", "bdev_raid_set_options"); 1388 1389 spdk_json_write_named_object_begin(w, "params"); 1390 spdk_json_write_named_uint32(w, "process_window_size_kb", g_opts.process_window_size_kb); 1391 spdk_json_write_object_end(w); 1392 1393 spdk_json_write_object_end(w); 1394 } 1395 1396 static int 1397 raid_bdev_config_json(struct spdk_json_write_ctx *w) 1398 { 1399 raid_bdev_opts_config_json(w); 1400 1401 return 0; 1402 } 1403 1404 /* 1405 * brief: 1406 * raid_bdev_get_ctx_size is used to return the context size of bdev_io for raid 1407 * module 1408 * params: 1409 * none 1410 * returns: 1411 * size of spdk_bdev_io context for raid 1412 */ 1413 static int 1414 raid_bdev_get_ctx_size(void) 1415 { 1416 SPDK_DEBUGLOG(bdev_raid, "raid_bdev_get_ctx_size\n"); 1417 return sizeof(struct raid_bdev_io); 1418 } 1419 1420 static struct spdk_bdev_module g_raid_if = { 1421 .name = "raid", 1422 .module_init = raid_bdev_init, 1423 .fini_start = raid_bdev_fini_start, 1424 .module_fini = raid_bdev_exit, 1425 .config_json = raid_bdev_config_json, 1426 .get_ctx_size = raid_bdev_get_ctx_size, 1427 .examine_disk = raid_bdev_examine, 1428 .async_init = false, 1429 .async_fini = false, 1430 }; 1431 SPDK_BDEV_MODULE_REGISTER(raid, &g_raid_if) 1432 1433 /* 1434 * brief: 1435 * raid_bdev_init is the initialization function for raid bdev module 1436 * params: 1437 * none 1438 * returns: 1439 * 0 - success 1440 * non zero - failure 1441 */ 1442 static int 1443 raid_bdev_init(void) 1444 { 1445 return 0; 1446 } 1447 1448 static int 1449 _raid_bdev_create(const char *name, uint32_t strip_size, uint8_t num_base_bdevs, 1450 enum raid_level level, bool superblock_enabled, const struct spdk_uuid *uuid, 1451 struct raid_bdev **raid_bdev_out) 1452 { 1453 struct raid_bdev *raid_bdev; 1454 struct spdk_bdev *raid_bdev_gen; 1455 struct raid_bdev_module *module; 1456 struct raid_base_bdev_info *base_info; 1457 uint8_t min_operational; 1458 1459 if (strnlen(name, RAID_BDEV_SB_NAME_SIZE) == RAID_BDEV_SB_NAME_SIZE) { 1460 SPDK_ERRLOG("Raid bdev name '%s' exceeds %d characters\n", name, RAID_BDEV_SB_NAME_SIZE - 1); 1461 return -EINVAL; 1462 } 1463 1464 if (raid_bdev_find_by_name(name) != NULL) { 1465 SPDK_ERRLOG("Duplicate raid bdev name found: %s\n", name); 1466 return -EEXIST; 1467 } 1468 1469 if (level == RAID1) { 1470 if (strip_size != 0) { 1471 SPDK_ERRLOG("Strip size is not supported by raid1\n"); 1472 return -EINVAL; 1473 } 1474 } else if (spdk_u32_is_pow2(strip_size) == false) { 1475 SPDK_ERRLOG("Invalid strip size %" PRIu32 "\n", strip_size); 1476 return -EINVAL; 1477 } 1478 1479 module = raid_bdev_module_find(level); 1480 if (module == NULL) { 1481 SPDK_ERRLOG("Unsupported raid level '%d'\n", level); 1482 return -EINVAL; 1483 } 1484 1485 assert(module->base_bdevs_min != 0); 1486 if (num_base_bdevs < module->base_bdevs_min) { 1487 SPDK_ERRLOG("At least %u base devices required for %s\n", 1488 module->base_bdevs_min, 1489 raid_bdev_level_to_str(level)); 1490 return -EINVAL; 1491 } 1492 1493 switch (module->base_bdevs_constraint.type) { 1494 case CONSTRAINT_MAX_BASE_BDEVS_REMOVED: 1495 min_operational = num_base_bdevs - module->base_bdevs_constraint.value; 1496 break; 1497 case CONSTRAINT_MIN_BASE_BDEVS_OPERATIONAL: 1498 min_operational = module->base_bdevs_constraint.value; 1499 break; 1500 case CONSTRAINT_UNSET: 1501 if (module->base_bdevs_constraint.value != 0) { 1502 SPDK_ERRLOG("Unexpected constraint value '%u' provided for raid bdev '%s'.\n", 1503 (uint8_t)module->base_bdevs_constraint.value, name); 1504 return -EINVAL; 1505 } 1506 min_operational = num_base_bdevs; 1507 break; 1508 default: 1509 SPDK_ERRLOG("Unrecognised constraint type '%u' in module for raid level '%s'.\n", 1510 (uint8_t)module->base_bdevs_constraint.type, 1511 raid_bdev_level_to_str(module->level)); 1512 return -EINVAL; 1513 }; 1514 1515 if (min_operational == 0 || min_operational > num_base_bdevs) { 1516 SPDK_ERRLOG("Wrong constraint value for raid level '%s'.\n", 1517 raid_bdev_level_to_str(module->level)); 1518 return -EINVAL; 1519 } 1520 1521 raid_bdev = calloc(1, sizeof(*raid_bdev)); 1522 if (!raid_bdev) { 1523 SPDK_ERRLOG("Unable to allocate memory for raid bdev\n"); 1524 return -ENOMEM; 1525 } 1526 1527 spdk_spin_init(&raid_bdev->base_bdev_lock); 1528 raid_bdev->module = module; 1529 raid_bdev->num_base_bdevs = num_base_bdevs; 1530 raid_bdev->base_bdev_info = calloc(raid_bdev->num_base_bdevs, 1531 sizeof(struct raid_base_bdev_info)); 1532 if (!raid_bdev->base_bdev_info) { 1533 SPDK_ERRLOG("Unable able to allocate base bdev info\n"); 1534 raid_bdev_free(raid_bdev); 1535 return -ENOMEM; 1536 } 1537 1538 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 1539 base_info->raid_bdev = raid_bdev; 1540 } 1541 1542 /* strip_size_kb is from the rpc param. strip_size is in blocks and used 1543 * internally and set later. 1544 */ 1545 raid_bdev->strip_size = 0; 1546 raid_bdev->strip_size_kb = strip_size; 1547 raid_bdev->state = RAID_BDEV_STATE_CONFIGURING; 1548 raid_bdev->level = level; 1549 raid_bdev->min_base_bdevs_operational = min_operational; 1550 raid_bdev->superblock_enabled = superblock_enabled; 1551 1552 raid_bdev_gen = &raid_bdev->bdev; 1553 1554 raid_bdev_gen->name = strdup(name); 1555 if (!raid_bdev_gen->name) { 1556 SPDK_ERRLOG("Unable to allocate name for raid\n"); 1557 raid_bdev_free(raid_bdev); 1558 return -ENOMEM; 1559 } 1560 1561 raid_bdev_gen->product_name = "Raid Volume"; 1562 raid_bdev_gen->ctxt = raid_bdev; 1563 raid_bdev_gen->fn_table = &g_raid_bdev_fn_table; 1564 raid_bdev_gen->module = &g_raid_if; 1565 raid_bdev_gen->write_cache = 0; 1566 spdk_uuid_copy(&raid_bdev_gen->uuid, uuid); 1567 1568 TAILQ_INSERT_TAIL(&g_raid_bdev_list, raid_bdev, global_link); 1569 1570 *raid_bdev_out = raid_bdev; 1571 1572 return 0; 1573 } 1574 1575 /* 1576 * brief: 1577 * raid_bdev_create allocates raid bdev based on passed configuration 1578 * params: 1579 * name - name for raid bdev 1580 * strip_size - strip size in KB 1581 * num_base_bdevs - number of base bdevs 1582 * level - raid level 1583 * superblock_enabled - true if raid should have superblock 1584 * uuid - uuid to set for the bdev 1585 * raid_bdev_out - the created raid bdev 1586 * returns: 1587 * 0 - success 1588 * non zero - failure 1589 */ 1590 int 1591 raid_bdev_create(const char *name, uint32_t strip_size, uint8_t num_base_bdevs, 1592 enum raid_level level, bool superblock_enabled, const struct spdk_uuid *uuid, 1593 struct raid_bdev **raid_bdev_out) 1594 { 1595 struct raid_bdev *raid_bdev; 1596 int rc; 1597 1598 assert(uuid != NULL); 1599 1600 rc = _raid_bdev_create(name, strip_size, num_base_bdevs, level, superblock_enabled, uuid, 1601 &raid_bdev); 1602 if (rc != 0) { 1603 return rc; 1604 } 1605 1606 if (superblock_enabled && spdk_uuid_is_null(uuid)) { 1607 /* we need to have the uuid to store in the superblock before the bdev is registered */ 1608 spdk_uuid_generate(&raid_bdev->bdev.uuid); 1609 } 1610 1611 raid_bdev->num_base_bdevs_operational = num_base_bdevs; 1612 1613 *raid_bdev_out = raid_bdev; 1614 1615 return 0; 1616 } 1617 1618 static void 1619 _raid_bdev_unregistering_cont(void *ctx) 1620 { 1621 struct raid_bdev *raid_bdev = ctx; 1622 1623 spdk_bdev_close(raid_bdev->self_desc); 1624 raid_bdev->self_desc = NULL; 1625 } 1626 1627 static void 1628 raid_bdev_unregistering_cont(void *ctx) 1629 { 1630 spdk_thread_exec_msg(spdk_thread_get_app_thread(), _raid_bdev_unregistering_cont, ctx); 1631 } 1632 1633 static int 1634 raid_bdev_process_add_finish_action(struct raid_bdev_process *process, spdk_msg_fn cb, void *cb_ctx) 1635 { 1636 struct raid_process_finish_action *finish_action; 1637 1638 assert(spdk_get_thread() == process->thread); 1639 assert(process->state < RAID_PROCESS_STATE_STOPPED); 1640 1641 finish_action = calloc(1, sizeof(*finish_action)); 1642 if (finish_action == NULL) { 1643 return -ENOMEM; 1644 } 1645 1646 finish_action->cb = cb; 1647 finish_action->cb_ctx = cb_ctx; 1648 1649 TAILQ_INSERT_TAIL(&process->finish_actions, finish_action, link); 1650 1651 return 0; 1652 } 1653 1654 static void 1655 raid_bdev_unregistering_stop_process(void *ctx) 1656 { 1657 struct raid_bdev_process *process = ctx; 1658 struct raid_bdev *raid_bdev = process->raid_bdev; 1659 int rc; 1660 1661 process->state = RAID_PROCESS_STATE_STOPPING; 1662 if (process->status == 0) { 1663 process->status = -ECANCELED; 1664 } 1665 1666 rc = raid_bdev_process_add_finish_action(process, raid_bdev_unregistering_cont, raid_bdev); 1667 if (rc != 0) { 1668 SPDK_ERRLOG("Failed to add raid bdev '%s' process finish action: %s\n", 1669 raid_bdev->bdev.name, spdk_strerror(-rc)); 1670 } 1671 } 1672 1673 static void 1674 raid_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *event_ctx) 1675 { 1676 struct raid_bdev *raid_bdev = event_ctx; 1677 1678 if (type == SPDK_BDEV_EVENT_REMOVE) { 1679 if (raid_bdev->process != NULL) { 1680 spdk_thread_send_msg(raid_bdev->process->thread, raid_bdev_unregistering_stop_process, 1681 raid_bdev->process); 1682 } else { 1683 raid_bdev_unregistering_cont(raid_bdev); 1684 } 1685 } 1686 } 1687 1688 static void 1689 raid_bdev_configure_cont(struct raid_bdev *raid_bdev) 1690 { 1691 struct spdk_bdev *raid_bdev_gen = &raid_bdev->bdev; 1692 int rc; 1693 1694 raid_bdev->state = RAID_BDEV_STATE_ONLINE; 1695 SPDK_DEBUGLOG(bdev_raid, "io device register %p\n", raid_bdev); 1696 SPDK_DEBUGLOG(bdev_raid, "blockcnt %" PRIu64 ", blocklen %u\n", 1697 raid_bdev_gen->blockcnt, raid_bdev_gen->blocklen); 1698 spdk_io_device_register(raid_bdev, raid_bdev_create_cb, raid_bdev_destroy_cb, 1699 sizeof(struct raid_bdev_io_channel), 1700 raid_bdev_gen->name); 1701 rc = spdk_bdev_register(raid_bdev_gen); 1702 if (rc != 0) { 1703 SPDK_ERRLOG("Failed to register raid bdev '%s': %s\n", 1704 raid_bdev_gen->name, spdk_strerror(-rc)); 1705 goto err; 1706 } 1707 1708 /* 1709 * Open the bdev internally to delay unregistering if we need to stop a background process 1710 * first. The process may still need to unquiesce a range but it will fail because the 1711 * bdev's internal.spinlock is destroyed by the time the destruct callback is reached. 1712 * During application shutdown, bdevs automatically get unregistered by the bdev layer 1713 * so this is the only way currently to do this correctly. 1714 * TODO: try to handle this correctly in bdev layer instead. 1715 */ 1716 rc = spdk_bdev_open_ext(raid_bdev_gen->name, false, raid_bdev_event_cb, raid_bdev, 1717 &raid_bdev->self_desc); 1718 if (rc != 0) { 1719 SPDK_ERRLOG("Failed to open raid bdev '%s': %s\n", 1720 raid_bdev_gen->name, spdk_strerror(-rc)); 1721 spdk_bdev_unregister(raid_bdev_gen, NULL, NULL); 1722 goto err; 1723 } 1724 1725 SPDK_DEBUGLOG(bdev_raid, "raid bdev generic %p\n", raid_bdev_gen); 1726 SPDK_DEBUGLOG(bdev_raid, "raid bdev is created with name %s, raid_bdev %p\n", 1727 raid_bdev_gen->name, raid_bdev); 1728 return; 1729 err: 1730 if (raid_bdev->module->stop != NULL) { 1731 raid_bdev->module->stop(raid_bdev); 1732 } 1733 spdk_io_device_unregister(raid_bdev, NULL); 1734 raid_bdev->state = RAID_BDEV_STATE_CONFIGURING; 1735 } 1736 1737 static void 1738 raid_bdev_configure_write_sb_cb(int status, struct raid_bdev *raid_bdev, void *ctx) 1739 { 1740 if (status == 0) { 1741 raid_bdev_configure_cont(raid_bdev); 1742 } else { 1743 SPDK_ERRLOG("Failed to write raid bdev '%s' superblock: %s\n", 1744 raid_bdev->bdev.name, spdk_strerror(-status)); 1745 if (raid_bdev->module->stop != NULL) { 1746 raid_bdev->module->stop(raid_bdev); 1747 } 1748 } 1749 } 1750 1751 /* 1752 * brief: 1753 * If raid bdev config is complete, then only register the raid bdev to 1754 * bdev layer and remove this raid bdev from configuring list and 1755 * insert the raid bdev to configured list 1756 * params: 1757 * raid_bdev - pointer to raid bdev 1758 * returns: 1759 * 0 - success 1760 * non zero - failure 1761 */ 1762 static int 1763 raid_bdev_configure(struct raid_bdev *raid_bdev) 1764 { 1765 uint32_t data_block_size = spdk_bdev_get_data_block_size(&raid_bdev->bdev); 1766 int rc; 1767 1768 assert(raid_bdev->state == RAID_BDEV_STATE_CONFIGURING); 1769 assert(raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs_operational); 1770 assert(raid_bdev->bdev.blocklen > 0); 1771 1772 /* The strip_size_kb is read in from user in KB. Convert to blocks here for 1773 * internal use. 1774 */ 1775 raid_bdev->strip_size = (raid_bdev->strip_size_kb * 1024) / data_block_size; 1776 if (raid_bdev->strip_size == 0 && raid_bdev->level != RAID1) { 1777 SPDK_ERRLOG("Strip size cannot be smaller than the device block size\n"); 1778 return -EINVAL; 1779 } 1780 raid_bdev->strip_size_shift = spdk_u32log2(raid_bdev->strip_size); 1781 1782 rc = raid_bdev->module->start(raid_bdev); 1783 if (rc != 0) { 1784 SPDK_ERRLOG("raid module startup callback failed\n"); 1785 return rc; 1786 } 1787 1788 if (raid_bdev->superblock_enabled) { 1789 if (raid_bdev->sb == NULL) { 1790 rc = raid_bdev_alloc_superblock(raid_bdev, data_block_size); 1791 if (rc == 0) { 1792 raid_bdev_init_superblock(raid_bdev); 1793 } 1794 } else { 1795 assert(spdk_uuid_compare(&raid_bdev->sb->uuid, &raid_bdev->bdev.uuid) == 0); 1796 if (raid_bdev->sb->block_size != data_block_size) { 1797 SPDK_ERRLOG("blocklen does not match value in superblock\n"); 1798 rc = -EINVAL; 1799 } 1800 if (raid_bdev->sb->raid_size != raid_bdev->bdev.blockcnt) { 1801 SPDK_ERRLOG("blockcnt does not match value in superblock\n"); 1802 rc = -EINVAL; 1803 } 1804 } 1805 1806 if (rc != 0) { 1807 if (raid_bdev->module->stop != NULL) { 1808 raid_bdev->module->stop(raid_bdev); 1809 } 1810 return rc; 1811 } 1812 1813 raid_bdev_write_superblock(raid_bdev, raid_bdev_configure_write_sb_cb, NULL); 1814 } else { 1815 raid_bdev_configure_cont(raid_bdev); 1816 } 1817 1818 return 0; 1819 } 1820 1821 /* 1822 * brief: 1823 * If raid bdev is online and registered, change the bdev state to 1824 * configuring and unregister this raid device. Queue this raid device 1825 * in configuring list 1826 * params: 1827 * raid_bdev - pointer to raid bdev 1828 * cb_fn - callback function 1829 * cb_arg - argument to callback function 1830 * returns: 1831 * none 1832 */ 1833 static void 1834 raid_bdev_deconfigure(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn, 1835 void *cb_arg) 1836 { 1837 if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) { 1838 if (cb_fn) { 1839 cb_fn(cb_arg, 0); 1840 } 1841 return; 1842 } 1843 1844 raid_bdev->state = RAID_BDEV_STATE_OFFLINE; 1845 SPDK_DEBUGLOG(bdev_raid, "raid bdev state changing from online to offline\n"); 1846 1847 spdk_bdev_unregister(&raid_bdev->bdev, cb_fn, cb_arg); 1848 } 1849 1850 /* 1851 * brief: 1852 * raid_bdev_find_base_info_by_bdev function finds the base bdev info by bdev. 1853 * params: 1854 * base_bdev - pointer to base bdev 1855 * returns: 1856 * base bdev info if found, otherwise NULL. 1857 */ 1858 static struct raid_base_bdev_info * 1859 raid_bdev_find_base_info_by_bdev(struct spdk_bdev *base_bdev) 1860 { 1861 struct raid_bdev *raid_bdev; 1862 struct raid_base_bdev_info *base_info; 1863 1864 TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) { 1865 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 1866 if (base_info->desc != NULL && 1867 spdk_bdev_desc_get_bdev(base_info->desc) == base_bdev) { 1868 return base_info; 1869 } 1870 } 1871 } 1872 1873 return NULL; 1874 } 1875 1876 static void 1877 raid_bdev_remove_base_bdev_done(struct raid_base_bdev_info *base_info, int status) 1878 { 1879 struct raid_bdev *raid_bdev = base_info->raid_bdev; 1880 1881 assert(base_info->remove_scheduled); 1882 base_info->remove_scheduled = false; 1883 1884 if (status == 0) { 1885 raid_bdev->num_base_bdevs_operational--; 1886 if (raid_bdev->num_base_bdevs_operational < raid_bdev->min_base_bdevs_operational) { 1887 /* There is not enough base bdevs to keep the raid bdev operational. */ 1888 raid_bdev_deconfigure(raid_bdev, base_info->remove_cb, base_info->remove_cb_ctx); 1889 return; 1890 } 1891 } 1892 1893 if (base_info->remove_cb != NULL) { 1894 base_info->remove_cb(base_info->remove_cb_ctx, status); 1895 } 1896 } 1897 1898 static void 1899 raid_bdev_remove_base_bdev_write_sb_cb(int status, struct raid_bdev *raid_bdev, void *ctx) 1900 { 1901 struct raid_base_bdev_info *base_info = ctx; 1902 1903 if (status != 0) { 1904 SPDK_ERRLOG("Failed to write raid bdev '%s' superblock: %s\n", 1905 raid_bdev->bdev.name, spdk_strerror(-status)); 1906 } 1907 1908 raid_bdev_remove_base_bdev_done(base_info, status); 1909 } 1910 1911 static void 1912 raid_bdev_remove_base_bdev_on_unquiesced(void *ctx, int status) 1913 { 1914 struct raid_base_bdev_info *base_info = ctx; 1915 struct raid_bdev *raid_bdev = base_info->raid_bdev; 1916 1917 if (status != 0) { 1918 SPDK_ERRLOG("Failed to unquiesce raid bdev %s: %s\n", 1919 raid_bdev->bdev.name, spdk_strerror(-status)); 1920 goto out; 1921 } 1922 1923 spdk_spin_lock(&raid_bdev->base_bdev_lock); 1924 raid_bdev_free_base_bdev_resource(base_info); 1925 spdk_spin_unlock(&raid_bdev->base_bdev_lock); 1926 1927 if (raid_bdev->sb) { 1928 struct raid_bdev_superblock *sb = raid_bdev->sb; 1929 uint8_t slot = raid_bdev_base_bdev_slot(base_info); 1930 uint8_t i; 1931 1932 for (i = 0; i < sb->base_bdevs_size; i++) { 1933 struct raid_bdev_sb_base_bdev *sb_base_bdev = &sb->base_bdevs[i]; 1934 1935 if (sb_base_bdev->state == RAID_SB_BASE_BDEV_CONFIGURED && 1936 sb_base_bdev->slot == slot) { 1937 /* TODO: distinguish between failure and intentional removal */ 1938 sb_base_bdev->state = RAID_SB_BASE_BDEV_FAILED; 1939 1940 raid_bdev_write_superblock(raid_bdev, raid_bdev_remove_base_bdev_write_sb_cb, base_info); 1941 return; 1942 } 1943 } 1944 } 1945 out: 1946 raid_bdev_remove_base_bdev_done(base_info, status); 1947 } 1948 1949 static void 1950 raid_bdev_channel_remove_base_bdev(struct spdk_io_channel_iter *i) 1951 { 1952 struct raid_base_bdev_info *base_info = spdk_io_channel_iter_get_ctx(i); 1953 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 1954 struct raid_bdev_io_channel *raid_ch = spdk_io_channel_get_ctx(ch); 1955 uint8_t idx = raid_bdev_base_bdev_slot(base_info); 1956 1957 SPDK_DEBUGLOG(bdev_raid, "slot: %u raid_ch: %p\n", idx, raid_ch); 1958 1959 if (raid_ch->base_channel[idx] != NULL) { 1960 spdk_put_io_channel(raid_ch->base_channel[idx]); 1961 raid_ch->base_channel[idx] = NULL; 1962 } 1963 1964 if (raid_ch->process.ch_processed != NULL) { 1965 raid_ch->process.ch_processed->base_channel[idx] = NULL; 1966 } 1967 1968 spdk_for_each_channel_continue(i, 0); 1969 } 1970 1971 static void 1972 raid_bdev_channels_remove_base_bdev_done(struct spdk_io_channel_iter *i, int status) 1973 { 1974 struct raid_base_bdev_info *base_info = spdk_io_channel_iter_get_ctx(i); 1975 struct raid_bdev *raid_bdev = base_info->raid_bdev; 1976 1977 spdk_bdev_unquiesce(&raid_bdev->bdev, &g_raid_if, raid_bdev_remove_base_bdev_on_unquiesced, 1978 base_info); 1979 } 1980 1981 static void 1982 raid_bdev_remove_base_bdev_on_quiesced(void *ctx, int status) 1983 { 1984 struct raid_base_bdev_info *base_info = ctx; 1985 struct raid_bdev *raid_bdev = base_info->raid_bdev; 1986 1987 if (status != 0) { 1988 SPDK_ERRLOG("Failed to quiesce raid bdev %s: %s\n", 1989 raid_bdev->bdev.name, spdk_strerror(-status)); 1990 raid_bdev_remove_base_bdev_done(base_info, status); 1991 return; 1992 } 1993 1994 spdk_for_each_channel(raid_bdev, raid_bdev_channel_remove_base_bdev, base_info, 1995 raid_bdev_channels_remove_base_bdev_done); 1996 } 1997 1998 static int 1999 raid_bdev_remove_base_bdev_quiesce(struct raid_base_bdev_info *base_info) 2000 { 2001 assert(spdk_get_thread() == spdk_thread_get_app_thread()); 2002 2003 return spdk_bdev_quiesce(&base_info->raid_bdev->bdev, &g_raid_if, 2004 raid_bdev_remove_base_bdev_on_quiesced, base_info); 2005 } 2006 2007 struct raid_bdev_process_base_bdev_remove_ctx { 2008 struct raid_bdev_process *process; 2009 struct raid_base_bdev_info *base_info; 2010 uint8_t num_base_bdevs_operational; 2011 }; 2012 2013 static void 2014 _raid_bdev_process_base_bdev_remove_cont(void *ctx) 2015 { 2016 struct raid_base_bdev_info *base_info = ctx; 2017 int ret; 2018 2019 ret = raid_bdev_remove_base_bdev_quiesce(base_info); 2020 if (ret != 0) { 2021 raid_bdev_remove_base_bdev_done(base_info, ret); 2022 } 2023 } 2024 2025 static void 2026 raid_bdev_process_base_bdev_remove_cont(void *_ctx) 2027 { 2028 struct raid_bdev_process_base_bdev_remove_ctx *ctx = _ctx; 2029 struct raid_base_bdev_info *base_info = ctx->base_info; 2030 2031 free(ctx); 2032 2033 spdk_thread_send_msg(spdk_thread_get_app_thread(), _raid_bdev_process_base_bdev_remove_cont, 2034 base_info); 2035 } 2036 2037 static void 2038 _raid_bdev_process_base_bdev_remove(void *_ctx) 2039 { 2040 struct raid_bdev_process_base_bdev_remove_ctx *ctx = _ctx; 2041 struct raid_bdev_process *process = ctx->process; 2042 int ret; 2043 2044 if (ctx->base_info != process->target && 2045 ctx->num_base_bdevs_operational > process->raid_bdev->min_base_bdevs_operational) { 2046 /* process doesn't need to be stopped */ 2047 raid_bdev_process_base_bdev_remove_cont(ctx); 2048 return; 2049 } 2050 2051 assert(process->state > RAID_PROCESS_STATE_INIT && 2052 process->state < RAID_PROCESS_STATE_STOPPED); 2053 2054 ret = raid_bdev_process_add_finish_action(process, raid_bdev_process_base_bdev_remove_cont, ctx); 2055 if (ret != 0) { 2056 raid_bdev_remove_base_bdev_done(ctx->base_info, ret); 2057 free(ctx); 2058 return; 2059 } 2060 2061 process->state = RAID_PROCESS_STATE_STOPPING; 2062 2063 if (process->status == 0) { 2064 process->status = -ENODEV; 2065 } 2066 } 2067 2068 static int 2069 raid_bdev_process_base_bdev_remove(struct raid_bdev_process *process, 2070 struct raid_base_bdev_info *base_info) 2071 { 2072 struct raid_bdev_process_base_bdev_remove_ctx *ctx; 2073 2074 assert(spdk_get_thread() == spdk_thread_get_app_thread()); 2075 2076 ctx = calloc(1, sizeof(*ctx)); 2077 if (ctx == NULL) { 2078 return -ENOMEM; 2079 } 2080 2081 /* 2082 * We have to send the process and num_base_bdevs_operational in the message ctx 2083 * because the process thread should not access raid_bdev's properties. Particularly, 2084 * raid_bdev->process may be cleared by the time the message is handled, but ctx->process 2085 * will still be valid until the process is fully stopped. 2086 */ 2087 ctx->base_info = base_info; 2088 ctx->process = process; 2089 /* 2090 * raid_bdev->num_base_bdevs_operational can't be used here because it is decremented 2091 * after the removal and more than one base bdev may be removed at the same time 2092 */ 2093 RAID_FOR_EACH_BASE_BDEV(process->raid_bdev, base_info) { 2094 if (!base_info->remove_scheduled && base_info->desc != NULL) { 2095 ctx->num_base_bdevs_operational++; 2096 } 2097 } 2098 2099 spdk_thread_send_msg(process->thread, _raid_bdev_process_base_bdev_remove, ctx); 2100 2101 return 0; 2102 } 2103 2104 static int 2105 _raid_bdev_remove_base_bdev(struct raid_base_bdev_info *base_info, 2106 raid_base_bdev_cb cb_fn, void *cb_ctx) 2107 { 2108 struct raid_bdev *raid_bdev = base_info->raid_bdev; 2109 int ret = 0; 2110 2111 SPDK_DEBUGLOG(bdev_raid, "%s\n", base_info->name); 2112 2113 assert(spdk_get_thread() == spdk_thread_get_app_thread()); 2114 2115 if (base_info->remove_scheduled) { 2116 return -ENODEV; 2117 } 2118 2119 assert(base_info->desc); 2120 base_info->remove_scheduled = true; 2121 2122 if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) { 2123 /* 2124 * As raid bdev is not registered yet or already unregistered, 2125 * so cleanup should be done here itself. 2126 * 2127 * Removing a base bdev at this stage does not change the number of operational 2128 * base bdevs, only the number of discovered base bdevs. 2129 */ 2130 raid_bdev_free_base_bdev_resource(base_info); 2131 base_info->remove_scheduled = false; 2132 if (raid_bdev->num_base_bdevs_discovered == 0) { 2133 /* There is no base bdev for this raid, so free the raid device. */ 2134 raid_bdev_cleanup_and_free(raid_bdev); 2135 } 2136 if (cb_fn != NULL) { 2137 cb_fn(cb_ctx, 0); 2138 } 2139 } else if (raid_bdev->min_base_bdevs_operational == raid_bdev->num_base_bdevs) { 2140 /* This raid bdev does not tolerate removing a base bdev. */ 2141 raid_bdev->num_base_bdevs_operational--; 2142 raid_bdev_deconfigure(raid_bdev, cb_fn, cb_ctx); 2143 } else { 2144 base_info->remove_cb = cb_fn; 2145 base_info->remove_cb_ctx = cb_ctx; 2146 2147 if (raid_bdev->process != NULL) { 2148 ret = raid_bdev_process_base_bdev_remove(raid_bdev->process, base_info); 2149 } else { 2150 ret = raid_bdev_remove_base_bdev_quiesce(base_info); 2151 } 2152 2153 if (ret != 0) { 2154 base_info->remove_scheduled = false; 2155 } 2156 } 2157 2158 return ret; 2159 } 2160 2161 /* 2162 * brief: 2163 * raid_bdev_remove_base_bdev function is called by below layers when base_bdev 2164 * is removed. This function checks if this base bdev is part of any raid bdev 2165 * or not. If yes, it takes necessary action on that particular raid bdev. 2166 * params: 2167 * base_bdev - pointer to base bdev which got removed 2168 * cb_fn - callback function 2169 * cb_arg - argument to callback function 2170 * returns: 2171 * 0 - success 2172 * non zero - failure 2173 */ 2174 int 2175 raid_bdev_remove_base_bdev(struct spdk_bdev *base_bdev, raid_base_bdev_cb cb_fn, void *cb_ctx) 2176 { 2177 struct raid_base_bdev_info *base_info; 2178 2179 /* Find the raid_bdev which has claimed this base_bdev */ 2180 base_info = raid_bdev_find_base_info_by_bdev(base_bdev); 2181 if (!base_info) { 2182 SPDK_ERRLOG("bdev to remove '%s' not found\n", base_bdev->name); 2183 return -ENODEV; 2184 } 2185 2186 return _raid_bdev_remove_base_bdev(base_info, cb_fn, cb_ctx); 2187 } 2188 2189 static void 2190 raid_bdev_resize_write_sb_cb(int status, struct raid_bdev *raid_bdev, void *ctx) 2191 { 2192 if (status != 0) { 2193 SPDK_ERRLOG("Failed to write raid bdev '%s' superblock after resizing the bdev: %s\n", 2194 raid_bdev->bdev.name, spdk_strerror(-status)); 2195 } 2196 } 2197 2198 /* 2199 * brief: 2200 * raid_bdev_resize_base_bdev function is called by below layers when base_bdev 2201 * is resized. This function checks if the smallest size of the base_bdevs is changed. 2202 * If yes, call module handler to resize the raid_bdev if implemented. 2203 * params: 2204 * base_bdev - pointer to base bdev which got resized. 2205 * returns: 2206 * none 2207 */ 2208 static void 2209 raid_bdev_resize_base_bdev(struct spdk_bdev *base_bdev) 2210 { 2211 struct raid_bdev *raid_bdev; 2212 struct raid_base_bdev_info *base_info; 2213 uint64_t blockcnt_old; 2214 2215 SPDK_DEBUGLOG(bdev_raid, "raid_bdev_resize_base_bdev\n"); 2216 2217 base_info = raid_bdev_find_base_info_by_bdev(base_bdev); 2218 2219 /* Find the raid_bdev which has claimed this base_bdev */ 2220 if (!base_info) { 2221 SPDK_ERRLOG("raid_bdev whose base_bdev '%s' not found\n", base_bdev->name); 2222 return; 2223 } 2224 raid_bdev = base_info->raid_bdev; 2225 2226 assert(spdk_get_thread() == spdk_thread_get_app_thread()); 2227 2228 SPDK_NOTICELOG("base_bdev '%s' was resized: old size %" PRIu64 ", new size %" PRIu64 "\n", 2229 base_bdev->name, base_info->blockcnt, base_bdev->blockcnt); 2230 2231 base_info->blockcnt = base_bdev->blockcnt; 2232 2233 if (!raid_bdev->module->resize) { 2234 return; 2235 } 2236 2237 blockcnt_old = raid_bdev->bdev.blockcnt; 2238 if (raid_bdev->module->resize(raid_bdev) == false) { 2239 return; 2240 } 2241 2242 SPDK_NOTICELOG("raid bdev '%s': block count was changed from %" PRIu64 " to %" PRIu64 "\n", 2243 raid_bdev->bdev.name, blockcnt_old, raid_bdev->bdev.blockcnt); 2244 2245 if (raid_bdev->superblock_enabled) { 2246 struct raid_bdev_superblock *sb = raid_bdev->sb; 2247 uint8_t i; 2248 2249 for (i = 0; i < sb->base_bdevs_size; i++) { 2250 struct raid_bdev_sb_base_bdev *sb_base_bdev = &sb->base_bdevs[i]; 2251 2252 if (sb_base_bdev->state == RAID_SB_BASE_BDEV_CONFIGURED) { 2253 base_info = &raid_bdev->base_bdev_info[sb_base_bdev->slot]; 2254 sb_base_bdev->data_size = base_info->data_size; 2255 } 2256 } 2257 sb->raid_size = raid_bdev->bdev.blockcnt; 2258 raid_bdev_write_superblock(raid_bdev, raid_bdev_resize_write_sb_cb, NULL); 2259 } 2260 } 2261 2262 /* 2263 * brief: 2264 * raid_bdev_event_base_bdev function is called by below layers when base_bdev 2265 * triggers asynchronous event. 2266 * params: 2267 * type - event details. 2268 * bdev - bdev that triggered event. 2269 * event_ctx - context for event. 2270 * returns: 2271 * none 2272 */ 2273 static void 2274 raid_bdev_event_base_bdev(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, 2275 void *event_ctx) 2276 { 2277 int rc; 2278 2279 switch (type) { 2280 case SPDK_BDEV_EVENT_REMOVE: 2281 rc = raid_bdev_remove_base_bdev(bdev, NULL, NULL); 2282 if (rc != 0) { 2283 SPDK_ERRLOG("Failed to remove base bdev %s: %s\n", 2284 spdk_bdev_get_name(bdev), spdk_strerror(-rc)); 2285 } 2286 break; 2287 case SPDK_BDEV_EVENT_RESIZE: 2288 raid_bdev_resize_base_bdev(bdev); 2289 break; 2290 default: 2291 SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); 2292 break; 2293 } 2294 } 2295 2296 /* 2297 * brief: 2298 * Deletes the specified raid bdev 2299 * params: 2300 * raid_bdev - pointer to raid bdev 2301 * cb_fn - callback function 2302 * cb_arg - argument to callback function 2303 */ 2304 void 2305 raid_bdev_delete(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn, void *cb_arg) 2306 { 2307 struct raid_base_bdev_info *base_info; 2308 2309 SPDK_DEBUGLOG(bdev_raid, "delete raid bdev: %s\n", raid_bdev->bdev.name); 2310 2311 if (raid_bdev->destroy_started) { 2312 SPDK_DEBUGLOG(bdev_raid, "destroying raid bdev %s is already started\n", 2313 raid_bdev->bdev.name); 2314 if (cb_fn) { 2315 cb_fn(cb_arg, -EALREADY); 2316 } 2317 return; 2318 } 2319 2320 raid_bdev->destroy_started = true; 2321 2322 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 2323 base_info->remove_scheduled = true; 2324 2325 if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) { 2326 /* 2327 * As raid bdev is not registered yet or already unregistered, 2328 * so cleanup should be done here itself. 2329 */ 2330 raid_bdev_free_base_bdev_resource(base_info); 2331 } 2332 } 2333 2334 if (raid_bdev->num_base_bdevs_discovered == 0) { 2335 /* There is no base bdev for this raid, so free the raid device. */ 2336 raid_bdev_cleanup_and_free(raid_bdev); 2337 if (cb_fn) { 2338 cb_fn(cb_arg, 0); 2339 } 2340 } else { 2341 raid_bdev_deconfigure(raid_bdev, cb_fn, cb_arg); 2342 } 2343 } 2344 2345 static void 2346 raid_bdev_process_finish_write_sb_cb(int status, struct raid_bdev *raid_bdev, void *ctx) 2347 { 2348 if (status != 0) { 2349 SPDK_ERRLOG("Failed to write raid bdev '%s' superblock after background process finished: %s\n", 2350 raid_bdev->bdev.name, spdk_strerror(-status)); 2351 } 2352 } 2353 2354 static void 2355 raid_bdev_process_finish_write_sb(void *ctx) 2356 { 2357 struct raid_bdev *raid_bdev = ctx; 2358 struct raid_bdev_superblock *sb = raid_bdev->sb; 2359 struct raid_bdev_sb_base_bdev *sb_base_bdev; 2360 struct raid_base_bdev_info *base_info; 2361 uint8_t i; 2362 2363 for (i = 0; i < sb->base_bdevs_size; i++) { 2364 sb_base_bdev = &sb->base_bdevs[i]; 2365 2366 if (sb_base_bdev->state != RAID_SB_BASE_BDEV_CONFIGURED && 2367 sb_base_bdev->slot < raid_bdev->num_base_bdevs) { 2368 base_info = &raid_bdev->base_bdev_info[sb_base_bdev->slot]; 2369 if (base_info->is_configured) { 2370 sb_base_bdev->state = RAID_SB_BASE_BDEV_CONFIGURED; 2371 spdk_uuid_copy(&sb_base_bdev->uuid, &base_info->uuid); 2372 } 2373 } 2374 } 2375 2376 raid_bdev_write_superblock(raid_bdev, raid_bdev_process_finish_write_sb_cb, NULL); 2377 } 2378 2379 static void raid_bdev_process_free(struct raid_bdev_process *process); 2380 2381 static void 2382 _raid_bdev_process_finish_done(void *ctx) 2383 { 2384 struct raid_bdev_process *process = ctx; 2385 struct raid_process_finish_action *finish_action; 2386 2387 while ((finish_action = TAILQ_FIRST(&process->finish_actions)) != NULL) { 2388 TAILQ_REMOVE(&process->finish_actions, finish_action, link); 2389 finish_action->cb(finish_action->cb_ctx); 2390 free(finish_action); 2391 } 2392 2393 raid_bdev_process_free(process); 2394 2395 spdk_thread_exit(spdk_get_thread()); 2396 } 2397 2398 static void 2399 raid_bdev_process_finish_target_removed(void *ctx, int status) 2400 { 2401 struct raid_bdev_process *process = ctx; 2402 2403 if (status != 0) { 2404 SPDK_ERRLOG("Failed to remove target bdev: %s\n", spdk_strerror(-status)); 2405 } 2406 2407 spdk_thread_send_msg(process->thread, _raid_bdev_process_finish_done, process); 2408 } 2409 2410 static void 2411 raid_bdev_process_finish_unquiesced(void *ctx, int status) 2412 { 2413 struct raid_bdev_process *process = ctx; 2414 2415 if (status != 0) { 2416 SPDK_ERRLOG("Failed to unquiesce bdev: %s\n", spdk_strerror(-status)); 2417 } 2418 2419 if (process->status != 0) { 2420 struct raid_base_bdev_info *target = process->target; 2421 2422 if (target->desc != NULL && target->remove_scheduled == false) { 2423 _raid_bdev_remove_base_bdev(target, raid_bdev_process_finish_target_removed, process); 2424 return; 2425 } 2426 } 2427 2428 spdk_thread_send_msg(process->thread, _raid_bdev_process_finish_done, process); 2429 } 2430 2431 static void 2432 raid_bdev_process_finish_unquiesce(void *ctx) 2433 { 2434 struct raid_bdev_process *process = ctx; 2435 int rc; 2436 2437 rc = spdk_bdev_unquiesce(&process->raid_bdev->bdev, &g_raid_if, 2438 raid_bdev_process_finish_unquiesced, process); 2439 if (rc != 0) { 2440 raid_bdev_process_finish_unquiesced(process, rc); 2441 } 2442 } 2443 2444 static void 2445 raid_bdev_process_finish_done(void *ctx) 2446 { 2447 struct raid_bdev_process *process = ctx; 2448 struct raid_bdev *raid_bdev = process->raid_bdev; 2449 2450 if (process->raid_ch != NULL) { 2451 spdk_put_io_channel(spdk_io_channel_from_ctx(process->raid_ch)); 2452 } 2453 2454 process->state = RAID_PROCESS_STATE_STOPPED; 2455 2456 if (process->status == 0) { 2457 SPDK_NOTICELOG("Finished %s on raid bdev %s\n", 2458 raid_bdev_process_to_str(process->type), 2459 raid_bdev->bdev.name); 2460 if (raid_bdev->superblock_enabled) { 2461 spdk_thread_send_msg(spdk_thread_get_app_thread(), 2462 raid_bdev_process_finish_write_sb, 2463 raid_bdev); 2464 } 2465 } else { 2466 SPDK_WARNLOG("Finished %s on raid bdev %s: %s\n", 2467 raid_bdev_process_to_str(process->type), 2468 raid_bdev->bdev.name, 2469 spdk_strerror(-process->status)); 2470 } 2471 2472 spdk_thread_send_msg(spdk_thread_get_app_thread(), raid_bdev_process_finish_unquiesce, 2473 process); 2474 } 2475 2476 static void 2477 __raid_bdev_process_finish(struct spdk_io_channel_iter *i, int status) 2478 { 2479 struct raid_bdev_process *process = spdk_io_channel_iter_get_ctx(i); 2480 2481 spdk_thread_send_msg(process->thread, raid_bdev_process_finish_done, process); 2482 } 2483 2484 static void 2485 raid_bdev_channel_process_finish(struct spdk_io_channel_iter *i) 2486 { 2487 struct raid_bdev_process *process = spdk_io_channel_iter_get_ctx(i); 2488 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2489 struct raid_bdev_io_channel *raid_ch = spdk_io_channel_get_ctx(ch); 2490 2491 if (process->status == 0) { 2492 uint8_t slot = raid_bdev_base_bdev_slot(process->target); 2493 2494 raid_ch->base_channel[slot] = raid_ch->process.target_ch; 2495 raid_ch->process.target_ch = NULL; 2496 } 2497 2498 raid_bdev_ch_process_cleanup(raid_ch); 2499 2500 spdk_for_each_channel_continue(i, 0); 2501 } 2502 2503 static void 2504 raid_bdev_process_finish_quiesced(void *ctx, int status) 2505 { 2506 struct raid_bdev_process *process = ctx; 2507 struct raid_bdev *raid_bdev = process->raid_bdev; 2508 2509 if (status != 0) { 2510 SPDK_ERRLOG("Failed to quiesce bdev: %s\n", spdk_strerror(-status)); 2511 return; 2512 } 2513 2514 raid_bdev->process = NULL; 2515 spdk_for_each_channel(process->raid_bdev, raid_bdev_channel_process_finish, process, 2516 __raid_bdev_process_finish); 2517 } 2518 2519 static void 2520 _raid_bdev_process_finish(void *ctx) 2521 { 2522 struct raid_bdev_process *process = ctx; 2523 int rc; 2524 2525 rc = spdk_bdev_quiesce(&process->raid_bdev->bdev, &g_raid_if, 2526 raid_bdev_process_finish_quiesced, process); 2527 if (rc != 0) { 2528 raid_bdev_process_finish_quiesced(ctx, rc); 2529 } 2530 } 2531 2532 static void 2533 raid_bdev_process_do_finish(struct raid_bdev_process *process) 2534 { 2535 spdk_thread_send_msg(spdk_thread_get_app_thread(), _raid_bdev_process_finish, process); 2536 } 2537 2538 static void raid_bdev_process_unlock_window_range(struct raid_bdev_process *process); 2539 static void raid_bdev_process_thread_run(struct raid_bdev_process *process); 2540 2541 static void 2542 raid_bdev_process_finish(struct raid_bdev_process *process, int status) 2543 { 2544 assert(spdk_get_thread() == process->thread); 2545 2546 if (process->status == 0) { 2547 process->status = status; 2548 } 2549 2550 if (process->state >= RAID_PROCESS_STATE_STOPPING) { 2551 return; 2552 } 2553 2554 assert(process->state == RAID_PROCESS_STATE_RUNNING); 2555 process->state = RAID_PROCESS_STATE_STOPPING; 2556 2557 if (process->window_range_locked) { 2558 raid_bdev_process_unlock_window_range(process); 2559 } else { 2560 raid_bdev_process_thread_run(process); 2561 } 2562 } 2563 2564 static void 2565 raid_bdev_process_window_range_unlocked(void *ctx, int status) 2566 { 2567 struct raid_bdev_process *process = ctx; 2568 2569 if (status != 0) { 2570 SPDK_ERRLOG("Failed to unlock LBA range: %s\n", spdk_strerror(-status)); 2571 raid_bdev_process_finish(process, status); 2572 return; 2573 } 2574 2575 process->window_range_locked = false; 2576 process->window_offset += process->window_size; 2577 2578 raid_bdev_process_thread_run(process); 2579 } 2580 2581 static void 2582 raid_bdev_process_unlock_window_range(struct raid_bdev_process *process) 2583 { 2584 int rc; 2585 2586 assert(process->window_range_locked == true); 2587 2588 rc = spdk_bdev_unquiesce_range(&process->raid_bdev->bdev, &g_raid_if, 2589 process->window_offset, process->max_window_size, 2590 raid_bdev_process_window_range_unlocked, process); 2591 if (rc != 0) { 2592 raid_bdev_process_window_range_unlocked(process, rc); 2593 } 2594 } 2595 2596 static void 2597 raid_bdev_process_channels_update_done(struct spdk_io_channel_iter *i, int status) 2598 { 2599 struct raid_bdev_process *process = spdk_io_channel_iter_get_ctx(i); 2600 2601 raid_bdev_process_unlock_window_range(process); 2602 } 2603 2604 static void 2605 raid_bdev_process_channel_update(struct spdk_io_channel_iter *i) 2606 { 2607 struct raid_bdev_process *process = spdk_io_channel_iter_get_ctx(i); 2608 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2609 struct raid_bdev_io_channel *raid_ch = spdk_io_channel_get_ctx(ch); 2610 2611 raid_ch->process.offset = process->window_offset + process->window_size; 2612 2613 spdk_for_each_channel_continue(i, 0); 2614 } 2615 2616 void 2617 raid_bdev_process_request_complete(struct raid_bdev_process_request *process_req, int status) 2618 { 2619 struct raid_bdev_process *process = process_req->process; 2620 2621 TAILQ_INSERT_TAIL(&process->requests, process_req, link); 2622 2623 assert(spdk_get_thread() == process->thread); 2624 assert(process->window_remaining >= process_req->num_blocks); 2625 2626 if (status != 0) { 2627 process->window_status = status; 2628 } 2629 2630 process->window_remaining -= process_req->num_blocks; 2631 if (process->window_remaining == 0) { 2632 if (process->window_status != 0) { 2633 raid_bdev_process_finish(process, process->window_status); 2634 return; 2635 } 2636 2637 spdk_for_each_channel(process->raid_bdev, raid_bdev_process_channel_update, process, 2638 raid_bdev_process_channels_update_done); 2639 } 2640 } 2641 2642 static int 2643 raid_bdev_submit_process_request(struct raid_bdev_process *process, uint64_t offset_blocks, 2644 uint32_t num_blocks) 2645 { 2646 struct raid_bdev *raid_bdev = process->raid_bdev; 2647 struct raid_bdev_process_request *process_req; 2648 int ret; 2649 2650 process_req = TAILQ_FIRST(&process->requests); 2651 if (process_req == NULL) { 2652 assert(process->window_remaining > 0); 2653 return 0; 2654 } 2655 2656 process_req->target = process->target; 2657 process_req->target_ch = process->raid_ch->process.target_ch; 2658 process_req->offset_blocks = offset_blocks; 2659 process_req->num_blocks = num_blocks; 2660 process_req->iov.iov_len = num_blocks * raid_bdev->bdev.blocklen; 2661 2662 ret = raid_bdev->module->submit_process_request(process_req, process->raid_ch); 2663 if (ret <= 0) { 2664 if (ret < 0) { 2665 SPDK_ERRLOG("Failed to submit process request on %s: %s\n", 2666 raid_bdev->bdev.name, spdk_strerror(-ret)); 2667 process->window_status = ret; 2668 } 2669 return ret; 2670 } 2671 2672 process_req->num_blocks = ret; 2673 TAILQ_REMOVE(&process->requests, process_req, link); 2674 2675 return ret; 2676 } 2677 2678 static void 2679 _raid_bdev_process_thread_run(struct raid_bdev_process *process) 2680 { 2681 struct raid_bdev *raid_bdev = process->raid_bdev; 2682 uint64_t offset = process->window_offset; 2683 const uint64_t offset_end = spdk_min(offset + process->max_window_size, raid_bdev->bdev.blockcnt); 2684 int ret; 2685 2686 while (offset < offset_end) { 2687 ret = raid_bdev_submit_process_request(process, offset, offset_end - offset); 2688 if (ret <= 0) { 2689 break; 2690 } 2691 2692 process->window_remaining += ret; 2693 offset += ret; 2694 } 2695 2696 if (process->window_remaining > 0) { 2697 process->window_size = process->window_remaining; 2698 } else { 2699 raid_bdev_process_finish(process, process->window_status); 2700 } 2701 } 2702 2703 static void 2704 raid_bdev_process_window_range_locked(void *ctx, int status) 2705 { 2706 struct raid_bdev_process *process = ctx; 2707 2708 if (status != 0) { 2709 SPDK_ERRLOG("Failed to lock LBA range: %s\n", spdk_strerror(-status)); 2710 raid_bdev_process_finish(process, status); 2711 return; 2712 } 2713 2714 process->window_range_locked = true; 2715 2716 if (process->state == RAID_PROCESS_STATE_STOPPING) { 2717 raid_bdev_process_unlock_window_range(process); 2718 return; 2719 } 2720 2721 _raid_bdev_process_thread_run(process); 2722 } 2723 2724 static void 2725 raid_bdev_process_thread_run(struct raid_bdev_process *process) 2726 { 2727 struct raid_bdev *raid_bdev = process->raid_bdev; 2728 int rc; 2729 2730 assert(spdk_get_thread() == process->thread); 2731 assert(process->window_remaining == 0); 2732 assert(process->window_range_locked == false); 2733 2734 if (process->state == RAID_PROCESS_STATE_STOPPING) { 2735 raid_bdev_process_do_finish(process); 2736 return; 2737 } 2738 2739 if (process->window_offset == raid_bdev->bdev.blockcnt) { 2740 SPDK_DEBUGLOG(bdev_raid, "process completed on %s\n", raid_bdev->bdev.name); 2741 raid_bdev_process_finish(process, 0); 2742 return; 2743 } 2744 2745 process->max_window_size = spdk_min(raid_bdev->bdev.blockcnt - process->window_offset, 2746 process->max_window_size); 2747 2748 rc = spdk_bdev_quiesce_range(&raid_bdev->bdev, &g_raid_if, 2749 process->window_offset, process->max_window_size, 2750 raid_bdev_process_window_range_locked, process); 2751 if (rc != 0) { 2752 raid_bdev_process_window_range_locked(process, rc); 2753 } 2754 } 2755 2756 static void 2757 raid_bdev_process_thread_init(void *ctx) 2758 { 2759 struct raid_bdev_process *process = ctx; 2760 struct raid_bdev *raid_bdev = process->raid_bdev; 2761 struct spdk_io_channel *ch; 2762 2763 process->thread = spdk_get_thread(); 2764 2765 ch = spdk_get_io_channel(raid_bdev); 2766 if (ch == NULL) { 2767 process->status = -ENOMEM; 2768 raid_bdev_process_do_finish(process); 2769 return; 2770 } 2771 2772 process->raid_ch = spdk_io_channel_get_ctx(ch); 2773 process->state = RAID_PROCESS_STATE_RUNNING; 2774 2775 SPDK_NOTICELOG("Started %s on raid bdev %s\n", 2776 raid_bdev_process_to_str(process->type), raid_bdev->bdev.name); 2777 2778 raid_bdev_process_thread_run(process); 2779 } 2780 2781 static void 2782 raid_bdev_channels_abort_start_process_done(struct spdk_io_channel_iter *i, int status) 2783 { 2784 struct raid_bdev_process *process = spdk_io_channel_iter_get_ctx(i); 2785 2786 _raid_bdev_remove_base_bdev(process->target, NULL, NULL); 2787 raid_bdev_process_free(process); 2788 2789 /* TODO: update sb */ 2790 } 2791 2792 static void 2793 raid_bdev_channel_abort_start_process(struct spdk_io_channel_iter *i) 2794 { 2795 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2796 struct raid_bdev_io_channel *raid_ch = spdk_io_channel_get_ctx(ch); 2797 2798 raid_bdev_ch_process_cleanup(raid_ch); 2799 2800 spdk_for_each_channel_continue(i, 0); 2801 } 2802 2803 static void 2804 raid_bdev_channels_start_process_done(struct spdk_io_channel_iter *i, int status) 2805 { 2806 struct raid_bdev_process *process = spdk_io_channel_iter_get_ctx(i); 2807 struct raid_bdev *raid_bdev = process->raid_bdev; 2808 struct spdk_thread *thread; 2809 char thread_name[RAID_BDEV_SB_NAME_SIZE + 16]; 2810 2811 if (status != 0) { 2812 SPDK_ERRLOG("Failed to start %s on %s: %s\n", 2813 raid_bdev_process_to_str(process->type), raid_bdev->bdev.name, 2814 spdk_strerror(-status)); 2815 goto err; 2816 } 2817 2818 /* TODO: we may need to abort if a base bdev was removed before we got here */ 2819 2820 snprintf(thread_name, sizeof(thread_name), "%s_%s", 2821 raid_bdev->bdev.name, raid_bdev_process_to_str(process->type)); 2822 2823 thread = spdk_thread_create(thread_name, NULL); 2824 if (thread == NULL) { 2825 SPDK_ERRLOG("Failed to create %s thread for %s\n", 2826 raid_bdev_process_to_str(process->type), raid_bdev->bdev.name); 2827 goto err; 2828 } 2829 2830 raid_bdev->process = process; 2831 2832 spdk_thread_send_msg(thread, raid_bdev_process_thread_init, process); 2833 2834 return; 2835 err: 2836 spdk_for_each_channel(process->raid_bdev, raid_bdev_channel_abort_start_process, process, 2837 raid_bdev_channels_abort_start_process_done); 2838 } 2839 2840 static void 2841 raid_bdev_channel_start_process(struct spdk_io_channel_iter *i) 2842 { 2843 struct raid_bdev_process *process = spdk_io_channel_iter_get_ctx(i); 2844 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2845 struct raid_bdev_io_channel *raid_ch = spdk_io_channel_get_ctx(ch); 2846 int rc; 2847 2848 rc = raid_bdev_ch_process_setup(raid_ch, process); 2849 2850 spdk_for_each_channel_continue(i, rc); 2851 } 2852 2853 static void 2854 raid_bdev_process_start(struct raid_bdev_process *process) 2855 { 2856 struct raid_bdev *raid_bdev = process->raid_bdev; 2857 2858 assert(raid_bdev->module->submit_process_request != NULL); 2859 2860 spdk_for_each_channel(raid_bdev, raid_bdev_channel_start_process, process, 2861 raid_bdev_channels_start_process_done); 2862 } 2863 2864 static void 2865 raid_bdev_process_request_free(struct raid_bdev_process_request *process_req) 2866 { 2867 spdk_dma_free(process_req->iov.iov_base); 2868 spdk_dma_free(process_req->md_buf); 2869 free(process_req); 2870 } 2871 2872 static struct raid_bdev_process_request * 2873 raid_bdev_process_alloc_request(struct raid_bdev_process *process) 2874 { 2875 struct raid_bdev *raid_bdev = process->raid_bdev; 2876 struct raid_bdev_process_request *process_req; 2877 2878 process_req = calloc(1, sizeof(*process_req)); 2879 if (process_req == NULL) { 2880 return NULL; 2881 } 2882 2883 process_req->process = process; 2884 process_req->iov.iov_len = process->max_window_size * raid_bdev->bdev.blocklen; 2885 process_req->iov.iov_base = spdk_dma_malloc(process_req->iov.iov_len, 4096, 0); 2886 if (process_req->iov.iov_base == NULL) { 2887 free(process_req); 2888 return NULL; 2889 } 2890 if (spdk_bdev_is_md_separate(&raid_bdev->bdev)) { 2891 process_req->md_buf = spdk_dma_malloc(process->max_window_size * raid_bdev->bdev.md_len, 4096, 0); 2892 if (process_req->md_buf == NULL) { 2893 raid_bdev_process_request_free(process_req); 2894 return NULL; 2895 } 2896 } 2897 2898 return process_req; 2899 } 2900 2901 static void 2902 raid_bdev_process_free(struct raid_bdev_process *process) 2903 { 2904 struct raid_bdev_process_request *process_req; 2905 2906 while ((process_req = TAILQ_FIRST(&process->requests)) != NULL) { 2907 TAILQ_REMOVE(&process->requests, process_req, link); 2908 raid_bdev_process_request_free(process_req); 2909 } 2910 2911 free(process); 2912 } 2913 2914 static struct raid_bdev_process * 2915 raid_bdev_process_alloc(struct raid_bdev *raid_bdev, enum raid_process_type type, 2916 struct raid_base_bdev_info *target) 2917 { 2918 struct raid_bdev_process *process; 2919 struct raid_bdev_process_request *process_req; 2920 int i; 2921 2922 process = calloc(1, sizeof(*process)); 2923 if (process == NULL) { 2924 return NULL; 2925 } 2926 2927 process->raid_bdev = raid_bdev; 2928 process->type = type; 2929 process->target = target; 2930 process->max_window_size = spdk_max(spdk_divide_round_up(g_opts.process_window_size_kb * 1024UL, 2931 spdk_bdev_get_data_block_size(&raid_bdev->bdev)), 2932 raid_bdev->bdev.write_unit_size); 2933 TAILQ_INIT(&process->requests); 2934 TAILQ_INIT(&process->finish_actions); 2935 2936 for (i = 0; i < RAID_BDEV_PROCESS_MAX_QD; i++) { 2937 process_req = raid_bdev_process_alloc_request(process); 2938 if (process_req == NULL) { 2939 raid_bdev_process_free(process); 2940 return NULL; 2941 } 2942 2943 TAILQ_INSERT_TAIL(&process->requests, process_req, link); 2944 } 2945 2946 return process; 2947 } 2948 2949 static int 2950 raid_bdev_start_rebuild(struct raid_base_bdev_info *target) 2951 { 2952 struct raid_bdev_process *process; 2953 2954 assert(spdk_get_thread() == spdk_thread_get_app_thread()); 2955 2956 process = raid_bdev_process_alloc(target->raid_bdev, RAID_PROCESS_REBUILD, target); 2957 if (process == NULL) { 2958 return -ENOMEM; 2959 } 2960 2961 raid_bdev_process_start(process); 2962 2963 return 0; 2964 } 2965 2966 static void 2967 raid_bdev_configure_base_bdev_cont(struct raid_base_bdev_info *base_info) 2968 { 2969 struct raid_bdev *raid_bdev = base_info->raid_bdev; 2970 int rc; 2971 2972 /* TODO: defer if rebuild in progress on another base bdev */ 2973 assert(raid_bdev->process == NULL); 2974 2975 base_info->is_configured = true; 2976 2977 raid_bdev->num_base_bdevs_discovered++; 2978 assert(raid_bdev->num_base_bdevs_discovered <= raid_bdev->num_base_bdevs); 2979 assert(raid_bdev->num_base_bdevs_operational <= raid_bdev->num_base_bdevs); 2980 assert(raid_bdev->num_base_bdevs_operational >= raid_bdev->min_base_bdevs_operational); 2981 2982 /* 2983 * Configure the raid bdev when the number of discovered base bdevs reaches the number 2984 * of base bdevs we know to be operational members of the array. Usually this is equal 2985 * to the total number of base bdevs (num_base_bdevs) but can be less - when the array is 2986 * degraded. 2987 */ 2988 if (raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs_operational) { 2989 rc = raid_bdev_configure(raid_bdev); 2990 if (rc != 0) { 2991 SPDK_ERRLOG("Failed to configure raid bdev: %s\n", spdk_strerror(-rc)); 2992 } 2993 } else if (raid_bdev->num_base_bdevs_discovered > raid_bdev->num_base_bdevs_operational) { 2994 assert(raid_bdev->state == RAID_BDEV_STATE_ONLINE); 2995 raid_bdev->num_base_bdevs_operational++; 2996 rc = raid_bdev_start_rebuild(base_info); 2997 if (rc != 0) { 2998 SPDK_ERRLOG("Failed to start rebuild: %s\n", spdk_strerror(-rc)); 2999 _raid_bdev_remove_base_bdev(base_info, NULL, NULL); 3000 } 3001 } else { 3002 rc = 0; 3003 } 3004 3005 if (base_info->configure_cb != NULL) { 3006 base_info->configure_cb(base_info->configure_cb_ctx, rc); 3007 } 3008 } 3009 3010 static void 3011 raid_bdev_configure_base_bdev_check_sb_cb(const struct raid_bdev_superblock *sb, int status, 3012 void *ctx) 3013 { 3014 struct raid_base_bdev_info *base_info = ctx; 3015 3016 switch (status) { 3017 case 0: 3018 /* valid superblock found */ 3019 SPDK_ERRLOG("Existing raid superblock found on bdev %s\n", base_info->name); 3020 status = -EEXIST; 3021 raid_bdev_free_base_bdev_resource(base_info); 3022 break; 3023 case -EINVAL: 3024 /* no valid superblock */ 3025 raid_bdev_configure_base_bdev_cont(base_info); 3026 return; 3027 default: 3028 SPDK_ERRLOG("Failed to examine bdev %s: %s\n", 3029 base_info->name, spdk_strerror(-status)); 3030 break; 3031 } 3032 3033 if (base_info->configure_cb != NULL) { 3034 base_info->configure_cb(base_info->configure_cb_ctx, status); 3035 } 3036 } 3037 3038 static int 3039 raid_bdev_configure_base_bdev(struct raid_base_bdev_info *base_info, bool existing, 3040 raid_base_bdev_cb cb_fn, void *cb_ctx) 3041 { 3042 struct raid_bdev *raid_bdev = base_info->raid_bdev; 3043 struct spdk_bdev_desc *desc; 3044 struct spdk_bdev *bdev; 3045 const struct spdk_uuid *bdev_uuid; 3046 int rc; 3047 3048 assert(spdk_get_thread() == spdk_thread_get_app_thread()); 3049 assert(base_info->desc == NULL); 3050 3051 /* 3052 * Base bdev can be added by name or uuid. Here we assure both properties are set and valid 3053 * before claiming the bdev. 3054 */ 3055 3056 if (!spdk_uuid_is_null(&base_info->uuid)) { 3057 char uuid_str[SPDK_UUID_STRING_LEN]; 3058 const char *bdev_name; 3059 3060 spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &base_info->uuid); 3061 3062 /* UUID of a bdev is registered as its alias */ 3063 bdev = spdk_bdev_get_by_name(uuid_str); 3064 if (bdev == NULL) { 3065 return -ENODEV; 3066 } 3067 3068 bdev_name = spdk_bdev_get_name(bdev); 3069 3070 if (base_info->name == NULL) { 3071 assert(existing == true); 3072 base_info->name = strdup(bdev_name); 3073 if (base_info->name == NULL) { 3074 return -ENOMEM; 3075 } 3076 } else if (strcmp(base_info->name, bdev_name) != 0) { 3077 SPDK_ERRLOG("Name mismatch for base bdev '%s' - expected '%s'\n", 3078 bdev_name, base_info->name); 3079 return -EINVAL; 3080 } 3081 } 3082 3083 assert(base_info->name != NULL); 3084 3085 rc = spdk_bdev_open_ext(base_info->name, true, raid_bdev_event_base_bdev, NULL, &desc); 3086 if (rc != 0) { 3087 if (rc != -ENODEV) { 3088 SPDK_ERRLOG("Unable to create desc on bdev '%s'\n", base_info->name); 3089 } 3090 return rc; 3091 } 3092 3093 bdev = spdk_bdev_desc_get_bdev(desc); 3094 bdev_uuid = spdk_bdev_get_uuid(bdev); 3095 3096 if (spdk_uuid_is_null(&base_info->uuid)) { 3097 spdk_uuid_copy(&base_info->uuid, bdev_uuid); 3098 } else if (spdk_uuid_compare(&base_info->uuid, bdev_uuid) != 0) { 3099 SPDK_ERRLOG("UUID mismatch for base bdev '%s'\n", base_info->name); 3100 spdk_bdev_close(desc); 3101 return -EINVAL; 3102 } 3103 3104 rc = spdk_bdev_module_claim_bdev(bdev, NULL, &g_raid_if); 3105 if (rc != 0) { 3106 SPDK_ERRLOG("Unable to claim this bdev as it is already claimed\n"); 3107 spdk_bdev_close(desc); 3108 return rc; 3109 } 3110 3111 SPDK_DEBUGLOG(bdev_raid, "bdev %s is claimed\n", bdev->name); 3112 3113 base_info->app_thread_ch = spdk_bdev_get_io_channel(desc); 3114 if (base_info->app_thread_ch == NULL) { 3115 SPDK_ERRLOG("Failed to get io channel\n"); 3116 spdk_bdev_module_release_bdev(bdev); 3117 spdk_bdev_close(desc); 3118 return -ENOMEM; 3119 } 3120 3121 base_info->desc = desc; 3122 base_info->blockcnt = bdev->blockcnt; 3123 3124 if (raid_bdev->superblock_enabled) { 3125 uint64_t data_offset; 3126 3127 if (base_info->data_offset == 0) { 3128 assert((RAID_BDEV_MIN_DATA_OFFSET_SIZE % spdk_bdev_get_data_block_size(bdev)) == 0); 3129 data_offset = RAID_BDEV_MIN_DATA_OFFSET_SIZE / spdk_bdev_get_data_block_size(bdev); 3130 } else { 3131 data_offset = base_info->data_offset; 3132 } 3133 3134 if (bdev->optimal_io_boundary != 0) { 3135 data_offset = spdk_divide_round_up(data_offset, 3136 bdev->optimal_io_boundary) * bdev->optimal_io_boundary; 3137 if (base_info->data_offset != 0 && base_info->data_offset != data_offset) { 3138 SPDK_WARNLOG("Data offset %lu on bdev '%s' is different than optimal value %lu\n", 3139 base_info->data_offset, base_info->name, data_offset); 3140 data_offset = base_info->data_offset; 3141 } 3142 } 3143 3144 base_info->data_offset = data_offset; 3145 } 3146 3147 if (base_info->data_offset >= bdev->blockcnt) { 3148 SPDK_ERRLOG("Data offset %lu exceeds base bdev capacity %lu on bdev '%s'\n", 3149 base_info->data_offset, bdev->blockcnt, base_info->name); 3150 rc = -EINVAL; 3151 goto out; 3152 } 3153 3154 if (base_info->data_size == 0) { 3155 base_info->data_size = bdev->blockcnt - base_info->data_offset; 3156 } else if (base_info->data_offset + base_info->data_size > bdev->blockcnt) { 3157 SPDK_ERRLOG("Data offset and size exceeds base bdev capacity %lu on bdev '%s'\n", 3158 bdev->blockcnt, base_info->name); 3159 rc = -EINVAL; 3160 goto out; 3161 } 3162 3163 if (!raid_bdev->module->dif_supported && spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 3164 SPDK_ERRLOG("Base bdev '%s' has DIF or DIX enabled - unsupported RAID configuration\n", 3165 bdev->name); 3166 rc = -EINVAL; 3167 goto out; 3168 } 3169 3170 /* 3171 * Set the raid bdev properties if this is the first base bdev configured, 3172 * otherwise - verify. Assumption is that all the base bdevs for any raid bdev should 3173 * have the same blocklen and metadata format. 3174 */ 3175 if (raid_bdev->bdev.blocklen == 0) { 3176 raid_bdev->bdev.blocklen = bdev->blocklen; 3177 raid_bdev->bdev.md_len = spdk_bdev_get_md_size(bdev); 3178 raid_bdev->bdev.md_interleave = spdk_bdev_is_md_interleaved(bdev); 3179 raid_bdev->bdev.dif_type = spdk_bdev_get_dif_type(bdev); 3180 raid_bdev->bdev.dif_check_flags = bdev->dif_check_flags; 3181 raid_bdev->bdev.dif_is_head_of_md = spdk_bdev_is_dif_head_of_md(bdev); 3182 } else { 3183 if (raid_bdev->bdev.blocklen != bdev->blocklen) { 3184 SPDK_ERRLOG("Raid bdev '%s' blocklen %u differs from base bdev '%s' blocklen %u\n", 3185 raid_bdev->bdev.name, raid_bdev->bdev.blocklen, bdev->name, bdev->blocklen); 3186 rc = -EINVAL; 3187 goto out; 3188 } 3189 3190 if (raid_bdev->bdev.md_len != spdk_bdev_get_md_size(bdev) || 3191 raid_bdev->bdev.md_interleave != spdk_bdev_is_md_interleaved(bdev) || 3192 raid_bdev->bdev.dif_type != spdk_bdev_get_dif_type(bdev) || 3193 raid_bdev->bdev.dif_check_flags != bdev->dif_check_flags || 3194 raid_bdev->bdev.dif_is_head_of_md != spdk_bdev_is_dif_head_of_md(bdev)) { 3195 SPDK_ERRLOG("Raid bdev '%s' has different metadata format than base bdev '%s'\n", 3196 raid_bdev->bdev.name, bdev->name); 3197 rc = -EINVAL; 3198 goto out; 3199 } 3200 } 3201 3202 base_info->configure_cb = cb_fn; 3203 base_info->configure_cb_ctx = cb_ctx; 3204 3205 if (existing) { 3206 raid_bdev_configure_base_bdev_cont(base_info); 3207 } else { 3208 /* check for existing superblock when using a new bdev */ 3209 rc = raid_bdev_load_base_bdev_superblock(desc, base_info->app_thread_ch, 3210 raid_bdev_configure_base_bdev_check_sb_cb, base_info); 3211 if (rc) { 3212 SPDK_ERRLOG("Failed to read bdev %s superblock: %s\n", 3213 bdev->name, spdk_strerror(-rc)); 3214 } 3215 } 3216 out: 3217 if (rc != 0) { 3218 raid_bdev_free_base_bdev_resource(base_info); 3219 } 3220 return rc; 3221 } 3222 3223 static int 3224 _raid_bdev_add_base_device(struct raid_bdev *raid_bdev, const char *name, uint8_t slot, 3225 uint64_t data_offset, uint64_t data_size, 3226 raid_base_bdev_cb cb_fn, void *cb_ctx) 3227 { 3228 struct raid_base_bdev_info *base_info; 3229 3230 assert(name != NULL); 3231 3232 if (slot >= raid_bdev->num_base_bdevs) { 3233 return -EINVAL; 3234 } 3235 3236 base_info = &raid_bdev->base_bdev_info[slot]; 3237 3238 if (base_info->name != NULL) { 3239 SPDK_ERRLOG("Slot %u on raid bdev '%s' already assigned to bdev '%s'\n", 3240 slot, raid_bdev->bdev.name, base_info->name); 3241 return -EBUSY; 3242 } 3243 3244 if (!spdk_uuid_is_null(&base_info->uuid)) { 3245 char uuid_str[SPDK_UUID_STRING_LEN]; 3246 3247 spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &base_info->uuid); 3248 SPDK_ERRLOG("Slot %u on raid bdev '%s' already assigned to bdev with uuid %s\n", 3249 slot, raid_bdev->bdev.name, uuid_str); 3250 return -EBUSY; 3251 } 3252 3253 base_info->name = strdup(name); 3254 if (base_info->name == NULL) { 3255 return -ENOMEM; 3256 } 3257 3258 base_info->data_offset = data_offset; 3259 base_info->data_size = data_size; 3260 3261 return raid_bdev_configure_base_bdev(base_info, false, cb_fn, cb_ctx); 3262 } 3263 3264 int 3265 raid_bdev_attach_base_bdev(struct raid_bdev *raid_bdev, struct spdk_bdev *base_bdev, 3266 raid_base_bdev_cb cb_fn, void *cb_ctx) 3267 { 3268 struct raid_base_bdev_info *base_info = NULL, *iter; 3269 int rc; 3270 3271 SPDK_DEBUGLOG(bdev_raid, "attach_base_device: %s\n", base_bdev->name); 3272 3273 assert(spdk_get_thread() == spdk_thread_get_app_thread()); 3274 3275 if (raid_bdev->process != NULL) { 3276 SPDK_ERRLOG("raid bdev '%s' is in process\n", 3277 raid_bdev->bdev.name); 3278 return -EPERM; 3279 } 3280 3281 if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) { 3282 SPDK_ERRLOG("raid bdev '%s' must be in online state to attach base bdev\n", 3283 raid_bdev->bdev.name); 3284 return -EINVAL; 3285 } 3286 3287 RAID_FOR_EACH_BASE_BDEV(raid_bdev, iter) { 3288 if (iter->desc == NULL) { 3289 base_info = iter; 3290 break; 3291 } 3292 } 3293 3294 if (base_info == NULL) { 3295 SPDK_ERRLOG("no empty slot found in raid bdev '%s' for new base bdev '%s'\n", 3296 raid_bdev->bdev.name, base_bdev->name); 3297 return -EINVAL; 3298 } 3299 3300 assert(base_info->is_configured == false); 3301 assert(base_info->data_size != 0); 3302 3303 spdk_spin_lock(&raid_bdev->base_bdev_lock); 3304 3305 rc = _raid_bdev_add_base_device(raid_bdev, base_bdev->name, 3306 raid_bdev_base_bdev_slot(base_info), 3307 base_info->data_offset, base_info->data_size, 3308 cb_fn, cb_ctx); 3309 if (rc != 0) { 3310 SPDK_ERRLOG("base bdev '%s' attach failed: %s\n", base_bdev->name, spdk_strerror(-rc)); 3311 raid_bdev_free_base_bdev_resource(base_info); 3312 } 3313 3314 spdk_spin_unlock(&raid_bdev->base_bdev_lock); 3315 3316 return rc; 3317 } 3318 3319 /* 3320 * brief: 3321 * raid_bdev_add_base_device function is the actual function which either adds 3322 * the nvme base device to existing raid bdev or create a new raid bdev. It also claims 3323 * the base device and keep the open descriptor. 3324 * params: 3325 * raid_bdev - pointer to raid bdev 3326 * name - name of the base bdev 3327 * slot - position to add base bdev 3328 * cb_fn - callback function 3329 * cb_ctx - argument to callback function 3330 * returns: 3331 * 0 - success 3332 * non zero - failure 3333 */ 3334 int 3335 raid_bdev_add_base_device(struct raid_bdev *raid_bdev, const char *name, uint8_t slot, 3336 raid_base_bdev_cb cb_fn, void *cb_ctx) 3337 { 3338 return _raid_bdev_add_base_device(raid_bdev, name, slot, 0, 0, cb_fn, cb_ctx); 3339 } 3340 3341 static int 3342 raid_bdev_create_from_sb(const struct raid_bdev_superblock *sb, struct raid_bdev **raid_bdev_out) 3343 { 3344 struct raid_bdev *raid_bdev; 3345 uint8_t i; 3346 int rc; 3347 3348 rc = _raid_bdev_create(sb->name, (sb->strip_size * sb->block_size) / 1024, sb->num_base_bdevs, 3349 sb->level, true, &sb->uuid, &raid_bdev); 3350 if (rc != 0) { 3351 return rc; 3352 } 3353 3354 rc = raid_bdev_alloc_superblock(raid_bdev, sb->block_size); 3355 if (rc != 0) { 3356 raid_bdev_free(raid_bdev); 3357 return rc; 3358 } 3359 3360 assert(sb->length <= RAID_BDEV_SB_MAX_LENGTH); 3361 memcpy(raid_bdev->sb, sb, sb->length); 3362 3363 for (i = 0; i < sb->base_bdevs_size; i++) { 3364 const struct raid_bdev_sb_base_bdev *sb_base_bdev = &sb->base_bdevs[i]; 3365 struct raid_base_bdev_info *base_info = &raid_bdev->base_bdev_info[sb_base_bdev->slot]; 3366 3367 if (sb_base_bdev->state == RAID_SB_BASE_BDEV_CONFIGURED) { 3368 spdk_uuid_copy(&base_info->uuid, &sb_base_bdev->uuid); 3369 raid_bdev->num_base_bdevs_operational++; 3370 } 3371 3372 base_info->data_offset = sb_base_bdev->data_offset; 3373 base_info->data_size = sb_base_bdev->data_size; 3374 } 3375 3376 *raid_bdev_out = raid_bdev; 3377 return 0; 3378 } 3379 3380 static void 3381 raid_bdev_examine_no_sb(struct spdk_bdev *bdev) 3382 { 3383 struct raid_bdev *raid_bdev; 3384 struct raid_base_bdev_info *base_info; 3385 3386 TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) { 3387 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 3388 if (base_info->desc == NULL && base_info->name != NULL && 3389 strcmp(bdev->name, base_info->name) == 0) { 3390 raid_bdev_configure_base_bdev(base_info, true, NULL, NULL); 3391 break; 3392 } 3393 } 3394 } 3395 } 3396 3397 static void 3398 raid_bdev_examine_sb(const struct raid_bdev_superblock *sb, struct spdk_bdev *bdev) 3399 { 3400 const struct raid_bdev_sb_base_bdev *sb_base_bdev = NULL; 3401 struct raid_bdev *raid_bdev; 3402 struct raid_base_bdev_info *iter, *base_info; 3403 uint8_t i; 3404 int rc; 3405 3406 if (sb->block_size != spdk_bdev_get_data_block_size(bdev)) { 3407 SPDK_WARNLOG("Bdev %s block size (%u) does not match the value in superblock (%u)\n", 3408 bdev->name, sb->block_size, spdk_bdev_get_data_block_size(bdev)); 3409 return; 3410 } 3411 3412 if (spdk_uuid_is_null(&sb->uuid)) { 3413 SPDK_WARNLOG("NULL raid bdev UUID in superblock on bdev %s\n", bdev->name); 3414 return; 3415 } 3416 3417 TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) { 3418 if (spdk_uuid_compare(&raid_bdev->bdev.uuid, &sb->uuid) == 0) { 3419 break; 3420 } 3421 } 3422 3423 if (raid_bdev) { 3424 if (sb->seq_number > raid_bdev->sb->seq_number) { 3425 SPDK_DEBUGLOG(bdev_raid, 3426 "raid superblock seq_number on bdev %s (%lu) greater than existing raid bdev %s (%lu)\n", 3427 bdev->name, sb->seq_number, raid_bdev->bdev.name, raid_bdev->sb->seq_number); 3428 3429 if (raid_bdev->state != RAID_BDEV_STATE_CONFIGURING) { 3430 SPDK_WARNLOG("Newer version of raid bdev %s superblock found on bdev %s but raid bdev is not in configuring state.\n", 3431 raid_bdev->bdev.name, bdev->name); 3432 return; 3433 } 3434 3435 /* remove and then recreate the raid bdev using the newer superblock */ 3436 raid_bdev_delete(raid_bdev, NULL, NULL); 3437 raid_bdev = NULL; 3438 } else if (sb->seq_number < raid_bdev->sb->seq_number) { 3439 SPDK_DEBUGLOG(bdev_raid, 3440 "raid superblock seq_number on bdev %s (%lu) smaller than existing raid bdev %s (%lu)\n", 3441 bdev->name, sb->seq_number, raid_bdev->bdev.name, raid_bdev->sb->seq_number); 3442 /* use the current raid bdev superblock */ 3443 sb = raid_bdev->sb; 3444 } 3445 } 3446 3447 for (i = 0; i < sb->base_bdevs_size; i++) { 3448 sb_base_bdev = &sb->base_bdevs[i]; 3449 3450 assert(spdk_uuid_is_null(&sb_base_bdev->uuid) == false); 3451 3452 if (spdk_uuid_compare(&sb_base_bdev->uuid, spdk_bdev_get_uuid(bdev)) == 0) { 3453 break; 3454 } 3455 } 3456 3457 if (i == sb->base_bdevs_size) { 3458 SPDK_DEBUGLOG(bdev_raid, "raid superblock does not contain this bdev's uuid\n"); 3459 return; 3460 } 3461 3462 if (!raid_bdev) { 3463 rc = raid_bdev_create_from_sb(sb, &raid_bdev); 3464 if (rc != 0) { 3465 SPDK_ERRLOG("Failed to create raid bdev %s: %s\n", 3466 sb->name, spdk_strerror(-rc)); 3467 return; 3468 } 3469 } 3470 3471 if (sb_base_bdev->state != RAID_SB_BASE_BDEV_CONFIGURED) { 3472 SPDK_NOTICELOG("Bdev %s is not an active member of raid bdev %s. Ignoring.\n", 3473 bdev->name, raid_bdev->bdev.name); 3474 return; 3475 } 3476 3477 base_info = NULL; 3478 RAID_FOR_EACH_BASE_BDEV(raid_bdev, iter) { 3479 if (spdk_uuid_compare(&iter->uuid, spdk_bdev_get_uuid(bdev)) == 0) { 3480 base_info = iter; 3481 break; 3482 } 3483 } 3484 3485 if (base_info == NULL) { 3486 SPDK_ERRLOG("Bdev %s is not a member of raid bdev %s\n", 3487 bdev->name, raid_bdev->bdev.name); 3488 return; 3489 } 3490 3491 rc = raid_bdev_configure_base_bdev(base_info, true, NULL, NULL); 3492 if (rc != 0) { 3493 SPDK_ERRLOG("Failed to configure bdev %s as base bdev of raid %s: %s\n", 3494 bdev->name, raid_bdev->bdev.name, spdk_strerror(-rc)); 3495 } 3496 } 3497 3498 struct raid_bdev_examine_ctx { 3499 struct spdk_bdev_desc *desc; 3500 struct spdk_io_channel *ch; 3501 }; 3502 3503 static void 3504 raid_bdev_examine_ctx_free(struct raid_bdev_examine_ctx *ctx) 3505 { 3506 if (!ctx) { 3507 return; 3508 } 3509 3510 if (ctx->ch) { 3511 spdk_put_io_channel(ctx->ch); 3512 } 3513 3514 if (ctx->desc) { 3515 spdk_bdev_close(ctx->desc); 3516 } 3517 3518 free(ctx); 3519 } 3520 3521 static void 3522 raid_bdev_examine_load_sb_cb(const struct raid_bdev_superblock *sb, int status, void *_ctx) 3523 { 3524 struct raid_bdev_examine_ctx *ctx = _ctx; 3525 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(ctx->desc); 3526 3527 switch (status) { 3528 case 0: 3529 /* valid superblock found */ 3530 SPDK_DEBUGLOG(bdev_raid, "raid superblock found on bdev %s\n", bdev->name); 3531 raid_bdev_examine_sb(sb, bdev); 3532 break; 3533 case -EINVAL: 3534 /* no valid superblock, check if it can be claimed anyway */ 3535 raid_bdev_examine_no_sb(bdev); 3536 break; 3537 default: 3538 SPDK_ERRLOG("Failed to examine bdev %s: %s\n", 3539 bdev->name, spdk_strerror(-status)); 3540 break; 3541 } 3542 3543 raid_bdev_examine_ctx_free(ctx); 3544 spdk_bdev_module_examine_done(&g_raid_if); 3545 } 3546 3547 static void 3548 raid_bdev_examine_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *event_ctx) 3549 { 3550 } 3551 3552 /* 3553 * brief: 3554 * raid_bdev_examine function is the examine function call by the below layers 3555 * like bdev_nvme layer. This function will check if this base bdev can be 3556 * claimed by this raid bdev or not. 3557 * params: 3558 * bdev - pointer to base bdev 3559 * returns: 3560 * none 3561 */ 3562 static void 3563 raid_bdev_examine(struct spdk_bdev *bdev) 3564 { 3565 struct raid_bdev_examine_ctx *ctx; 3566 int rc; 3567 3568 if (raid_bdev_find_base_info_by_bdev(bdev) != NULL) { 3569 goto done; 3570 } 3571 3572 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 3573 raid_bdev_examine_no_sb(bdev); 3574 goto done; 3575 } 3576 3577 ctx = calloc(1, sizeof(*ctx)); 3578 if (!ctx) { 3579 SPDK_ERRLOG("Failed to examine bdev %s: %s\n", 3580 bdev->name, spdk_strerror(ENOMEM)); 3581 goto err; 3582 } 3583 3584 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, raid_bdev_examine_event_cb, NULL, 3585 &ctx->desc); 3586 if (rc) { 3587 SPDK_ERRLOG("Failed to open bdev %s: %s\n", 3588 bdev->name, spdk_strerror(-rc)); 3589 goto err; 3590 } 3591 3592 ctx->ch = spdk_bdev_get_io_channel(ctx->desc); 3593 if (!ctx->ch) { 3594 SPDK_ERRLOG("Failed to get io channel for bdev %s\n", bdev->name); 3595 goto err; 3596 } 3597 3598 rc = raid_bdev_load_base_bdev_superblock(ctx->desc, ctx->ch, raid_bdev_examine_load_sb_cb, ctx); 3599 if (rc) { 3600 SPDK_ERRLOG("Failed to read bdev %s superblock: %s\n", 3601 bdev->name, spdk_strerror(-rc)); 3602 goto err; 3603 } 3604 3605 return; 3606 err: 3607 raid_bdev_examine_ctx_free(ctx); 3608 done: 3609 spdk_bdev_module_examine_done(&g_raid_if); 3610 } 3611 3612 /* Log component for bdev raid bdev module */ 3613 SPDK_LOG_REGISTER_COMPONENT(bdev_raid) 3614