1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2018 Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "bdev_raid.h" 8 #include "spdk/env.h" 9 #include "spdk/thread.h" 10 #include "spdk/log.h" 11 #include "spdk/string.h" 12 #include "spdk/util.h" 13 #include "spdk/json.h" 14 15 static bool g_shutdown_started = false; 16 17 /* List of all raid bdevs */ 18 struct raid_all_tailq g_raid_bdev_list = TAILQ_HEAD_INITIALIZER(g_raid_bdev_list); 19 20 static TAILQ_HEAD(, raid_bdev_module) g_raid_modules = TAILQ_HEAD_INITIALIZER(g_raid_modules); 21 22 static struct raid_bdev_module * 23 raid_bdev_module_find(enum raid_level level) 24 { 25 struct raid_bdev_module *raid_module; 26 27 TAILQ_FOREACH(raid_module, &g_raid_modules, link) { 28 if (raid_module->level == level) { 29 return raid_module; 30 } 31 } 32 33 return NULL; 34 } 35 36 void 37 raid_bdev_module_list_add(struct raid_bdev_module *raid_module) 38 { 39 if (raid_bdev_module_find(raid_module->level) != NULL) { 40 SPDK_ERRLOG("module for raid level '%s' already registered.\n", 41 raid_bdev_level_to_str(raid_module->level)); 42 assert(false); 43 } else { 44 TAILQ_INSERT_TAIL(&g_raid_modules, raid_module, link); 45 } 46 } 47 48 /* Function declarations */ 49 static void raid_bdev_examine(struct spdk_bdev *bdev); 50 static int raid_bdev_init(void); 51 static void raid_bdev_deconfigure(struct raid_bdev *raid_bdev, 52 raid_bdev_destruct_cb cb_fn, void *cb_arg); 53 54 /* 55 * brief: 56 * raid_bdev_create_cb function is a cb function for raid bdev which creates the 57 * hierarchy from raid bdev to base bdev io channels. It will be called per core 58 * params: 59 * io_device - pointer to raid bdev io device represented by raid_bdev 60 * ctx_buf - pointer to context buffer for raid bdev io channel 61 * returns: 62 * 0 - success 63 * non zero - failure 64 */ 65 static int 66 raid_bdev_create_cb(void *io_device, void *ctx_buf) 67 { 68 struct raid_bdev *raid_bdev = io_device; 69 struct raid_bdev_io_channel *raid_ch = ctx_buf; 70 uint8_t i; 71 int ret = 0; 72 73 SPDK_DEBUGLOG(bdev_raid, "raid_bdev_create_cb, %p\n", raid_ch); 74 75 assert(raid_bdev != NULL); 76 assert(raid_bdev->state == RAID_BDEV_STATE_ONLINE); 77 78 raid_ch->num_channels = raid_bdev->num_base_bdevs; 79 80 raid_ch->base_channel = calloc(raid_ch->num_channels, 81 sizeof(struct spdk_io_channel *)); 82 if (!raid_ch->base_channel) { 83 SPDK_ERRLOG("Unable to allocate base bdevs io channel\n"); 84 return -ENOMEM; 85 } 86 87 spdk_spin_lock(&raid_bdev->base_bdev_lock); 88 for (i = 0; i < raid_ch->num_channels; i++) { 89 /* 90 * Get the spdk_io_channel for all the base bdevs. This is used during 91 * split logic to send the respective child bdev ios to respective base 92 * bdev io channel. 93 */ 94 if (raid_bdev->base_bdev_info[i].desc == NULL) { 95 continue; 96 } 97 raid_ch->base_channel[i] = spdk_bdev_get_io_channel( 98 raid_bdev->base_bdev_info[i].desc); 99 if (!raid_ch->base_channel[i]) { 100 SPDK_ERRLOG("Unable to create io channel for base bdev\n"); 101 ret = -ENOMEM; 102 break; 103 } 104 } 105 spdk_spin_unlock(&raid_bdev->base_bdev_lock); 106 107 if (!ret && raid_bdev->module->get_io_channel) { 108 raid_ch->module_channel = raid_bdev->module->get_io_channel(raid_bdev); 109 if (!raid_ch->module_channel) { 110 SPDK_ERRLOG("Unable to create io channel for raid module\n"); 111 ret = -ENOMEM; 112 } 113 } 114 115 if (ret) { 116 for (i = 0; i < raid_ch->num_channels; i++) { 117 if (raid_ch->base_channel[i] != NULL) { 118 spdk_put_io_channel(raid_ch->base_channel[i]); 119 } 120 } 121 free(raid_ch->base_channel); 122 raid_ch->base_channel = NULL; 123 } 124 return ret; 125 } 126 127 /* 128 * brief: 129 * raid_bdev_destroy_cb function is a cb function for raid bdev which deletes the 130 * hierarchy from raid bdev to base bdev io channels. It will be called per core 131 * params: 132 * io_device - pointer to raid bdev io device represented by raid_bdev 133 * ctx_buf - pointer to context buffer for raid bdev io channel 134 * returns: 135 * none 136 */ 137 static void 138 raid_bdev_destroy_cb(void *io_device, void *ctx_buf) 139 { 140 struct raid_bdev_io_channel *raid_ch = ctx_buf; 141 uint8_t i; 142 143 SPDK_DEBUGLOG(bdev_raid, "raid_bdev_destroy_cb\n"); 144 145 assert(raid_ch != NULL); 146 assert(raid_ch->base_channel); 147 148 if (raid_ch->module_channel) { 149 spdk_put_io_channel(raid_ch->module_channel); 150 } 151 152 for (i = 0; i < raid_ch->num_channels; i++) { 153 /* Free base bdev channels */ 154 if (raid_ch->base_channel[i] != NULL) { 155 spdk_put_io_channel(raid_ch->base_channel[i]); 156 } 157 } 158 free(raid_ch->base_channel); 159 raid_ch->base_channel = NULL; 160 } 161 162 /* 163 * brief: 164 * raid_bdev_cleanup is used to cleanup raid_bdev related data 165 * structures. 166 * params: 167 * raid_bdev - pointer to raid_bdev 168 * returns: 169 * none 170 */ 171 static void 172 raid_bdev_cleanup(struct raid_bdev *raid_bdev) 173 { 174 struct raid_base_bdev_info *base_info; 175 176 SPDK_DEBUGLOG(bdev_raid, "raid_bdev_cleanup, %p name %s, state %s\n", 177 raid_bdev, raid_bdev->bdev.name, raid_bdev_state_to_str(raid_bdev->state)); 178 assert(raid_bdev->state != RAID_BDEV_STATE_ONLINE); 179 assert(spdk_get_thread() == spdk_thread_get_app_thread()); 180 181 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 182 assert(base_info->desc == NULL); 183 free(base_info->name); 184 } 185 186 TAILQ_REMOVE(&g_raid_bdev_list, raid_bdev, global_link); 187 } 188 189 static void 190 raid_bdev_free(struct raid_bdev *raid_bdev) 191 { 192 spdk_spin_destroy(&raid_bdev->base_bdev_lock); 193 free(raid_bdev->base_bdev_info); 194 free(raid_bdev->bdev.name); 195 free(raid_bdev); 196 } 197 198 static void 199 raid_bdev_cleanup_and_free(struct raid_bdev *raid_bdev) 200 { 201 raid_bdev_cleanup(raid_bdev); 202 raid_bdev_free(raid_bdev); 203 } 204 205 /* 206 * brief: 207 * free resource of base bdev for raid bdev 208 * params: 209 * base_info - raid base bdev info 210 * returns: 211 * 0 - success 212 * non zero - failure 213 */ 214 static void 215 raid_bdev_free_base_bdev_resource(struct raid_base_bdev_info *base_info) 216 { 217 struct raid_bdev *raid_bdev = base_info->raid_bdev; 218 219 assert(spdk_get_thread() == spdk_thread_get_app_thread()); 220 221 free(base_info->name); 222 base_info->name = NULL; 223 224 if (base_info->desc == NULL) { 225 return; 226 } 227 228 spdk_bdev_module_release_bdev(spdk_bdev_desc_get_bdev(base_info->desc)); 229 spdk_bdev_close(base_info->desc); 230 base_info->desc = NULL; 231 232 assert(raid_bdev->num_base_bdevs_discovered); 233 raid_bdev->num_base_bdevs_discovered--; 234 } 235 236 static void 237 raid_bdev_io_device_unregister_cb(void *io_device) 238 { 239 struct raid_bdev *raid_bdev = io_device; 240 241 if (raid_bdev->num_base_bdevs_discovered == 0) { 242 /* Free raid_bdev when there are no base bdevs left */ 243 SPDK_DEBUGLOG(bdev_raid, "raid bdev base bdevs is 0, going to free all in destruct\n"); 244 raid_bdev_cleanup(raid_bdev); 245 spdk_bdev_destruct_done(&raid_bdev->bdev, 0); 246 raid_bdev_free(raid_bdev); 247 } else { 248 spdk_bdev_destruct_done(&raid_bdev->bdev, 0); 249 } 250 } 251 252 void 253 raid_bdev_module_stop_done(struct raid_bdev *raid_bdev) 254 { 255 if (raid_bdev->state != RAID_BDEV_STATE_CONFIGURING) { 256 spdk_io_device_unregister(raid_bdev, raid_bdev_io_device_unregister_cb); 257 } 258 } 259 260 static void 261 _raid_bdev_destruct(void *ctxt) 262 { 263 struct raid_bdev *raid_bdev = ctxt; 264 struct raid_base_bdev_info *base_info; 265 266 SPDK_DEBUGLOG(bdev_raid, "raid_bdev_destruct\n"); 267 268 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 269 /* 270 * Close all base bdev descriptors for which call has come from below 271 * layers. Also close the descriptors if we have started shutdown. 272 */ 273 if (g_shutdown_started || base_info->remove_scheduled == true) { 274 raid_bdev_free_base_bdev_resource(base_info); 275 } 276 } 277 278 if (g_shutdown_started) { 279 raid_bdev->state = RAID_BDEV_STATE_OFFLINE; 280 } 281 282 if (raid_bdev->module->stop != NULL) { 283 if (raid_bdev->module->stop(raid_bdev) == false) { 284 return; 285 } 286 } 287 288 raid_bdev_module_stop_done(raid_bdev); 289 } 290 291 static int 292 raid_bdev_destruct(void *ctx) 293 { 294 spdk_thread_exec_msg(spdk_thread_get_app_thread(), _raid_bdev_destruct, ctx); 295 296 return 1; 297 } 298 299 void 300 raid_bdev_io_complete(struct raid_bdev_io *raid_io, enum spdk_bdev_io_status status) 301 { 302 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); 303 304 spdk_bdev_io_complete(bdev_io, status); 305 } 306 307 /* 308 * brief: 309 * raid_bdev_io_complete_part - signal the completion of a part of the expected 310 * base bdev IOs and complete the raid_io if this is the final expected IO. 311 * The caller should first set raid_io->base_bdev_io_remaining. This function 312 * will decrement this counter by the value of the 'completed' parameter and 313 * complete the raid_io if the counter reaches 0. The caller is free to 314 * interpret the 'base_bdev_io_remaining' and 'completed' values as needed, 315 * it can represent e.g. blocks or IOs. 316 * params: 317 * raid_io - pointer to raid_bdev_io 318 * completed - the part of the raid_io that has been completed 319 * status - status of the base IO 320 * returns: 321 * true - if the raid_io is completed 322 * false - otherwise 323 */ 324 bool 325 raid_bdev_io_complete_part(struct raid_bdev_io *raid_io, uint64_t completed, 326 enum spdk_bdev_io_status status) 327 { 328 assert(raid_io->base_bdev_io_remaining >= completed); 329 raid_io->base_bdev_io_remaining -= completed; 330 331 if (status != SPDK_BDEV_IO_STATUS_SUCCESS) { 332 raid_io->base_bdev_io_status = status; 333 } 334 335 if (raid_io->base_bdev_io_remaining == 0) { 336 raid_bdev_io_complete(raid_io, raid_io->base_bdev_io_status); 337 return true; 338 } else { 339 return false; 340 } 341 } 342 343 /* 344 * brief: 345 * raid_bdev_queue_io_wait function processes the IO which failed to submit. 346 * It will try to queue the IOs after storing the context to bdev wait queue logic. 347 * params: 348 * raid_io - pointer to raid_bdev_io 349 * bdev - the block device that the IO is submitted to 350 * ch - io channel 351 * cb_fn - callback when the spdk_bdev_io for bdev becomes available 352 * returns: 353 * none 354 */ 355 void 356 raid_bdev_queue_io_wait(struct raid_bdev_io *raid_io, struct spdk_bdev *bdev, 357 struct spdk_io_channel *ch, spdk_bdev_io_wait_cb cb_fn) 358 { 359 raid_io->waitq_entry.bdev = bdev; 360 raid_io->waitq_entry.cb_fn = cb_fn; 361 raid_io->waitq_entry.cb_arg = raid_io; 362 spdk_bdev_queue_io_wait(bdev, ch, &raid_io->waitq_entry); 363 } 364 365 static void 366 raid_base_bdev_reset_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 367 { 368 struct raid_bdev_io *raid_io = cb_arg; 369 370 spdk_bdev_free_io(bdev_io); 371 372 raid_bdev_io_complete_part(raid_io, 1, success ? 373 SPDK_BDEV_IO_STATUS_SUCCESS : 374 SPDK_BDEV_IO_STATUS_FAILED); 375 } 376 377 static void raid_bdev_submit_reset_request(struct raid_bdev_io *raid_io); 378 379 static void 380 _raid_bdev_submit_reset_request(void *_raid_io) 381 { 382 struct raid_bdev_io *raid_io = _raid_io; 383 384 raid_bdev_submit_reset_request(raid_io); 385 } 386 387 /* 388 * brief: 389 * raid_bdev_submit_reset_request function submits reset requests 390 * to member disks; it will submit as many as possible unless a reset fails with -ENOMEM, in 391 * which case it will queue it for later submission 392 * params: 393 * raid_io 394 * returns: 395 * none 396 */ 397 static void 398 raid_bdev_submit_reset_request(struct raid_bdev_io *raid_io) 399 { 400 struct raid_bdev *raid_bdev; 401 int ret; 402 uint8_t i; 403 struct raid_base_bdev_info *base_info; 404 struct spdk_io_channel *base_ch; 405 406 raid_bdev = raid_io->raid_bdev; 407 408 if (raid_io->base_bdev_io_remaining == 0) { 409 raid_io->base_bdev_io_remaining = raid_bdev->num_base_bdevs; 410 } 411 412 for (i = raid_io->base_bdev_io_submitted; i < raid_bdev->num_base_bdevs; i++) { 413 base_info = &raid_bdev->base_bdev_info[i]; 414 base_ch = raid_io->raid_ch->base_channel[i]; 415 if (base_ch == NULL) { 416 raid_io->base_bdev_io_submitted++; 417 raid_bdev_io_complete_part(raid_io, 1, SPDK_BDEV_IO_STATUS_SUCCESS); 418 continue; 419 } 420 ret = spdk_bdev_reset(base_info->desc, base_ch, 421 raid_base_bdev_reset_complete, raid_io); 422 if (ret == 0) { 423 raid_io->base_bdev_io_submitted++; 424 } else if (ret == -ENOMEM) { 425 raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc), 426 base_ch, _raid_bdev_submit_reset_request); 427 return; 428 } else { 429 SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); 430 assert(false); 431 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 432 return; 433 } 434 } 435 } 436 437 /* 438 * brief: 439 * Callback function to spdk_bdev_io_get_buf. 440 * params: 441 * ch - pointer to raid bdev io channel 442 * bdev_io - pointer to parent bdev_io on raid bdev device 443 * success - True if buffer is allocated or false otherwise. 444 * returns: 445 * none 446 */ 447 static void 448 raid_bdev_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 449 bool success) 450 { 451 struct raid_bdev_io *raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx; 452 453 if (!success) { 454 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 455 return; 456 } 457 458 raid_io->raid_bdev->module->submit_rw_request(raid_io); 459 } 460 461 /* 462 * brief: 463 * raid_bdev_submit_request function is the submit_request function pointer of 464 * raid bdev function table. This is used to submit the io on raid_bdev to below 465 * layers. 466 * params: 467 * ch - pointer to raid bdev io channel 468 * bdev_io - pointer to parent bdev_io on raid bdev device 469 * returns: 470 * none 471 */ 472 static void 473 raid_bdev_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 474 { 475 struct raid_bdev_io *raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx; 476 477 raid_io->raid_bdev = bdev_io->bdev->ctxt; 478 raid_io->raid_ch = spdk_io_channel_get_ctx(ch); 479 raid_io->base_bdev_io_remaining = 0; 480 raid_io->base_bdev_io_submitted = 0; 481 raid_io->base_bdev_io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 482 483 switch (bdev_io->type) { 484 case SPDK_BDEV_IO_TYPE_READ: 485 spdk_bdev_io_get_buf(bdev_io, raid_bdev_get_buf_cb, 486 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 487 break; 488 case SPDK_BDEV_IO_TYPE_WRITE: 489 raid_io->raid_bdev->module->submit_rw_request(raid_io); 490 break; 491 492 case SPDK_BDEV_IO_TYPE_RESET: 493 raid_bdev_submit_reset_request(raid_io); 494 break; 495 496 case SPDK_BDEV_IO_TYPE_FLUSH: 497 case SPDK_BDEV_IO_TYPE_UNMAP: 498 raid_io->raid_bdev->module->submit_null_payload_request(raid_io); 499 break; 500 501 default: 502 SPDK_ERRLOG("submit request, invalid io type %u\n", bdev_io->type); 503 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 504 break; 505 } 506 } 507 508 /* 509 * brief: 510 * _raid_bdev_io_type_supported checks whether io_type is supported in 511 * all base bdev modules of raid bdev module. If anyone among the base_bdevs 512 * doesn't support, the raid device doesn't supports. 513 * 514 * params: 515 * raid_bdev - pointer to raid bdev context 516 * io_type - io type 517 * returns: 518 * true - io_type is supported 519 * false - io_type is not supported 520 */ 521 inline static bool 522 _raid_bdev_io_type_supported(struct raid_bdev *raid_bdev, enum spdk_bdev_io_type io_type) 523 { 524 struct raid_base_bdev_info *base_info; 525 526 if (io_type == SPDK_BDEV_IO_TYPE_FLUSH || 527 io_type == SPDK_BDEV_IO_TYPE_UNMAP) { 528 if (raid_bdev->module->submit_null_payload_request == NULL) { 529 return false; 530 } 531 } 532 533 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 534 if (base_info->desc == NULL) { 535 continue; 536 } 537 538 if (spdk_bdev_io_type_supported(spdk_bdev_desc_get_bdev(base_info->desc), io_type) == false) { 539 return false; 540 } 541 } 542 543 return true; 544 } 545 546 /* 547 * brief: 548 * raid_bdev_io_type_supported is the io_supported function for bdev function 549 * table which returns whether the particular io type is supported or not by 550 * raid bdev module 551 * params: 552 * ctx - pointer to raid bdev context 553 * type - io type 554 * returns: 555 * true - io_type is supported 556 * false - io_type is not supported 557 */ 558 static bool 559 raid_bdev_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 560 { 561 switch (io_type) { 562 case SPDK_BDEV_IO_TYPE_READ: 563 case SPDK_BDEV_IO_TYPE_WRITE: 564 return true; 565 566 case SPDK_BDEV_IO_TYPE_FLUSH: 567 case SPDK_BDEV_IO_TYPE_RESET: 568 case SPDK_BDEV_IO_TYPE_UNMAP: 569 return _raid_bdev_io_type_supported(ctx, io_type); 570 571 default: 572 return false; 573 } 574 575 return false; 576 } 577 578 /* 579 * brief: 580 * raid_bdev_get_io_channel is the get_io_channel function table pointer for 581 * raid bdev. This is used to return the io channel for this raid bdev 582 * params: 583 * ctxt - pointer to raid_bdev 584 * returns: 585 * pointer to io channel for raid bdev 586 */ 587 static struct spdk_io_channel * 588 raid_bdev_get_io_channel(void *ctxt) 589 { 590 struct raid_bdev *raid_bdev = ctxt; 591 592 return spdk_get_io_channel(raid_bdev); 593 } 594 595 void 596 raid_bdev_write_info_json(struct raid_bdev *raid_bdev, struct spdk_json_write_ctx *w) 597 { 598 struct raid_base_bdev_info *base_info; 599 char uuid_str[SPDK_UUID_STRING_LEN]; 600 601 assert(raid_bdev != NULL); 602 assert(spdk_get_thread() == spdk_thread_get_app_thread()); 603 604 spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &raid_bdev->bdev.uuid); 605 spdk_json_write_named_string(w, "uuid", uuid_str); 606 spdk_json_write_named_uint32(w, "strip_size_kb", raid_bdev->strip_size_kb); 607 spdk_json_write_named_string(w, "state", raid_bdev_state_to_str(raid_bdev->state)); 608 spdk_json_write_named_string(w, "raid_level", raid_bdev_level_to_str(raid_bdev->level)); 609 spdk_json_write_named_bool(w, "superblock", raid_bdev->superblock_enabled); 610 spdk_json_write_named_uint32(w, "num_base_bdevs", raid_bdev->num_base_bdevs); 611 spdk_json_write_named_uint32(w, "num_base_bdevs_discovered", raid_bdev->num_base_bdevs_discovered); 612 spdk_json_write_name(w, "base_bdevs_list"); 613 spdk_json_write_array_begin(w); 614 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 615 if (base_info->desc) { 616 spdk_json_write_string(w, spdk_bdev_desc_get_bdev(base_info->desc)->name); 617 } else { 618 spdk_json_write_null(w); 619 } 620 } 621 spdk_json_write_array_end(w); 622 } 623 624 /* 625 * brief: 626 * raid_bdev_dump_info_json is the function table pointer for raid bdev 627 * params: 628 * ctx - pointer to raid_bdev 629 * w - pointer to json context 630 * returns: 631 * 0 - success 632 * non zero - failure 633 */ 634 static int 635 raid_bdev_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 636 { 637 struct raid_bdev *raid_bdev = ctx; 638 639 SPDK_DEBUGLOG(bdev_raid, "raid_bdev_dump_config_json\n"); 640 641 /* Dump the raid bdev configuration related information */ 642 spdk_json_write_named_object_begin(w, "raid"); 643 raid_bdev_write_info_json(raid_bdev, w); 644 spdk_json_write_object_end(w); 645 646 return 0; 647 } 648 649 /* 650 * brief: 651 * raid_bdev_write_config_json is the function table pointer for raid bdev 652 * params: 653 * bdev - pointer to spdk_bdev 654 * w - pointer to json context 655 * returns: 656 * none 657 */ 658 static void 659 raid_bdev_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 660 { 661 struct raid_bdev *raid_bdev = bdev->ctxt; 662 struct raid_base_bdev_info *base_info; 663 char uuid_str[SPDK_UUID_STRING_LEN]; 664 665 assert(spdk_get_thread() == spdk_thread_get_app_thread()); 666 667 if (raid_bdev->superblock_enabled) { 668 /* raid bdev configuration is stored in the superblock */ 669 return; 670 } 671 672 spdk_json_write_object_begin(w); 673 674 spdk_json_write_named_string(w, "method", "bdev_raid_create"); 675 676 spdk_json_write_named_object_begin(w, "params"); 677 spdk_json_write_named_string(w, "name", bdev->name); 678 spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &raid_bdev->bdev.uuid); 679 spdk_json_write_named_string(w, "uuid", uuid_str); 680 spdk_json_write_named_uint32(w, "strip_size_kb", raid_bdev->strip_size_kb); 681 spdk_json_write_named_string(w, "raid_level", raid_bdev_level_to_str(raid_bdev->level)); 682 spdk_json_write_named_bool(w, "superblock", raid_bdev->superblock_enabled); 683 684 spdk_json_write_named_array_begin(w, "base_bdevs"); 685 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 686 if (base_info->desc) { 687 spdk_json_write_string(w, spdk_bdev_desc_get_bdev(base_info->desc)->name); 688 } 689 } 690 spdk_json_write_array_end(w); 691 spdk_json_write_object_end(w); 692 693 spdk_json_write_object_end(w); 694 } 695 696 static int 697 raid_bdev_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 698 { 699 struct raid_bdev *raid_bdev = ctx; 700 struct raid_base_bdev_info *base_info; 701 int domains_count = 0, rc = 0; 702 703 if (raid_bdev->module->memory_domains_supported == false) { 704 return 0; 705 } 706 707 spdk_spin_lock(&raid_bdev->base_bdev_lock); 708 709 /* First loop to get the number of memory domains */ 710 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 711 if (base_info->desc == NULL) { 712 continue; 713 } 714 rc = spdk_bdev_get_memory_domains(spdk_bdev_desc_get_bdev(base_info->desc), NULL, 0); 715 if (rc < 0) { 716 goto out; 717 } 718 domains_count += rc; 719 } 720 721 if (!domains || array_size < domains_count) { 722 goto out; 723 } 724 725 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 726 if (base_info->desc == NULL) { 727 continue; 728 } 729 rc = spdk_bdev_get_memory_domains(spdk_bdev_desc_get_bdev(base_info->desc), domains, array_size); 730 if (rc < 0) { 731 goto out; 732 } 733 domains += rc; 734 array_size -= rc; 735 } 736 out: 737 spdk_spin_unlock(&raid_bdev->base_bdev_lock); 738 739 if (rc < 0) { 740 return rc; 741 } 742 743 return domains_count; 744 } 745 746 /* g_raid_bdev_fn_table is the function table for raid bdev */ 747 static const struct spdk_bdev_fn_table g_raid_bdev_fn_table = { 748 .destruct = raid_bdev_destruct, 749 .submit_request = raid_bdev_submit_request, 750 .io_type_supported = raid_bdev_io_type_supported, 751 .get_io_channel = raid_bdev_get_io_channel, 752 .dump_info_json = raid_bdev_dump_info_json, 753 .write_config_json = raid_bdev_write_config_json, 754 .get_memory_domains = raid_bdev_get_memory_domains, 755 }; 756 757 struct raid_bdev * 758 raid_bdev_find_by_name(const char *name) 759 { 760 struct raid_bdev *raid_bdev; 761 762 TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) { 763 if (strcmp(raid_bdev->bdev.name, name) == 0) { 764 return raid_bdev; 765 } 766 } 767 768 return NULL; 769 } 770 771 static struct { 772 const char *name; 773 enum raid_level value; 774 } g_raid_level_names[] = { 775 { "raid0", RAID0 }, 776 { "0", RAID0 }, 777 { "raid1", RAID1 }, 778 { "1", RAID1 }, 779 { "raid5f", RAID5F }, 780 { "5f", RAID5F }, 781 { "concat", CONCAT }, 782 { } 783 }; 784 785 static struct { 786 const char *name; 787 enum raid_bdev_state value; 788 } g_raid_state_names[] = { 789 { "online", RAID_BDEV_STATE_ONLINE }, 790 { "configuring", RAID_BDEV_STATE_CONFIGURING }, 791 { "offline", RAID_BDEV_STATE_OFFLINE }, 792 { } 793 }; 794 795 /* We have to use the typedef in the function declaration to appease astyle. */ 796 typedef enum raid_level raid_level_t; 797 typedef enum raid_bdev_state raid_bdev_state_t; 798 799 raid_level_t 800 raid_bdev_str_to_level(const char *str) 801 { 802 unsigned int i; 803 804 assert(str != NULL); 805 806 for (i = 0; g_raid_level_names[i].name != NULL; i++) { 807 if (strcasecmp(g_raid_level_names[i].name, str) == 0) { 808 return g_raid_level_names[i].value; 809 } 810 } 811 812 return INVALID_RAID_LEVEL; 813 } 814 815 const char * 816 raid_bdev_level_to_str(enum raid_level level) 817 { 818 unsigned int i; 819 820 for (i = 0; g_raid_level_names[i].name != NULL; i++) { 821 if (g_raid_level_names[i].value == level) { 822 return g_raid_level_names[i].name; 823 } 824 } 825 826 return ""; 827 } 828 829 raid_bdev_state_t 830 raid_bdev_str_to_state(const char *str) 831 { 832 unsigned int i; 833 834 assert(str != NULL); 835 836 for (i = 0; g_raid_state_names[i].name != NULL; i++) { 837 if (strcasecmp(g_raid_state_names[i].name, str) == 0) { 838 return g_raid_state_names[i].value; 839 } 840 } 841 842 return RAID_BDEV_STATE_MAX; 843 } 844 845 const char * 846 raid_bdev_state_to_str(enum raid_bdev_state state) 847 { 848 unsigned int i; 849 850 for (i = 0; g_raid_state_names[i].name != NULL; i++) { 851 if (g_raid_state_names[i].value == state) { 852 return g_raid_state_names[i].name; 853 } 854 } 855 856 assert(false); 857 return ""; 858 } 859 860 /* 861 * brief: 862 * raid_bdev_fini_start is called when bdev layer is starting the 863 * shutdown process 864 * params: 865 * none 866 * returns: 867 * none 868 */ 869 static void 870 raid_bdev_fini_start(void) 871 { 872 SPDK_DEBUGLOG(bdev_raid, "raid_bdev_fini_start\n"); 873 g_shutdown_started = true; 874 } 875 876 /* 877 * brief: 878 * raid_bdev_exit is called on raid bdev module exit time by bdev layer 879 * params: 880 * none 881 * returns: 882 * none 883 */ 884 static void 885 raid_bdev_exit(void) 886 { 887 struct raid_bdev *raid_bdev, *tmp; 888 889 SPDK_DEBUGLOG(bdev_raid, "raid_bdev_exit\n"); 890 891 TAILQ_FOREACH_SAFE(raid_bdev, &g_raid_bdev_list, global_link, tmp) { 892 raid_bdev_cleanup_and_free(raid_bdev); 893 } 894 } 895 896 /* 897 * brief: 898 * raid_bdev_get_ctx_size is used to return the context size of bdev_io for raid 899 * module 900 * params: 901 * none 902 * returns: 903 * size of spdk_bdev_io context for raid 904 */ 905 static int 906 raid_bdev_get_ctx_size(void) 907 { 908 SPDK_DEBUGLOG(bdev_raid, "raid_bdev_get_ctx_size\n"); 909 return sizeof(struct raid_bdev_io); 910 } 911 912 static struct spdk_bdev_module g_raid_if = { 913 .name = "raid", 914 .module_init = raid_bdev_init, 915 .fini_start = raid_bdev_fini_start, 916 .module_fini = raid_bdev_exit, 917 .get_ctx_size = raid_bdev_get_ctx_size, 918 .examine_config = raid_bdev_examine, 919 .async_init = false, 920 .async_fini = false, 921 }; 922 SPDK_BDEV_MODULE_REGISTER(raid, &g_raid_if) 923 924 /* 925 * brief: 926 * raid_bdev_init is the initialization function for raid bdev module 927 * params: 928 * none 929 * returns: 930 * 0 - success 931 * non zero - failure 932 */ 933 static int 934 raid_bdev_init(void) 935 { 936 return 0; 937 } 938 939 /* 940 * brief: 941 * raid_bdev_create allocates raid bdev based on passed configuration 942 * params: 943 * name - name for raid bdev 944 * strip_size - strip size in KB 945 * num_base_bdevs - number of base bdevs 946 * level - raid level 947 * superblock_enabled - true if raid should have superblock 948 * uuid - uuid to set for the bdev 949 * raid_bdev_out - the created raid bdev 950 * returns: 951 * 0 - success 952 * non zero - failure 953 */ 954 int 955 raid_bdev_create(const char *name, uint32_t strip_size, uint8_t num_base_bdevs, 956 enum raid_level level, bool superblock_enabled, const struct spdk_uuid *uuid, 957 struct raid_bdev **raid_bdev_out) 958 { 959 struct raid_bdev *raid_bdev; 960 struct spdk_bdev *raid_bdev_gen; 961 struct raid_bdev_module *module; 962 struct raid_base_bdev_info *base_info; 963 uint8_t min_operational; 964 965 if (raid_bdev_find_by_name(name) != NULL) { 966 SPDK_ERRLOG("Duplicate raid bdev name found: %s\n", name); 967 return -EEXIST; 968 } 969 970 if (level == RAID1) { 971 if (strip_size != 0) { 972 SPDK_ERRLOG("Strip size is not supported by raid1\n"); 973 return -EINVAL; 974 } 975 } else if (spdk_u32_is_pow2(strip_size) == false) { 976 SPDK_ERRLOG("Invalid strip size %" PRIu32 "\n", strip_size); 977 return -EINVAL; 978 } 979 980 module = raid_bdev_module_find(level); 981 if (module == NULL) { 982 SPDK_ERRLOG("Unsupported raid level '%d'\n", level); 983 return -EINVAL; 984 } 985 986 assert(module->base_bdevs_min != 0); 987 if (num_base_bdevs < module->base_bdevs_min) { 988 SPDK_ERRLOG("At least %u base devices required for %s\n", 989 module->base_bdevs_min, 990 raid_bdev_level_to_str(level)); 991 return -EINVAL; 992 } 993 994 switch (module->base_bdevs_constraint.type) { 995 case CONSTRAINT_MAX_BASE_BDEVS_REMOVED: 996 min_operational = num_base_bdevs - module->base_bdevs_constraint.value; 997 break; 998 case CONSTRAINT_MIN_BASE_BDEVS_OPERATIONAL: 999 min_operational = module->base_bdevs_constraint.value; 1000 break; 1001 case CONSTRAINT_UNSET: 1002 if (module->base_bdevs_constraint.value != 0) { 1003 SPDK_ERRLOG("Unexpected constraint value '%u' provided for raid bdev '%s'.\n", 1004 (uint8_t)module->base_bdevs_constraint.value, name); 1005 return -EINVAL; 1006 } 1007 min_operational = num_base_bdevs; 1008 break; 1009 default: 1010 SPDK_ERRLOG("Unrecognised constraint type '%u' in module for raid level '%s'.\n", 1011 (uint8_t)module->base_bdevs_constraint.type, 1012 raid_bdev_level_to_str(module->level)); 1013 return -EINVAL; 1014 }; 1015 1016 if (min_operational == 0 || min_operational > num_base_bdevs) { 1017 SPDK_ERRLOG("Wrong constraint value for raid level '%s'.\n", 1018 raid_bdev_level_to_str(module->level)); 1019 return -EINVAL; 1020 } 1021 1022 raid_bdev = calloc(1, sizeof(*raid_bdev)); 1023 if (!raid_bdev) { 1024 SPDK_ERRLOG("Unable to allocate memory for raid bdev\n"); 1025 return -ENOMEM; 1026 } 1027 1028 spdk_spin_init(&raid_bdev->base_bdev_lock); 1029 raid_bdev->module = module; 1030 raid_bdev->num_base_bdevs = num_base_bdevs; 1031 raid_bdev->base_bdev_info = calloc(raid_bdev->num_base_bdevs, 1032 sizeof(struct raid_base_bdev_info)); 1033 if (!raid_bdev->base_bdev_info) { 1034 SPDK_ERRLOG("Unable able to allocate base bdev info\n"); 1035 raid_bdev_free(raid_bdev); 1036 return -ENOMEM; 1037 } 1038 1039 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 1040 base_info->raid_bdev = raid_bdev; 1041 } 1042 1043 /* strip_size_kb is from the rpc param. strip_size is in blocks and used 1044 * internally and set later. 1045 */ 1046 raid_bdev->strip_size = 0; 1047 raid_bdev->strip_size_kb = strip_size; 1048 raid_bdev->state = RAID_BDEV_STATE_CONFIGURING; 1049 raid_bdev->level = level; 1050 raid_bdev->min_base_bdevs_operational = min_operational; 1051 raid_bdev->superblock_enabled = superblock_enabled; 1052 1053 raid_bdev_gen = &raid_bdev->bdev; 1054 1055 raid_bdev_gen->name = strdup(name); 1056 if (!raid_bdev_gen->name) { 1057 SPDK_ERRLOG("Unable to allocate name for raid\n"); 1058 raid_bdev_free(raid_bdev); 1059 return -ENOMEM; 1060 } 1061 1062 raid_bdev_gen->product_name = "Raid Volume"; 1063 raid_bdev_gen->ctxt = raid_bdev; 1064 raid_bdev_gen->fn_table = &g_raid_bdev_fn_table; 1065 raid_bdev_gen->module = &g_raid_if; 1066 raid_bdev_gen->write_cache = 0; 1067 spdk_uuid_copy(&raid_bdev_gen->uuid, uuid); 1068 1069 TAILQ_INSERT_TAIL(&g_raid_bdev_list, raid_bdev, global_link); 1070 1071 *raid_bdev_out = raid_bdev; 1072 1073 return 0; 1074 } 1075 1076 /* 1077 * brief: 1078 * Check underlying block devices against support for metadata. Do not configure 1079 * md support when parameters from block devices are inconsistent. 1080 * params: 1081 * raid_bdev - pointer to raid bdev 1082 * returns: 1083 * 0 - The raid bdev md parameters were successfully configured. 1084 * non zero - Failed to configure md. 1085 */ 1086 static int 1087 raid_bdev_configure_md(struct raid_bdev *raid_bdev) 1088 { 1089 struct spdk_bdev *base_bdev; 1090 uint8_t i; 1091 1092 for (i = 0; i < raid_bdev->num_base_bdevs; i++) { 1093 base_bdev = spdk_bdev_desc_get_bdev(raid_bdev->base_bdev_info[i].desc); 1094 1095 /* Currently, RAID bdevs do not support DIF or DIX, so a RAID bdev cannot 1096 * be created on top of any bdev which supports it */ 1097 if (spdk_bdev_get_dif_type(base_bdev) != SPDK_DIF_DISABLE) { 1098 SPDK_ERRLOG("at least one base bdev has DIF or DIX enabled " 1099 "- unsupported RAID configuration\n"); 1100 return -EPERM; 1101 } 1102 1103 if (i == 0) { 1104 raid_bdev->bdev.md_len = spdk_bdev_get_md_size(base_bdev); 1105 raid_bdev->bdev.md_interleave = spdk_bdev_is_md_interleaved(base_bdev); 1106 continue; 1107 } 1108 1109 if (raid_bdev->bdev.md_len != spdk_bdev_get_md_size(base_bdev) || 1110 raid_bdev->bdev.md_interleave != spdk_bdev_is_md_interleaved(base_bdev)) { 1111 SPDK_ERRLOG("base bdevs are configured with different metadata formats\n"); 1112 return -EPERM; 1113 } 1114 } 1115 1116 return 0; 1117 } 1118 1119 /* 1120 * brief: 1121 * If raid bdev config is complete, then only register the raid bdev to 1122 * bdev layer and remove this raid bdev from configuring list and 1123 * insert the raid bdev to configured list 1124 * params: 1125 * raid_bdev - pointer to raid bdev 1126 * returns: 1127 * 0 - success 1128 * non zero - failure 1129 */ 1130 static int 1131 raid_bdev_configure(struct raid_bdev *raid_bdev) 1132 { 1133 uint32_t blocklen = 0; 1134 struct spdk_bdev *raid_bdev_gen; 1135 struct raid_base_bdev_info *base_info; 1136 struct spdk_bdev *base_bdev; 1137 int rc = 0; 1138 1139 assert(raid_bdev->state == RAID_BDEV_STATE_CONFIGURING); 1140 assert(raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs); 1141 1142 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 1143 assert(base_info->desc != NULL); 1144 base_bdev = spdk_bdev_desc_get_bdev(base_info->desc); 1145 /* Check blocklen for all base bdevs that it should be same */ 1146 if (blocklen == 0) { 1147 blocklen = base_bdev->blocklen; 1148 } else if (blocklen != base_bdev->blocklen) { 1149 /* 1150 * Assumption is that all the base bdevs for any raid bdev should 1151 * have same blocklen 1152 */ 1153 SPDK_ERRLOG("Blocklen of various bdevs not matching\n"); 1154 return -EINVAL; 1155 } 1156 } 1157 assert(blocklen > 0); 1158 1159 /* The strip_size_kb is read in from user in KB. Convert to blocks here for 1160 * internal use. 1161 */ 1162 raid_bdev->strip_size = (raid_bdev->strip_size_kb * 1024) / blocklen; 1163 if (raid_bdev->strip_size == 0 && raid_bdev->level != RAID1) { 1164 SPDK_ERRLOG("Strip size cannot be smaller than the device block size\n"); 1165 return -EINVAL; 1166 } 1167 raid_bdev->strip_size_shift = spdk_u32log2(raid_bdev->strip_size); 1168 raid_bdev->blocklen_shift = spdk_u32log2(blocklen); 1169 1170 raid_bdev_gen = &raid_bdev->bdev; 1171 raid_bdev_gen->blocklen = blocklen; 1172 1173 rc = raid_bdev_configure_md(raid_bdev); 1174 if (rc != 0) { 1175 SPDK_ERRLOG("raid metadata configuration failed\n"); 1176 return rc; 1177 } 1178 1179 rc = raid_bdev->module->start(raid_bdev); 1180 if (rc != 0) { 1181 SPDK_ERRLOG("raid module startup callback failed\n"); 1182 return rc; 1183 } 1184 raid_bdev->state = RAID_BDEV_STATE_ONLINE; 1185 SPDK_DEBUGLOG(bdev_raid, "io device register %p\n", raid_bdev); 1186 SPDK_DEBUGLOG(bdev_raid, "blockcnt %" PRIu64 ", blocklen %u\n", 1187 raid_bdev_gen->blockcnt, raid_bdev_gen->blocklen); 1188 spdk_io_device_register(raid_bdev, raid_bdev_create_cb, raid_bdev_destroy_cb, 1189 sizeof(struct raid_bdev_io_channel), 1190 raid_bdev->bdev.name); 1191 rc = spdk_bdev_register(raid_bdev_gen); 1192 if (rc != 0) { 1193 SPDK_ERRLOG("Unable to register raid bdev and stay at configuring state\n"); 1194 if (raid_bdev->module->stop != NULL) { 1195 raid_bdev->module->stop(raid_bdev); 1196 } 1197 spdk_io_device_unregister(raid_bdev, NULL); 1198 raid_bdev->state = RAID_BDEV_STATE_CONFIGURING; 1199 return rc; 1200 } 1201 SPDK_DEBUGLOG(bdev_raid, "raid bdev generic %p\n", raid_bdev_gen); 1202 SPDK_DEBUGLOG(bdev_raid, "raid bdev is created with name %s, raid_bdev %p\n", 1203 raid_bdev_gen->name, raid_bdev); 1204 1205 return 0; 1206 } 1207 1208 /* 1209 * brief: 1210 * If raid bdev is online and registered, change the bdev state to 1211 * configuring and unregister this raid device. Queue this raid device 1212 * in configuring list 1213 * params: 1214 * raid_bdev - pointer to raid bdev 1215 * cb_fn - callback function 1216 * cb_arg - argument to callback function 1217 * returns: 1218 * none 1219 */ 1220 static void 1221 raid_bdev_deconfigure(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn, 1222 void *cb_arg) 1223 { 1224 if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) { 1225 if (cb_fn) { 1226 cb_fn(cb_arg, 0); 1227 } 1228 return; 1229 } 1230 1231 raid_bdev->state = RAID_BDEV_STATE_OFFLINE; 1232 assert(raid_bdev->num_base_bdevs_discovered); 1233 SPDK_DEBUGLOG(bdev_raid, "raid bdev state changing from online to offline\n"); 1234 1235 spdk_bdev_unregister(&raid_bdev->bdev, cb_fn, cb_arg); 1236 } 1237 1238 /* 1239 * brief: 1240 * raid_bdev_find_base_info_by_bdev function finds the base bdev info by bdev. 1241 * params: 1242 * base_bdev - pointer to base bdev 1243 * returns: 1244 * base bdev info if found, otherwise NULL. 1245 */ 1246 static struct raid_base_bdev_info * 1247 raid_bdev_find_base_info_by_bdev(struct spdk_bdev *base_bdev) 1248 { 1249 struct raid_bdev *raid_bdev; 1250 struct raid_base_bdev_info *base_info; 1251 1252 TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) { 1253 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 1254 if (base_info->desc != NULL && 1255 spdk_bdev_desc_get_bdev(base_info->desc) == base_bdev) { 1256 return base_info; 1257 } 1258 } 1259 } 1260 1261 return NULL; 1262 } 1263 1264 static void 1265 raid_bdev_remove_base_bdev_on_unquiesced(void *ctx, int status) 1266 { 1267 struct raid_base_bdev_info *base_info = ctx; 1268 struct raid_bdev *raid_bdev = base_info->raid_bdev; 1269 1270 base_info->remove_scheduled = false; 1271 1272 if (status != 0) { 1273 SPDK_ERRLOG("Failed to unquiesce raid bdev %s: %s\n", 1274 raid_bdev->bdev.name, spdk_strerror(-status)); 1275 goto out; 1276 } 1277 1278 spdk_spin_lock(&raid_bdev->base_bdev_lock); 1279 raid_bdev_free_base_bdev_resource(base_info); 1280 spdk_spin_unlock(&raid_bdev->base_bdev_lock); 1281 out: 1282 if (base_info->remove_cb != NULL) { 1283 base_info->remove_cb(base_info->remove_cb_ctx, status); 1284 } 1285 } 1286 1287 static void 1288 raid_bdev_channel_remove_base_bdev(struct spdk_io_channel_iter *i) 1289 { 1290 struct raid_base_bdev_info *base_info = spdk_io_channel_iter_get_ctx(i); 1291 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 1292 struct raid_bdev_io_channel *raid_ch = spdk_io_channel_get_ctx(ch); 1293 uint8_t idx = base_info - base_info->raid_bdev->base_bdev_info; 1294 1295 SPDK_DEBUGLOG(bdev_raid, "slot: %u raid_ch: %p\n", idx, raid_ch); 1296 1297 if (raid_ch->base_channel[idx] != NULL) { 1298 spdk_put_io_channel(raid_ch->base_channel[idx]); 1299 raid_ch->base_channel[idx] = NULL; 1300 } 1301 1302 spdk_for_each_channel_continue(i, 0); 1303 } 1304 1305 static void 1306 raid_bdev_channels_remove_base_bdev_done(struct spdk_io_channel_iter *i, int status) 1307 { 1308 struct raid_base_bdev_info *base_info = spdk_io_channel_iter_get_ctx(i); 1309 struct raid_bdev *raid_bdev = base_info->raid_bdev; 1310 1311 spdk_bdev_unquiesce(&raid_bdev->bdev, &g_raid_if, raid_bdev_remove_base_bdev_on_unquiesced, 1312 base_info); 1313 } 1314 1315 static void 1316 raid_bdev_remove_base_bdev_on_quiesced(void *ctx, int status) 1317 { 1318 struct raid_base_bdev_info *base_info = ctx; 1319 struct raid_bdev *raid_bdev = base_info->raid_bdev; 1320 1321 if (status != 0) { 1322 SPDK_ERRLOG("Failed to quiesce raid bdev %s: %s\n", 1323 raid_bdev->bdev.name, spdk_strerror(-status)); 1324 base_info->remove_scheduled = false; 1325 if (base_info->remove_cb != NULL) { 1326 base_info->remove_cb(base_info->remove_cb_ctx, status); 1327 } 1328 return; 1329 } 1330 1331 spdk_for_each_channel(raid_bdev, raid_bdev_channel_remove_base_bdev, base_info, 1332 raid_bdev_channels_remove_base_bdev_done); 1333 } 1334 1335 /* 1336 * brief: 1337 * raid_bdev_remove_base_bdev function is called by below layers when base_bdev 1338 * is removed. This function checks if this base bdev is part of any raid bdev 1339 * or not. If yes, it takes necessary action on that particular raid bdev. 1340 * params: 1341 * base_bdev - pointer to base bdev which got removed 1342 * cb_fn - callback function 1343 * cb_arg - argument to callback function 1344 * returns: 1345 * 0 - success 1346 * non zero - failure 1347 */ 1348 int 1349 raid_bdev_remove_base_bdev(struct spdk_bdev *base_bdev, raid_bdev_remove_base_bdev_cb cb_fn, 1350 void *cb_ctx) 1351 { 1352 struct raid_bdev *raid_bdev; 1353 struct raid_base_bdev_info *base_info; 1354 1355 SPDK_DEBUGLOG(bdev_raid, "%s\n", base_bdev->name); 1356 1357 /* Find the raid_bdev which has claimed this base_bdev */ 1358 base_info = raid_bdev_find_base_info_by_bdev(base_bdev); 1359 if (!base_info) { 1360 SPDK_ERRLOG("bdev to remove '%s' not found\n", base_bdev->name); 1361 return -ENODEV; 1362 } 1363 raid_bdev = base_info->raid_bdev; 1364 1365 assert(spdk_get_thread() == spdk_thread_get_app_thread()); 1366 1367 if (base_info->remove_scheduled) { 1368 return 0; 1369 } 1370 1371 assert(base_info->desc); 1372 base_info->remove_scheduled = true; 1373 base_info->remove_cb = cb_fn; 1374 base_info->remove_cb_ctx = cb_ctx; 1375 1376 if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) { 1377 /* 1378 * As raid bdev is not registered yet or already unregistered, 1379 * so cleanup should be done here itself. 1380 */ 1381 raid_bdev_free_base_bdev_resource(base_info); 1382 if (raid_bdev->num_base_bdevs_discovered == 0) { 1383 /* There is no base bdev for this raid, so free the raid device. */ 1384 raid_bdev_cleanup_and_free(raid_bdev); 1385 } 1386 } else if (raid_bdev->num_base_bdevs_discovered == raid_bdev->min_base_bdevs_operational) { 1387 /* 1388 * After this base bdev is removed there will not be enough base bdevs 1389 * to keep the raid bdev operational. 1390 */ 1391 raid_bdev_deconfigure(raid_bdev, cb_fn, cb_ctx); 1392 } else { 1393 int ret; 1394 1395 ret = spdk_bdev_quiesce(&raid_bdev->bdev, &g_raid_if, 1396 raid_bdev_remove_base_bdev_on_quiesced, base_info); 1397 if (ret != 0) { 1398 base_info->remove_scheduled = false; 1399 } 1400 } 1401 1402 return 0; 1403 } 1404 1405 /* 1406 * brief: 1407 * raid_bdev_resize_base_bdev function is called by below layers when base_bdev 1408 * is resized. This function checks if the smallest size of the base_bdevs is changed. 1409 * If yes, call module handler to resize the raid_bdev if implemented. 1410 * params: 1411 * base_bdev - pointer to base bdev which got resized. 1412 * returns: 1413 * none 1414 */ 1415 static void 1416 raid_bdev_resize_base_bdev(struct spdk_bdev *base_bdev) 1417 { 1418 struct raid_bdev *raid_bdev; 1419 struct raid_base_bdev_info *base_info; 1420 1421 SPDK_DEBUGLOG(bdev_raid, "raid_bdev_resize_base_bdev\n"); 1422 1423 base_info = raid_bdev_find_base_info_by_bdev(base_bdev); 1424 1425 /* Find the raid_bdev which has claimed this base_bdev */ 1426 if (!base_info) { 1427 SPDK_ERRLOG("raid_bdev whose base_bdev '%s' not found\n", base_bdev->name); 1428 return; 1429 } 1430 raid_bdev = base_info->raid_bdev; 1431 1432 assert(spdk_get_thread() == spdk_thread_get_app_thread()); 1433 1434 SPDK_NOTICELOG("base_bdev '%s' was resized: old size %" PRIu64 ", new size %" PRIu64 "\n", 1435 base_bdev->name, base_info->blockcnt, base_bdev->blockcnt); 1436 1437 if (raid_bdev->module->resize) { 1438 raid_bdev->module->resize(raid_bdev); 1439 } 1440 } 1441 1442 /* 1443 * brief: 1444 * raid_bdev_event_base_bdev function is called by below layers when base_bdev 1445 * triggers asynchronous event. 1446 * params: 1447 * type - event details. 1448 * bdev - bdev that triggered event. 1449 * event_ctx - context for event. 1450 * returns: 1451 * none 1452 */ 1453 static void 1454 raid_bdev_event_base_bdev(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, 1455 void *event_ctx) 1456 { 1457 int rc; 1458 1459 switch (type) { 1460 case SPDK_BDEV_EVENT_REMOVE: 1461 rc = raid_bdev_remove_base_bdev(bdev, NULL, NULL); 1462 if (rc != 0) { 1463 SPDK_ERRLOG("Failed to remove base bdev %s: %s\n", 1464 spdk_bdev_get_name(bdev), spdk_strerror(-rc)); 1465 } 1466 break; 1467 case SPDK_BDEV_EVENT_RESIZE: 1468 raid_bdev_resize_base_bdev(bdev); 1469 break; 1470 default: 1471 SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); 1472 break; 1473 } 1474 } 1475 1476 /* 1477 * brief: 1478 * Deletes the specified raid bdev 1479 * params: 1480 * raid_bdev - pointer to raid bdev 1481 * cb_fn - callback function 1482 * cb_arg - argument to callback function 1483 */ 1484 void 1485 raid_bdev_delete(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn, void *cb_arg) 1486 { 1487 struct raid_base_bdev_info *base_info; 1488 1489 SPDK_DEBUGLOG(bdev_raid, "delete raid bdev: %s\n", raid_bdev->bdev.name); 1490 1491 if (raid_bdev->destroy_started) { 1492 SPDK_DEBUGLOG(bdev_raid, "destroying raid bdev %s is already started\n", 1493 raid_bdev->bdev.name); 1494 if (cb_fn) { 1495 cb_fn(cb_arg, -EALREADY); 1496 } 1497 return; 1498 } 1499 1500 raid_bdev->destroy_started = true; 1501 1502 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 1503 base_info->remove_scheduled = true; 1504 1505 if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) { 1506 /* 1507 * As raid bdev is not registered yet or already unregistered, 1508 * so cleanup should be done here itself. 1509 */ 1510 raid_bdev_free_base_bdev_resource(base_info); 1511 } 1512 } 1513 1514 if (raid_bdev->num_base_bdevs_discovered == 0) { 1515 /* There is no base bdev for this raid, so free the raid device. */ 1516 raid_bdev_cleanup_and_free(raid_bdev); 1517 if (cb_fn) { 1518 cb_fn(cb_arg, 0); 1519 } 1520 } else { 1521 raid_bdev_deconfigure(raid_bdev, cb_fn, cb_arg); 1522 } 1523 } 1524 1525 static int 1526 raid_bdev_configure_base_bdev(struct raid_base_bdev_info *base_info) 1527 { 1528 struct raid_bdev *raid_bdev = base_info->raid_bdev; 1529 struct spdk_bdev_desc *desc; 1530 struct spdk_bdev *bdev; 1531 int rc; 1532 1533 assert(spdk_get_thread() == spdk_thread_get_app_thread()); 1534 assert(base_info->name != NULL); 1535 assert(base_info->desc == NULL); 1536 1537 rc = spdk_bdev_open_ext(base_info->name, true, raid_bdev_event_base_bdev, NULL, &desc); 1538 if (rc != 0) { 1539 if (rc != -ENODEV) { 1540 SPDK_ERRLOG("Unable to create desc on bdev '%s'\n", base_info->name); 1541 } 1542 return rc; 1543 } 1544 1545 bdev = spdk_bdev_desc_get_bdev(desc); 1546 1547 rc = spdk_bdev_module_claim_bdev(bdev, NULL, &g_raid_if); 1548 if (rc != 0) { 1549 SPDK_ERRLOG("Unable to claim this bdev as it is already claimed\n"); 1550 spdk_bdev_close(desc); 1551 return rc; 1552 } 1553 1554 SPDK_DEBUGLOG(bdev_raid, "bdev %s is claimed\n", bdev->name); 1555 1556 assert(raid_bdev->state != RAID_BDEV_STATE_ONLINE); 1557 1558 base_info->desc = desc; 1559 base_info->blockcnt = bdev->blockcnt; 1560 base_info->data_offset = 0; 1561 base_info->data_size = bdev->blockcnt; 1562 raid_bdev->num_base_bdevs_discovered++; 1563 assert(raid_bdev->num_base_bdevs_discovered <= raid_bdev->num_base_bdevs); 1564 1565 if (raid_bdev->superblock_enabled) { 1566 assert((RAID_BDEV_MIN_DATA_OFFSET_SIZE % bdev->blocklen) == 0); 1567 base_info->data_offset = RAID_BDEV_MIN_DATA_OFFSET_SIZE / bdev->blocklen; 1568 1569 if (bdev->optimal_io_boundary) { 1570 base_info->data_offset = spdk_divide_round_up(base_info->data_offset, 1571 bdev->optimal_io_boundary) * bdev->optimal_io_boundary; 1572 } 1573 1574 if (base_info->data_offset >= bdev->blockcnt) { 1575 SPDK_ERRLOG("Data offset %lu exceeds base bdev capacity %lu on bdev '%s'\n", 1576 base_info->data_offset, bdev->blockcnt, base_info->name); 1577 return -EINVAL; 1578 } 1579 1580 base_info->data_size = bdev->blockcnt - base_info->data_offset; 1581 } 1582 1583 if (raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs) { 1584 rc = raid_bdev_configure(raid_bdev); 1585 if (rc != 0) { 1586 SPDK_ERRLOG("Failed to configure raid bdev\n"); 1587 return rc; 1588 } 1589 } 1590 1591 return 0; 1592 } 1593 1594 /* 1595 * brief: 1596 * raid_bdev_add_base_device function is the actual function which either adds 1597 * the nvme base device to existing raid bdev or create a new raid bdev. It also claims 1598 * the base device and keep the open descriptor. 1599 * params: 1600 * raid_bdev - pointer to raid bdev 1601 * name - name of the base bdev 1602 * slot - position to add base bdev 1603 * returns: 1604 * 0 - success 1605 * non zero - failure 1606 */ 1607 int 1608 raid_bdev_add_base_device(struct raid_bdev *raid_bdev, const char *name, uint8_t slot) 1609 { 1610 struct raid_base_bdev_info *base_info; 1611 int rc; 1612 1613 if (slot >= raid_bdev->num_base_bdevs) { 1614 return -EINVAL; 1615 } 1616 1617 base_info = &raid_bdev->base_bdev_info[slot]; 1618 1619 if (base_info->name != NULL) { 1620 SPDK_ERRLOG("Slot %u on raid bdev '%s' already assigned to bdev '%s'\n", 1621 slot, raid_bdev->bdev.name, base_info->name); 1622 return -EBUSY; 1623 } 1624 1625 base_info->name = strdup(name); 1626 if (base_info->name == NULL) { 1627 return -ENOMEM; 1628 } 1629 1630 rc = raid_bdev_configure_base_bdev(base_info); 1631 if (rc != 0) { 1632 if (rc != -ENODEV) { 1633 SPDK_ERRLOG("Failed to allocate resource for bdev '%s'\n", name); 1634 } 1635 return rc; 1636 } 1637 1638 return 0; 1639 } 1640 1641 /* 1642 * brief: 1643 * raid_bdev_examine function is the examine function call by the below layers 1644 * like bdev_nvme layer. This function will check if this base bdev can be 1645 * claimed by this raid bdev or not. 1646 * params: 1647 * bdev - pointer to base bdev 1648 * returns: 1649 * none 1650 */ 1651 static void 1652 raid_bdev_examine(struct spdk_bdev *bdev) 1653 { 1654 struct raid_bdev *raid_bdev; 1655 struct raid_base_bdev_info *base_info; 1656 1657 TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) { 1658 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 1659 if (base_info->desc == NULL && base_info->name != NULL && 1660 strcmp(bdev->name, base_info->name) == 0) { 1661 raid_bdev_configure_base_bdev(base_info); 1662 break; 1663 } 1664 } 1665 } 1666 1667 spdk_bdev_module_examine_done(&g_raid_if); 1668 } 1669 1670 /* Log component for bdev raid bdev module */ 1671 SPDK_LOG_REGISTER_COMPONENT(bdev_raid) 1672