1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2018 Intel Corporation. 3 * All rights reserved. 4 */ 5 6 #ifndef SPDK_BDEV_RAID_INTERNAL_H 7 #define SPDK_BDEV_RAID_INTERNAL_H 8 9 #include "spdk/bdev_module.h" 10 #include "spdk/uuid.h" 11 12 #define RAID_BDEV_MIN_DATA_OFFSET_SIZE (1024*1024) /* 1 MiB */ 13 14 enum raid_level { 15 INVALID_RAID_LEVEL = -1, 16 RAID0 = 0, 17 RAID1 = 1, 18 RAID5F = 95, /* 0x5f */ 19 CONCAT = 99, 20 }; 21 22 /* 23 * Raid state describes the state of the raid. This raid bdev can be either in 24 * configured list or configuring list 25 */ 26 enum raid_bdev_state { 27 /* raid bdev is ready and is seen by upper layers */ 28 RAID_BDEV_STATE_ONLINE, 29 30 /* 31 * raid bdev is configuring, not all underlying bdevs are present. 32 * And can't be seen by upper layers. 33 */ 34 RAID_BDEV_STATE_CONFIGURING, 35 36 /* 37 * In offline state, raid bdev layer will complete all incoming commands without 38 * submitting to underlying base nvme bdevs 39 */ 40 RAID_BDEV_STATE_OFFLINE, 41 42 /* raid bdev state max, new states should be added before this */ 43 RAID_BDEV_STATE_MAX 44 }; 45 46 enum raid_process_type { 47 RAID_PROCESS_NONE, 48 RAID_PROCESS_REBUILD, 49 RAID_PROCESS_MAX 50 }; 51 52 typedef void (*raid_base_bdev_cb)(void *ctx, int status); 53 54 /* 55 * raid_base_bdev_info contains information for the base bdevs which are part of some 56 * raid. This structure contains the per base bdev information. Whatever is 57 * required per base device for raid bdev will be kept here 58 */ 59 struct raid_base_bdev_info { 60 /* The raid bdev that this base bdev belongs to */ 61 struct raid_bdev *raid_bdev; 62 63 /* name of the bdev */ 64 char *name; 65 66 /* uuid of the bdev */ 67 struct spdk_uuid uuid; 68 69 /* 70 * Pointer to base bdev descriptor opened by raid bdev. This is NULL when the bdev for 71 * this slot is missing. 72 */ 73 struct spdk_bdev_desc *desc; 74 75 /* offset in blocks from the start of the base bdev to the start of the data region */ 76 uint64_t data_offset; 77 78 /* size in blocks of the base bdev's data region */ 79 uint64_t data_size; 80 81 /* 82 * When underlying base device calls the hot plug function on drive removal, 83 * this flag will be set and later after doing some processing, base device 84 * descriptor will be closed 85 */ 86 bool remove_scheduled; 87 88 /* callback for base bdev removal */ 89 raid_base_bdev_cb remove_cb; 90 91 /* context of the callback */ 92 void *remove_cb_ctx; 93 94 /* Hold the number of blocks to know how large the base bdev is resized. */ 95 uint64_t blockcnt; 96 97 /* io channel for the app thread */ 98 struct spdk_io_channel *app_thread_ch; 99 100 /* Set to true when base bdev has completed the configuration process */ 101 bool is_configured; 102 103 /* Set to true if this base bdev is the target of a background process */ 104 bool is_process_target; 105 106 /* Set to true to indicate that the base bdev is being removed because of a failure */ 107 bool is_failed; 108 109 /* callback for base bdev configuration */ 110 raid_base_bdev_cb configure_cb; 111 112 /* context of the callback */ 113 void *configure_cb_ctx; 114 }; 115 116 struct raid_bdev_io; 117 typedef void (*raid_bdev_io_completion_cb)(struct raid_bdev_io *raid_io, 118 enum spdk_bdev_io_status status); 119 120 /* 121 * raid_bdev_io is the context part of bdev_io. It contains the information 122 * related to bdev_io for a raid bdev 123 */ 124 struct raid_bdev_io { 125 /* The raid bdev associated with this IO */ 126 struct raid_bdev *raid_bdev; 127 128 uint64_t offset_blocks; 129 uint64_t num_blocks; 130 struct iovec *iovs; 131 int iovcnt; 132 enum spdk_bdev_io_type type; 133 struct spdk_memory_domain *memory_domain; 134 void *memory_domain_ctx; 135 void *md_buf; 136 137 /* WaitQ entry, used only in waitq logic */ 138 struct spdk_bdev_io_wait_entry waitq_entry; 139 140 /* Context of the original channel for this IO */ 141 struct raid_bdev_io_channel *raid_ch; 142 143 /* Used for tracking progress on io requests sent to member disks. */ 144 uint64_t base_bdev_io_remaining; 145 uint8_t base_bdev_io_submitted; 146 enum spdk_bdev_io_status base_bdev_io_status; 147 /* This will be the raid_io completion status unless any base io's status is different. */ 148 enum spdk_bdev_io_status base_bdev_io_status_default; 149 150 /* Private data for the raid module */ 151 void *module_private; 152 153 /* Custom completion callback. Overrides bdev_io completion if set. */ 154 raid_bdev_io_completion_cb completion_cb; 155 156 struct { 157 uint64_t offset; 158 struct iovec *iov; 159 struct iovec iov_copy; 160 } split; 161 }; 162 163 struct raid_bdev_process_request { 164 struct raid_bdev_process *process; 165 struct raid_base_bdev_info *target; 166 struct spdk_io_channel *target_ch; 167 uint64_t offset_blocks; 168 uint32_t num_blocks; 169 struct iovec iov; 170 void *md_buf; 171 /* bdev_io is raid_io's driver_ctx - don't reorder them! 172 * These are needed for re-using raid module I/O functions for process I/O. */ 173 struct spdk_bdev_io bdev_io; 174 struct raid_bdev_io raid_io; 175 TAILQ_ENTRY(raid_bdev_process_request) link; 176 }; 177 178 typedef void (*raid_bdev_configure_cb)(void *cb_ctx, int rc); 179 180 /* 181 * raid_bdev is the single entity structure which contains SPDK block device 182 * and the information related to any raid bdev either configured or 183 * in configuring list. io device is created on this. 184 */ 185 struct raid_bdev { 186 /* raid bdev device, this will get registered in bdev layer */ 187 struct spdk_bdev bdev; 188 189 /* the raid bdev descriptor, opened for internal use */ 190 struct spdk_bdev_desc *self_desc; 191 192 /* link of raid bdev to link it to global raid bdev list */ 193 TAILQ_ENTRY(raid_bdev) global_link; 194 195 /* array of base bdev info */ 196 struct raid_base_bdev_info *base_bdev_info; 197 198 /* strip size of raid bdev in blocks */ 199 uint32_t strip_size; 200 201 /* strip size of raid bdev in KB */ 202 uint32_t strip_size_kb; 203 204 /* strip size bit shift for optimized calculation */ 205 uint32_t strip_size_shift; 206 207 /* state of raid bdev */ 208 enum raid_bdev_state state; 209 210 /* number of base bdevs comprising raid bdev */ 211 uint8_t num_base_bdevs; 212 213 /* number of base bdevs discovered */ 214 uint8_t num_base_bdevs_discovered; 215 216 /* 217 * Number of operational base bdevs, i.e. how many we know/expect to be working. This 218 * will be less than num_base_bdevs when starting a degraded array. 219 */ 220 uint8_t num_base_bdevs_operational; 221 222 /* minimum number of viable base bdevs that are required by array to operate */ 223 uint8_t min_base_bdevs_operational; 224 225 /* Raid Level of this raid bdev */ 226 enum raid_level level; 227 228 /* Set to true if destroy of this raid bdev is started. */ 229 bool destroy_started; 230 231 /* Module for RAID-level specific operations */ 232 struct raid_bdev_module *module; 233 234 /* Private data for the raid module */ 235 void *module_private; 236 237 /* Superblock */ 238 bool superblock_enabled; 239 struct raid_bdev_superblock *sb; 240 241 /* Superblock buffer used for I/O */ 242 void *sb_io_buf; 243 uint32_t sb_io_buf_size; 244 245 /* Raid bdev background process, e.g. rebuild */ 246 struct raid_bdev_process *process; 247 248 /* Callback and context for raid_bdev configuration */ 249 raid_bdev_configure_cb configure_cb; 250 void *configure_cb_ctx; 251 }; 252 253 #define RAID_FOR_EACH_BASE_BDEV(r, i) \ 254 for (i = r->base_bdev_info; i < r->base_bdev_info + r->num_base_bdevs; i++) 255 256 struct raid_bdev_io_channel; 257 258 /* TAIL head for raid bdev list */ 259 TAILQ_HEAD(raid_all_tailq, raid_bdev); 260 261 extern struct raid_all_tailq g_raid_bdev_list; 262 263 typedef void (*raid_bdev_destruct_cb)(void *cb_ctx, int rc); 264 265 int raid_bdev_create(const char *name, uint32_t strip_size, uint8_t num_base_bdevs, 266 enum raid_level level, bool superblock, const struct spdk_uuid *uuid, 267 struct raid_bdev **raid_bdev_out); 268 void raid_bdev_delete(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn, void *cb_ctx); 269 int raid_bdev_add_base_bdev(struct raid_bdev *raid_bdev, const char *name, 270 raid_base_bdev_cb cb_fn, void *cb_ctx); 271 struct raid_bdev *raid_bdev_find_by_name(const char *name); 272 enum raid_level raid_bdev_str_to_level(const char *str); 273 const char *raid_bdev_level_to_str(enum raid_level level); 274 enum raid_bdev_state raid_bdev_str_to_state(const char *str); 275 const char *raid_bdev_state_to_str(enum raid_bdev_state state); 276 const char *raid_bdev_process_to_str(enum raid_process_type value); 277 void raid_bdev_write_info_json(struct raid_bdev *raid_bdev, struct spdk_json_write_ctx *w); 278 int raid_bdev_remove_base_bdev(struct spdk_bdev *base_bdev, raid_base_bdev_cb cb_fn, void *cb_ctx); 279 280 /* 281 * RAID module descriptor 282 */ 283 struct raid_bdev_module { 284 /* RAID level implemented by this module */ 285 enum raid_level level; 286 287 /* Minimum required number of base bdevs. Must be > 0. */ 288 uint8_t base_bdevs_min; 289 290 /* 291 * RAID constraint. Determines number of base bdevs that can be removed 292 * without failing the array. 293 */ 294 struct { 295 enum { 296 CONSTRAINT_UNSET = 0, 297 CONSTRAINT_MAX_BASE_BDEVS_REMOVED, 298 CONSTRAINT_MIN_BASE_BDEVS_OPERATIONAL, 299 } type; 300 uint8_t value; 301 } base_bdevs_constraint; 302 303 /* Set to true if this module supports memory domains. */ 304 bool memory_domains_supported; 305 306 /* Set to true if this module supports DIF/DIX */ 307 bool dif_supported; 308 309 /* 310 * Called when the raid is starting, right before changing the state to 311 * online and registering the bdev. Parameters of the bdev like blockcnt 312 * should be set here. 313 * 314 * Non-zero return value will abort the startup process. 315 */ 316 int (*start)(struct raid_bdev *raid_bdev); 317 318 /* 319 * Called when the raid is stopping, right before changing the state to 320 * offline and unregistering the bdev. Optional. 321 * 322 * The function should return false if it is asynchronous. Then, after 323 * the async operation has completed and the module is fully stopped 324 * raid_bdev_module_stop_done() must be called. 325 */ 326 bool (*stop)(struct raid_bdev *raid_bdev); 327 328 /* Handler for R/W requests */ 329 void (*submit_rw_request)(struct raid_bdev_io *raid_io); 330 331 /* Handler for requests without payload (flush, unmap). Optional. */ 332 void (*submit_null_payload_request)(struct raid_bdev_io *raid_io); 333 334 /* 335 * Called when the bdev's IO channel is created to get the module's private IO channel. 336 * Optional. 337 */ 338 struct spdk_io_channel *(*get_io_channel)(struct raid_bdev *raid_bdev); 339 340 /* 341 * Called when a base_bdev is resized to resize the raid if the condition 342 * is satisfied. Optional. 343 * 344 * Returns true if the resize was performed. 345 */ 346 bool (*resize)(struct raid_bdev *raid_bdev); 347 348 /* Handler for raid process requests. Required for raid modules with redundancy. */ 349 int (*submit_process_request)(struct raid_bdev_process_request *process_req, 350 struct raid_bdev_io_channel *raid_ch); 351 352 TAILQ_ENTRY(raid_bdev_module) link; 353 }; 354 355 void raid_bdev_module_list_add(struct raid_bdev_module *raid_module); 356 357 #define __RAID_MODULE_REGISTER(line) __RAID_MODULE_REGISTER_(line) 358 #define __RAID_MODULE_REGISTER_(line) raid_module_register_##line 359 360 #define RAID_MODULE_REGISTER(_module) \ 361 __attribute__((constructor)) static void \ 362 __RAID_MODULE_REGISTER(__LINE__)(void) \ 363 { \ 364 raid_bdev_module_list_add(_module); \ 365 } 366 367 bool raid_bdev_io_complete_part(struct raid_bdev_io *raid_io, uint64_t completed, 368 enum spdk_bdev_io_status status); 369 void raid_bdev_queue_io_wait(struct raid_bdev_io *raid_io, struct spdk_bdev *bdev, 370 struct spdk_io_channel *ch, spdk_bdev_io_wait_cb cb_fn); 371 void raid_bdev_io_complete(struct raid_bdev_io *raid_io, enum spdk_bdev_io_status status); 372 void raid_bdev_module_stop_done(struct raid_bdev *raid_bdev); 373 struct spdk_io_channel *raid_bdev_channel_get_base_channel(struct raid_bdev_io_channel *raid_ch, 374 uint8_t idx); 375 void *raid_bdev_channel_get_module_ctx(struct raid_bdev_io_channel *raid_ch); 376 struct raid_base_bdev_info *raid_bdev_channel_get_base_info(struct raid_bdev_io_channel *raid_ch, 377 struct spdk_bdev *base_bdev); 378 void raid_bdev_process_request_complete(struct raid_bdev_process_request *process_req, int status); 379 void raid_bdev_io_init(struct raid_bdev_io *raid_io, struct raid_bdev_io_channel *raid_ch, 380 enum spdk_bdev_io_type type, uint64_t offset_blocks, 381 uint64_t num_blocks, struct iovec *iovs, int iovcnt, void *md_buf, 382 struct spdk_memory_domain *memory_domain, void *memory_domain_ctx); 383 void raid_bdev_fail_base_bdev(struct raid_base_bdev_info *base_info); 384 385 static inline uint8_t 386 raid_bdev_base_bdev_slot(struct raid_base_bdev_info *base_info) 387 { 388 return base_info - base_info->raid_bdev->base_bdev_info; 389 } 390 391 static inline void 392 raid_bdev_io_set_default_status(struct raid_bdev_io *raid_io, enum spdk_bdev_io_status status) 393 { 394 assert(raid_io->base_bdev_io_submitted == 0); 395 raid_io->base_bdev_io_status = status; 396 raid_io->base_bdev_io_status_default = status; 397 } 398 399 int raid_bdev_remap_dix_reftag(void *md_buf, uint64_t num_blocks, 400 struct spdk_bdev *bdev, uint32_t remapped_offset); 401 int raid_bdev_verify_dix_reftag(struct iovec *iovs, int iovcnt, void *md_buf, 402 uint64_t num_blocks, struct spdk_bdev *bdev, uint32_t offset_blocks); 403 404 /** 405 * Raid bdev I/O read/write wrapper for spdk_bdev_readv_blocks_ext function. 406 */ 407 static inline int 408 raid_bdev_readv_blocks_ext(struct raid_base_bdev_info *base_info, struct spdk_io_channel *ch, 409 struct iovec *iov, int iovcnt, uint64_t offset_blocks, 410 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 411 struct spdk_bdev_ext_io_opts *opts) 412 { 413 return spdk_bdev_readv_blocks_ext(base_info->desc, ch, iov, iovcnt, 414 base_info->data_offset + offset_blocks, num_blocks, cb, cb_arg, opts); 415 } 416 417 /** 418 * Raid bdev I/O read/write wrapper for spdk_bdev_writev_blocks_ext function. 419 */ 420 static inline int 421 raid_bdev_writev_blocks_ext(struct raid_base_bdev_info *base_info, struct spdk_io_channel *ch, 422 struct iovec *iov, int iovcnt, uint64_t offset_blocks, 423 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 424 struct spdk_bdev_ext_io_opts *opts) 425 { 426 int rc; 427 uint64_t remapped_offset_blocks = base_info->data_offset + offset_blocks; 428 429 if (spdk_unlikely(spdk_bdev_get_dif_type(&base_info->raid_bdev->bdev) != SPDK_DIF_DISABLE && 430 (base_info->raid_bdev->bdev.dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK))) { 431 rc = raid_bdev_remap_dix_reftag(opts->metadata, num_blocks, &base_info->raid_bdev->bdev, 432 remapped_offset_blocks); 433 if (rc != 0) { 434 return rc; 435 } 436 } 437 438 return spdk_bdev_writev_blocks_ext(base_info->desc, ch, iov, iovcnt, 439 remapped_offset_blocks, num_blocks, cb, cb_arg, opts); 440 } 441 442 /** 443 * Raid bdev I/O read/write wrapper for spdk_bdev_unmap_blocks function. 444 */ 445 static inline int 446 raid_bdev_unmap_blocks(struct raid_base_bdev_info *base_info, struct spdk_io_channel *ch, 447 uint64_t offset_blocks, uint64_t num_blocks, 448 spdk_bdev_io_completion_cb cb, void *cb_arg) 449 { 450 return spdk_bdev_unmap_blocks(base_info->desc, ch, base_info->data_offset + offset_blocks, 451 num_blocks, cb, cb_arg); 452 } 453 454 /** 455 * Raid bdev I/O read/write wrapper for spdk_bdev_flush_blocks function. 456 */ 457 static inline int 458 raid_bdev_flush_blocks(struct raid_base_bdev_info *base_info, struct spdk_io_channel *ch, 459 uint64_t offset_blocks, uint64_t num_blocks, 460 spdk_bdev_io_completion_cb cb, void *cb_arg) 461 { 462 return spdk_bdev_flush_blocks(base_info->desc, ch, base_info->data_offset + offset_blocks, 463 num_blocks, cb, cb_arg); 464 } 465 466 /* 467 * Definitions related to raid bdev superblock 468 */ 469 470 #define RAID_BDEV_SB_VERSION_MAJOR 1 471 #define RAID_BDEV_SB_VERSION_MINOR 0 472 473 #define RAID_BDEV_SB_NAME_SIZE 64 474 475 enum raid_bdev_sb_base_bdev_state { 476 RAID_SB_BASE_BDEV_MISSING = 0, 477 RAID_SB_BASE_BDEV_CONFIGURED = 1, 478 RAID_SB_BASE_BDEV_FAILED = 2, 479 RAID_SB_BASE_BDEV_SPARE = 3, 480 }; 481 482 struct raid_bdev_sb_base_bdev { 483 /* uuid of the base bdev */ 484 struct spdk_uuid uuid; 485 /* offset in blocks from base device start to the start of raid data area */ 486 uint64_t data_offset; 487 /* size in blocks of the base device raid data area */ 488 uint64_t data_size; 489 /* state of the base bdev */ 490 uint32_t state; 491 /* feature/status flags */ 492 uint32_t flags; 493 /* slot number of this base bdev in the raid */ 494 uint8_t slot; 495 496 uint8_t reserved[23]; 497 }; 498 SPDK_STATIC_ASSERT(sizeof(struct raid_bdev_sb_base_bdev) == 64, "incorrect size"); 499 500 struct raid_bdev_superblock { 501 #define RAID_BDEV_SB_SIG "SPDKRAID" 502 uint8_t signature[8]; 503 struct { 504 /* incremented when a breaking change in the superblock structure is made */ 505 uint16_t major; 506 /* incremented for changes in the superblock that are backward compatible */ 507 uint16_t minor; 508 } version; 509 /* length in bytes of the entire superblock */ 510 uint32_t length; 511 /* crc32c checksum of the entire superblock */ 512 uint32_t crc; 513 /* feature/status flags */ 514 uint32_t flags; 515 /* unique id of the raid bdev */ 516 struct spdk_uuid uuid; 517 /* name of the raid bdev */ 518 uint8_t name[RAID_BDEV_SB_NAME_SIZE]; 519 /* size of the raid bdev in blocks */ 520 uint64_t raid_size; 521 /* the raid bdev block size - must be the same for all base bdevs */ 522 uint32_t block_size; 523 /* the raid level */ 524 uint32_t level; 525 /* strip (chunk) size in blocks */ 526 uint32_t strip_size; 527 /* state of the raid */ 528 uint32_t state; 529 /* sequence number, incremented on every superblock update */ 530 uint64_t seq_number; 531 /* number of raid base devices */ 532 uint8_t num_base_bdevs; 533 534 uint8_t reserved[118]; 535 536 /* size of the base bdevs array */ 537 uint8_t base_bdevs_size; 538 /* array of base bdev descriptors */ 539 struct raid_bdev_sb_base_bdev base_bdevs[]; 540 }; 541 SPDK_STATIC_ASSERT(sizeof(struct raid_bdev_superblock) == 256, "incorrect size"); 542 543 #define RAID_BDEV_SB_MAX_LENGTH (sizeof(struct raid_bdev_superblock) + UINT8_MAX * sizeof(struct raid_bdev_sb_base_bdev)) 544 545 SPDK_STATIC_ASSERT(RAID_BDEV_SB_MAX_LENGTH < RAID_BDEV_MIN_DATA_OFFSET_SIZE, 546 "Incorrect min data offset"); 547 548 typedef void (*raid_bdev_write_sb_cb)(int status, struct raid_bdev *raid_bdev, void *ctx); 549 typedef void (*raid_bdev_load_sb_cb)(const struct raid_bdev_superblock *sb, int status, void *ctx); 550 551 int raid_bdev_alloc_superblock(struct raid_bdev *raid_bdev, uint32_t block_size); 552 void raid_bdev_free_superblock(struct raid_bdev *raid_bdev); 553 void raid_bdev_init_superblock(struct raid_bdev *raid_bdev); 554 void raid_bdev_write_superblock(struct raid_bdev *raid_bdev, raid_bdev_write_sb_cb cb, 555 void *cb_ctx); 556 int raid_bdev_load_base_bdev_superblock(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 557 raid_bdev_load_sb_cb cb, void *cb_ctx); 558 559 struct spdk_raid_bdev_opts { 560 /* Size of the background process window in KiB */ 561 uint32_t process_window_size_kb; 562 /* Maximum bandwidth in MiB to process per second */ 563 uint32_t process_max_bandwidth_mb_sec; 564 }; 565 566 void raid_bdev_get_opts(struct spdk_raid_bdev_opts *opts); 567 int raid_bdev_set_opts(const struct spdk_raid_bdev_opts *opts); 568 569 #endif /* SPDK_BDEV_RAID_INTERNAL_H */ 570