1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2018 Intel Corporation. 3 * All rights reserved. 4 */ 5 6 #ifndef SPDK_BDEV_RAID_INTERNAL_H 7 #define SPDK_BDEV_RAID_INTERNAL_H 8 9 #include "spdk/bdev_module.h" 10 #include "spdk/uuid.h" 11 12 #define RAID_BDEV_MIN_DATA_OFFSET_SIZE (1024*1024) /* 1 MiB */ 13 14 enum raid_level { 15 INVALID_RAID_LEVEL = -1, 16 RAID0 = 0, 17 RAID1 = 1, 18 RAID5F = 95, /* 0x5f */ 19 CONCAT = 99, 20 }; 21 22 /* 23 * Raid state describes the state of the raid. This raid bdev can be either in 24 * configured list or configuring list 25 */ 26 enum raid_bdev_state { 27 /* raid bdev is ready and is seen by upper layers */ 28 RAID_BDEV_STATE_ONLINE, 29 30 /* 31 * raid bdev is configuring, not all underlying bdevs are present. 32 * And can't be seen by upper layers. 33 */ 34 RAID_BDEV_STATE_CONFIGURING, 35 36 /* 37 * In offline state, raid bdev layer will complete all incoming commands without 38 * submitting to underlying base nvme bdevs 39 */ 40 RAID_BDEV_STATE_OFFLINE, 41 42 /* raid bdev state max, new states should be added before this */ 43 RAID_BDEV_STATE_MAX 44 }; 45 46 typedef void (*raid_bdev_remove_base_bdev_cb)(void *ctx, int status); 47 48 /* 49 * raid_base_bdev_info contains information for the base bdevs which are part of some 50 * raid. This structure contains the per base bdev information. Whatever is 51 * required per base device for raid bdev will be kept here 52 */ 53 struct raid_base_bdev_info { 54 /* The raid bdev that this base bdev belongs to */ 55 struct raid_bdev *raid_bdev; 56 57 /* name of the bdev */ 58 char *name; 59 60 /* uuid of the bdev */ 61 struct spdk_uuid uuid; 62 63 /* 64 * Pointer to base bdev descriptor opened by raid bdev. This is NULL when the bdev for 65 * this slot is missing. 66 */ 67 struct spdk_bdev_desc *desc; 68 69 /* offset in blocks from the start of the base bdev to the start of the data region */ 70 uint64_t data_offset; 71 72 /* size in blocks of the base bdev's data region */ 73 uint64_t data_size; 74 75 /* 76 * When underlying base device calls the hot plug function on drive removal, 77 * this flag will be set and later after doing some processing, base device 78 * descriptor will be closed 79 */ 80 bool remove_scheduled; 81 82 /* callback for base bdev removal */ 83 raid_bdev_remove_base_bdev_cb remove_cb; 84 85 /* context of the callback */ 86 void *remove_cb_ctx; 87 88 /* Hold the number of blocks to know how large the base bdev is resized. */ 89 uint64_t blockcnt; 90 91 /* io channel for the app thread */ 92 struct spdk_io_channel *app_thread_ch; 93 94 /* Set to true when base bdev has completed the configuration process */ 95 bool is_configured; 96 }; 97 98 struct raid_bdev_io; 99 typedef void (*raid_bdev_io_completion_cb)(struct raid_bdev_io *raid_io, 100 enum spdk_bdev_io_status status); 101 102 /* 103 * raid_bdev_io is the context part of bdev_io. It contains the information 104 * related to bdev_io for a raid bdev 105 */ 106 struct raid_bdev_io { 107 /* The raid bdev associated with this IO */ 108 struct raid_bdev *raid_bdev; 109 110 uint64_t offset_blocks; 111 uint64_t num_blocks; 112 struct iovec *iovs; 113 int iovcnt; 114 enum spdk_bdev_io_type type; 115 struct spdk_memory_domain *memory_domain; 116 void *memory_domain_ctx; 117 void *md_buf; 118 119 /* WaitQ entry, used only in waitq logic */ 120 struct spdk_bdev_io_wait_entry waitq_entry; 121 122 /* Context of the original channel for this IO */ 123 struct raid_bdev_io_channel *raid_ch; 124 125 /* Used for tracking progress on io requests sent to member disks. */ 126 uint64_t base_bdev_io_remaining; 127 uint8_t base_bdev_io_submitted; 128 enum spdk_bdev_io_status base_bdev_io_status; 129 130 /* Private data for the raid module */ 131 void *module_private; 132 133 /* Custom completion callback. Overrides bdev_io completion if set. */ 134 raid_bdev_io_completion_cb completion_cb; 135 }; 136 137 /* 138 * raid_bdev is the single entity structure which contains SPDK block device 139 * and the information related to any raid bdev either configured or 140 * in configuring list. io device is created on this. 141 */ 142 struct raid_bdev { 143 /* raid bdev device, this will get registered in bdev layer */ 144 struct spdk_bdev bdev; 145 146 /* link of raid bdev to link it to global raid bdev list */ 147 TAILQ_ENTRY(raid_bdev) global_link; 148 149 /* array of base bdev info */ 150 struct raid_base_bdev_info *base_bdev_info; 151 152 /* lock to protect the base bdev array */ 153 struct spdk_spinlock base_bdev_lock; 154 155 /* strip size of raid bdev in blocks */ 156 uint32_t strip_size; 157 158 /* strip size of raid bdev in KB */ 159 uint32_t strip_size_kb; 160 161 /* strip size bit shift for optimized calculation */ 162 uint32_t strip_size_shift; 163 164 /* block length bit shift for optimized calculation */ 165 uint32_t blocklen_shift; 166 167 /* state of raid bdev */ 168 enum raid_bdev_state state; 169 170 /* number of base bdevs comprising raid bdev */ 171 uint8_t num_base_bdevs; 172 173 /* number of base bdevs discovered */ 174 uint8_t num_base_bdevs_discovered; 175 176 /* 177 * Number of operational base bdevs, i.e. how many we know/expect to be working. This 178 * will be less than num_base_bdevs when starting a degraded array. 179 */ 180 uint8_t num_base_bdevs_operational; 181 182 /* minimum number of viable base bdevs that are required by array to operate */ 183 uint8_t min_base_bdevs_operational; 184 185 /* Raid Level of this raid bdev */ 186 enum raid_level level; 187 188 /* Set to true if destroy of this raid bdev is started. */ 189 bool destroy_started; 190 191 /* Module for RAID-level specific operations */ 192 struct raid_bdev_module *module; 193 194 /* Private data for the raid module */ 195 void *module_private; 196 197 /* Superblock */ 198 struct raid_bdev_superblock *sb; 199 }; 200 201 #define RAID_FOR_EACH_BASE_BDEV(r, i) \ 202 for (i = r->base_bdev_info; i < r->base_bdev_info + r->num_base_bdevs; i++) 203 204 struct raid_bdev_io_channel; 205 206 /* TAIL head for raid bdev list */ 207 TAILQ_HEAD(raid_all_tailq, raid_bdev); 208 209 extern struct raid_all_tailq g_raid_bdev_list; 210 211 typedef void (*raid_bdev_destruct_cb)(void *cb_ctx, int rc); 212 213 int raid_bdev_create(const char *name, uint32_t strip_size, uint8_t num_base_bdevs, 214 enum raid_level level, bool superblock, const struct spdk_uuid *uuid, 215 struct raid_bdev **raid_bdev_out); 216 void raid_bdev_delete(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn, void *cb_ctx); 217 int raid_bdev_add_base_device(struct raid_bdev *raid_bdev, const char *name, uint8_t slot); 218 struct raid_bdev *raid_bdev_find_by_name(const char *name); 219 enum raid_level raid_bdev_str_to_level(const char *str); 220 const char *raid_bdev_level_to_str(enum raid_level level); 221 enum raid_bdev_state raid_bdev_str_to_state(const char *str); 222 const char *raid_bdev_state_to_str(enum raid_bdev_state state); 223 void raid_bdev_write_info_json(struct raid_bdev *raid_bdev, struct spdk_json_write_ctx *w); 224 int raid_bdev_remove_base_bdev(struct spdk_bdev *base_bdev, raid_bdev_remove_base_bdev_cb cb_fn, 225 void *cb_ctx); 226 struct spdk_io_channel *raid_bdev_channel_get_base_channel(struct raid_bdev_io_channel *raid_ch, 227 uint8_t idx); 228 void *raid_bdev_channel_get_module_ctx(struct raid_bdev_io_channel *raid_ch); 229 230 /* 231 * RAID module descriptor 232 */ 233 struct raid_bdev_module { 234 /* RAID level implemented by this module */ 235 enum raid_level level; 236 237 /* Minimum required number of base bdevs. Must be > 0. */ 238 uint8_t base_bdevs_min; 239 240 /* 241 * RAID constraint. Determines number of base bdevs that can be removed 242 * without failing the array. 243 */ 244 struct { 245 enum { 246 CONSTRAINT_UNSET = 0, 247 CONSTRAINT_MAX_BASE_BDEVS_REMOVED, 248 CONSTRAINT_MIN_BASE_BDEVS_OPERATIONAL, 249 } type; 250 uint8_t value; 251 } base_bdevs_constraint; 252 253 /* Set to true if this module supports memory domains. */ 254 bool memory_domains_supported; 255 256 /* 257 * Called when the raid is starting, right before changing the state to 258 * online and registering the bdev. Parameters of the bdev like blockcnt 259 * should be set here. 260 * 261 * Non-zero return value will abort the startup process. 262 */ 263 int (*start)(struct raid_bdev *raid_bdev); 264 265 /* 266 * Called when the raid is stopping, right before changing the state to 267 * offline and unregistering the bdev. Optional. 268 * 269 * The function should return false if it is asynchronous. Then, after 270 * the async operation has completed and the module is fully stopped 271 * raid_bdev_module_stop_done() must be called. 272 */ 273 bool (*stop)(struct raid_bdev *raid_bdev); 274 275 /* Handler for R/W requests */ 276 void (*submit_rw_request)(struct raid_bdev_io *raid_io); 277 278 /* Handler for requests without payload (flush, unmap). Optional. */ 279 void (*submit_null_payload_request)(struct raid_bdev_io *raid_io); 280 281 /* 282 * Called when the bdev's IO channel is created to get the module's private IO channel. 283 * Optional. 284 */ 285 struct spdk_io_channel *(*get_io_channel)(struct raid_bdev *raid_bdev); 286 287 /* 288 * Called when a base_bdev is resized to resize the raid if the condition 289 * is satisfied. 290 */ 291 void (*resize)(struct raid_bdev *raid_bdev); 292 293 TAILQ_ENTRY(raid_bdev_module) link; 294 }; 295 296 void raid_bdev_module_list_add(struct raid_bdev_module *raid_module); 297 298 #define __RAID_MODULE_REGISTER(line) __RAID_MODULE_REGISTER_(line) 299 #define __RAID_MODULE_REGISTER_(line) raid_module_register_##line 300 301 #define RAID_MODULE_REGISTER(_module) \ 302 __attribute__((constructor)) static void \ 303 __RAID_MODULE_REGISTER(__LINE__)(void) \ 304 { \ 305 raid_bdev_module_list_add(_module); \ 306 } 307 308 bool raid_bdev_io_complete_part(struct raid_bdev_io *raid_io, uint64_t completed, 309 enum spdk_bdev_io_status status); 310 void raid_bdev_queue_io_wait(struct raid_bdev_io *raid_io, struct spdk_bdev *bdev, 311 struct spdk_io_channel *ch, spdk_bdev_io_wait_cb cb_fn); 312 void raid_bdev_io_complete(struct raid_bdev_io *raid_io, enum spdk_bdev_io_status status); 313 void raid_bdev_module_stop_done(struct raid_bdev *raid_bdev); 314 void raid_bdev_io_init(struct raid_bdev_io *raid_io, struct raid_bdev_io_channel *raid_ch, 315 enum spdk_bdev_io_type type, uint64_t offset_blocks, 316 uint64_t num_blocks, struct iovec *iovs, int iovcnt, void *md_buf, 317 struct spdk_memory_domain *memory_domain, void *memory_domain_ctx); 318 319 static inline uint8_t 320 raid_bdev_base_bdev_slot(struct raid_base_bdev_info *base_info) 321 { 322 return base_info - base_info->raid_bdev->base_bdev_info; 323 } 324 325 /** 326 * Raid bdev I/O read/write wrapper for spdk_bdev_readv_blocks_ext function. 327 */ 328 static inline int 329 raid_bdev_readv_blocks_ext(struct raid_base_bdev_info *base_info, struct spdk_io_channel *ch, 330 struct iovec *iov, int iovcnt, uint64_t offset_blocks, 331 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 332 struct spdk_bdev_ext_io_opts *opts) 333 { 334 return spdk_bdev_readv_blocks_ext(base_info->desc, ch, iov, iovcnt, 335 base_info->data_offset + offset_blocks, num_blocks, cb, cb_arg, opts); 336 } 337 338 /** 339 * Raid bdev I/O read/write wrapper for spdk_bdev_writev_blocks_ext function. 340 */ 341 static inline int 342 raid_bdev_writev_blocks_ext(struct raid_base_bdev_info *base_info, struct spdk_io_channel *ch, 343 struct iovec *iov, int iovcnt, uint64_t offset_blocks, 344 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 345 struct spdk_bdev_ext_io_opts *opts) 346 { 347 return spdk_bdev_writev_blocks_ext(base_info->desc, ch, iov, iovcnt, 348 base_info->data_offset + offset_blocks, num_blocks, cb, cb_arg, opts); 349 } 350 351 /** 352 * Raid bdev I/O read/write wrapper for spdk_bdev_unmap_blocks function. 353 */ 354 static inline int 355 raid_bdev_unmap_blocks(struct raid_base_bdev_info *base_info, struct spdk_io_channel *ch, 356 uint64_t offset_blocks, uint64_t num_blocks, 357 spdk_bdev_io_completion_cb cb, void *cb_arg) 358 { 359 return spdk_bdev_unmap_blocks(base_info->desc, ch, base_info->data_offset + offset_blocks, 360 num_blocks, cb, cb_arg); 361 } 362 363 /** 364 * Raid bdev I/O read/write wrapper for spdk_bdev_flush_blocks function. 365 */ 366 static inline int 367 raid_bdev_flush_blocks(struct raid_base_bdev_info *base_info, struct spdk_io_channel *ch, 368 uint64_t offset_blocks, uint64_t num_blocks, 369 spdk_bdev_io_completion_cb cb, void *cb_arg) 370 { 371 return spdk_bdev_flush_blocks(base_info->desc, ch, base_info->data_offset + offset_blocks, 372 num_blocks, cb, cb_arg); 373 } 374 375 /* 376 * Definitions related to raid bdev superblock 377 */ 378 379 #define RAID_BDEV_SB_VERSION_MAJOR 1 380 #define RAID_BDEV_SB_VERSION_MINOR 0 381 382 #define RAID_BDEV_SB_NAME_SIZE 64 383 384 enum raid_bdev_sb_base_bdev_state { 385 RAID_SB_BASE_BDEV_MISSING = 0, 386 RAID_SB_BASE_BDEV_CONFIGURED = 1, 387 RAID_SB_BASE_BDEV_FAILED = 2, 388 RAID_SB_BASE_BDEV_SPARE = 3, 389 }; 390 391 struct raid_bdev_sb_base_bdev { 392 /* uuid of the base bdev */ 393 struct spdk_uuid uuid; 394 /* offset in blocks from base device start to the start of raid data area */ 395 uint64_t data_offset; 396 /* size in blocks of the base device raid data area */ 397 uint64_t data_size; 398 /* state of the base bdev */ 399 uint32_t state; 400 /* feature/status flags */ 401 uint32_t flags; 402 /* slot number of this base bdev in the raid */ 403 uint8_t slot; 404 405 uint8_t reserved[23]; 406 }; 407 SPDK_STATIC_ASSERT(sizeof(struct raid_bdev_sb_base_bdev) == 64, "incorrect size"); 408 409 struct raid_bdev_superblock { 410 #define RAID_BDEV_SB_SIG "SPDKRAID" 411 uint8_t signature[8]; 412 struct { 413 /* incremented when a breaking change in the superblock structure is made */ 414 uint16_t major; 415 /* incremented for changes in the superblock that are backward compatible */ 416 uint16_t minor; 417 } version; 418 /* length in bytes of the entire superblock */ 419 uint32_t length; 420 /* crc32c checksum of the entire superblock */ 421 uint32_t crc; 422 /* feature/status flags */ 423 uint32_t flags; 424 /* unique id of the raid bdev */ 425 struct spdk_uuid uuid; 426 /* name of the raid bdev */ 427 uint8_t name[RAID_BDEV_SB_NAME_SIZE]; 428 /* size of the raid bdev in blocks */ 429 uint64_t raid_size; 430 /* the raid bdev block size - must be the same for all base bdevs */ 431 uint32_t block_size; 432 /* the raid level */ 433 uint32_t level; 434 /* strip (chunk) size in blocks */ 435 uint32_t strip_size; 436 /* state of the raid */ 437 uint32_t state; 438 /* sequence number, incremented on every superblock update */ 439 uint64_t seq_number; 440 /* number of raid base devices */ 441 uint8_t num_base_bdevs; 442 443 uint8_t reserved[118]; 444 445 /* size of the base bdevs array */ 446 uint8_t base_bdevs_size; 447 /* array of base bdev descriptors */ 448 struct raid_bdev_sb_base_bdev base_bdevs[]; 449 }; 450 SPDK_STATIC_ASSERT(sizeof(struct raid_bdev_superblock) == 256, "incorrect size"); 451 452 #define RAID_BDEV_SB_MAX_LENGTH \ 453 SPDK_ALIGN_CEIL((sizeof(struct raid_bdev_superblock) + UINT8_MAX * sizeof(struct raid_bdev_sb_base_bdev)), 0x1000) 454 455 SPDK_STATIC_ASSERT(RAID_BDEV_SB_MAX_LENGTH < RAID_BDEV_MIN_DATA_OFFSET_SIZE, 456 "Incorrect min data offset"); 457 458 typedef void (*raid_bdev_write_sb_cb)(int status, struct raid_bdev *raid_bdev, void *ctx); 459 typedef void (*raid_bdev_load_sb_cb)(const struct raid_bdev_superblock *sb, int status, void *ctx); 460 461 void raid_bdev_init_superblock(struct raid_bdev *raid_bdev); 462 void raid_bdev_write_superblock(struct raid_bdev *raid_bdev, raid_bdev_write_sb_cb cb, 463 void *cb_ctx); 464 int raid_bdev_load_base_bdev_superblock(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 465 raid_bdev_load_sb_cb cb, void *cb_ctx); 466 467 #endif /* SPDK_BDEV_RAID_INTERNAL_H */ 468