1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2019 Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "bdev_raid.h" 8 9 #include "spdk/env.h" 10 #include "spdk/thread.h" 11 #include "spdk/string.h" 12 #include "spdk/util.h" 13 14 #include "spdk/log.h" 15 16 /* 17 * brief: 18 * raid0_bdev_io_completion function is called by lower layers to notify raid 19 * module that particular bdev_io is completed. 20 * params: 21 * bdev_io - pointer to bdev io submitted to lower layers, like child io 22 * success - bdev_io status 23 * cb_arg - function callback context (parent raid_bdev_io) 24 * returns: 25 * none 26 */ 27 static void 28 raid0_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 29 { 30 struct raid_bdev_io *raid_io = cb_arg; 31 32 spdk_bdev_free_io(bdev_io); 33 34 if (success) { 35 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS); 36 } else { 37 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 38 } 39 } 40 41 static void raid0_submit_rw_request(struct raid_bdev_io *raid_io); 42 43 static void 44 _raid0_submit_rw_request(void *_raid_io) 45 { 46 struct raid_bdev_io *raid_io = _raid_io; 47 48 raid0_submit_rw_request(raid_io); 49 } 50 51 /* 52 * brief: 53 * raid0_submit_rw_request function is used to submit I/O to the correct 54 * member disk for raid0 bdevs. 55 * params: 56 * raid_io 57 * returns: 58 * none 59 */ 60 static void 61 raid0_submit_rw_request(struct raid_bdev_io *raid_io) 62 { 63 struct spdk_bdev_ext_io_opts io_opts = {}; 64 struct raid_bdev_io_channel *raid_ch = raid_io->raid_ch; 65 struct raid_bdev *raid_bdev = raid_io->raid_bdev; 66 uint64_t pd_strip; 67 uint32_t offset_in_strip; 68 uint64_t pd_lba; 69 uint64_t pd_blocks; 70 uint8_t pd_idx; 71 int ret = 0; 72 uint64_t start_strip; 73 uint64_t end_strip; 74 struct raid_base_bdev_info *base_info; 75 struct spdk_io_channel *base_ch; 76 77 start_strip = raid_io->offset_blocks >> raid_bdev->strip_size_shift; 78 end_strip = (raid_io->offset_blocks + raid_io->num_blocks - 1) >> 79 raid_bdev->strip_size_shift; 80 if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) { 81 assert(false); 82 SPDK_ERRLOG("I/O spans strip boundary!\n"); 83 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 84 return; 85 } 86 87 pd_strip = start_strip / raid_bdev->num_base_bdevs; 88 pd_idx = start_strip % raid_bdev->num_base_bdevs; 89 offset_in_strip = raid_io->offset_blocks & (raid_bdev->strip_size - 1); 90 pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip; 91 pd_blocks = raid_io->num_blocks; 92 base_info = &raid_bdev->base_bdev_info[pd_idx]; 93 if (base_info->desc == NULL) { 94 SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx); 95 assert(0); 96 } 97 98 /* 99 * Submit child io to bdev layer with using base bdev descriptors, base 100 * bdev lba, base bdev child io length in blocks, buffer, completion 101 * function and function callback context 102 */ 103 assert(raid_ch != NULL); 104 base_ch = raid_bdev_channel_get_base_channel(raid_ch, pd_idx); 105 106 io_opts.size = sizeof(io_opts); 107 io_opts.memory_domain = raid_io->memory_domain; 108 io_opts.memory_domain_ctx = raid_io->memory_domain_ctx; 109 io_opts.metadata = raid_io->md_buf; 110 111 if (raid_io->type == SPDK_BDEV_IO_TYPE_READ) { 112 ret = raid_bdev_readv_blocks_ext(base_info, base_ch, 113 raid_io->iovs, raid_io->iovcnt, 114 pd_lba, pd_blocks, raid0_bdev_io_completion, 115 raid_io, &io_opts); 116 } else if (raid_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 117 ret = raid_bdev_writev_blocks_ext(base_info, base_ch, 118 raid_io->iovs, raid_io->iovcnt, 119 pd_lba, pd_blocks, raid0_bdev_io_completion, 120 raid_io, &io_opts); 121 } else { 122 SPDK_ERRLOG("Recvd not supported io type %u\n", raid_io->type); 123 assert(0); 124 } 125 126 if (ret == -ENOMEM) { 127 raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc), 128 base_ch, _raid0_submit_rw_request); 129 } else if (ret != 0) { 130 SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); 131 assert(false); 132 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 133 } 134 } 135 136 /* raid0 IO range */ 137 struct raid_bdev_io_range { 138 uint64_t strip_size; 139 uint64_t start_strip_in_disk; 140 uint64_t end_strip_in_disk; 141 uint64_t start_offset_in_strip; 142 uint64_t end_offset_in_strip; 143 uint8_t start_disk; 144 uint8_t end_disk; 145 uint8_t n_disks_involved; 146 }; 147 148 static inline void 149 _raid0_get_io_range(struct raid_bdev_io_range *io_range, 150 uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift, 151 uint64_t offset_blocks, uint64_t num_blocks) 152 { 153 uint64_t start_strip; 154 uint64_t end_strip; 155 uint64_t total_blocks; 156 157 io_range->strip_size = strip_size; 158 total_blocks = offset_blocks + num_blocks - (num_blocks > 0); 159 160 /* The start and end strip index in raid0 bdev scope */ 161 start_strip = offset_blocks >> strip_size_shift; 162 end_strip = total_blocks >> strip_size_shift; 163 io_range->start_strip_in_disk = start_strip / num_base_bdevs; 164 io_range->end_strip_in_disk = end_strip / num_base_bdevs; 165 166 /* The first strip may have unaligned start LBA offset. 167 * The end strip may have unaligned end LBA offset. 168 * Strips between them certainly have aligned offset and length to boundaries. 169 */ 170 io_range->start_offset_in_strip = offset_blocks % strip_size; 171 io_range->end_offset_in_strip = total_blocks % strip_size; 172 173 /* The base bdev indexes in which start and end strips are located */ 174 io_range->start_disk = start_strip % num_base_bdevs; 175 io_range->end_disk = end_strip % num_base_bdevs; 176 177 /* Calculate how many base_bdevs are involved in io operation. 178 * Number of base bdevs involved is between 1 and num_base_bdevs. 179 * It will be 1 if the first strip and last strip are the same one. 180 */ 181 io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs); 182 } 183 184 static inline void 185 _raid0_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx, 186 uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk) 187 { 188 uint64_t n_strips_in_disk; 189 uint64_t start_offset_in_disk; 190 uint64_t end_offset_in_disk; 191 uint64_t offset_in_disk; 192 uint64_t nblocks_in_disk; 193 uint64_t start_strip_in_disk; 194 uint64_t end_strip_in_disk; 195 196 start_strip_in_disk = io_range->start_strip_in_disk; 197 if (disk_idx < io_range->start_disk) { 198 start_strip_in_disk += 1; 199 } 200 201 end_strip_in_disk = io_range->end_strip_in_disk; 202 if (disk_idx > io_range->end_disk) { 203 end_strip_in_disk -= 1; 204 } 205 206 assert(end_strip_in_disk >= start_strip_in_disk); 207 n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1; 208 209 if (disk_idx == io_range->start_disk) { 210 start_offset_in_disk = io_range->start_offset_in_strip; 211 } else { 212 start_offset_in_disk = 0; 213 } 214 215 if (disk_idx == io_range->end_disk) { 216 end_offset_in_disk = io_range->end_offset_in_strip; 217 } else { 218 end_offset_in_disk = io_range->strip_size - 1; 219 } 220 221 offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size; 222 nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size 223 + end_offset_in_disk - start_offset_in_disk + 1; 224 225 SPDK_DEBUGLOG(bdev_raid0, 226 "raid_bdev (strip_size 0x%" PRIx64 ") splits IO to base_bdev (%u) at (0x%" PRIx64 ", 0x%" PRIx64 227 ").\n", 228 io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk); 229 230 *_offset_in_disk = offset_in_disk; 231 *_nblocks_in_disk = nblocks_in_disk; 232 } 233 234 static void raid0_submit_null_payload_request(struct raid_bdev_io *raid_io); 235 236 static void 237 _raid0_submit_null_payload_request(void *_raid_io) 238 { 239 struct raid_bdev_io *raid_io = _raid_io; 240 241 raid0_submit_null_payload_request(raid_io); 242 } 243 244 static void 245 raid0_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 246 { 247 struct raid_bdev_io *raid_io = cb_arg; 248 249 raid_bdev_io_complete_part(raid_io, 1, success ? 250 SPDK_BDEV_IO_STATUS_SUCCESS : 251 SPDK_BDEV_IO_STATUS_FAILED); 252 253 spdk_bdev_free_io(bdev_io); 254 } 255 256 /* 257 * brief: 258 * raid0_submit_null_payload_request function submits the next batch of 259 * io requests with range but without payload, like FLUSH and UNMAP, to member disks; 260 * it will submit as many as possible unless one base io request fails with -ENOMEM, 261 * in which case it will queue itself for later submission. 262 * params: 263 * bdev_io - pointer to parent bdev_io on raid bdev device 264 * returns: 265 * none 266 */ 267 static void 268 raid0_submit_null_payload_request(struct raid_bdev_io *raid_io) 269 { 270 struct raid_bdev *raid_bdev; 271 struct raid_bdev_io_range io_range; 272 int ret; 273 struct raid_base_bdev_info *base_info; 274 struct spdk_io_channel *base_ch; 275 276 raid_bdev = raid_io->raid_bdev; 277 278 _raid0_get_io_range(&io_range, raid_bdev->num_base_bdevs, 279 raid_bdev->strip_size, raid_bdev->strip_size_shift, 280 raid_io->offset_blocks, raid_io->num_blocks); 281 282 if (raid_io->base_bdev_io_remaining == 0) { 283 raid_io->base_bdev_io_remaining = io_range.n_disks_involved; 284 } 285 286 while (raid_io->base_bdev_io_submitted < io_range.n_disks_involved) { 287 uint8_t disk_idx; 288 uint64_t offset_in_disk; 289 uint64_t nblocks_in_disk; 290 291 /* base_bdev is started from start_disk to end_disk. 292 * It is possible that index of start_disk is larger than end_disk's. 293 */ 294 disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs; 295 base_info = &raid_bdev->base_bdev_info[disk_idx]; 296 base_ch = raid_bdev_channel_get_base_channel(raid_io->raid_ch, disk_idx); 297 298 _raid0_split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk); 299 300 switch (raid_io->type) { 301 case SPDK_BDEV_IO_TYPE_UNMAP: 302 ret = raid_bdev_unmap_blocks(base_info, base_ch, 303 offset_in_disk, nblocks_in_disk, 304 raid0_base_io_complete, raid_io); 305 break; 306 307 case SPDK_BDEV_IO_TYPE_FLUSH: 308 ret = raid_bdev_flush_blocks(base_info, base_ch, 309 offset_in_disk, nblocks_in_disk, 310 raid0_base_io_complete, raid_io); 311 break; 312 313 default: 314 SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", raid_io->type); 315 assert(false); 316 ret = -EIO; 317 } 318 319 if (ret == 0) { 320 raid_io->base_bdev_io_submitted++; 321 } else if (ret == -ENOMEM) { 322 raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc), 323 base_ch, _raid0_submit_null_payload_request); 324 return; 325 } else { 326 SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); 327 assert(false); 328 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 329 return; 330 } 331 } 332 } 333 334 static int 335 raid0_start(struct raid_bdev *raid_bdev) 336 { 337 uint64_t min_blockcnt = UINT64_MAX; 338 uint64_t base_bdev_data_size; 339 struct raid_base_bdev_info *base_info; 340 341 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 342 /* Calculate minimum block count from all base bdevs */ 343 min_blockcnt = spdk_min(min_blockcnt, base_info->data_size); 344 } 345 346 base_bdev_data_size = (min_blockcnt >> raid_bdev->strip_size_shift) << raid_bdev->strip_size_shift; 347 348 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 349 base_info->data_size = base_bdev_data_size; 350 } 351 352 /* 353 * Take the minimum block count based approach where total block count 354 * of raid bdev is the number of base bdev times the minimum block count 355 * of any base bdev. 356 */ 357 SPDK_DEBUGLOG(bdev_raid0, "min blockcount %" PRIu64 ", numbasedev %u, strip size shift %u\n", 358 min_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift); 359 360 raid_bdev->bdev.blockcnt = base_bdev_data_size * raid_bdev->num_base_bdevs; 361 362 if (raid_bdev->num_base_bdevs > 1) { 363 raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size; 364 raid_bdev->bdev.split_on_optimal_io_boundary = true; 365 } else { 366 /* Do not need to split reads/writes on single bdev RAID modules. */ 367 raid_bdev->bdev.optimal_io_boundary = 0; 368 raid_bdev->bdev.split_on_optimal_io_boundary = false; 369 } 370 371 return 0; 372 } 373 374 static void 375 raid0_resize(struct raid_bdev *raid_bdev) 376 { 377 uint64_t blockcnt; 378 int rc; 379 uint64_t min_blockcnt = UINT64_MAX; 380 struct raid_base_bdev_info *base_info; 381 uint64_t base_bdev_data_size; 382 383 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 384 struct spdk_bdev *base_bdev = spdk_bdev_desc_get_bdev(base_info->desc); 385 386 min_blockcnt = spdk_min(min_blockcnt, base_bdev->blockcnt - base_info->data_offset); 387 } 388 389 base_bdev_data_size = (min_blockcnt >> raid_bdev->strip_size_shift) << raid_bdev->strip_size_shift; 390 blockcnt = base_bdev_data_size * raid_bdev->num_base_bdevs; 391 392 if (blockcnt == raid_bdev->bdev.blockcnt) { 393 return; 394 } 395 396 rc = spdk_bdev_notify_blockcnt_change(&raid_bdev->bdev, blockcnt); 397 if (rc != 0) { 398 SPDK_ERRLOG("Failed to notify blockcount change\n"); 399 return; 400 } 401 402 SPDK_NOTICELOG("raid0 '%s': min blockcount was changed from %" PRIu64 " to %" PRIu64 "\n", 403 raid_bdev->bdev.name, 404 raid_bdev->bdev.blockcnt, 405 blockcnt); 406 407 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 408 base_info->data_size = base_bdev_data_size; 409 } 410 } 411 412 static struct raid_bdev_module g_raid0_module = { 413 .level = RAID0, 414 .base_bdevs_min = 1, 415 .memory_domains_supported = true, 416 .start = raid0_start, 417 .submit_rw_request = raid0_submit_rw_request, 418 .submit_null_payload_request = raid0_submit_null_payload_request, 419 .resize = raid0_resize, 420 }; 421 RAID_MODULE_REGISTER(&g_raid0_module) 422 423 SPDK_LOG_REGISTER_COMPONENT(bdev_raid0) 424