1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2019 Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "bdev_raid.h" 8 9 #include "spdk/env.h" 10 #include "spdk/thread.h" 11 #include "spdk/string.h" 12 #include "spdk/util.h" 13 14 #include "spdk/log.h" 15 16 /* 17 * brief: 18 * raid0_bdev_io_completion function is called by lower layers to notify raid 19 * module that particular bdev_io is completed. 20 * params: 21 * bdev_io - pointer to bdev io submitted to lower layers, like child io 22 * success - bdev_io status 23 * cb_arg - function callback context (parent raid_bdev_io) 24 * returns: 25 * none 26 */ 27 static void 28 raid0_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 29 { 30 struct raid_bdev_io *raid_io = cb_arg; 31 int rc; 32 33 if (success) { 34 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 35 spdk_bdev_get_dif_type(bdev_io->bdev) != SPDK_DIF_DISABLE && 36 bdev_io->bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK)) { 37 38 rc = raid_bdev_verify_dix_reftag(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 39 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.num_blocks, bdev_io->bdev, 40 bdev_io->u.bdev.offset_blocks); 41 if (rc != 0) { 42 SPDK_ERRLOG("Reftag verify failed.\n"); 43 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 44 return; 45 } 46 } 47 48 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS); 49 } else { 50 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 51 } 52 53 spdk_bdev_free_io(bdev_io); 54 } 55 56 static void raid0_submit_rw_request(struct raid_bdev_io *raid_io); 57 58 static void 59 _raid0_submit_rw_request(void *_raid_io) 60 { 61 struct raid_bdev_io *raid_io = _raid_io; 62 63 raid0_submit_rw_request(raid_io); 64 } 65 66 /* 67 * brief: 68 * raid0_submit_rw_request function is used to submit I/O to the correct 69 * member disk for raid0 bdevs. 70 * params: 71 * raid_io 72 * returns: 73 * none 74 */ 75 static void 76 raid0_submit_rw_request(struct raid_bdev_io *raid_io) 77 { 78 struct spdk_bdev_ext_io_opts io_opts = {}; 79 struct raid_bdev_io_channel *raid_ch = raid_io->raid_ch; 80 struct raid_bdev *raid_bdev = raid_io->raid_bdev; 81 uint64_t pd_strip; 82 uint32_t offset_in_strip; 83 uint64_t pd_lba; 84 uint64_t pd_blocks; 85 uint8_t pd_idx; 86 int ret = 0; 87 uint64_t start_strip; 88 uint64_t end_strip; 89 struct raid_base_bdev_info *base_info; 90 struct spdk_io_channel *base_ch; 91 92 start_strip = raid_io->offset_blocks >> raid_bdev->strip_size_shift; 93 end_strip = (raid_io->offset_blocks + raid_io->num_blocks - 1) >> 94 raid_bdev->strip_size_shift; 95 if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) { 96 assert(false); 97 SPDK_ERRLOG("I/O spans strip boundary!\n"); 98 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 99 return; 100 } 101 102 pd_strip = start_strip / raid_bdev->num_base_bdevs; 103 pd_idx = start_strip % raid_bdev->num_base_bdevs; 104 offset_in_strip = raid_io->offset_blocks & (raid_bdev->strip_size - 1); 105 pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip; 106 pd_blocks = raid_io->num_blocks; 107 base_info = &raid_bdev->base_bdev_info[pd_idx]; 108 if (base_info->desc == NULL) { 109 SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx); 110 assert(0); 111 } 112 113 /* 114 * Submit child io to bdev layer with using base bdev descriptors, base 115 * bdev lba, base bdev child io length in blocks, buffer, completion 116 * function and function callback context 117 */ 118 assert(raid_ch != NULL); 119 base_ch = raid_bdev_channel_get_base_channel(raid_ch, pd_idx); 120 121 io_opts.size = sizeof(io_opts); 122 io_opts.memory_domain = raid_io->memory_domain; 123 io_opts.memory_domain_ctx = raid_io->memory_domain_ctx; 124 io_opts.metadata = raid_io->md_buf; 125 126 if (raid_io->type == SPDK_BDEV_IO_TYPE_READ) { 127 ret = raid_bdev_readv_blocks_ext(base_info, base_ch, 128 raid_io->iovs, raid_io->iovcnt, 129 pd_lba, pd_blocks, raid0_bdev_io_completion, 130 raid_io, &io_opts); 131 } else if (raid_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 132 struct spdk_bdev *bdev = &base_info->raid_bdev->bdev; 133 134 if (spdk_unlikely(spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE && 135 bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK)) { 136 ret = raid_bdev_verify_dix_reftag(raid_io->iovs, raid_io->iovcnt, io_opts.metadata, 137 pd_blocks, bdev, raid_io->offset_blocks); 138 if (ret != 0) { 139 SPDK_ERRLOG("bdev io submit error due to DIX verify failure\n"); 140 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 141 return; 142 } 143 } 144 145 ret = raid_bdev_writev_blocks_ext(base_info, base_ch, 146 raid_io->iovs, raid_io->iovcnt, 147 pd_lba, pd_blocks, raid0_bdev_io_completion, 148 raid_io, &io_opts); 149 } else { 150 SPDK_ERRLOG("Recvd not supported io type %u\n", raid_io->type); 151 assert(0); 152 } 153 154 if (ret == -ENOMEM) { 155 raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc), 156 base_ch, _raid0_submit_rw_request); 157 } else if (ret != 0) { 158 SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); 159 assert(false); 160 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 161 } 162 } 163 164 /* raid0 IO range */ 165 struct raid_bdev_io_range { 166 uint64_t strip_size; 167 uint64_t start_strip_in_disk; 168 uint64_t end_strip_in_disk; 169 uint64_t start_offset_in_strip; 170 uint64_t end_offset_in_strip; 171 uint8_t start_disk; 172 uint8_t end_disk; 173 uint8_t n_disks_involved; 174 }; 175 176 static inline void 177 _raid0_get_io_range(struct raid_bdev_io_range *io_range, 178 uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift, 179 uint64_t offset_blocks, uint64_t num_blocks) 180 { 181 uint64_t start_strip; 182 uint64_t end_strip; 183 uint64_t total_blocks; 184 185 io_range->strip_size = strip_size; 186 total_blocks = offset_blocks + num_blocks - (num_blocks > 0); 187 188 /* The start and end strip index in raid0 bdev scope */ 189 start_strip = offset_blocks >> strip_size_shift; 190 end_strip = total_blocks >> strip_size_shift; 191 io_range->start_strip_in_disk = start_strip / num_base_bdevs; 192 io_range->end_strip_in_disk = end_strip / num_base_bdevs; 193 194 /* The first strip may have unaligned start LBA offset. 195 * The end strip may have unaligned end LBA offset. 196 * Strips between them certainly have aligned offset and length to boundaries. 197 */ 198 io_range->start_offset_in_strip = offset_blocks % strip_size; 199 io_range->end_offset_in_strip = total_blocks % strip_size; 200 201 /* The base bdev indexes in which start and end strips are located */ 202 io_range->start_disk = start_strip % num_base_bdevs; 203 io_range->end_disk = end_strip % num_base_bdevs; 204 205 /* Calculate how many base_bdevs are involved in io operation. 206 * Number of base bdevs involved is between 1 and num_base_bdevs. 207 * It will be 1 if the first strip and last strip are the same one. 208 */ 209 io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs); 210 } 211 212 static inline void 213 _raid0_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx, 214 uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk) 215 { 216 uint64_t n_strips_in_disk; 217 uint64_t start_offset_in_disk; 218 uint64_t end_offset_in_disk; 219 uint64_t offset_in_disk; 220 uint64_t nblocks_in_disk; 221 uint64_t start_strip_in_disk; 222 uint64_t end_strip_in_disk; 223 224 start_strip_in_disk = io_range->start_strip_in_disk; 225 if (disk_idx < io_range->start_disk) { 226 start_strip_in_disk += 1; 227 } 228 229 end_strip_in_disk = io_range->end_strip_in_disk; 230 if (disk_idx > io_range->end_disk) { 231 end_strip_in_disk -= 1; 232 } 233 234 assert(end_strip_in_disk >= start_strip_in_disk); 235 n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1; 236 237 if (disk_idx == io_range->start_disk) { 238 start_offset_in_disk = io_range->start_offset_in_strip; 239 } else { 240 start_offset_in_disk = 0; 241 } 242 243 if (disk_idx == io_range->end_disk) { 244 end_offset_in_disk = io_range->end_offset_in_strip; 245 } else { 246 end_offset_in_disk = io_range->strip_size - 1; 247 } 248 249 offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size; 250 nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size 251 + end_offset_in_disk - start_offset_in_disk + 1; 252 253 SPDK_DEBUGLOG(bdev_raid0, 254 "raid_bdev (strip_size 0x%" PRIx64 ") splits IO to base_bdev (%u) at (0x%" PRIx64 ", 0x%" PRIx64 255 ").\n", 256 io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk); 257 258 *_offset_in_disk = offset_in_disk; 259 *_nblocks_in_disk = nblocks_in_disk; 260 } 261 262 static void raid0_submit_null_payload_request(struct raid_bdev_io *raid_io); 263 264 static void 265 _raid0_submit_null_payload_request(void *_raid_io) 266 { 267 struct raid_bdev_io *raid_io = _raid_io; 268 269 raid0_submit_null_payload_request(raid_io); 270 } 271 272 static void 273 raid0_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 274 { 275 struct raid_bdev_io *raid_io = cb_arg; 276 277 raid_bdev_io_complete_part(raid_io, 1, success ? 278 SPDK_BDEV_IO_STATUS_SUCCESS : 279 SPDK_BDEV_IO_STATUS_FAILED); 280 281 spdk_bdev_free_io(bdev_io); 282 } 283 284 /* 285 * brief: 286 * raid0_submit_null_payload_request function submits the next batch of 287 * io requests with range but without payload, like FLUSH and UNMAP, to member disks; 288 * it will submit as many as possible unless one base io request fails with -ENOMEM, 289 * in which case it will queue itself for later submission. 290 * params: 291 * bdev_io - pointer to parent bdev_io on raid bdev device 292 * returns: 293 * none 294 */ 295 static void 296 raid0_submit_null_payload_request(struct raid_bdev_io *raid_io) 297 { 298 struct raid_bdev *raid_bdev; 299 struct raid_bdev_io_range io_range; 300 int ret; 301 struct raid_base_bdev_info *base_info; 302 struct spdk_io_channel *base_ch; 303 304 raid_bdev = raid_io->raid_bdev; 305 306 _raid0_get_io_range(&io_range, raid_bdev->num_base_bdevs, 307 raid_bdev->strip_size, raid_bdev->strip_size_shift, 308 raid_io->offset_blocks, raid_io->num_blocks); 309 310 if (raid_io->base_bdev_io_remaining == 0) { 311 raid_io->base_bdev_io_remaining = io_range.n_disks_involved; 312 } 313 314 while (raid_io->base_bdev_io_submitted < io_range.n_disks_involved) { 315 uint8_t disk_idx; 316 uint64_t offset_in_disk; 317 uint64_t nblocks_in_disk; 318 319 /* base_bdev is started from start_disk to end_disk. 320 * It is possible that index of start_disk is larger than end_disk's. 321 */ 322 disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs; 323 base_info = &raid_bdev->base_bdev_info[disk_idx]; 324 base_ch = raid_bdev_channel_get_base_channel(raid_io->raid_ch, disk_idx); 325 326 _raid0_split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk); 327 328 switch (raid_io->type) { 329 case SPDK_BDEV_IO_TYPE_UNMAP: 330 ret = raid_bdev_unmap_blocks(base_info, base_ch, 331 offset_in_disk, nblocks_in_disk, 332 raid0_base_io_complete, raid_io); 333 break; 334 335 case SPDK_BDEV_IO_TYPE_FLUSH: 336 ret = raid_bdev_flush_blocks(base_info, base_ch, 337 offset_in_disk, nblocks_in_disk, 338 raid0_base_io_complete, raid_io); 339 break; 340 341 default: 342 SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", raid_io->type); 343 assert(false); 344 ret = -EIO; 345 } 346 347 if (ret == 0) { 348 raid_io->base_bdev_io_submitted++; 349 } else if (ret == -ENOMEM) { 350 raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc), 351 base_ch, _raid0_submit_null_payload_request); 352 return; 353 } else { 354 SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); 355 assert(false); 356 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 357 return; 358 } 359 } 360 } 361 362 static int 363 raid0_start(struct raid_bdev *raid_bdev) 364 { 365 uint64_t min_blockcnt = UINT64_MAX; 366 uint64_t base_bdev_data_size; 367 struct raid_base_bdev_info *base_info; 368 369 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 370 /* Calculate minimum block count from all base bdevs */ 371 min_blockcnt = spdk_min(min_blockcnt, base_info->data_size); 372 } 373 374 base_bdev_data_size = (min_blockcnt >> raid_bdev->strip_size_shift) << raid_bdev->strip_size_shift; 375 376 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 377 base_info->data_size = base_bdev_data_size; 378 } 379 380 /* 381 * Take the minimum block count based approach where total block count 382 * of raid bdev is the number of base bdev times the minimum block count 383 * of any base bdev. 384 */ 385 SPDK_DEBUGLOG(bdev_raid0, "min blockcount %" PRIu64 ", numbasedev %u, strip size shift %u\n", 386 min_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift); 387 388 raid_bdev->bdev.blockcnt = base_bdev_data_size * raid_bdev->num_base_bdevs; 389 390 if (raid_bdev->num_base_bdevs > 1) { 391 raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size; 392 raid_bdev->bdev.split_on_optimal_io_boundary = true; 393 } else { 394 /* Do not need to split reads/writes on single bdev RAID modules. */ 395 raid_bdev->bdev.optimal_io_boundary = 0; 396 raid_bdev->bdev.split_on_optimal_io_boundary = false; 397 } 398 399 return 0; 400 } 401 402 static bool 403 raid0_resize(struct raid_bdev *raid_bdev) 404 { 405 uint64_t blockcnt; 406 int rc; 407 uint64_t min_blockcnt = UINT64_MAX; 408 struct raid_base_bdev_info *base_info; 409 uint64_t base_bdev_data_size; 410 411 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 412 struct spdk_bdev *base_bdev = spdk_bdev_desc_get_bdev(base_info->desc); 413 414 min_blockcnt = spdk_min(min_blockcnt, base_bdev->blockcnt - base_info->data_offset); 415 } 416 417 base_bdev_data_size = (min_blockcnt >> raid_bdev->strip_size_shift) << raid_bdev->strip_size_shift; 418 blockcnt = base_bdev_data_size * raid_bdev->num_base_bdevs; 419 420 if (blockcnt == raid_bdev->bdev.blockcnt) { 421 return false; 422 } 423 424 rc = spdk_bdev_notify_blockcnt_change(&raid_bdev->bdev, blockcnt); 425 if (rc != 0) { 426 SPDK_ERRLOG("Failed to notify blockcount change\n"); 427 return false; 428 } 429 430 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 431 base_info->data_size = base_bdev_data_size; 432 } 433 434 return true; 435 } 436 437 static struct raid_bdev_module g_raid0_module = { 438 .level = RAID0, 439 .base_bdevs_min = 1, 440 .memory_domains_supported = true, 441 .dif_supported = true, 442 .start = raid0_start, 443 .submit_rw_request = raid0_submit_rw_request, 444 .submit_null_payload_request = raid0_submit_null_payload_request, 445 .resize = raid0_resize, 446 }; 447 RAID_MODULE_REGISTER(&g_raid0_module) 448 449 SPDK_LOG_REGISTER_COMPONENT(bdev_raid0) 450