1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include "bdev_raid.h" 36 37 #include "spdk/env.h" 38 #include "spdk/thread.h" 39 #include "spdk/string.h" 40 #include "spdk/util.h" 41 42 #include "spdk/log.h" 43 44 /* 45 * brief: 46 * raid0_bdev_io_completion function is called by lower layers to notify raid 47 * module that particular bdev_io is completed. 48 * params: 49 * bdev_io - pointer to bdev io submitted to lower layers, like child io 50 * success - bdev_io status 51 * cb_arg - function callback context (parent raid_bdev_io) 52 * returns: 53 * none 54 */ 55 static void 56 raid0_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 57 { 58 struct raid_bdev_io *raid_io = cb_arg; 59 60 spdk_bdev_free_io(bdev_io); 61 62 if (success) { 63 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS); 64 } else { 65 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 66 } 67 } 68 69 static void 70 raid0_submit_rw_request(struct raid_bdev_io *raid_io); 71 72 static void 73 _raid0_submit_rw_request(void *_raid_io) 74 { 75 struct raid_bdev_io *raid_io = _raid_io; 76 77 raid0_submit_rw_request(raid_io); 78 } 79 80 /* 81 * brief: 82 * raid0_submit_rw_request function is used to submit I/O to the correct 83 * member disk for raid0 bdevs. 84 * params: 85 * raid_io 86 * returns: 87 * none 88 */ 89 static void 90 raid0_submit_rw_request(struct raid_bdev_io *raid_io) 91 { 92 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); 93 struct raid_bdev_io_channel *raid_ch = raid_io->raid_ch; 94 struct raid_bdev *raid_bdev = raid_io->raid_bdev; 95 uint64_t pd_strip; 96 uint32_t offset_in_strip; 97 uint64_t pd_lba; 98 uint64_t pd_blocks; 99 uint8_t pd_idx; 100 int ret = 0; 101 uint64_t start_strip; 102 uint64_t end_strip; 103 struct raid_base_bdev_info *base_info; 104 struct spdk_io_channel *base_ch; 105 106 start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift; 107 end_strip = (bdev_io->u.bdev.offset_blocks + bdev_io->u.bdev.num_blocks - 1) >> 108 raid_bdev->strip_size_shift; 109 if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) { 110 assert(false); 111 SPDK_ERRLOG("I/O spans strip boundary!\n"); 112 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 113 return; 114 } 115 116 pd_strip = start_strip / raid_bdev->num_base_bdevs; 117 pd_idx = start_strip % raid_bdev->num_base_bdevs; 118 offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1); 119 pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip; 120 pd_blocks = bdev_io->u.bdev.num_blocks; 121 base_info = &raid_bdev->base_bdev_info[pd_idx]; 122 if (base_info->desc == NULL) { 123 SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx); 124 assert(0); 125 } 126 127 /* 128 * Submit child io to bdev layer with using base bdev descriptors, base 129 * bdev lba, base bdev child io length in blocks, buffer, completion 130 * function and function callback context 131 */ 132 assert(raid_ch != NULL); 133 assert(raid_ch->base_channel); 134 base_ch = raid_ch->base_channel[pd_idx]; 135 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 136 ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch, 137 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 138 pd_lba, pd_blocks, raid0_bdev_io_completion, 139 raid_io, bdev_io->u.bdev.ext_opts); 140 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 141 ret = spdk_bdev_writev_blocks_ext(base_info->desc, base_ch, 142 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 143 pd_lba, pd_blocks, raid0_bdev_io_completion, 144 raid_io, bdev_io->u.bdev.ext_opts); 145 } else { 146 SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type); 147 assert(0); 148 } 149 150 if (ret == -ENOMEM) { 151 raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, 152 _raid0_submit_rw_request); 153 } else if (ret != 0) { 154 SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); 155 assert(false); 156 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 157 } 158 } 159 160 /* raid0 IO range */ 161 struct raid_bdev_io_range { 162 uint64_t strip_size; 163 uint64_t start_strip_in_disk; 164 uint64_t end_strip_in_disk; 165 uint64_t start_offset_in_strip; 166 uint64_t end_offset_in_strip; 167 uint8_t start_disk; 168 uint8_t end_disk; 169 uint8_t n_disks_involved; 170 }; 171 172 static inline void 173 _raid0_get_io_range(struct raid_bdev_io_range *io_range, 174 uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift, 175 uint64_t offset_blocks, uint64_t num_blocks) 176 { 177 uint64_t start_strip; 178 uint64_t end_strip; 179 180 io_range->strip_size = strip_size; 181 182 /* The start and end strip index in raid0 bdev scope */ 183 start_strip = offset_blocks >> strip_size_shift; 184 end_strip = (offset_blocks + num_blocks - 1) >> strip_size_shift; 185 io_range->start_strip_in_disk = start_strip / num_base_bdevs; 186 io_range->end_strip_in_disk = end_strip / num_base_bdevs; 187 188 /* The first strip may have unaligned start LBA offset. 189 * The end strip may have unaligned end LBA offset. 190 * Strips between them certainly have aligned offset and length to boundaries. 191 */ 192 io_range->start_offset_in_strip = offset_blocks % strip_size; 193 io_range->end_offset_in_strip = (offset_blocks + num_blocks - 1) % strip_size; 194 195 /* The base bdev indexes in which start and end strips are located */ 196 io_range->start_disk = start_strip % num_base_bdevs; 197 io_range->end_disk = end_strip % num_base_bdevs; 198 199 /* Calculate how many base_bdevs are involved in io operation. 200 * Number of base bdevs involved is between 1 and num_base_bdevs. 201 * It will be 1 if the first strip and last strip are the same one. 202 */ 203 io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs); 204 } 205 206 static inline void 207 _raid0_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx, 208 uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk) 209 { 210 uint64_t n_strips_in_disk; 211 uint64_t start_offset_in_disk; 212 uint64_t end_offset_in_disk; 213 uint64_t offset_in_disk; 214 uint64_t nblocks_in_disk; 215 uint64_t start_strip_in_disk; 216 uint64_t end_strip_in_disk; 217 218 start_strip_in_disk = io_range->start_strip_in_disk; 219 if (disk_idx < io_range->start_disk) { 220 start_strip_in_disk += 1; 221 } 222 223 end_strip_in_disk = io_range->end_strip_in_disk; 224 if (disk_idx > io_range->end_disk) { 225 end_strip_in_disk -= 1; 226 } 227 228 assert(end_strip_in_disk >= start_strip_in_disk); 229 n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1; 230 231 if (disk_idx == io_range->start_disk) { 232 start_offset_in_disk = io_range->start_offset_in_strip; 233 } else { 234 start_offset_in_disk = 0; 235 } 236 237 if (disk_idx == io_range->end_disk) { 238 end_offset_in_disk = io_range->end_offset_in_strip; 239 } else { 240 end_offset_in_disk = io_range->strip_size - 1; 241 } 242 243 offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size; 244 nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size 245 + end_offset_in_disk - start_offset_in_disk + 1; 246 247 SPDK_DEBUGLOG(bdev_raid0, 248 "raid_bdev (strip_size 0x%" PRIx64 ") splits IO to base_bdev (%u) at (0x%" PRIx64 ", 0x%" PRIx64 249 ").\n", 250 io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk); 251 252 *_offset_in_disk = offset_in_disk; 253 *_nblocks_in_disk = nblocks_in_disk; 254 } 255 256 static void 257 raid0_submit_null_payload_request(struct raid_bdev_io *raid_io); 258 259 static void 260 _raid0_submit_null_payload_request(void *_raid_io) 261 { 262 struct raid_bdev_io *raid_io = _raid_io; 263 264 raid0_submit_null_payload_request(raid_io); 265 } 266 267 static void 268 raid0_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 269 { 270 struct raid_bdev_io *raid_io = cb_arg; 271 272 raid_bdev_io_complete_part(raid_io, 1, success ? 273 SPDK_BDEV_IO_STATUS_SUCCESS : 274 SPDK_BDEV_IO_STATUS_FAILED); 275 276 spdk_bdev_free_io(bdev_io); 277 } 278 279 /* 280 * brief: 281 * raid0_submit_null_payload_request function submits the next batch of 282 * io requests with range but without payload, like FLUSH and UNMAP, to member disks; 283 * it will submit as many as possible unless one base io request fails with -ENOMEM, 284 * in which case it will queue itself for later submission. 285 * params: 286 * bdev_io - pointer to parent bdev_io on raid bdev device 287 * returns: 288 * none 289 */ 290 static void 291 raid0_submit_null_payload_request(struct raid_bdev_io *raid_io) 292 { 293 struct spdk_bdev_io *bdev_io; 294 struct raid_bdev *raid_bdev; 295 struct raid_bdev_io_range io_range; 296 int ret; 297 struct raid_base_bdev_info *base_info; 298 struct spdk_io_channel *base_ch; 299 300 bdev_io = spdk_bdev_io_from_ctx(raid_io); 301 raid_bdev = raid_io->raid_bdev; 302 303 _raid0_get_io_range(&io_range, raid_bdev->num_base_bdevs, 304 raid_bdev->strip_size, raid_bdev->strip_size_shift, 305 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks); 306 307 if (raid_io->base_bdev_io_remaining == 0) { 308 raid_io->base_bdev_io_remaining = io_range.n_disks_involved; 309 } 310 311 while (raid_io->base_bdev_io_submitted < io_range.n_disks_involved) { 312 uint8_t disk_idx; 313 uint64_t offset_in_disk; 314 uint64_t nblocks_in_disk; 315 316 /* base_bdev is started from start_disk to end_disk. 317 * It is possible that index of start_disk is larger than end_disk's. 318 */ 319 disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs; 320 base_info = &raid_bdev->base_bdev_info[disk_idx]; 321 base_ch = raid_io->raid_ch->base_channel[disk_idx]; 322 323 _raid0_split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk); 324 325 switch (bdev_io->type) { 326 case SPDK_BDEV_IO_TYPE_UNMAP: 327 ret = spdk_bdev_unmap_blocks(base_info->desc, base_ch, 328 offset_in_disk, nblocks_in_disk, 329 raid0_base_io_complete, raid_io); 330 break; 331 332 case SPDK_BDEV_IO_TYPE_FLUSH: 333 ret = spdk_bdev_flush_blocks(base_info->desc, base_ch, 334 offset_in_disk, nblocks_in_disk, 335 raid0_base_io_complete, raid_io); 336 break; 337 338 default: 339 SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type); 340 assert(false); 341 ret = -EIO; 342 } 343 344 if (ret == 0) { 345 raid_io->base_bdev_io_submitted++; 346 } else if (ret == -ENOMEM) { 347 raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, 348 _raid0_submit_null_payload_request); 349 return; 350 } else { 351 SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); 352 assert(false); 353 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 354 return; 355 } 356 } 357 } 358 359 static int raid0_start(struct raid_bdev *raid_bdev) 360 { 361 uint64_t min_blockcnt = UINT64_MAX; 362 struct raid_base_bdev_info *base_info; 363 364 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 365 /* Calculate minimum block count from all base bdevs */ 366 min_blockcnt = spdk_min(min_blockcnt, base_info->bdev->blockcnt); 367 } 368 369 /* 370 * Take the minimum block count based approach where total block count 371 * of raid bdev is the number of base bdev times the minimum block count 372 * of any base bdev. 373 */ 374 SPDK_DEBUGLOG(bdev_raid0, "min blockcount %" PRIu64 ", numbasedev %u, strip size shift %u\n", 375 min_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift); 376 raid_bdev->bdev.blockcnt = ((min_blockcnt >> raid_bdev->strip_size_shift) << 377 raid_bdev->strip_size_shift) * raid_bdev->num_base_bdevs; 378 379 if (raid_bdev->num_base_bdevs > 1) { 380 raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size; 381 raid_bdev->bdev.split_on_optimal_io_boundary = true; 382 } else { 383 /* Do not need to split reads/writes on single bdev RAID modules. */ 384 raid_bdev->bdev.optimal_io_boundary = 0; 385 raid_bdev->bdev.split_on_optimal_io_boundary = false; 386 } 387 388 return 0; 389 } 390 391 static struct raid_bdev_module g_raid0_module = { 392 .level = RAID0, 393 .base_bdevs_min = 1, 394 .start = raid0_start, 395 .submit_rw_request = raid0_submit_rw_request, 396 .submit_null_payload_request = raid0_submit_null_payload_request, 397 }; 398 RAID_MODULE_REGISTER(&g_raid0_module) 399 400 SPDK_LOG_REGISTER_COMPONENT(bdev_raid0) 401