1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "bdev_raid.h" 8 9 #include "spdk/env.h" 10 #include "spdk/thread.h" 11 #include "spdk/string.h" 12 #include "spdk/util.h" 13 14 #include "spdk/log.h" 15 16 /* 17 * brief: 18 * raid0_bdev_io_completion function is called by lower layers to notify raid 19 * module that particular bdev_io is completed. 20 * params: 21 * bdev_io - pointer to bdev io submitted to lower layers, like child io 22 * success - bdev_io status 23 * cb_arg - function callback context (parent raid_bdev_io) 24 * returns: 25 * none 26 */ 27 static void 28 raid0_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 29 { 30 struct raid_bdev_io *raid_io = cb_arg; 31 32 spdk_bdev_free_io(bdev_io); 33 34 if (success) { 35 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS); 36 } else { 37 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 38 } 39 } 40 41 static void 42 raid0_submit_rw_request(struct raid_bdev_io *raid_io); 43 44 static void 45 _raid0_submit_rw_request(void *_raid_io) 46 { 47 struct raid_bdev_io *raid_io = _raid_io; 48 49 raid0_submit_rw_request(raid_io); 50 } 51 52 /* 53 * brief: 54 * raid0_submit_rw_request function is used to submit I/O to the correct 55 * member disk for raid0 bdevs. 56 * params: 57 * raid_io 58 * returns: 59 * none 60 */ 61 static void 62 raid0_submit_rw_request(struct raid_bdev_io *raid_io) 63 { 64 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); 65 struct raid_bdev_io_channel *raid_ch = raid_io->raid_ch; 66 struct raid_bdev *raid_bdev = raid_io->raid_bdev; 67 uint64_t pd_strip; 68 uint32_t offset_in_strip; 69 uint64_t pd_lba; 70 uint64_t pd_blocks; 71 uint8_t pd_idx; 72 int ret = 0; 73 uint64_t start_strip; 74 uint64_t end_strip; 75 struct raid_base_bdev_info *base_info; 76 struct spdk_io_channel *base_ch; 77 78 start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift; 79 end_strip = (bdev_io->u.bdev.offset_blocks + bdev_io->u.bdev.num_blocks - 1) >> 80 raid_bdev->strip_size_shift; 81 if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) { 82 assert(false); 83 SPDK_ERRLOG("I/O spans strip boundary!\n"); 84 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 85 return; 86 } 87 88 pd_strip = start_strip / raid_bdev->num_base_bdevs; 89 pd_idx = start_strip % raid_bdev->num_base_bdevs; 90 offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1); 91 pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip; 92 pd_blocks = bdev_io->u.bdev.num_blocks; 93 base_info = &raid_bdev->base_bdev_info[pd_idx]; 94 if (base_info->desc == NULL) { 95 SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx); 96 assert(0); 97 } 98 99 /* 100 * Submit child io to bdev layer with using base bdev descriptors, base 101 * bdev lba, base bdev child io length in blocks, buffer, completion 102 * function and function callback context 103 */ 104 assert(raid_ch != NULL); 105 assert(raid_ch->base_channel); 106 base_ch = raid_ch->base_channel[pd_idx]; 107 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 108 ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch, 109 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 110 pd_lba, pd_blocks, raid0_bdev_io_completion, 111 raid_io, bdev_io->u.bdev.ext_opts); 112 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 113 ret = spdk_bdev_writev_blocks_ext(base_info->desc, base_ch, 114 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 115 pd_lba, pd_blocks, raid0_bdev_io_completion, 116 raid_io, bdev_io->u.bdev.ext_opts); 117 } else { 118 SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type); 119 assert(0); 120 } 121 122 if (ret == -ENOMEM) { 123 raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, 124 _raid0_submit_rw_request); 125 } else if (ret != 0) { 126 SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); 127 assert(false); 128 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 129 } 130 } 131 132 /* raid0 IO range */ 133 struct raid_bdev_io_range { 134 uint64_t strip_size; 135 uint64_t start_strip_in_disk; 136 uint64_t end_strip_in_disk; 137 uint64_t start_offset_in_strip; 138 uint64_t end_offset_in_strip; 139 uint8_t start_disk; 140 uint8_t end_disk; 141 uint8_t n_disks_involved; 142 }; 143 144 static inline void 145 _raid0_get_io_range(struct raid_bdev_io_range *io_range, 146 uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift, 147 uint64_t offset_blocks, uint64_t num_blocks) 148 { 149 uint64_t start_strip; 150 uint64_t end_strip; 151 152 io_range->strip_size = strip_size; 153 154 /* The start and end strip index in raid0 bdev scope */ 155 start_strip = offset_blocks >> strip_size_shift; 156 end_strip = (offset_blocks + num_blocks - 1) >> strip_size_shift; 157 io_range->start_strip_in_disk = start_strip / num_base_bdevs; 158 io_range->end_strip_in_disk = end_strip / num_base_bdevs; 159 160 /* The first strip may have unaligned start LBA offset. 161 * The end strip may have unaligned end LBA offset. 162 * Strips between them certainly have aligned offset and length to boundaries. 163 */ 164 io_range->start_offset_in_strip = offset_blocks % strip_size; 165 io_range->end_offset_in_strip = (offset_blocks + num_blocks - 1) % strip_size; 166 167 /* The base bdev indexes in which start and end strips are located */ 168 io_range->start_disk = start_strip % num_base_bdevs; 169 io_range->end_disk = end_strip % num_base_bdevs; 170 171 /* Calculate how many base_bdevs are involved in io operation. 172 * Number of base bdevs involved is between 1 and num_base_bdevs. 173 * It will be 1 if the first strip and last strip are the same one. 174 */ 175 io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs); 176 } 177 178 static inline void 179 _raid0_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx, 180 uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk) 181 { 182 uint64_t n_strips_in_disk; 183 uint64_t start_offset_in_disk; 184 uint64_t end_offset_in_disk; 185 uint64_t offset_in_disk; 186 uint64_t nblocks_in_disk; 187 uint64_t start_strip_in_disk; 188 uint64_t end_strip_in_disk; 189 190 start_strip_in_disk = io_range->start_strip_in_disk; 191 if (disk_idx < io_range->start_disk) { 192 start_strip_in_disk += 1; 193 } 194 195 end_strip_in_disk = io_range->end_strip_in_disk; 196 if (disk_idx > io_range->end_disk) { 197 end_strip_in_disk -= 1; 198 } 199 200 assert(end_strip_in_disk >= start_strip_in_disk); 201 n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1; 202 203 if (disk_idx == io_range->start_disk) { 204 start_offset_in_disk = io_range->start_offset_in_strip; 205 } else { 206 start_offset_in_disk = 0; 207 } 208 209 if (disk_idx == io_range->end_disk) { 210 end_offset_in_disk = io_range->end_offset_in_strip; 211 } else { 212 end_offset_in_disk = io_range->strip_size - 1; 213 } 214 215 offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size; 216 nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size 217 + end_offset_in_disk - start_offset_in_disk + 1; 218 219 SPDK_DEBUGLOG(bdev_raid0, 220 "raid_bdev (strip_size 0x%" PRIx64 ") splits IO to base_bdev (%u) at (0x%" PRIx64 ", 0x%" PRIx64 221 ").\n", 222 io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk); 223 224 *_offset_in_disk = offset_in_disk; 225 *_nblocks_in_disk = nblocks_in_disk; 226 } 227 228 static void 229 raid0_submit_null_payload_request(struct raid_bdev_io *raid_io); 230 231 static void 232 _raid0_submit_null_payload_request(void *_raid_io) 233 { 234 struct raid_bdev_io *raid_io = _raid_io; 235 236 raid0_submit_null_payload_request(raid_io); 237 } 238 239 static void 240 raid0_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 241 { 242 struct raid_bdev_io *raid_io = cb_arg; 243 244 raid_bdev_io_complete_part(raid_io, 1, success ? 245 SPDK_BDEV_IO_STATUS_SUCCESS : 246 SPDK_BDEV_IO_STATUS_FAILED); 247 248 spdk_bdev_free_io(bdev_io); 249 } 250 251 /* 252 * brief: 253 * raid0_submit_null_payload_request function submits the next batch of 254 * io requests with range but without payload, like FLUSH and UNMAP, to member disks; 255 * it will submit as many as possible unless one base io request fails with -ENOMEM, 256 * in which case it will queue itself for later submission. 257 * params: 258 * bdev_io - pointer to parent bdev_io on raid bdev device 259 * returns: 260 * none 261 */ 262 static void 263 raid0_submit_null_payload_request(struct raid_bdev_io *raid_io) 264 { 265 struct spdk_bdev_io *bdev_io; 266 struct raid_bdev *raid_bdev; 267 struct raid_bdev_io_range io_range; 268 int ret; 269 struct raid_base_bdev_info *base_info; 270 struct spdk_io_channel *base_ch; 271 272 bdev_io = spdk_bdev_io_from_ctx(raid_io); 273 raid_bdev = raid_io->raid_bdev; 274 275 _raid0_get_io_range(&io_range, raid_bdev->num_base_bdevs, 276 raid_bdev->strip_size, raid_bdev->strip_size_shift, 277 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks); 278 279 if (raid_io->base_bdev_io_remaining == 0) { 280 raid_io->base_bdev_io_remaining = io_range.n_disks_involved; 281 } 282 283 while (raid_io->base_bdev_io_submitted < io_range.n_disks_involved) { 284 uint8_t disk_idx; 285 uint64_t offset_in_disk; 286 uint64_t nblocks_in_disk; 287 288 /* base_bdev is started from start_disk to end_disk. 289 * It is possible that index of start_disk is larger than end_disk's. 290 */ 291 disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs; 292 base_info = &raid_bdev->base_bdev_info[disk_idx]; 293 base_ch = raid_io->raid_ch->base_channel[disk_idx]; 294 295 _raid0_split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk); 296 297 switch (bdev_io->type) { 298 case SPDK_BDEV_IO_TYPE_UNMAP: 299 ret = spdk_bdev_unmap_blocks(base_info->desc, base_ch, 300 offset_in_disk, nblocks_in_disk, 301 raid0_base_io_complete, raid_io); 302 break; 303 304 case SPDK_BDEV_IO_TYPE_FLUSH: 305 ret = spdk_bdev_flush_blocks(base_info->desc, base_ch, 306 offset_in_disk, nblocks_in_disk, 307 raid0_base_io_complete, raid_io); 308 break; 309 310 default: 311 SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type); 312 assert(false); 313 ret = -EIO; 314 } 315 316 if (ret == 0) { 317 raid_io->base_bdev_io_submitted++; 318 } else if (ret == -ENOMEM) { 319 raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, 320 _raid0_submit_null_payload_request); 321 return; 322 } else { 323 SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); 324 assert(false); 325 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 326 return; 327 } 328 } 329 } 330 331 static int raid0_start(struct raid_bdev *raid_bdev) 332 { 333 uint64_t min_blockcnt = UINT64_MAX; 334 struct raid_base_bdev_info *base_info; 335 336 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 337 /* Calculate minimum block count from all base bdevs */ 338 min_blockcnt = spdk_min(min_blockcnt, base_info->bdev->blockcnt); 339 } 340 341 /* 342 * Take the minimum block count based approach where total block count 343 * of raid bdev is the number of base bdev times the minimum block count 344 * of any base bdev. 345 */ 346 SPDK_DEBUGLOG(bdev_raid0, "min blockcount %" PRIu64 ", numbasedev %u, strip size shift %u\n", 347 min_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift); 348 raid_bdev->bdev.blockcnt = ((min_blockcnt >> raid_bdev->strip_size_shift) << 349 raid_bdev->strip_size_shift) * raid_bdev->num_base_bdevs; 350 351 if (raid_bdev->num_base_bdevs > 1) { 352 raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size; 353 raid_bdev->bdev.split_on_optimal_io_boundary = true; 354 } else { 355 /* Do not need to split reads/writes on single bdev RAID modules. */ 356 raid_bdev->bdev.optimal_io_boundary = 0; 357 raid_bdev->bdev.split_on_optimal_io_boundary = false; 358 } 359 360 return 0; 361 } 362 363 static struct raid_bdev_module g_raid0_module = { 364 .level = RAID0, 365 .base_bdevs_min = 1, 366 .start = raid0_start, 367 .submit_rw_request = raid0_submit_rw_request, 368 .submit_null_payload_request = raid0_submit_null_payload_request, 369 }; 370 RAID_MODULE_REGISTER(&g_raid0_module) 371 372 SPDK_LOG_REGISTER_COMPONENT(bdev_raid0) 373