1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "bdev_raid.h" 35 36 #include "spdk/env.h" 37 #include "spdk/thread.h" 38 #include "spdk/string.h" 39 #include "spdk/util.h" 40 41 #include "spdk_internal/log.h" 42 43 /* 44 * brief: 45 * raid0_bdev_io_completion function is called by lower layers to notify raid 46 * module that particular bdev_io is completed. 47 * params: 48 * bdev_io - pointer to bdev io submitted to lower layers, like child io 49 * success - bdev_io status 50 * cb_arg - function callback context (parent raid_bdev_io) 51 * returns: 52 * none 53 */ 54 static void 55 raid0_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 56 { 57 struct raid_bdev_io *raid_io = cb_arg; 58 59 spdk_bdev_free_io(bdev_io); 60 61 if (success) { 62 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS); 63 } else { 64 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 65 } 66 } 67 68 static void 69 raid0_submit_rw_request(struct raid_bdev_io *raid_io); 70 71 static void 72 _raid0_submit_rw_request(void *_raid_io) 73 { 74 struct raid_bdev_io *raid_io = _raid_io; 75 76 raid0_submit_rw_request(raid_io); 77 } 78 79 /* 80 * brief: 81 * raid0_submit_rw_request function is used to submit I/O to the correct 82 * member disk for raid0 bdevs. 83 * params: 84 * raid_io 85 * returns: 86 * none 87 */ 88 static void 89 raid0_submit_rw_request(struct raid_bdev_io *raid_io) 90 { 91 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); 92 struct raid_bdev_io_channel *raid_ch = raid_io->raid_ch; 93 struct raid_bdev *raid_bdev = raid_io->raid_bdev; 94 uint64_t pd_strip; 95 uint32_t offset_in_strip; 96 uint64_t pd_lba; 97 uint64_t pd_blocks; 98 uint8_t pd_idx; 99 int ret = 0; 100 uint64_t start_strip; 101 uint64_t end_strip; 102 struct raid_base_bdev_info *base_info; 103 struct spdk_io_channel *base_ch; 104 105 start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift; 106 end_strip = (bdev_io->u.bdev.offset_blocks + bdev_io->u.bdev.num_blocks - 1) >> 107 raid_bdev->strip_size_shift; 108 if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) { 109 assert(false); 110 SPDK_ERRLOG("I/O spans strip boundary!\n"); 111 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 112 return; 113 } 114 115 pd_strip = start_strip / raid_bdev->num_base_bdevs; 116 pd_idx = start_strip % raid_bdev->num_base_bdevs; 117 offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1); 118 pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip; 119 pd_blocks = bdev_io->u.bdev.num_blocks; 120 base_info = &raid_bdev->base_bdev_info[pd_idx]; 121 if (base_info->desc == NULL) { 122 SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx); 123 assert(0); 124 } 125 126 /* 127 * Submit child io to bdev layer with using base bdev descriptors, base 128 * bdev lba, base bdev child io length in blocks, buffer, completion 129 * function and function callback context 130 */ 131 assert(raid_ch != NULL); 132 assert(raid_ch->base_channel); 133 base_ch = raid_ch->base_channel[pd_idx]; 134 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 135 ret = spdk_bdev_readv_blocks(base_info->desc, base_ch, 136 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 137 pd_lba, pd_blocks, raid0_bdev_io_completion, 138 raid_io); 139 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 140 ret = spdk_bdev_writev_blocks(base_info->desc, base_ch, 141 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 142 pd_lba, pd_blocks, raid0_bdev_io_completion, 143 raid_io); 144 } else { 145 SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type); 146 assert(0); 147 } 148 149 if (ret == -ENOMEM) { 150 raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, 151 _raid0_submit_rw_request); 152 } else if (ret != 0) { 153 SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); 154 assert(false); 155 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 156 } 157 } 158 159 /* raid0 IO range */ 160 struct raid_bdev_io_range { 161 uint64_t strip_size; 162 uint64_t start_strip_in_disk; 163 uint64_t end_strip_in_disk; 164 uint64_t start_offset_in_strip; 165 uint64_t end_offset_in_strip; 166 uint8_t start_disk; 167 uint8_t end_disk; 168 uint8_t n_disks_involved; 169 }; 170 171 static inline void 172 _raid0_get_io_range(struct raid_bdev_io_range *io_range, 173 uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift, 174 uint64_t offset_blocks, uint64_t num_blocks) 175 { 176 uint64_t start_strip; 177 uint64_t end_strip; 178 179 io_range->strip_size = strip_size; 180 181 /* The start and end strip index in raid0 bdev scope */ 182 start_strip = offset_blocks >> strip_size_shift; 183 end_strip = (offset_blocks + num_blocks - 1) >> strip_size_shift; 184 io_range->start_strip_in_disk = start_strip / num_base_bdevs; 185 io_range->end_strip_in_disk = end_strip / num_base_bdevs; 186 187 /* The first strip may have unaligned start LBA offset. 188 * The end strip may have unaligned end LBA offset. 189 * Strips between them certainly have aligned offset and length to boundaries. 190 */ 191 io_range->start_offset_in_strip = offset_blocks % strip_size; 192 io_range->end_offset_in_strip = (offset_blocks + num_blocks - 1) % strip_size; 193 194 /* The base bdev indexes in which start and end strips are located */ 195 io_range->start_disk = start_strip % num_base_bdevs; 196 io_range->end_disk = end_strip % num_base_bdevs; 197 198 /* Calculate how many base_bdevs are involved in io operation. 199 * Number of base bdevs involved is between 1 and num_base_bdevs. 200 * It will be 1 if the first strip and last strip are the same one. 201 */ 202 io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs); 203 } 204 205 static inline void 206 _raid0_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx, 207 uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk) 208 { 209 uint64_t n_strips_in_disk; 210 uint64_t start_offset_in_disk; 211 uint64_t end_offset_in_disk; 212 uint64_t offset_in_disk; 213 uint64_t nblocks_in_disk; 214 uint64_t start_strip_in_disk; 215 uint64_t end_strip_in_disk; 216 217 start_strip_in_disk = io_range->start_strip_in_disk; 218 if (disk_idx < io_range->start_disk) { 219 start_strip_in_disk += 1; 220 } 221 222 end_strip_in_disk = io_range->end_strip_in_disk; 223 if (disk_idx > io_range->end_disk) { 224 end_strip_in_disk -= 1; 225 } 226 227 assert(end_strip_in_disk >= start_strip_in_disk); 228 n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1; 229 230 if (disk_idx == io_range->start_disk) { 231 start_offset_in_disk = io_range->start_offset_in_strip; 232 } else { 233 start_offset_in_disk = 0; 234 } 235 236 if (disk_idx == io_range->end_disk) { 237 end_offset_in_disk = io_range->end_offset_in_strip; 238 } else { 239 end_offset_in_disk = io_range->strip_size - 1; 240 } 241 242 offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size; 243 nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size 244 + end_offset_in_disk - start_offset_in_disk + 1; 245 246 SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID0, 247 "raid_bdev (strip_size 0x%lx) splits IO to base_bdev (%u) at (0x%lx, 0x%lx).\n", 248 io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk); 249 250 *_offset_in_disk = offset_in_disk; 251 *_nblocks_in_disk = nblocks_in_disk; 252 } 253 254 static void 255 raid0_submit_null_payload_request(struct raid_bdev_io *raid_io); 256 257 static void 258 _raid0_submit_null_payload_request(void *_raid_io) 259 { 260 struct raid_bdev_io *raid_io = _raid_io; 261 262 raid0_submit_null_payload_request(raid_io); 263 } 264 265 static void 266 raid0_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 267 { 268 struct raid_bdev_io *raid_io = cb_arg; 269 270 raid_bdev_io_complete_part(raid_io, 1, success ? 271 SPDK_BDEV_IO_STATUS_SUCCESS : 272 SPDK_BDEV_IO_STATUS_FAILED); 273 274 spdk_bdev_free_io(bdev_io); 275 } 276 277 /* 278 * brief: 279 * raid0_submit_null_payload_request function submits the next batch of 280 * io requests with range but without payload, like FLUSH and UNMAP, to member disks; 281 * it will submit as many as possible unless one base io request fails with -ENOMEM, 282 * in which case it will queue itself for later submission. 283 * params: 284 * bdev_io - pointer to parent bdev_io on raid bdev device 285 * returns: 286 * none 287 */ 288 static void 289 raid0_submit_null_payload_request(struct raid_bdev_io *raid_io) 290 { 291 struct spdk_bdev_io *bdev_io; 292 struct raid_bdev *raid_bdev; 293 struct raid_bdev_io_range io_range; 294 int ret; 295 struct raid_base_bdev_info *base_info; 296 struct spdk_io_channel *base_ch; 297 298 bdev_io = spdk_bdev_io_from_ctx(raid_io); 299 raid_bdev = raid_io->raid_bdev; 300 301 _raid0_get_io_range(&io_range, raid_bdev->num_base_bdevs, 302 raid_bdev->strip_size, raid_bdev->strip_size_shift, 303 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks); 304 305 if (raid_io->base_bdev_io_remaining == 0) { 306 raid_io->base_bdev_io_remaining = io_range.n_disks_involved; 307 } 308 309 while (raid_io->base_bdev_io_submitted < io_range.n_disks_involved) { 310 uint8_t disk_idx; 311 uint64_t offset_in_disk; 312 uint64_t nblocks_in_disk; 313 314 /* base_bdev is started from start_disk to end_disk. 315 * It is possible that index of start_disk is larger than end_disk's. 316 */ 317 disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs; 318 base_info = &raid_bdev->base_bdev_info[disk_idx]; 319 base_ch = raid_io->raid_ch->base_channel[disk_idx]; 320 321 _raid0_split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk); 322 323 switch (bdev_io->type) { 324 case SPDK_BDEV_IO_TYPE_UNMAP: 325 ret = spdk_bdev_unmap_blocks(base_info->desc, base_ch, 326 offset_in_disk, nblocks_in_disk, 327 raid0_base_io_complete, raid_io); 328 break; 329 330 case SPDK_BDEV_IO_TYPE_FLUSH: 331 ret = spdk_bdev_flush_blocks(base_info->desc, base_ch, 332 offset_in_disk, nblocks_in_disk, 333 raid0_base_io_complete, raid_io); 334 break; 335 336 default: 337 SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type); 338 assert(false); 339 ret = -EIO; 340 } 341 342 if (ret == 0) { 343 raid_io->base_bdev_io_submitted++; 344 } else if (ret == -ENOMEM) { 345 raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, 346 _raid0_submit_null_payload_request); 347 return; 348 } else { 349 SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); 350 assert(false); 351 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 352 return; 353 } 354 } 355 } 356 357 static int raid0_start(struct raid_bdev *raid_bdev) 358 { 359 uint64_t min_blockcnt = UINT64_MAX; 360 struct raid_base_bdev_info *base_info; 361 362 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 363 /* Calculate minimum block count from all base bdevs */ 364 min_blockcnt = spdk_min(min_blockcnt, base_info->bdev->blockcnt); 365 } 366 367 /* 368 * Take the minimum block count based approach where total block count 369 * of raid bdev is the number of base bdev times the minimum block count 370 * of any base bdev. 371 */ 372 SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID0, "min blockcount %lu, numbasedev %u, strip size shift %u\n", 373 min_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift); 374 raid_bdev->bdev.blockcnt = ((min_blockcnt >> raid_bdev->strip_size_shift) << 375 raid_bdev->strip_size_shift) * raid_bdev->num_base_bdevs; 376 377 if (raid_bdev->num_base_bdevs > 1) { 378 raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size; 379 raid_bdev->bdev.split_on_optimal_io_boundary = true; 380 } else { 381 /* Do not need to split reads/writes on single bdev RAID modules. */ 382 raid_bdev->bdev.optimal_io_boundary = 0; 383 raid_bdev->bdev.split_on_optimal_io_boundary = false; 384 } 385 386 return 0; 387 } 388 389 static struct raid_bdev_module g_raid0_module = { 390 .level = RAID0, 391 .base_bdevs_min = 1, 392 .start = raid0_start, 393 .submit_rw_request = raid0_submit_rw_request, 394 .submit_null_payload_request = raid0_submit_null_payload_request, 395 }; 396 RAID_MODULE_REGISTER(&g_raid0_module) 397 398 SPDK_LOG_REGISTER_COMPONENT("bdev_raid0", SPDK_LOG_BDEV_RAID0) 399