1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2022 Intel Corporation. 3 * Copyright (c) Peng Yu yupeng0921@gmail.com. 4 * All rights reserved. 5 */ 6 7 #include "bdev_raid.h" 8 9 #include "spdk/env.h" 10 #include "spdk/thread.h" 11 #include "spdk/string.h" 12 #include "spdk/util.h" 13 14 #include "spdk/log.h" 15 16 struct concat_block_range { 17 uint64_t start; 18 uint64_t length; 19 }; 20 21 /* 22 * brief: 23 * concat_bdev_io_completion function is called by lower layers to notify raid 24 * module that particular bdev_io is completed. 25 * params: 26 * bdev_io - pointer to bdev io submitted to lower layers, like child io 27 * success - bdev_io status 28 * cb_arg - function callback context (parent raid_bdev_io) 29 * returns: 30 * none 31 */ 32 static void 33 concat_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 34 { 35 struct raid_bdev_io *raid_io = cb_arg; 36 37 spdk_bdev_free_io(bdev_io); 38 39 if (success) { 40 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS); 41 } else { 42 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 43 } 44 } 45 46 static void concat_submit_rw_request(struct raid_bdev_io *raid_io); 47 48 static void 49 _concat_submit_rw_request(void *_raid_io) 50 { 51 struct raid_bdev_io *raid_io = _raid_io; 52 53 concat_submit_rw_request(raid_io); 54 } 55 56 /* 57 * brief: 58 * concat_submit_rw_request function is used to submit I/O to the correct 59 * member disk for concat bdevs. 60 * params: 61 * raid_io 62 * returns: 63 * none 64 */ 65 static void 66 concat_submit_rw_request(struct raid_bdev_io *raid_io) 67 { 68 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); 69 struct raid_bdev_io_channel *raid_ch = raid_io->raid_ch; 70 struct raid_bdev *raid_bdev = raid_io->raid_bdev; 71 struct concat_block_range *block_range = raid_bdev->module_private; 72 uint64_t pd_lba; 73 uint64_t pd_blocks; 74 int pd_idx; 75 int ret = 0; 76 struct raid_base_bdev_info *base_info; 77 struct spdk_io_channel *base_ch; 78 struct spdk_bdev_ext_io_opts io_opts = {}; 79 int i; 80 81 pd_idx = -1; 82 for (i = 0; i < raid_bdev->num_base_bdevs; i++) { 83 if (block_range[i].start > bdev_io->u.bdev.offset_blocks) { 84 break; 85 } 86 pd_idx = i; 87 } 88 assert(pd_idx >= 0); 89 assert(bdev_io->u.bdev.offset_blocks >= block_range[pd_idx].start); 90 pd_lba = bdev_io->u.bdev.offset_blocks - block_range[pd_idx].start; 91 pd_blocks = bdev_io->u.bdev.num_blocks; 92 base_info = &raid_bdev->base_bdev_info[pd_idx]; 93 if (base_info->desc == NULL) { 94 SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx); 95 assert(0); 96 } 97 98 /* 99 * Submit child io to bdev layer with using base bdev descriptors, base 100 * bdev lba, base bdev child io length in blocks, buffer, completion 101 * function and function callback context 102 */ 103 assert(raid_ch != NULL); 104 assert(raid_ch->base_channel); 105 base_ch = raid_ch->base_channel[pd_idx]; 106 107 io_opts.size = sizeof(io_opts); 108 io_opts.memory_domain = bdev_io->u.bdev.memory_domain; 109 io_opts.memory_domain_ctx = bdev_io->u.bdev.memory_domain_ctx; 110 io_opts.metadata = bdev_io->u.bdev.md_buf; 111 112 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 113 ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch, 114 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 115 pd_lba, pd_blocks, concat_bdev_io_completion, 116 raid_io, &io_opts); 117 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 118 ret = spdk_bdev_writev_blocks_ext(base_info->desc, base_ch, 119 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 120 pd_lba, pd_blocks, concat_bdev_io_completion, 121 raid_io, &io_opts); 122 } else { 123 SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type); 124 assert(0); 125 } 126 127 if (ret == -ENOMEM) { 128 raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc), 129 base_ch, _concat_submit_rw_request); 130 } else if (ret != 0) { 131 SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); 132 assert(false); 133 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 134 } 135 } 136 137 static void concat_submit_null_payload_request(struct raid_bdev_io *raid_io); 138 139 static void 140 _concat_submit_null_payload_request(void *_raid_io) 141 { 142 struct raid_bdev_io *raid_io = _raid_io; 143 144 concat_submit_null_payload_request(raid_io); 145 } 146 147 static void 148 concat_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 149 { 150 struct raid_bdev_io *raid_io = cb_arg; 151 152 raid_bdev_io_complete_part(raid_io, 1, success ? 153 SPDK_BDEV_IO_STATUS_SUCCESS : 154 SPDK_BDEV_IO_STATUS_FAILED); 155 156 spdk_bdev_free_io(bdev_io); 157 } 158 159 /* 160 * brief: 161 * concat_submit_null_payload_request function submits the next batch of 162 * io requests with range but without payload, like FLUSH and UNMAP, to member disks; 163 * it will submit as many as possible unless one base io request fails with -ENOMEM, 164 * in which case it will queue itself for later submission. 165 * params: 166 * bdev_io - pointer to parent bdev_io on raid bdev device 167 * returns: 168 * none 169 */ 170 static void 171 concat_submit_null_payload_request(struct raid_bdev_io *raid_io) 172 { 173 struct spdk_bdev_io *bdev_io; 174 struct raid_bdev *raid_bdev; 175 int ret; 176 struct raid_base_bdev_info *base_info; 177 struct spdk_io_channel *base_ch; 178 uint64_t pd_lba; 179 uint64_t pd_blocks; 180 uint64_t offset_blocks; 181 uint64_t num_blocks; 182 struct concat_block_range *block_range; 183 int i, start_idx, stop_idx; 184 185 bdev_io = spdk_bdev_io_from_ctx(raid_io); 186 raid_bdev = raid_io->raid_bdev; 187 block_range = raid_bdev->module_private; 188 189 offset_blocks = bdev_io->u.bdev.offset_blocks; 190 num_blocks = bdev_io->u.bdev.num_blocks; 191 start_idx = -1; 192 stop_idx = -1; 193 /* 194 * Go through all base bdevs, find the first bdev and the last bdev 195 */ 196 for (i = 0; i < raid_bdev->num_base_bdevs; i++) { 197 /* skip the bdevs before the offset_blocks */ 198 if (offset_blocks >= block_range[i].start + block_range[i].length) { 199 continue; 200 } 201 if (start_idx == -1) { 202 start_idx = i; 203 } else { 204 /* 205 * The offset_blocks might be at the middle of the first bdev. 206 * Besides the first bdev, the offset_blocks should be always 207 * at the start of the bdev. 208 */ 209 assert(offset_blocks == block_range[i].start); 210 } 211 pd_lba = offset_blocks - block_range[i].start; 212 pd_blocks = spdk_min(num_blocks, block_range[i].length - pd_lba); 213 offset_blocks += pd_blocks; 214 num_blocks -= pd_blocks; 215 if (num_blocks == 0) { 216 stop_idx = i; 217 break; 218 } 219 } 220 assert(start_idx >= 0); 221 assert(stop_idx >= 0); 222 223 if (raid_io->base_bdev_io_remaining == 0) { 224 raid_io->base_bdev_io_remaining = stop_idx - start_idx + 1; 225 } 226 offset_blocks = bdev_io->u.bdev.offset_blocks; 227 num_blocks = bdev_io->u.bdev.num_blocks; 228 for (i = start_idx; i <= stop_idx; i++) { 229 assert(offset_blocks >= block_range[i].start); 230 assert(offset_blocks < block_range[i].start + block_range[i].length); 231 pd_lba = offset_blocks - block_range[i].start; 232 pd_blocks = spdk_min(num_blocks, block_range[i].length - pd_lba); 233 offset_blocks += pd_blocks; 234 num_blocks -= pd_blocks; 235 /* 236 * Skip the IOs we have submitted 237 */ 238 if (i < start_idx + raid_io->base_bdev_io_submitted) { 239 continue; 240 } 241 base_info = &raid_bdev->base_bdev_info[i]; 242 base_ch = raid_io->raid_ch->base_channel[i]; 243 switch (bdev_io->type) { 244 case SPDK_BDEV_IO_TYPE_UNMAP: 245 ret = spdk_bdev_unmap_blocks(base_info->desc, base_ch, 246 pd_lba, pd_blocks, 247 concat_base_io_complete, raid_io); 248 break; 249 case SPDK_BDEV_IO_TYPE_FLUSH: 250 ret = spdk_bdev_flush_blocks(base_info->desc, base_ch, 251 pd_lba, pd_blocks, 252 concat_base_io_complete, raid_io); 253 break; 254 default: 255 SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type); 256 assert(false); 257 ret = -EIO; 258 } 259 if (ret == 0) { 260 raid_io->base_bdev_io_submitted++; 261 } else if (ret == -ENOMEM) { 262 raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc), 263 base_ch, _concat_submit_null_payload_request); 264 return; 265 } else { 266 SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); 267 assert(false); 268 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 269 return; 270 } 271 } 272 } 273 274 static int 275 concat_start(struct raid_bdev *raid_bdev) 276 { 277 uint64_t total_blockcnt = 0; 278 struct raid_base_bdev_info *base_info; 279 struct concat_block_range *block_range; 280 281 block_range = calloc(raid_bdev->num_base_bdevs, sizeof(struct concat_block_range)); 282 if (!block_range) { 283 SPDK_ERRLOG("Can not allocate block_range, num_base_bdevs: %u", 284 raid_bdev->num_base_bdevs); 285 return -ENOMEM; 286 } 287 288 int idx = 0; 289 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 290 uint64_t strip_cnt = spdk_bdev_desc_get_bdev(base_info->desc)->blockcnt >> 291 raid_bdev->strip_size_shift; 292 uint64_t pd_block_cnt = strip_cnt << raid_bdev->strip_size_shift; 293 294 block_range[idx].start = total_blockcnt; 295 block_range[idx].length = pd_block_cnt; 296 total_blockcnt += pd_block_cnt; 297 idx++; 298 } 299 300 raid_bdev->module_private = block_range; 301 302 SPDK_DEBUGLOG(bdev_concat, "total blockcount %" PRIu64 ", numbasedev %u, strip size shift %u\n", 303 total_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift); 304 raid_bdev->bdev.blockcnt = total_blockcnt; 305 306 raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size; 307 raid_bdev->bdev.split_on_optimal_io_boundary = true; 308 309 return 0; 310 } 311 312 static bool 313 concat_stop(struct raid_bdev *raid_bdev) 314 { 315 struct concat_block_range *block_range = raid_bdev->module_private; 316 317 free(block_range); 318 319 return true; 320 } 321 322 static struct raid_bdev_module g_concat_module = { 323 .level = CONCAT, 324 .base_bdevs_min = 1, 325 .memory_domains_supported = true, 326 .start = concat_start, 327 .stop = concat_stop, 328 .submit_rw_request = concat_submit_rw_request, 329 .submit_null_payload_request = concat_submit_null_payload_request, 330 }; 331 RAID_MODULE_REGISTER(&g_concat_module) 332 333 SPDK_LOG_REGISTER_COMPONENT(bdev_concat) 334