1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2022 Intel Corporation. 3 * All rights reserved. 4 */ 5 6 #include "bdev_raid.h" 7 8 #include "spdk/env.h" 9 #include "spdk/thread.h" 10 #include "spdk/string.h" 11 #include "spdk/util.h" 12 #include "spdk/likely.h" 13 #include "spdk/log.h" 14 #include "spdk/accel.h" 15 16 /* Maximum concurrent full stripe writes per io channel */ 17 #define RAID5F_MAX_STRIPES 32 18 19 struct chunk { 20 /* Corresponds to base_bdev index */ 21 uint8_t index; 22 23 /* Array of iovecs */ 24 struct iovec *iovs; 25 26 /* Number of used iovecs */ 27 int iovcnt; 28 29 /* Total number of available iovecs in the array */ 30 int iovcnt_max; 31 32 /* Pointer to buffer with I/O metadata */ 33 void *md_buf; 34 35 /* Shallow copy of IO request parameters */ 36 struct spdk_bdev_ext_io_opts ext_opts; 37 }; 38 39 struct stripe_request; 40 typedef void (*stripe_req_xor_cb)(struct stripe_request *stripe_req, int status); 41 42 struct stripe_request { 43 enum stripe_request_type { 44 STRIPE_REQ_WRITE, 45 STRIPE_REQ_RECONSTRUCT, 46 } type; 47 48 struct raid5f_io_channel *r5ch; 49 50 /* The associated raid_bdev_io */ 51 struct raid_bdev_io *raid_io; 52 53 /* The stripe's index in the raid array. */ 54 uint64_t stripe_index; 55 56 /* The stripe's parity chunk */ 57 struct chunk *parity_chunk; 58 59 union { 60 struct { 61 /* Buffer for stripe parity */ 62 void *parity_buf; 63 64 /* Buffer for stripe io metadata parity */ 65 void *parity_md_buf; 66 } write; 67 68 struct { 69 /* Array of buffers for reading chunk data */ 70 void **chunk_buffers; 71 72 /* Array of buffers for reading chunk metadata */ 73 void **chunk_md_buffers; 74 75 /* Chunk to reconstruct from parity */ 76 struct chunk *chunk; 77 78 /* Offset from chunk start */ 79 uint64_t chunk_offset; 80 } reconstruct; 81 }; 82 83 /* Array of iovec iterators for each chunk */ 84 struct spdk_ioviter *chunk_iov_iters; 85 86 /* Array of source buffer pointers for parity calculation */ 87 void **chunk_xor_buffers; 88 89 /* Array of source buffer pointers for parity calculation of io metadata */ 90 void **chunk_xor_md_buffers; 91 92 struct { 93 size_t len; 94 size_t remaining; 95 size_t remaining_md; 96 int status; 97 stripe_req_xor_cb cb; 98 } xor; 99 100 TAILQ_ENTRY(stripe_request) link; 101 102 /* Array of chunks corresponding to base_bdevs */ 103 struct chunk chunks[0]; 104 }; 105 106 struct raid5f_info { 107 /* The parent raid bdev */ 108 struct raid_bdev *raid_bdev; 109 110 /* Number of data blocks in a stripe (without parity) */ 111 uint64_t stripe_blocks; 112 113 /* Number of stripes on this array */ 114 uint64_t total_stripes; 115 116 /* Alignment for buffer allocation */ 117 size_t buf_alignment; 118 }; 119 120 struct raid5f_io_channel { 121 /* All available stripe requests on this channel */ 122 struct { 123 TAILQ_HEAD(, stripe_request) write; 124 TAILQ_HEAD(, stripe_request) reconstruct; 125 } free_stripe_requests; 126 127 /* accel_fw channel */ 128 struct spdk_io_channel *accel_ch; 129 130 /* For retrying xor if accel_ch runs out of resources */ 131 TAILQ_HEAD(, stripe_request) xor_retry_queue; 132 133 /* For iterating over chunk iovecs during xor calculation */ 134 void **chunk_xor_buffers; 135 struct iovec **chunk_xor_iovs; 136 size_t *chunk_xor_iovcnt; 137 }; 138 139 #define __CHUNK_IN_RANGE(req, c) \ 140 c < req->chunks + raid5f_ch_to_r5f_info(req->r5ch)->raid_bdev->num_base_bdevs 141 142 #define FOR_EACH_CHUNK_FROM(req, c, from) \ 143 for (c = from; __CHUNK_IN_RANGE(req, c); c++) 144 145 #define FOR_EACH_CHUNK(req, c) \ 146 FOR_EACH_CHUNK_FROM(req, c, req->chunks) 147 148 #define __NEXT_DATA_CHUNK(req, c) \ 149 c == req->parity_chunk ? c+1 : c 150 151 #define FOR_EACH_DATA_CHUNK(req, c) \ 152 for (c = __NEXT_DATA_CHUNK(req, req->chunks); __CHUNK_IN_RANGE(req, c); \ 153 c = __NEXT_DATA_CHUNK(req, c+1)) 154 155 static inline struct raid5f_info * 156 raid5f_ch_to_r5f_info(struct raid5f_io_channel *r5ch) 157 { 158 return spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(r5ch)); 159 } 160 161 static inline struct stripe_request * 162 raid5f_chunk_stripe_req(struct chunk *chunk) 163 { 164 return SPDK_CONTAINEROF((chunk - chunk->index), struct stripe_request, chunks); 165 } 166 167 static inline uint8_t 168 raid5f_stripe_data_chunks_num(const struct raid_bdev *raid_bdev) 169 { 170 return raid_bdev->min_base_bdevs_operational; 171 } 172 173 static inline uint8_t 174 raid5f_stripe_parity_chunk_index(const struct raid_bdev *raid_bdev, uint64_t stripe_index) 175 { 176 return raid5f_stripe_data_chunks_num(raid_bdev) - stripe_index % raid_bdev->num_base_bdevs; 177 } 178 179 static inline void 180 raid5f_stripe_request_release(struct stripe_request *stripe_req) 181 { 182 if (spdk_likely(stripe_req->type == STRIPE_REQ_WRITE)) { 183 TAILQ_INSERT_HEAD(&stripe_req->r5ch->free_stripe_requests.write, stripe_req, link); 184 } else if (stripe_req->type == STRIPE_REQ_RECONSTRUCT) { 185 TAILQ_INSERT_HEAD(&stripe_req->r5ch->free_stripe_requests.reconstruct, stripe_req, link); 186 } else { 187 assert(false); 188 } 189 } 190 191 static void raid5f_xor_stripe_retry(struct stripe_request *stripe_req); 192 193 static void 194 raid5f_xor_stripe_done(struct stripe_request *stripe_req) 195 { 196 struct raid5f_io_channel *r5ch = stripe_req->r5ch; 197 198 if (stripe_req->xor.status != 0) { 199 SPDK_ERRLOG("stripe xor failed: %s\n", spdk_strerror(-stripe_req->xor.status)); 200 } 201 202 stripe_req->xor.cb(stripe_req, stripe_req->xor.status); 203 204 if (!TAILQ_EMPTY(&r5ch->xor_retry_queue)) { 205 stripe_req = TAILQ_FIRST(&r5ch->xor_retry_queue); 206 TAILQ_REMOVE(&r5ch->xor_retry_queue, stripe_req, link); 207 raid5f_xor_stripe_retry(stripe_req); 208 } 209 } 210 211 static void raid5f_xor_stripe_continue(struct stripe_request *stripe_req); 212 213 static void 214 _raid5f_xor_stripe_cb(struct stripe_request *stripe_req, int status) 215 { 216 if (status != 0) { 217 stripe_req->xor.status = status; 218 } 219 220 if (stripe_req->xor.remaining + stripe_req->xor.remaining_md == 0) { 221 raid5f_xor_stripe_done(stripe_req); 222 } 223 } 224 225 static void 226 raid5f_xor_stripe_cb(void *_stripe_req, int status) 227 { 228 struct stripe_request *stripe_req = _stripe_req; 229 230 stripe_req->xor.remaining -= stripe_req->xor.len; 231 232 if (stripe_req->xor.remaining > 0) { 233 stripe_req->xor.len = spdk_ioviter_nextv(stripe_req->chunk_iov_iters, 234 stripe_req->r5ch->chunk_xor_buffers); 235 raid5f_xor_stripe_continue(stripe_req); 236 } 237 238 _raid5f_xor_stripe_cb(stripe_req, status); 239 } 240 241 static void 242 raid5f_xor_stripe_md_cb(void *_stripe_req, int status) 243 { 244 struct stripe_request *stripe_req = _stripe_req; 245 246 stripe_req->xor.remaining_md = 0; 247 248 _raid5f_xor_stripe_cb(stripe_req, status); 249 } 250 251 static void 252 raid5f_xor_stripe_continue(struct stripe_request *stripe_req) 253 { 254 struct raid5f_io_channel *r5ch = stripe_req->r5ch; 255 struct raid_bdev_io *raid_io = stripe_req->raid_io; 256 struct raid_bdev *raid_bdev = raid_io->raid_bdev; 257 uint8_t n_src = raid5f_stripe_data_chunks_num(raid_bdev); 258 uint8_t i; 259 int ret; 260 261 assert(stripe_req->xor.len > 0); 262 263 for (i = 0; i < n_src; i++) { 264 stripe_req->chunk_xor_buffers[i] = r5ch->chunk_xor_buffers[i]; 265 } 266 267 ret = spdk_accel_submit_xor(r5ch->accel_ch, r5ch->chunk_xor_buffers[n_src], 268 stripe_req->chunk_xor_buffers, n_src, stripe_req->xor.len, 269 raid5f_xor_stripe_cb, stripe_req); 270 if (spdk_unlikely(ret)) { 271 if (ret == -ENOMEM) { 272 TAILQ_INSERT_HEAD(&r5ch->xor_retry_queue, stripe_req, link); 273 } else { 274 stripe_req->xor.status = ret; 275 raid5f_xor_stripe_done(stripe_req); 276 } 277 } 278 } 279 280 static void 281 raid5f_xor_stripe(struct stripe_request *stripe_req, stripe_req_xor_cb cb) 282 { 283 struct raid5f_io_channel *r5ch = stripe_req->r5ch; 284 struct raid_bdev_io *raid_io = stripe_req->raid_io; 285 struct raid_bdev *raid_bdev = raid_io->raid_bdev; 286 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); 287 struct chunk *chunk; 288 struct chunk *dest_chunk; 289 uint64_t num_blocks; 290 uint8_t c; 291 292 assert(cb != NULL); 293 294 if (spdk_likely(stripe_req->type == STRIPE_REQ_WRITE)) { 295 num_blocks = raid_bdev->strip_size; 296 dest_chunk = stripe_req->parity_chunk; 297 } else if (stripe_req->type == STRIPE_REQ_RECONSTRUCT) { 298 num_blocks = bdev_io->u.bdev.num_blocks; 299 dest_chunk = stripe_req->reconstruct.chunk; 300 } else { 301 assert(false); 302 } 303 304 c = 0; 305 FOR_EACH_CHUNK(stripe_req, chunk) { 306 if (chunk == dest_chunk) { 307 continue; 308 } 309 r5ch->chunk_xor_iovs[c] = chunk->iovs; 310 r5ch->chunk_xor_iovcnt[c] = chunk->iovcnt; 311 c++; 312 } 313 r5ch->chunk_xor_iovs[c] = dest_chunk->iovs; 314 r5ch->chunk_xor_iovcnt[c] = dest_chunk->iovcnt; 315 316 stripe_req->xor.len = spdk_ioviter_firstv(stripe_req->chunk_iov_iters, 317 raid_bdev->num_base_bdevs, 318 r5ch->chunk_xor_iovs, 319 r5ch->chunk_xor_iovcnt, 320 r5ch->chunk_xor_buffers); 321 stripe_req->xor.remaining = num_blocks << raid_bdev->blocklen_shift; 322 stripe_req->xor.status = 0; 323 stripe_req->xor.cb = cb; 324 325 if (spdk_bdev_io_get_md_buf(bdev_io)) { 326 uint8_t n_src = raid5f_stripe_data_chunks_num(raid_bdev); 327 uint64_t len = num_blocks * spdk_bdev_get_md_size(&raid_bdev->bdev); 328 int ret; 329 330 stripe_req->xor.remaining_md = len; 331 332 c = 0; 333 FOR_EACH_CHUNK(stripe_req, chunk) { 334 if (chunk != dest_chunk) { 335 stripe_req->chunk_xor_md_buffers[c] = chunk->md_buf; 336 c++; 337 } 338 } 339 340 ret = spdk_accel_submit_xor(stripe_req->r5ch->accel_ch, dest_chunk->md_buf, 341 stripe_req->chunk_xor_md_buffers, n_src, len, 342 raid5f_xor_stripe_md_cb, stripe_req); 343 if (spdk_unlikely(ret)) { 344 if (ret == -ENOMEM) { 345 TAILQ_INSERT_HEAD(&stripe_req->r5ch->xor_retry_queue, stripe_req, link); 346 } else { 347 stripe_req->xor.status = ret; 348 raid5f_xor_stripe_done(stripe_req); 349 } 350 return; 351 } 352 } 353 354 raid5f_xor_stripe_continue(stripe_req); 355 } 356 357 static void 358 raid5f_xor_stripe_retry(struct stripe_request *stripe_req) 359 { 360 if (stripe_req->xor.remaining_md) { 361 raid5f_xor_stripe(stripe_req, stripe_req->xor.cb); 362 } else { 363 raid5f_xor_stripe_continue(stripe_req); 364 } 365 } 366 367 static void 368 raid5f_stripe_request_chunk_write_complete(struct stripe_request *stripe_req, 369 enum spdk_bdev_io_status status) 370 { 371 if (raid_bdev_io_complete_part(stripe_req->raid_io, 1, status)) { 372 raid5f_stripe_request_release(stripe_req); 373 } 374 } 375 376 static void 377 raid5f_stripe_request_reconstruct_xor_done(struct stripe_request *stripe_req, int status) 378 { 379 struct raid_bdev_io *raid_io = stripe_req->raid_io; 380 381 raid5f_stripe_request_release(stripe_req); 382 383 raid_bdev_io_complete_part(raid_io, 1, 384 status == 0 ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 385 } 386 387 static void 388 raid5f_stripe_request_chunk_read_complete(struct stripe_request *stripe_req, 389 enum spdk_bdev_io_status status) 390 { 391 struct raid_bdev_io *raid_io = stripe_req->raid_io; 392 393 if (raid_io->base_bdev_io_remaining == 1) { 394 if (raid_io->base_bdev_io_status == SPDK_BDEV_IO_STATUS_SUCCESS && 395 status == SPDK_BDEV_IO_STATUS_SUCCESS) { 396 raid5f_xor_stripe(stripe_req, raid5f_stripe_request_reconstruct_xor_done); 397 return; 398 } 399 raid5f_stripe_request_release(stripe_req); 400 } 401 402 raid_bdev_io_complete_part(raid_io, 1, status); 403 } 404 405 static void 406 raid5f_chunk_complete_bdev_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 407 { 408 struct chunk *chunk = cb_arg; 409 struct stripe_request *stripe_req = raid5f_chunk_stripe_req(chunk); 410 enum spdk_bdev_io_status status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : 411 SPDK_BDEV_IO_STATUS_FAILED; 412 413 spdk_bdev_free_io(bdev_io); 414 415 if (spdk_likely(stripe_req->type == STRIPE_REQ_WRITE)) { 416 raid5f_stripe_request_chunk_write_complete(stripe_req, status); 417 } else if (stripe_req->type == STRIPE_REQ_RECONSTRUCT) { 418 raid5f_stripe_request_chunk_read_complete(stripe_req, status); 419 } else { 420 assert(false); 421 } 422 } 423 424 static void raid5f_stripe_request_submit_chunks(struct stripe_request *stripe_req); 425 426 static void 427 raid5f_chunk_submit_retry(void *_raid_io) 428 { 429 struct raid_bdev_io *raid_io = _raid_io; 430 struct stripe_request *stripe_req = raid_io->module_private; 431 432 raid5f_stripe_request_submit_chunks(stripe_req); 433 } 434 435 static inline void 436 raid5f_init_ext_io_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts) 437 { 438 memset(opts, 0, sizeof(*opts)); 439 opts->size = sizeof(*opts); 440 opts->memory_domain = bdev_io->u.bdev.memory_domain; 441 opts->memory_domain_ctx = bdev_io->u.bdev.memory_domain_ctx; 442 opts->metadata = bdev_io->u.bdev.md_buf; 443 } 444 445 static int 446 raid5f_chunk_submit(struct chunk *chunk) 447 { 448 struct stripe_request *stripe_req = raid5f_chunk_stripe_req(chunk); 449 struct raid_bdev_io *raid_io = stripe_req->raid_io; 450 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); 451 struct raid_bdev *raid_bdev = raid_io->raid_bdev; 452 struct raid_base_bdev_info *base_info = &raid_bdev->base_bdev_info[chunk->index]; 453 struct spdk_io_channel *base_ch = raid_io->raid_ch->base_channel[chunk->index]; 454 uint64_t base_offset_blocks = (stripe_req->stripe_index << raid_bdev->strip_size_shift); 455 int ret; 456 457 raid5f_init_ext_io_opts(bdev_io, &chunk->ext_opts); 458 chunk->ext_opts.metadata = chunk->md_buf; 459 460 raid_io->base_bdev_io_submitted++; 461 462 switch (stripe_req->type) { 463 case STRIPE_REQ_WRITE: 464 if (base_ch == NULL) { 465 raid_bdev_io_complete_part(raid_io, 1, SPDK_BDEV_IO_STATUS_SUCCESS); 466 return 0; 467 } 468 469 ret = raid_bdev_writev_blocks_ext(base_info, base_ch, chunk->iovs, chunk->iovcnt, 470 base_offset_blocks, raid_bdev->strip_size, 471 raid5f_chunk_complete_bdev_io, chunk, 472 &chunk->ext_opts); 473 break; 474 case STRIPE_REQ_RECONSTRUCT: 475 if (chunk == stripe_req->reconstruct.chunk) { 476 raid_bdev_io_complete_part(raid_io, 1, SPDK_BDEV_IO_STATUS_SUCCESS); 477 return 0; 478 } 479 480 base_offset_blocks += stripe_req->reconstruct.chunk_offset; 481 482 ret = raid_bdev_readv_blocks_ext(base_info, base_ch, chunk->iovs, chunk->iovcnt, 483 base_offset_blocks, bdev_io->u.bdev.num_blocks, 484 raid5f_chunk_complete_bdev_io, chunk, 485 &chunk->ext_opts); 486 break; 487 default: 488 assert(false); 489 ret = -EINVAL; 490 break; 491 } 492 493 if (spdk_unlikely(ret)) { 494 raid_io->base_bdev_io_submitted--; 495 if (ret == -ENOMEM) { 496 raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc), 497 base_ch, raid5f_chunk_submit_retry); 498 } else { 499 /* 500 * Implicitly complete any I/Os not yet submitted as FAILED. If completing 501 * these means there are no more to complete for the stripe request, we can 502 * release the stripe request as well. 503 */ 504 uint64_t base_bdev_io_not_submitted; 505 506 if (stripe_req->type == STRIPE_REQ_WRITE) { 507 base_bdev_io_not_submitted = raid_bdev->num_base_bdevs - 508 raid_io->base_bdev_io_submitted; 509 } else { 510 base_bdev_io_not_submitted = raid5f_stripe_data_chunks_num(raid_bdev) - 511 raid_io->base_bdev_io_submitted; 512 } 513 514 if (raid_bdev_io_complete_part(raid_io, base_bdev_io_not_submitted, 515 SPDK_BDEV_IO_STATUS_FAILED)) { 516 raid5f_stripe_request_release(stripe_req); 517 } 518 } 519 } 520 521 return ret; 522 } 523 524 static int 525 raid5f_chunk_set_iovcnt(struct chunk *chunk, int iovcnt) 526 { 527 if (iovcnt > chunk->iovcnt_max) { 528 struct iovec *iovs = chunk->iovs; 529 530 iovs = realloc(iovs, iovcnt * sizeof(*iovs)); 531 if (!iovs) { 532 return -ENOMEM; 533 } 534 chunk->iovs = iovs; 535 chunk->iovcnt_max = iovcnt; 536 } 537 chunk->iovcnt = iovcnt; 538 539 return 0; 540 } 541 542 static int 543 raid5f_stripe_request_map_iovecs(struct stripe_request *stripe_req) 544 { 545 struct raid_bdev *raid_bdev = stripe_req->raid_io->raid_bdev; 546 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(stripe_req->raid_io); 547 const struct iovec *raid_io_iovs = bdev_io->u.bdev.iovs; 548 int raid_io_iovcnt = bdev_io->u.bdev.iovcnt; 549 void *raid_io_md = spdk_bdev_io_get_md_buf(bdev_io); 550 uint32_t raid_io_md_size = spdk_bdev_get_md_size(&raid_bdev->bdev); 551 struct chunk *chunk; 552 int raid_io_iov_idx = 0; 553 size_t raid_io_offset = 0; 554 size_t raid_io_iov_offset = 0; 555 int i; 556 557 FOR_EACH_DATA_CHUNK(stripe_req, chunk) { 558 int chunk_iovcnt = 0; 559 uint64_t len = raid_bdev->strip_size << raid_bdev->blocklen_shift; 560 size_t off = raid_io_iov_offset; 561 int ret; 562 563 for (i = raid_io_iov_idx; i < raid_io_iovcnt; i++) { 564 chunk_iovcnt++; 565 off += raid_io_iovs[i].iov_len; 566 if (off >= raid_io_offset + len) { 567 break; 568 } 569 } 570 571 assert(raid_io_iov_idx + chunk_iovcnt <= raid_io_iovcnt); 572 573 ret = raid5f_chunk_set_iovcnt(chunk, chunk_iovcnt); 574 if (ret) { 575 return ret; 576 } 577 578 if (raid_io_md) { 579 chunk->md_buf = raid_io_md + 580 (raid_io_offset >> raid_bdev->blocklen_shift) * raid_io_md_size; 581 } 582 583 for (i = 0; i < chunk_iovcnt; i++) { 584 struct iovec *chunk_iov = &chunk->iovs[i]; 585 const struct iovec *raid_io_iov = &raid_io_iovs[raid_io_iov_idx]; 586 size_t chunk_iov_offset = raid_io_offset - raid_io_iov_offset; 587 588 chunk_iov->iov_base = raid_io_iov->iov_base + chunk_iov_offset; 589 chunk_iov->iov_len = spdk_min(len, raid_io_iov->iov_len - chunk_iov_offset); 590 raid_io_offset += chunk_iov->iov_len; 591 len -= chunk_iov->iov_len; 592 593 if (raid_io_offset >= raid_io_iov_offset + raid_io_iov->iov_len) { 594 raid_io_iov_idx++; 595 raid_io_iov_offset += raid_io_iov->iov_len; 596 } 597 } 598 599 if (spdk_unlikely(len > 0)) { 600 return -EINVAL; 601 } 602 } 603 604 stripe_req->parity_chunk->iovs[0].iov_base = stripe_req->write.parity_buf; 605 stripe_req->parity_chunk->iovs[0].iov_len = raid_bdev->strip_size << raid_bdev->blocklen_shift; 606 stripe_req->parity_chunk->iovcnt = 1; 607 stripe_req->parity_chunk->md_buf = stripe_req->write.parity_md_buf; 608 609 return 0; 610 } 611 612 static void 613 raid5f_stripe_request_submit_chunks(struct stripe_request *stripe_req) 614 { 615 struct raid_bdev_io *raid_io = stripe_req->raid_io; 616 struct chunk *start = &stripe_req->chunks[raid_io->base_bdev_io_submitted]; 617 struct chunk *chunk; 618 619 FOR_EACH_CHUNK_FROM(stripe_req, chunk, start) { 620 if (spdk_unlikely(raid5f_chunk_submit(chunk) != 0)) { 621 break; 622 } 623 } 624 } 625 626 static inline void 627 raid5f_stripe_request_init(struct stripe_request *stripe_req, struct raid_bdev_io *raid_io, 628 uint64_t stripe_index) 629 { 630 stripe_req->raid_io = raid_io; 631 stripe_req->stripe_index = stripe_index; 632 stripe_req->parity_chunk = &stripe_req->chunks[raid5f_stripe_parity_chunk_index(raid_io->raid_bdev, 633 stripe_index)]; 634 } 635 636 static void 637 raid5f_stripe_write_request_xor_done(struct stripe_request *stripe_req, int status) 638 { 639 struct raid_bdev_io *raid_io = stripe_req->raid_io; 640 641 if (status != 0) { 642 raid5f_stripe_request_release(stripe_req); 643 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); 644 } else { 645 raid5f_stripe_request_submit_chunks(stripe_req); 646 } 647 } 648 649 static int 650 raid5f_submit_write_request(struct raid_bdev_io *raid_io, uint64_t stripe_index) 651 { 652 struct raid_bdev *raid_bdev = raid_io->raid_bdev; 653 struct raid5f_io_channel *r5ch = spdk_io_channel_get_ctx(raid_io->raid_ch->module_channel); 654 struct stripe_request *stripe_req; 655 int ret; 656 657 stripe_req = TAILQ_FIRST(&r5ch->free_stripe_requests.write); 658 if (!stripe_req) { 659 return -ENOMEM; 660 } 661 662 raid5f_stripe_request_init(stripe_req, raid_io, stripe_index); 663 664 ret = raid5f_stripe_request_map_iovecs(stripe_req); 665 if (spdk_unlikely(ret)) { 666 return ret; 667 } 668 669 TAILQ_REMOVE(&r5ch->free_stripe_requests.write, stripe_req, link); 670 671 raid_io->module_private = stripe_req; 672 raid_io->base_bdev_io_remaining = raid_bdev->num_base_bdevs; 673 674 if (raid_io->raid_ch->base_channel[stripe_req->parity_chunk->index] != NULL) { 675 raid5f_xor_stripe(stripe_req, raid5f_stripe_write_request_xor_done); 676 } else { 677 raid5f_stripe_write_request_xor_done(stripe_req, 0); 678 } 679 680 return 0; 681 } 682 683 static void 684 raid5f_chunk_read_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 685 { 686 struct raid_bdev_io *raid_io = cb_arg; 687 688 spdk_bdev_free_io(bdev_io); 689 690 raid_bdev_io_complete(raid_io, success ? SPDK_BDEV_IO_STATUS_SUCCESS : 691 SPDK_BDEV_IO_STATUS_FAILED); 692 } 693 694 static void raid5f_submit_rw_request(struct raid_bdev_io *raid_io); 695 696 static void 697 _raid5f_submit_rw_request(void *_raid_io) 698 { 699 struct raid_bdev_io *raid_io = _raid_io; 700 701 raid5f_submit_rw_request(raid_io); 702 } 703 704 static int 705 raid5f_submit_reconstruct_read(struct raid_bdev_io *raid_io, uint64_t stripe_index, 706 uint8_t chunk_idx, uint64_t chunk_offset) 707 { 708 struct raid_bdev *raid_bdev = raid_io->raid_bdev; 709 struct raid5f_io_channel *r5ch = spdk_io_channel_get_ctx(raid_io->raid_ch->module_channel); 710 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); 711 void *bdev_io_md = spdk_bdev_io_get_md_buf(bdev_io); 712 struct stripe_request *stripe_req; 713 struct chunk *chunk; 714 int buf_idx; 715 716 stripe_req = TAILQ_FIRST(&r5ch->free_stripe_requests.reconstruct); 717 if (!stripe_req) { 718 return -ENOMEM; 719 } 720 721 raid5f_stripe_request_init(stripe_req, raid_io, stripe_index); 722 723 stripe_req->reconstruct.chunk = &stripe_req->chunks[chunk_idx]; 724 stripe_req->reconstruct.chunk_offset = chunk_offset; 725 buf_idx = 0; 726 727 FOR_EACH_CHUNK(stripe_req, chunk) { 728 if (chunk == stripe_req->reconstruct.chunk) { 729 int i; 730 int ret; 731 732 ret = raid5f_chunk_set_iovcnt(chunk, bdev_io->u.bdev.iovcnt); 733 if (ret) { 734 return ret; 735 } 736 737 for (i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 738 chunk->iovs[i] = bdev_io->u.bdev.iovs[i]; 739 } 740 741 chunk->md_buf = bdev_io_md; 742 } else { 743 struct iovec *iov = &chunk->iovs[0]; 744 745 iov->iov_base = stripe_req->reconstruct.chunk_buffers[buf_idx]; 746 iov->iov_len = bdev_io->u.bdev.num_blocks << raid_bdev->blocklen_shift; 747 chunk->iovcnt = 1; 748 749 if (bdev_io_md) { 750 chunk->md_buf = stripe_req->reconstruct.chunk_md_buffers[buf_idx]; 751 } 752 753 buf_idx++; 754 } 755 } 756 757 raid_io->module_private = stripe_req; 758 raid_io->base_bdev_io_remaining = raid_bdev->num_base_bdevs; 759 760 TAILQ_REMOVE(&r5ch->free_stripe_requests.reconstruct, stripe_req, link); 761 762 raid5f_stripe_request_submit_chunks(stripe_req); 763 764 return 0; 765 } 766 767 static int 768 raid5f_submit_read_request(struct raid_bdev_io *raid_io, uint64_t stripe_index, 769 uint64_t stripe_offset) 770 { 771 struct raid_bdev *raid_bdev = raid_io->raid_bdev; 772 uint8_t chunk_data_idx = stripe_offset >> raid_bdev->strip_size_shift; 773 uint8_t p_idx = raid5f_stripe_parity_chunk_index(raid_bdev, stripe_index); 774 uint8_t chunk_idx = chunk_data_idx < p_idx ? chunk_data_idx : chunk_data_idx + 1; 775 struct raid_base_bdev_info *base_info = &raid_bdev->base_bdev_info[chunk_idx]; 776 struct spdk_io_channel *base_ch = raid_io->raid_ch->base_channel[chunk_idx]; 777 uint64_t chunk_offset = stripe_offset - (chunk_data_idx << raid_bdev->strip_size_shift); 778 uint64_t base_offset_blocks = (stripe_index << raid_bdev->strip_size_shift) + chunk_offset; 779 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); 780 struct spdk_bdev_ext_io_opts io_opts; 781 int ret; 782 783 raid5f_init_ext_io_opts(bdev_io, &io_opts); 784 if (base_ch == NULL) { 785 return raid5f_submit_reconstruct_read(raid_io, stripe_index, chunk_idx, chunk_offset); 786 } 787 788 ret = raid_bdev_readv_blocks_ext(base_info, base_ch, bdev_io->u.bdev.iovs, 789 bdev_io->u.bdev.iovcnt, 790 base_offset_blocks, bdev_io->u.bdev.num_blocks, raid5f_chunk_read_complete, raid_io, 791 &io_opts); 792 793 if (spdk_unlikely(ret == -ENOMEM)) { 794 raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc), 795 base_ch, _raid5f_submit_rw_request); 796 return 0; 797 } 798 799 return ret; 800 } 801 802 static void 803 raid5f_submit_rw_request(struct raid_bdev_io *raid_io) 804 { 805 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); 806 struct raid_bdev *raid_bdev = raid_io->raid_bdev; 807 struct raid5f_info *r5f_info = raid_bdev->module_private; 808 uint64_t offset_blocks = bdev_io->u.bdev.offset_blocks; 809 uint64_t stripe_index = offset_blocks / r5f_info->stripe_blocks; 810 uint64_t stripe_offset = offset_blocks % r5f_info->stripe_blocks; 811 int ret; 812 813 switch (bdev_io->type) { 814 case SPDK_BDEV_IO_TYPE_READ: 815 assert(bdev_io->u.bdev.num_blocks <= raid_bdev->strip_size); 816 ret = raid5f_submit_read_request(raid_io, stripe_index, stripe_offset); 817 break; 818 case SPDK_BDEV_IO_TYPE_WRITE: 819 assert(stripe_offset == 0); 820 assert(bdev_io->u.bdev.num_blocks == r5f_info->stripe_blocks); 821 ret = raid5f_submit_write_request(raid_io, stripe_index); 822 break; 823 default: 824 ret = -EINVAL; 825 break; 826 } 827 828 if (spdk_unlikely(ret)) { 829 raid_bdev_io_complete(raid_io, ret == -ENOMEM ? SPDK_BDEV_IO_STATUS_NOMEM : 830 SPDK_BDEV_IO_STATUS_FAILED); 831 } 832 } 833 834 static void 835 raid5f_stripe_request_free(struct stripe_request *stripe_req) 836 { 837 struct chunk *chunk; 838 839 FOR_EACH_CHUNK(stripe_req, chunk) { 840 free(chunk->iovs); 841 } 842 843 if (stripe_req->type == STRIPE_REQ_WRITE) { 844 spdk_dma_free(stripe_req->write.parity_buf); 845 spdk_dma_free(stripe_req->write.parity_md_buf); 846 } else if (stripe_req->type == STRIPE_REQ_RECONSTRUCT) { 847 struct raid5f_info *r5f_info = raid5f_ch_to_r5f_info(stripe_req->r5ch); 848 struct raid_bdev *raid_bdev = r5f_info->raid_bdev; 849 uint8_t i; 850 851 if (stripe_req->reconstruct.chunk_buffers) { 852 for (i = 0; i < raid5f_stripe_data_chunks_num(raid_bdev); i++) { 853 spdk_dma_free(stripe_req->reconstruct.chunk_buffers[i]); 854 } 855 free(stripe_req->reconstruct.chunk_buffers); 856 } 857 858 if (stripe_req->reconstruct.chunk_md_buffers) { 859 for (i = 0; i < raid5f_stripe_data_chunks_num(raid_bdev); i++) { 860 spdk_dma_free(stripe_req->reconstruct.chunk_md_buffers[i]); 861 } 862 free(stripe_req->reconstruct.chunk_md_buffers); 863 } 864 } else { 865 assert(false); 866 } 867 868 free(stripe_req->chunk_xor_buffers); 869 free(stripe_req->chunk_xor_md_buffers); 870 free(stripe_req->chunk_iov_iters); 871 872 free(stripe_req); 873 } 874 875 static struct stripe_request * 876 raid5f_stripe_request_alloc(struct raid5f_io_channel *r5ch, enum stripe_request_type type) 877 { 878 struct raid5f_info *r5f_info = raid5f_ch_to_r5f_info(r5ch); 879 struct raid_bdev *raid_bdev = r5f_info->raid_bdev; 880 uint32_t raid_io_md_size = spdk_bdev_get_md_size(&raid_bdev->bdev); 881 struct stripe_request *stripe_req; 882 struct chunk *chunk; 883 size_t chunk_len; 884 885 stripe_req = calloc(1, sizeof(*stripe_req) + sizeof(*chunk) * raid_bdev->num_base_bdevs); 886 if (!stripe_req) { 887 return NULL; 888 } 889 890 stripe_req->r5ch = r5ch; 891 stripe_req->type = type; 892 893 FOR_EACH_CHUNK(stripe_req, chunk) { 894 chunk->index = chunk - stripe_req->chunks; 895 chunk->iovcnt_max = 4; 896 chunk->iovs = calloc(chunk->iovcnt_max, sizeof(chunk->iovs[0])); 897 if (!chunk->iovs) { 898 goto err; 899 } 900 } 901 902 chunk_len = raid_bdev->strip_size << raid_bdev->blocklen_shift; 903 904 if (type == STRIPE_REQ_WRITE) { 905 stripe_req->write.parity_buf = spdk_dma_malloc(chunk_len, r5f_info->buf_alignment, NULL); 906 if (!stripe_req->write.parity_buf) { 907 goto err; 908 } 909 910 if (raid_io_md_size != 0) { 911 stripe_req->write.parity_md_buf = spdk_dma_malloc(raid_bdev->strip_size * raid_io_md_size, 912 r5f_info->buf_alignment, NULL); 913 if (!stripe_req->write.parity_md_buf) { 914 goto err; 915 } 916 } 917 } else if (type == STRIPE_REQ_RECONSTRUCT) { 918 uint8_t n = raid5f_stripe_data_chunks_num(raid_bdev); 919 void *buf; 920 uint8_t i; 921 922 stripe_req->reconstruct.chunk_buffers = calloc(n, sizeof(void *)); 923 if (!stripe_req->reconstruct.chunk_buffers) { 924 goto err; 925 } 926 927 for (i = 0; i < n; i++) { 928 buf = spdk_dma_malloc(chunk_len, r5f_info->buf_alignment, NULL); 929 if (!buf) { 930 goto err; 931 } 932 stripe_req->reconstruct.chunk_buffers[i] = buf; 933 } 934 935 if (raid_io_md_size != 0) { 936 stripe_req->reconstruct.chunk_md_buffers = calloc(n, sizeof(void *)); 937 if (!stripe_req->reconstruct.chunk_md_buffers) { 938 goto err; 939 } 940 941 for (i = 0; i < n; i++) { 942 buf = spdk_dma_malloc(raid_bdev->strip_size * raid_io_md_size, r5f_info->buf_alignment, NULL); 943 if (!buf) { 944 goto err; 945 } 946 stripe_req->reconstruct.chunk_md_buffers[i] = buf; 947 } 948 } 949 } else { 950 assert(false); 951 return NULL; 952 } 953 954 stripe_req->chunk_iov_iters = malloc(SPDK_IOVITER_SIZE(raid_bdev->num_base_bdevs)); 955 if (!stripe_req->chunk_iov_iters) { 956 goto err; 957 } 958 959 stripe_req->chunk_xor_buffers = calloc(raid5f_stripe_data_chunks_num(raid_bdev), 960 sizeof(stripe_req->chunk_xor_buffers[0])); 961 if (!stripe_req->chunk_xor_buffers) { 962 goto err; 963 } 964 965 stripe_req->chunk_xor_md_buffers = calloc(raid5f_stripe_data_chunks_num(raid_bdev), 966 sizeof(stripe_req->chunk_xor_md_buffers[0])); 967 if (!stripe_req->chunk_xor_md_buffers) { 968 goto err; 969 } 970 971 return stripe_req; 972 err: 973 raid5f_stripe_request_free(stripe_req); 974 return NULL; 975 } 976 977 static void 978 raid5f_ioch_destroy(void *io_device, void *ctx_buf) 979 { 980 struct raid5f_io_channel *r5ch = ctx_buf; 981 struct stripe_request *stripe_req; 982 983 assert(TAILQ_EMPTY(&r5ch->xor_retry_queue)); 984 985 while ((stripe_req = TAILQ_FIRST(&r5ch->free_stripe_requests.write))) { 986 TAILQ_REMOVE(&r5ch->free_stripe_requests.write, stripe_req, link); 987 raid5f_stripe_request_free(stripe_req); 988 } 989 990 while ((stripe_req = TAILQ_FIRST(&r5ch->free_stripe_requests.reconstruct))) { 991 TAILQ_REMOVE(&r5ch->free_stripe_requests.reconstruct, stripe_req, link); 992 raid5f_stripe_request_free(stripe_req); 993 } 994 995 if (r5ch->accel_ch) { 996 spdk_put_io_channel(r5ch->accel_ch); 997 } 998 999 free(r5ch->chunk_xor_buffers); 1000 free(r5ch->chunk_xor_iovs); 1001 free(r5ch->chunk_xor_iovcnt); 1002 } 1003 1004 static int 1005 raid5f_ioch_create(void *io_device, void *ctx_buf) 1006 { 1007 struct raid5f_io_channel *r5ch = ctx_buf; 1008 struct raid5f_info *r5f_info = io_device; 1009 struct raid_bdev *raid_bdev = r5f_info->raid_bdev; 1010 struct stripe_request *stripe_req; 1011 int i; 1012 1013 TAILQ_INIT(&r5ch->free_stripe_requests.write); 1014 TAILQ_INIT(&r5ch->free_stripe_requests.reconstruct); 1015 TAILQ_INIT(&r5ch->xor_retry_queue); 1016 1017 for (i = 0; i < RAID5F_MAX_STRIPES; i++) { 1018 stripe_req = raid5f_stripe_request_alloc(r5ch, STRIPE_REQ_WRITE); 1019 if (!stripe_req) { 1020 goto err; 1021 } 1022 1023 TAILQ_INSERT_HEAD(&r5ch->free_stripe_requests.write, stripe_req, link); 1024 } 1025 1026 for (i = 0; i < RAID5F_MAX_STRIPES; i++) { 1027 stripe_req = raid5f_stripe_request_alloc(r5ch, STRIPE_REQ_RECONSTRUCT); 1028 if (!stripe_req) { 1029 goto err; 1030 } 1031 1032 TAILQ_INSERT_HEAD(&r5ch->free_stripe_requests.reconstruct, stripe_req, link); 1033 } 1034 1035 r5ch->accel_ch = spdk_accel_get_io_channel(); 1036 if (!r5ch->accel_ch) { 1037 SPDK_ERRLOG("Failed to get accel framework's IO channel\n"); 1038 goto err; 1039 } 1040 1041 r5ch->chunk_xor_buffers = calloc(raid_bdev->num_base_bdevs, sizeof(*r5ch->chunk_xor_buffers)); 1042 if (!r5ch->chunk_xor_buffers) { 1043 goto err; 1044 } 1045 1046 r5ch->chunk_xor_iovs = calloc(raid_bdev->num_base_bdevs, sizeof(*r5ch->chunk_xor_iovs)); 1047 if (!r5ch->chunk_xor_iovs) { 1048 goto err; 1049 } 1050 1051 r5ch->chunk_xor_iovcnt = calloc(raid_bdev->num_base_bdevs, sizeof(*r5ch->chunk_xor_iovcnt)); 1052 if (!r5ch->chunk_xor_iovcnt) { 1053 goto err; 1054 } 1055 1056 return 0; 1057 err: 1058 SPDK_ERRLOG("Failed to initialize io channel\n"); 1059 raid5f_ioch_destroy(r5f_info, r5ch); 1060 return -ENOMEM; 1061 } 1062 1063 static int 1064 raid5f_start(struct raid_bdev *raid_bdev) 1065 { 1066 uint64_t min_blockcnt = UINT64_MAX; 1067 uint64_t base_bdev_data_size; 1068 struct raid_base_bdev_info *base_info; 1069 struct raid5f_info *r5f_info; 1070 size_t alignment = 0; 1071 1072 r5f_info = calloc(1, sizeof(*r5f_info)); 1073 if (!r5f_info) { 1074 SPDK_ERRLOG("Failed to allocate r5f_info\n"); 1075 return -ENOMEM; 1076 } 1077 r5f_info->raid_bdev = raid_bdev; 1078 1079 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 1080 struct spdk_bdev *base_bdev = spdk_bdev_desc_get_bdev(base_info->desc); 1081 1082 min_blockcnt = spdk_min(min_blockcnt, base_info->data_size); 1083 alignment = spdk_max(alignment, spdk_bdev_get_buf_align(base_bdev)); 1084 } 1085 1086 base_bdev_data_size = (min_blockcnt / raid_bdev->strip_size) * raid_bdev->strip_size; 1087 1088 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { 1089 base_info->data_size = base_bdev_data_size; 1090 } 1091 1092 r5f_info->total_stripes = min_blockcnt / raid_bdev->strip_size; 1093 r5f_info->stripe_blocks = raid_bdev->strip_size * raid5f_stripe_data_chunks_num(raid_bdev); 1094 r5f_info->buf_alignment = alignment; 1095 1096 raid_bdev->bdev.blockcnt = r5f_info->stripe_blocks * r5f_info->total_stripes; 1097 raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size; 1098 raid_bdev->bdev.split_on_optimal_io_boundary = true; 1099 raid_bdev->bdev.write_unit_size = r5f_info->stripe_blocks; 1100 raid_bdev->bdev.split_on_write_unit = true; 1101 1102 raid_bdev->module_private = r5f_info; 1103 1104 spdk_io_device_register(r5f_info, raid5f_ioch_create, raid5f_ioch_destroy, 1105 sizeof(struct raid5f_io_channel), NULL); 1106 1107 return 0; 1108 } 1109 1110 static void 1111 raid5f_io_device_unregister_done(void *io_device) 1112 { 1113 struct raid5f_info *r5f_info = io_device; 1114 1115 raid_bdev_module_stop_done(r5f_info->raid_bdev); 1116 1117 free(r5f_info); 1118 } 1119 1120 static bool 1121 raid5f_stop(struct raid_bdev *raid_bdev) 1122 { 1123 struct raid5f_info *r5f_info = raid_bdev->module_private; 1124 1125 spdk_io_device_unregister(r5f_info, raid5f_io_device_unregister_done); 1126 1127 return false; 1128 } 1129 1130 static struct spdk_io_channel * 1131 raid5f_get_io_channel(struct raid_bdev *raid_bdev) 1132 { 1133 struct raid5f_info *r5f_info = raid_bdev->module_private; 1134 1135 return spdk_get_io_channel(r5f_info); 1136 } 1137 1138 static struct raid_bdev_module g_raid5f_module = { 1139 .level = RAID5F, 1140 .base_bdevs_min = 3, 1141 .base_bdevs_constraint = {CONSTRAINT_MAX_BASE_BDEVS_REMOVED, 1}, 1142 .start = raid5f_start, 1143 .stop = raid5f_stop, 1144 .submit_rw_request = raid5f_submit_rw_request, 1145 .get_io_channel = raid5f_get_io_channel, 1146 }; 1147 RAID_MODULE_REGISTER(&g_raid5f_module) 1148 1149 SPDK_LOG_REGISTER_COMPONENT(bdev_raid5f) 1150