1488570ebSJim Harris /* SPDX-License-Identifier: BSD-3-Clause
2a6dbe372Spaul luse * Copyright (C) 2022 Intel Corporation.
364eebbd1Syupeng * Copyright (c) Peng Yu yupeng0921@gmail.com.
464eebbd1Syupeng * All rights reserved.
564eebbd1Syupeng */
664eebbd1Syupeng
764eebbd1Syupeng #include "bdev_raid.h"
864eebbd1Syupeng
964eebbd1Syupeng #include "spdk/env.h"
1064eebbd1Syupeng #include "spdk/thread.h"
1164eebbd1Syupeng #include "spdk/string.h"
1264eebbd1Syupeng #include "spdk/util.h"
1364eebbd1Syupeng
1464eebbd1Syupeng #include "spdk/log.h"
1564eebbd1Syupeng
1664eebbd1Syupeng struct concat_block_range {
1764eebbd1Syupeng uint64_t start;
1864eebbd1Syupeng uint64_t length;
1964eebbd1Syupeng };
2064eebbd1Syupeng
2164eebbd1Syupeng /*
2264eebbd1Syupeng * brief:
2364eebbd1Syupeng * concat_bdev_io_completion function is called by lower layers to notify raid
2464eebbd1Syupeng * module that particular bdev_io is completed.
2564eebbd1Syupeng * params:
2664eebbd1Syupeng * bdev_io - pointer to bdev io submitted to lower layers, like child io
2764eebbd1Syupeng * success - bdev_io status
2864eebbd1Syupeng * cb_arg - function callback context (parent raid_bdev_io)
2964eebbd1Syupeng * returns:
3064eebbd1Syupeng * none
3164eebbd1Syupeng */
3264eebbd1Syupeng static void
concat_bdev_io_completion(struct spdk_bdev_io * bdev_io,bool success,void * cb_arg)3364eebbd1Syupeng concat_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
3464eebbd1Syupeng {
3564eebbd1Syupeng struct raid_bdev_io *raid_io = cb_arg;
3664eebbd1Syupeng
3764eebbd1Syupeng spdk_bdev_free_io(bdev_io);
3864eebbd1Syupeng
3964eebbd1Syupeng if (success) {
4064eebbd1Syupeng raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS);
4164eebbd1Syupeng } else {
4264eebbd1Syupeng raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
4364eebbd1Syupeng }
4464eebbd1Syupeng }
4564eebbd1Syupeng
468dd1cd21SBen Walker static void concat_submit_rw_request(struct raid_bdev_io *raid_io);
4764eebbd1Syupeng
4864eebbd1Syupeng static void
_concat_submit_rw_request(void * _raid_io)4964eebbd1Syupeng _concat_submit_rw_request(void *_raid_io)
5064eebbd1Syupeng {
5164eebbd1Syupeng struct raid_bdev_io *raid_io = _raid_io;
5264eebbd1Syupeng
5364eebbd1Syupeng concat_submit_rw_request(raid_io);
5464eebbd1Syupeng }
5564eebbd1Syupeng
5664eebbd1Syupeng /*
5764eebbd1Syupeng * brief:
5864eebbd1Syupeng * concat_submit_rw_request function is used to submit I/O to the correct
5964eebbd1Syupeng * member disk for concat bdevs.
6064eebbd1Syupeng * params:
6164eebbd1Syupeng * raid_io
6264eebbd1Syupeng * returns:
6364eebbd1Syupeng * none
6464eebbd1Syupeng */
6564eebbd1Syupeng static void
concat_submit_rw_request(struct raid_bdev_io * raid_io)6664eebbd1Syupeng concat_submit_rw_request(struct raid_bdev_io *raid_io)
6764eebbd1Syupeng {
6864eebbd1Syupeng struct raid_bdev_io_channel *raid_ch = raid_io->raid_ch;
6964eebbd1Syupeng struct raid_bdev *raid_bdev = raid_io->raid_bdev;
7064eebbd1Syupeng struct concat_block_range *block_range = raid_bdev->module_private;
7164eebbd1Syupeng uint64_t pd_lba;
7264eebbd1Syupeng uint64_t pd_blocks;
7364eebbd1Syupeng int pd_idx;
7464eebbd1Syupeng int ret = 0;
7564eebbd1Syupeng struct raid_base_bdev_info *base_info;
7664eebbd1Syupeng struct spdk_io_channel *base_ch;
7755f94793SKonrad Sztyber struct spdk_bdev_ext_io_opts io_opts = {};
7864eebbd1Syupeng int i;
7964eebbd1Syupeng
8064eebbd1Syupeng pd_idx = -1;
8164eebbd1Syupeng for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
82a4e1703eSArtur Paszkiewicz if (block_range[i].start > raid_io->offset_blocks) {
8364eebbd1Syupeng break;
8464eebbd1Syupeng }
8564eebbd1Syupeng pd_idx = i;
8664eebbd1Syupeng }
8764eebbd1Syupeng assert(pd_idx >= 0);
88a4e1703eSArtur Paszkiewicz assert(raid_io->offset_blocks >= block_range[pd_idx].start);
89a4e1703eSArtur Paszkiewicz pd_lba = raid_io->offset_blocks - block_range[pd_idx].start;
90a4e1703eSArtur Paszkiewicz pd_blocks = raid_io->num_blocks;
9164eebbd1Syupeng base_info = &raid_bdev->base_bdev_info[pd_idx];
9264eebbd1Syupeng if (base_info->desc == NULL) {
9364eebbd1Syupeng SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
9464eebbd1Syupeng assert(0);
9564eebbd1Syupeng }
9664eebbd1Syupeng
9764eebbd1Syupeng /*
9864eebbd1Syupeng * Submit child io to bdev layer with using base bdev descriptors, base
9964eebbd1Syupeng * bdev lba, base bdev child io length in blocks, buffer, completion
10064eebbd1Syupeng * function and function callback context
10164eebbd1Syupeng */
10264eebbd1Syupeng assert(raid_ch != NULL);
103*6e03e49bSArtur Paszkiewicz base_ch = raid_bdev_channel_get_base_channel(raid_ch, pd_idx);
10455f94793SKonrad Sztyber
10555f94793SKonrad Sztyber io_opts.size = sizeof(io_opts);
106a4e1703eSArtur Paszkiewicz io_opts.memory_domain = raid_io->memory_domain;
107a4e1703eSArtur Paszkiewicz io_opts.memory_domain_ctx = raid_io->memory_domain_ctx;
108a4e1703eSArtur Paszkiewicz io_opts.metadata = raid_io->md_buf;
10955f94793SKonrad Sztyber
110a4e1703eSArtur Paszkiewicz if (raid_io->type == SPDK_BDEV_IO_TYPE_READ) {
111614ca6d2SArtur Paszkiewicz ret = raid_bdev_readv_blocks_ext(base_info, base_ch,
112a4e1703eSArtur Paszkiewicz raid_io->iovs, raid_io->iovcnt,
11364eebbd1Syupeng pd_lba, pd_blocks, concat_bdev_io_completion,
11455f94793SKonrad Sztyber raid_io, &io_opts);
115a4e1703eSArtur Paszkiewicz } else if (raid_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
116614ca6d2SArtur Paszkiewicz ret = raid_bdev_writev_blocks_ext(base_info, base_ch,
117a4e1703eSArtur Paszkiewicz raid_io->iovs, raid_io->iovcnt,
11864eebbd1Syupeng pd_lba, pd_blocks, concat_bdev_io_completion,
11955f94793SKonrad Sztyber raid_io, &io_opts);
120ba460005SKrzysztof Smolinski } else {
121a4e1703eSArtur Paszkiewicz SPDK_ERRLOG("Recvd not supported io type %u\n", raid_io->type);
12264eebbd1Syupeng assert(0);
12364eebbd1Syupeng }
12464eebbd1Syupeng
12564eebbd1Syupeng if (ret == -ENOMEM) {
1268d1993a5SArtur Paszkiewicz raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
1278d1993a5SArtur Paszkiewicz base_ch, _concat_submit_rw_request);
12864eebbd1Syupeng } else if (ret != 0) {
12964eebbd1Syupeng SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
13064eebbd1Syupeng assert(false);
13164eebbd1Syupeng raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
13264eebbd1Syupeng }
13364eebbd1Syupeng }
13464eebbd1Syupeng
1358dd1cd21SBen Walker static void concat_submit_null_payload_request(struct raid_bdev_io *raid_io);
13664eebbd1Syupeng
13764eebbd1Syupeng static void
_concat_submit_null_payload_request(void * _raid_io)13864eebbd1Syupeng _concat_submit_null_payload_request(void *_raid_io)
13964eebbd1Syupeng {
14064eebbd1Syupeng struct raid_bdev_io *raid_io = _raid_io;
14164eebbd1Syupeng
14264eebbd1Syupeng concat_submit_null_payload_request(raid_io);
14364eebbd1Syupeng }
14464eebbd1Syupeng
14564eebbd1Syupeng static void
concat_base_io_complete(struct spdk_bdev_io * bdev_io,bool success,void * cb_arg)14664eebbd1Syupeng concat_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
14764eebbd1Syupeng {
14864eebbd1Syupeng struct raid_bdev_io *raid_io = cb_arg;
14964eebbd1Syupeng
15064eebbd1Syupeng raid_bdev_io_complete_part(raid_io, 1, success ?
15164eebbd1Syupeng SPDK_BDEV_IO_STATUS_SUCCESS :
15264eebbd1Syupeng SPDK_BDEV_IO_STATUS_FAILED);
15364eebbd1Syupeng
15464eebbd1Syupeng spdk_bdev_free_io(bdev_io);
15564eebbd1Syupeng }
15664eebbd1Syupeng
15764eebbd1Syupeng /*
15864eebbd1Syupeng * brief:
15964eebbd1Syupeng * concat_submit_null_payload_request function submits the next batch of
16064eebbd1Syupeng * io requests with range but without payload, like FLUSH and UNMAP, to member disks;
16164eebbd1Syupeng * it will submit as many as possible unless one base io request fails with -ENOMEM,
16264eebbd1Syupeng * in which case it will queue itself for later submission.
16364eebbd1Syupeng * params:
16464eebbd1Syupeng * bdev_io - pointer to parent bdev_io on raid bdev device
16564eebbd1Syupeng * returns:
16664eebbd1Syupeng * none
16764eebbd1Syupeng */
16864eebbd1Syupeng static void
concat_submit_null_payload_request(struct raid_bdev_io * raid_io)16964eebbd1Syupeng concat_submit_null_payload_request(struct raid_bdev_io *raid_io)
17064eebbd1Syupeng {
17164eebbd1Syupeng struct raid_bdev *raid_bdev;
17264eebbd1Syupeng int ret;
17364eebbd1Syupeng struct raid_base_bdev_info *base_info;
17464eebbd1Syupeng struct spdk_io_channel *base_ch;
17564eebbd1Syupeng uint64_t pd_lba;
17664eebbd1Syupeng uint64_t pd_blocks;
17764eebbd1Syupeng uint64_t offset_blocks;
17864eebbd1Syupeng uint64_t num_blocks;
17964eebbd1Syupeng struct concat_block_range *block_range;
18064eebbd1Syupeng int i, start_idx, stop_idx;
18164eebbd1Syupeng
18264eebbd1Syupeng raid_bdev = raid_io->raid_bdev;
18364eebbd1Syupeng block_range = raid_bdev->module_private;
18464eebbd1Syupeng
185a4e1703eSArtur Paszkiewicz offset_blocks = raid_io->offset_blocks;
186a4e1703eSArtur Paszkiewicz num_blocks = raid_io->num_blocks;
18764eebbd1Syupeng start_idx = -1;
18864eebbd1Syupeng stop_idx = -1;
18964eebbd1Syupeng /*
19064eebbd1Syupeng * Go through all base bdevs, find the first bdev and the last bdev
19164eebbd1Syupeng */
19264eebbd1Syupeng for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
19364eebbd1Syupeng /* skip the bdevs before the offset_blocks */
19464eebbd1Syupeng if (offset_blocks >= block_range[i].start + block_range[i].length) {
19564eebbd1Syupeng continue;
19664eebbd1Syupeng }
19764eebbd1Syupeng if (start_idx == -1) {
19864eebbd1Syupeng start_idx = i;
19964eebbd1Syupeng } else {
20064eebbd1Syupeng /*
20164eebbd1Syupeng * The offset_blocks might be at the middle of the first bdev.
20264eebbd1Syupeng * Besides the first bdev, the offset_blocks should be always
20364eebbd1Syupeng * at the start of the bdev.
20464eebbd1Syupeng */
20564eebbd1Syupeng assert(offset_blocks == block_range[i].start);
20664eebbd1Syupeng }
20764eebbd1Syupeng pd_lba = offset_blocks - block_range[i].start;
20864eebbd1Syupeng pd_blocks = spdk_min(num_blocks, block_range[i].length - pd_lba);
20964eebbd1Syupeng offset_blocks += pd_blocks;
21064eebbd1Syupeng num_blocks -= pd_blocks;
21164eebbd1Syupeng if (num_blocks == 0) {
21264eebbd1Syupeng stop_idx = i;
21364eebbd1Syupeng break;
21464eebbd1Syupeng }
21564eebbd1Syupeng }
21664eebbd1Syupeng assert(start_idx >= 0);
21764eebbd1Syupeng assert(stop_idx >= 0);
21864eebbd1Syupeng
21964eebbd1Syupeng if (raid_io->base_bdev_io_remaining == 0) {
22064eebbd1Syupeng raid_io->base_bdev_io_remaining = stop_idx - start_idx + 1;
22164eebbd1Syupeng }
222a4e1703eSArtur Paszkiewicz offset_blocks = raid_io->offset_blocks;
223a4e1703eSArtur Paszkiewicz num_blocks = raid_io->num_blocks;
22464eebbd1Syupeng for (i = start_idx; i <= stop_idx; i++) {
22564eebbd1Syupeng assert(offset_blocks >= block_range[i].start);
22664eebbd1Syupeng assert(offset_blocks < block_range[i].start + block_range[i].length);
22764eebbd1Syupeng pd_lba = offset_blocks - block_range[i].start;
22864eebbd1Syupeng pd_blocks = spdk_min(num_blocks, block_range[i].length - pd_lba);
22964eebbd1Syupeng offset_blocks += pd_blocks;
23064eebbd1Syupeng num_blocks -= pd_blocks;
23164eebbd1Syupeng /*
23264eebbd1Syupeng * Skip the IOs we have submitted
23364eebbd1Syupeng */
23464eebbd1Syupeng if (i < start_idx + raid_io->base_bdev_io_submitted) {
23564eebbd1Syupeng continue;
23664eebbd1Syupeng }
23764eebbd1Syupeng base_info = &raid_bdev->base_bdev_info[i];
238*6e03e49bSArtur Paszkiewicz base_ch = raid_bdev_channel_get_base_channel(raid_io->raid_ch, i);
239a4e1703eSArtur Paszkiewicz switch (raid_io->type) {
24064eebbd1Syupeng case SPDK_BDEV_IO_TYPE_UNMAP:
241614ca6d2SArtur Paszkiewicz ret = raid_bdev_unmap_blocks(base_info, base_ch,
24264eebbd1Syupeng pd_lba, pd_blocks,
24364eebbd1Syupeng concat_base_io_complete, raid_io);
24464eebbd1Syupeng break;
24564eebbd1Syupeng case SPDK_BDEV_IO_TYPE_FLUSH:
246614ca6d2SArtur Paszkiewicz ret = raid_bdev_flush_blocks(base_info, base_ch,
24764eebbd1Syupeng pd_lba, pd_blocks,
24864eebbd1Syupeng concat_base_io_complete, raid_io);
24964eebbd1Syupeng break;
25064eebbd1Syupeng default:
251a4e1703eSArtur Paszkiewicz SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", raid_io->type);
25264eebbd1Syupeng assert(false);
25364eebbd1Syupeng ret = -EIO;
25464eebbd1Syupeng }
25564eebbd1Syupeng if (ret == 0) {
25664eebbd1Syupeng raid_io->base_bdev_io_submitted++;
25764eebbd1Syupeng } else if (ret == -ENOMEM) {
2588d1993a5SArtur Paszkiewicz raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
2598d1993a5SArtur Paszkiewicz base_ch, _concat_submit_null_payload_request);
26064eebbd1Syupeng return;
26164eebbd1Syupeng } else {
26264eebbd1Syupeng SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
26364eebbd1Syupeng assert(false);
26464eebbd1Syupeng raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
26564eebbd1Syupeng return;
26664eebbd1Syupeng }
26764eebbd1Syupeng }
26864eebbd1Syupeng }
26964eebbd1Syupeng
2708dd1cd21SBen Walker static int
concat_start(struct raid_bdev * raid_bdev)2718dd1cd21SBen Walker concat_start(struct raid_bdev *raid_bdev)
27264eebbd1Syupeng {
27364eebbd1Syupeng uint64_t total_blockcnt = 0;
27464eebbd1Syupeng struct raid_base_bdev_info *base_info;
27564eebbd1Syupeng struct concat_block_range *block_range;
27664eebbd1Syupeng
27764eebbd1Syupeng block_range = calloc(raid_bdev->num_base_bdevs, sizeof(struct concat_block_range));
27864eebbd1Syupeng if (!block_range) {
27964eebbd1Syupeng SPDK_ERRLOG("Can not allocate block_range, num_base_bdevs: %u",
28064eebbd1Syupeng raid_bdev->num_base_bdevs);
28164eebbd1Syupeng return -ENOMEM;
28264eebbd1Syupeng }
28364eebbd1Syupeng
28464eebbd1Syupeng int idx = 0;
28564eebbd1Syupeng RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
286deed7d2fSArtur Paszkiewicz uint64_t strip_cnt = base_info->data_size >> raid_bdev->strip_size_shift;
28764eebbd1Syupeng uint64_t pd_block_cnt = strip_cnt << raid_bdev->strip_size_shift;
28864eebbd1Syupeng
289deed7d2fSArtur Paszkiewicz base_info->data_size = pd_block_cnt;
290deed7d2fSArtur Paszkiewicz
29164eebbd1Syupeng block_range[idx].start = total_blockcnt;
29264eebbd1Syupeng block_range[idx].length = pd_block_cnt;
29364eebbd1Syupeng total_blockcnt += pd_block_cnt;
29464eebbd1Syupeng idx++;
29564eebbd1Syupeng }
29664eebbd1Syupeng
29764eebbd1Syupeng raid_bdev->module_private = block_range;
29864eebbd1Syupeng
29964eebbd1Syupeng SPDK_DEBUGLOG(bdev_concat, "total blockcount %" PRIu64 ", numbasedev %u, strip size shift %u\n",
30064eebbd1Syupeng total_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift);
30164eebbd1Syupeng raid_bdev->bdev.blockcnt = total_blockcnt;
30264eebbd1Syupeng
30364eebbd1Syupeng raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size;
30464eebbd1Syupeng raid_bdev->bdev.split_on_optimal_io_boundary = true;
30564eebbd1Syupeng
30664eebbd1Syupeng return 0;
30764eebbd1Syupeng }
30864eebbd1Syupeng
3099cf1ab5bSArtur Paszkiewicz static bool
concat_stop(struct raid_bdev * raid_bdev)31064eebbd1Syupeng concat_stop(struct raid_bdev *raid_bdev)
31164eebbd1Syupeng {
31264eebbd1Syupeng struct concat_block_range *block_range = raid_bdev->module_private;
31364eebbd1Syupeng
31464eebbd1Syupeng free(block_range);
3159cf1ab5bSArtur Paszkiewicz
3169cf1ab5bSArtur Paszkiewicz return true;
31764eebbd1Syupeng }
31864eebbd1Syupeng
31964eebbd1Syupeng static struct raid_bdev_module g_concat_module = {
32064eebbd1Syupeng .level = CONCAT,
32164eebbd1Syupeng .base_bdevs_min = 1,
32272672d49SArtur Paszkiewicz .memory_domains_supported = true,
32364eebbd1Syupeng .start = concat_start,
32464eebbd1Syupeng .stop = concat_stop,
32564eebbd1Syupeng .submit_rw_request = concat_submit_rw_request,
32664eebbd1Syupeng .submit_null_payload_request = concat_submit_null_payload_request,
32764eebbd1Syupeng };
32864eebbd1Syupeng RAID_MODULE_REGISTER(&g_concat_module)
32964eebbd1Syupeng
33064eebbd1Syupeng SPDK_LOG_REGISTER_COMPONENT(bdev_concat)
331