xref: /spdk/module/bdev/raid/concat.c (revision 6e03e49b3610bfd79d1539578ae1e138d5ed1b5c)
1488570ebSJim Harris /*   SPDX-License-Identifier: BSD-3-Clause
2a6dbe372Spaul luse  *   Copyright (C) 2022 Intel Corporation.
364eebbd1Syupeng  *   Copyright (c) Peng Yu yupeng0921@gmail.com.
464eebbd1Syupeng  *   All rights reserved.
564eebbd1Syupeng  */
664eebbd1Syupeng 
764eebbd1Syupeng #include "bdev_raid.h"
864eebbd1Syupeng 
964eebbd1Syupeng #include "spdk/env.h"
1064eebbd1Syupeng #include "spdk/thread.h"
1164eebbd1Syupeng #include "spdk/string.h"
1264eebbd1Syupeng #include "spdk/util.h"
1364eebbd1Syupeng 
1464eebbd1Syupeng #include "spdk/log.h"
1564eebbd1Syupeng 
1664eebbd1Syupeng struct concat_block_range {
1764eebbd1Syupeng 	uint64_t start;
1864eebbd1Syupeng 	uint64_t length;
1964eebbd1Syupeng };
2064eebbd1Syupeng 
2164eebbd1Syupeng /*
2264eebbd1Syupeng  * brief:
2364eebbd1Syupeng  * concat_bdev_io_completion function is called by lower layers to notify raid
2464eebbd1Syupeng  * module that particular bdev_io is completed.
2564eebbd1Syupeng  * params:
2664eebbd1Syupeng  * bdev_io - pointer to bdev io submitted to lower layers, like child io
2764eebbd1Syupeng  * success - bdev_io status
2864eebbd1Syupeng  * cb_arg - function callback context (parent raid_bdev_io)
2964eebbd1Syupeng  * returns:
3064eebbd1Syupeng  * none
3164eebbd1Syupeng  */
3264eebbd1Syupeng static void
concat_bdev_io_completion(struct spdk_bdev_io * bdev_io,bool success,void * cb_arg)3364eebbd1Syupeng concat_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
3464eebbd1Syupeng {
3564eebbd1Syupeng 	struct raid_bdev_io *raid_io = cb_arg;
3664eebbd1Syupeng 
3764eebbd1Syupeng 	spdk_bdev_free_io(bdev_io);
3864eebbd1Syupeng 
3964eebbd1Syupeng 	if (success) {
4064eebbd1Syupeng 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS);
4164eebbd1Syupeng 	} else {
4264eebbd1Syupeng 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
4364eebbd1Syupeng 	}
4464eebbd1Syupeng }
4564eebbd1Syupeng 
468dd1cd21SBen Walker static void concat_submit_rw_request(struct raid_bdev_io *raid_io);
4764eebbd1Syupeng 
4864eebbd1Syupeng static void
_concat_submit_rw_request(void * _raid_io)4964eebbd1Syupeng _concat_submit_rw_request(void *_raid_io)
5064eebbd1Syupeng {
5164eebbd1Syupeng 	struct raid_bdev_io *raid_io = _raid_io;
5264eebbd1Syupeng 
5364eebbd1Syupeng 	concat_submit_rw_request(raid_io);
5464eebbd1Syupeng }
5564eebbd1Syupeng 
5664eebbd1Syupeng /*
5764eebbd1Syupeng  * brief:
5864eebbd1Syupeng  * concat_submit_rw_request function is used to submit I/O to the correct
5964eebbd1Syupeng  * member disk for concat bdevs.
6064eebbd1Syupeng  * params:
6164eebbd1Syupeng  * raid_io
6264eebbd1Syupeng  * returns:
6364eebbd1Syupeng  * none
6464eebbd1Syupeng  */
6564eebbd1Syupeng static void
concat_submit_rw_request(struct raid_bdev_io * raid_io)6664eebbd1Syupeng concat_submit_rw_request(struct raid_bdev_io *raid_io)
6764eebbd1Syupeng {
6864eebbd1Syupeng 	struct raid_bdev_io_channel	*raid_ch = raid_io->raid_ch;
6964eebbd1Syupeng 	struct raid_bdev		*raid_bdev = raid_io->raid_bdev;
7064eebbd1Syupeng 	struct concat_block_range	*block_range = raid_bdev->module_private;
7164eebbd1Syupeng 	uint64_t			pd_lba;
7264eebbd1Syupeng 	uint64_t			pd_blocks;
7364eebbd1Syupeng 	int				pd_idx;
7464eebbd1Syupeng 	int				ret = 0;
7564eebbd1Syupeng 	struct raid_base_bdev_info	*base_info;
7664eebbd1Syupeng 	struct spdk_io_channel		*base_ch;
7755f94793SKonrad Sztyber 	struct spdk_bdev_ext_io_opts	io_opts = {};
7864eebbd1Syupeng 	int i;
7964eebbd1Syupeng 
8064eebbd1Syupeng 	pd_idx = -1;
8164eebbd1Syupeng 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
82a4e1703eSArtur Paszkiewicz 		if (block_range[i].start > raid_io->offset_blocks) {
8364eebbd1Syupeng 			break;
8464eebbd1Syupeng 		}
8564eebbd1Syupeng 		pd_idx = i;
8664eebbd1Syupeng 	}
8764eebbd1Syupeng 	assert(pd_idx >= 0);
88a4e1703eSArtur Paszkiewicz 	assert(raid_io->offset_blocks >= block_range[pd_idx].start);
89a4e1703eSArtur Paszkiewicz 	pd_lba = raid_io->offset_blocks - block_range[pd_idx].start;
90a4e1703eSArtur Paszkiewicz 	pd_blocks = raid_io->num_blocks;
9164eebbd1Syupeng 	base_info = &raid_bdev->base_bdev_info[pd_idx];
9264eebbd1Syupeng 	if (base_info->desc == NULL) {
9364eebbd1Syupeng 		SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
9464eebbd1Syupeng 		assert(0);
9564eebbd1Syupeng 	}
9664eebbd1Syupeng 
9764eebbd1Syupeng 	/*
9864eebbd1Syupeng 	 * Submit child io to bdev layer with using base bdev descriptors, base
9964eebbd1Syupeng 	 * bdev lba, base bdev child io length in blocks, buffer, completion
10064eebbd1Syupeng 	 * function and function callback context
10164eebbd1Syupeng 	 */
10264eebbd1Syupeng 	assert(raid_ch != NULL);
103*6e03e49bSArtur Paszkiewicz 	base_ch = raid_bdev_channel_get_base_channel(raid_ch, pd_idx);
10455f94793SKonrad Sztyber 
10555f94793SKonrad Sztyber 	io_opts.size = sizeof(io_opts);
106a4e1703eSArtur Paszkiewicz 	io_opts.memory_domain = raid_io->memory_domain;
107a4e1703eSArtur Paszkiewicz 	io_opts.memory_domain_ctx = raid_io->memory_domain_ctx;
108a4e1703eSArtur Paszkiewicz 	io_opts.metadata = raid_io->md_buf;
10955f94793SKonrad Sztyber 
110a4e1703eSArtur Paszkiewicz 	if (raid_io->type == SPDK_BDEV_IO_TYPE_READ) {
111614ca6d2SArtur Paszkiewicz 		ret = raid_bdev_readv_blocks_ext(base_info, base_ch,
112a4e1703eSArtur Paszkiewicz 						 raid_io->iovs, raid_io->iovcnt,
11364eebbd1Syupeng 						 pd_lba, pd_blocks, concat_bdev_io_completion,
11455f94793SKonrad Sztyber 						 raid_io, &io_opts);
115a4e1703eSArtur Paszkiewicz 	} else if (raid_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
116614ca6d2SArtur Paszkiewicz 		ret = raid_bdev_writev_blocks_ext(base_info, base_ch,
117a4e1703eSArtur Paszkiewicz 						  raid_io->iovs, raid_io->iovcnt,
11864eebbd1Syupeng 						  pd_lba, pd_blocks, concat_bdev_io_completion,
11955f94793SKonrad Sztyber 						  raid_io, &io_opts);
120ba460005SKrzysztof Smolinski 	} else {
121a4e1703eSArtur Paszkiewicz 		SPDK_ERRLOG("Recvd not supported io type %u\n", raid_io->type);
12264eebbd1Syupeng 		assert(0);
12364eebbd1Syupeng 	}
12464eebbd1Syupeng 
12564eebbd1Syupeng 	if (ret == -ENOMEM) {
1268d1993a5SArtur Paszkiewicz 		raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
1278d1993a5SArtur Paszkiewicz 					base_ch, _concat_submit_rw_request);
12864eebbd1Syupeng 	} else if (ret != 0) {
12964eebbd1Syupeng 		SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
13064eebbd1Syupeng 		assert(false);
13164eebbd1Syupeng 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
13264eebbd1Syupeng 	}
13364eebbd1Syupeng }
13464eebbd1Syupeng 
1358dd1cd21SBen Walker static void concat_submit_null_payload_request(struct raid_bdev_io *raid_io);
13664eebbd1Syupeng 
13764eebbd1Syupeng static void
_concat_submit_null_payload_request(void * _raid_io)13864eebbd1Syupeng _concat_submit_null_payload_request(void *_raid_io)
13964eebbd1Syupeng {
14064eebbd1Syupeng 	struct raid_bdev_io *raid_io = _raid_io;
14164eebbd1Syupeng 
14264eebbd1Syupeng 	concat_submit_null_payload_request(raid_io);
14364eebbd1Syupeng }
14464eebbd1Syupeng 
14564eebbd1Syupeng static void
concat_base_io_complete(struct spdk_bdev_io * bdev_io,bool success,void * cb_arg)14664eebbd1Syupeng concat_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
14764eebbd1Syupeng {
14864eebbd1Syupeng 	struct raid_bdev_io *raid_io = cb_arg;
14964eebbd1Syupeng 
15064eebbd1Syupeng 	raid_bdev_io_complete_part(raid_io, 1, success ?
15164eebbd1Syupeng 				   SPDK_BDEV_IO_STATUS_SUCCESS :
15264eebbd1Syupeng 				   SPDK_BDEV_IO_STATUS_FAILED);
15364eebbd1Syupeng 
15464eebbd1Syupeng 	spdk_bdev_free_io(bdev_io);
15564eebbd1Syupeng }
15664eebbd1Syupeng 
15764eebbd1Syupeng /*
15864eebbd1Syupeng  * brief:
15964eebbd1Syupeng  * concat_submit_null_payload_request function submits the next batch of
16064eebbd1Syupeng  * io requests with range but without payload, like FLUSH and UNMAP, to member disks;
16164eebbd1Syupeng  * it will submit as many as possible unless one base io request fails with -ENOMEM,
16264eebbd1Syupeng  * in which case it will queue itself for later submission.
16364eebbd1Syupeng  * params:
16464eebbd1Syupeng  * bdev_io - pointer to parent bdev_io on raid bdev device
16564eebbd1Syupeng  * returns:
16664eebbd1Syupeng  * none
16764eebbd1Syupeng  */
16864eebbd1Syupeng static void
concat_submit_null_payload_request(struct raid_bdev_io * raid_io)16964eebbd1Syupeng concat_submit_null_payload_request(struct raid_bdev_io *raid_io)
17064eebbd1Syupeng {
17164eebbd1Syupeng 	struct raid_bdev		*raid_bdev;
17264eebbd1Syupeng 	int				ret;
17364eebbd1Syupeng 	struct raid_base_bdev_info	*base_info;
17464eebbd1Syupeng 	struct spdk_io_channel		*base_ch;
17564eebbd1Syupeng 	uint64_t			pd_lba;
17664eebbd1Syupeng 	uint64_t			pd_blocks;
17764eebbd1Syupeng 	uint64_t			offset_blocks;
17864eebbd1Syupeng 	uint64_t			num_blocks;
17964eebbd1Syupeng 	struct concat_block_range	*block_range;
18064eebbd1Syupeng 	int				i, start_idx, stop_idx;
18164eebbd1Syupeng 
18264eebbd1Syupeng 	raid_bdev = raid_io->raid_bdev;
18364eebbd1Syupeng 	block_range = raid_bdev->module_private;
18464eebbd1Syupeng 
185a4e1703eSArtur Paszkiewicz 	offset_blocks = raid_io->offset_blocks;
186a4e1703eSArtur Paszkiewicz 	num_blocks = raid_io->num_blocks;
18764eebbd1Syupeng 	start_idx = -1;
18864eebbd1Syupeng 	stop_idx = -1;
18964eebbd1Syupeng 	/*
19064eebbd1Syupeng 	 * Go through all base bdevs, find the first bdev and the last bdev
19164eebbd1Syupeng 	 */
19264eebbd1Syupeng 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
19364eebbd1Syupeng 		/* skip the bdevs before the offset_blocks */
19464eebbd1Syupeng 		if (offset_blocks >= block_range[i].start + block_range[i].length) {
19564eebbd1Syupeng 			continue;
19664eebbd1Syupeng 		}
19764eebbd1Syupeng 		if (start_idx == -1) {
19864eebbd1Syupeng 			start_idx = i;
19964eebbd1Syupeng 		} else {
20064eebbd1Syupeng 			/*
20164eebbd1Syupeng 			 * The offset_blocks might be at the middle of the first bdev.
20264eebbd1Syupeng 			 * Besides the first bdev, the offset_blocks should be always
20364eebbd1Syupeng 			 * at the start of the bdev.
20464eebbd1Syupeng 			 */
20564eebbd1Syupeng 			assert(offset_blocks == block_range[i].start);
20664eebbd1Syupeng 		}
20764eebbd1Syupeng 		pd_lba = offset_blocks - block_range[i].start;
20864eebbd1Syupeng 		pd_blocks = spdk_min(num_blocks, block_range[i].length - pd_lba);
20964eebbd1Syupeng 		offset_blocks += pd_blocks;
21064eebbd1Syupeng 		num_blocks -= pd_blocks;
21164eebbd1Syupeng 		if (num_blocks == 0) {
21264eebbd1Syupeng 			stop_idx = i;
21364eebbd1Syupeng 			break;
21464eebbd1Syupeng 		}
21564eebbd1Syupeng 	}
21664eebbd1Syupeng 	assert(start_idx >= 0);
21764eebbd1Syupeng 	assert(stop_idx >= 0);
21864eebbd1Syupeng 
21964eebbd1Syupeng 	if (raid_io->base_bdev_io_remaining == 0) {
22064eebbd1Syupeng 		raid_io->base_bdev_io_remaining = stop_idx - start_idx + 1;
22164eebbd1Syupeng 	}
222a4e1703eSArtur Paszkiewicz 	offset_blocks = raid_io->offset_blocks;
223a4e1703eSArtur Paszkiewicz 	num_blocks = raid_io->num_blocks;
22464eebbd1Syupeng 	for (i = start_idx; i <= stop_idx; i++) {
22564eebbd1Syupeng 		assert(offset_blocks >= block_range[i].start);
22664eebbd1Syupeng 		assert(offset_blocks < block_range[i].start + block_range[i].length);
22764eebbd1Syupeng 		pd_lba = offset_blocks -  block_range[i].start;
22864eebbd1Syupeng 		pd_blocks = spdk_min(num_blocks, block_range[i].length - pd_lba);
22964eebbd1Syupeng 		offset_blocks += pd_blocks;
23064eebbd1Syupeng 		num_blocks -= pd_blocks;
23164eebbd1Syupeng 		/*
23264eebbd1Syupeng 		 * Skip the IOs we have submitted
23364eebbd1Syupeng 		 */
23464eebbd1Syupeng 		if (i < start_idx + raid_io->base_bdev_io_submitted) {
23564eebbd1Syupeng 			continue;
23664eebbd1Syupeng 		}
23764eebbd1Syupeng 		base_info = &raid_bdev->base_bdev_info[i];
238*6e03e49bSArtur Paszkiewicz 		base_ch = raid_bdev_channel_get_base_channel(raid_io->raid_ch, i);
239a4e1703eSArtur Paszkiewicz 		switch (raid_io->type) {
24064eebbd1Syupeng 		case SPDK_BDEV_IO_TYPE_UNMAP:
241614ca6d2SArtur Paszkiewicz 			ret = raid_bdev_unmap_blocks(base_info, base_ch,
24264eebbd1Syupeng 						     pd_lba, pd_blocks,
24364eebbd1Syupeng 						     concat_base_io_complete, raid_io);
24464eebbd1Syupeng 			break;
24564eebbd1Syupeng 		case SPDK_BDEV_IO_TYPE_FLUSH:
246614ca6d2SArtur Paszkiewicz 			ret = raid_bdev_flush_blocks(base_info, base_ch,
24764eebbd1Syupeng 						     pd_lba, pd_blocks,
24864eebbd1Syupeng 						     concat_base_io_complete, raid_io);
24964eebbd1Syupeng 			break;
25064eebbd1Syupeng 		default:
251a4e1703eSArtur Paszkiewicz 			SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", raid_io->type);
25264eebbd1Syupeng 			assert(false);
25364eebbd1Syupeng 			ret = -EIO;
25464eebbd1Syupeng 		}
25564eebbd1Syupeng 		if (ret == 0) {
25664eebbd1Syupeng 			raid_io->base_bdev_io_submitted++;
25764eebbd1Syupeng 		} else if (ret == -ENOMEM) {
2588d1993a5SArtur Paszkiewicz 			raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
2598d1993a5SArtur Paszkiewicz 						base_ch, _concat_submit_null_payload_request);
26064eebbd1Syupeng 			return;
26164eebbd1Syupeng 		} else {
26264eebbd1Syupeng 			SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
26364eebbd1Syupeng 			assert(false);
26464eebbd1Syupeng 			raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
26564eebbd1Syupeng 			return;
26664eebbd1Syupeng 		}
26764eebbd1Syupeng 	}
26864eebbd1Syupeng }
26964eebbd1Syupeng 
2708dd1cd21SBen Walker static int
concat_start(struct raid_bdev * raid_bdev)2718dd1cd21SBen Walker concat_start(struct raid_bdev *raid_bdev)
27264eebbd1Syupeng {
27364eebbd1Syupeng 	uint64_t total_blockcnt = 0;
27464eebbd1Syupeng 	struct raid_base_bdev_info *base_info;
27564eebbd1Syupeng 	struct concat_block_range *block_range;
27664eebbd1Syupeng 
27764eebbd1Syupeng 	block_range = calloc(raid_bdev->num_base_bdevs, sizeof(struct concat_block_range));
27864eebbd1Syupeng 	if (!block_range) {
27964eebbd1Syupeng 		SPDK_ERRLOG("Can not allocate block_range, num_base_bdevs: %u",
28064eebbd1Syupeng 			    raid_bdev->num_base_bdevs);
28164eebbd1Syupeng 		return -ENOMEM;
28264eebbd1Syupeng 	}
28364eebbd1Syupeng 
28464eebbd1Syupeng 	int idx = 0;
28564eebbd1Syupeng 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
286deed7d2fSArtur Paszkiewicz 		uint64_t strip_cnt = base_info->data_size >> raid_bdev->strip_size_shift;
28764eebbd1Syupeng 		uint64_t pd_block_cnt = strip_cnt << raid_bdev->strip_size_shift;
28864eebbd1Syupeng 
289deed7d2fSArtur Paszkiewicz 		base_info->data_size = pd_block_cnt;
290deed7d2fSArtur Paszkiewicz 
29164eebbd1Syupeng 		block_range[idx].start = total_blockcnt;
29264eebbd1Syupeng 		block_range[idx].length = pd_block_cnt;
29364eebbd1Syupeng 		total_blockcnt += pd_block_cnt;
29464eebbd1Syupeng 		idx++;
29564eebbd1Syupeng 	}
29664eebbd1Syupeng 
29764eebbd1Syupeng 	raid_bdev->module_private = block_range;
29864eebbd1Syupeng 
29964eebbd1Syupeng 	SPDK_DEBUGLOG(bdev_concat, "total blockcount %" PRIu64 ",  numbasedev %u, strip size shift %u\n",
30064eebbd1Syupeng 		      total_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift);
30164eebbd1Syupeng 	raid_bdev->bdev.blockcnt = total_blockcnt;
30264eebbd1Syupeng 
30364eebbd1Syupeng 	raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size;
30464eebbd1Syupeng 	raid_bdev->bdev.split_on_optimal_io_boundary = true;
30564eebbd1Syupeng 
30664eebbd1Syupeng 	return 0;
30764eebbd1Syupeng }
30864eebbd1Syupeng 
3099cf1ab5bSArtur Paszkiewicz static bool
concat_stop(struct raid_bdev * raid_bdev)31064eebbd1Syupeng concat_stop(struct raid_bdev *raid_bdev)
31164eebbd1Syupeng {
31264eebbd1Syupeng 	struct concat_block_range *block_range = raid_bdev->module_private;
31364eebbd1Syupeng 
31464eebbd1Syupeng 	free(block_range);
3159cf1ab5bSArtur Paszkiewicz 
3169cf1ab5bSArtur Paszkiewicz 	return true;
31764eebbd1Syupeng }
31864eebbd1Syupeng 
31964eebbd1Syupeng static struct raid_bdev_module g_concat_module = {
32064eebbd1Syupeng 	.level = CONCAT,
32164eebbd1Syupeng 	.base_bdevs_min = 1,
32272672d49SArtur Paszkiewicz 	.memory_domains_supported = true,
32364eebbd1Syupeng 	.start = concat_start,
32464eebbd1Syupeng 	.stop = concat_stop,
32564eebbd1Syupeng 	.submit_rw_request = concat_submit_rw_request,
32664eebbd1Syupeng 	.submit_null_payload_request = concat_submit_null_payload_request,
32764eebbd1Syupeng };
32864eebbd1Syupeng RAID_MODULE_REGISTER(&g_concat_module)
32964eebbd1Syupeng 
33064eebbd1Syupeng SPDK_LOG_REGISTER_COMPONENT(bdev_concat)
331