xref: /spdk/module/bdev/raid/concat.c (revision 60982c759db49b4f4579f16e3b24df0725ba4b94)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2022 Intel Corporation.
3  *   Copyright (c) Peng Yu yupeng0921@gmail.com.
4  *   All rights reserved.
5  */
6 
7 #include "bdev_raid.h"
8 
9 #include "spdk/env.h"
10 #include "spdk/thread.h"
11 #include "spdk/string.h"
12 #include "spdk/util.h"
13 
14 #include "spdk/log.h"
15 
16 struct concat_block_range {
17 	uint64_t start;
18 	uint64_t length;
19 };
20 
21 /*
22  * brief:
23  * concat_bdev_io_completion function is called by lower layers to notify raid
24  * module that particular bdev_io is completed.
25  * params:
26  * bdev_io - pointer to bdev io submitted to lower layers, like child io
27  * success - bdev_io status
28  * cb_arg - function callback context (parent raid_bdev_io)
29  * returns:
30  * none
31  */
32 static void
33 concat_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
34 {
35 	struct raid_bdev_io *raid_io = cb_arg;
36 
37 	spdk_bdev_free_io(bdev_io);
38 
39 	if (success) {
40 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS);
41 	} else {
42 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
43 	}
44 }
45 
46 static void concat_submit_rw_request(struct raid_bdev_io *raid_io);
47 
48 static void
49 _concat_submit_rw_request(void *_raid_io)
50 {
51 	struct raid_bdev_io *raid_io = _raid_io;
52 
53 	concat_submit_rw_request(raid_io);
54 }
55 
56 /*
57  * brief:
58  * concat_submit_rw_request function is used to submit I/O to the correct
59  * member disk for concat bdevs.
60  * params:
61  * raid_io
62  * returns:
63  * none
64  */
65 static void
66 concat_submit_rw_request(struct raid_bdev_io *raid_io)
67 {
68 	struct spdk_bdev_io		*bdev_io = spdk_bdev_io_from_ctx(raid_io);
69 	struct raid_bdev_io_channel	*raid_ch = raid_io->raid_ch;
70 	struct raid_bdev		*raid_bdev = raid_io->raid_bdev;
71 	struct concat_block_range	*block_range = raid_bdev->module_private;
72 	uint64_t			pd_lba;
73 	uint64_t			pd_blocks;
74 	int				pd_idx;
75 	int				ret = 0;
76 	struct raid_base_bdev_info	*base_info;
77 	struct spdk_io_channel		*base_ch;
78 	struct spdk_bdev_ext_io_opts	io_opts = {};
79 	int i;
80 
81 	pd_idx = -1;
82 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
83 		if (block_range[i].start > bdev_io->u.bdev.offset_blocks) {
84 			break;
85 		}
86 		pd_idx = i;
87 	}
88 	assert(pd_idx >= 0);
89 	assert(bdev_io->u.bdev.offset_blocks >= block_range[pd_idx].start);
90 	pd_lba = bdev_io->u.bdev.offset_blocks - block_range[pd_idx].start;
91 	pd_blocks = bdev_io->u.bdev.num_blocks;
92 	base_info = &raid_bdev->base_bdev_info[pd_idx];
93 	if (base_info->desc == NULL) {
94 		SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
95 		assert(0);
96 	}
97 
98 	/*
99 	 * Submit child io to bdev layer with using base bdev descriptors, base
100 	 * bdev lba, base bdev child io length in blocks, buffer, completion
101 	 * function and function callback context
102 	 */
103 	assert(raid_ch != NULL);
104 	assert(raid_ch->base_channel);
105 	base_ch = raid_ch->base_channel[pd_idx];
106 
107 	io_opts.size = sizeof(io_opts);
108 	io_opts.memory_domain = bdev_io->u.bdev.memory_domain;
109 	io_opts.memory_domain_ctx = bdev_io->u.bdev.memory_domain_ctx;
110 	io_opts.metadata = bdev_io->u.bdev.md_buf;
111 
112 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
113 		ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch,
114 						 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
115 						 pd_lba, pd_blocks, concat_bdev_io_completion,
116 						 raid_io, &io_opts);
117 	} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
118 		ret = spdk_bdev_writev_blocks_ext(base_info->desc, base_ch,
119 						  bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
120 						  pd_lba, pd_blocks, concat_bdev_io_completion,
121 						  raid_io, &io_opts);
122 	} else {
123 		SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type);
124 		assert(0);
125 	}
126 
127 	if (ret == -ENOMEM) {
128 		raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
129 					base_ch, _concat_submit_rw_request);
130 	} else if (ret != 0) {
131 		SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
132 		assert(false);
133 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
134 	}
135 }
136 
137 static void concat_submit_null_payload_request(struct raid_bdev_io *raid_io);
138 
139 static void
140 _concat_submit_null_payload_request(void *_raid_io)
141 {
142 	struct raid_bdev_io *raid_io = _raid_io;
143 
144 	concat_submit_null_payload_request(raid_io);
145 }
146 
147 static void
148 concat_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
149 {
150 	struct raid_bdev_io *raid_io = cb_arg;
151 
152 	raid_bdev_io_complete_part(raid_io, 1, success ?
153 				   SPDK_BDEV_IO_STATUS_SUCCESS :
154 				   SPDK_BDEV_IO_STATUS_FAILED);
155 
156 	spdk_bdev_free_io(bdev_io);
157 }
158 
159 /*
160  * brief:
161  * concat_submit_null_payload_request function submits the next batch of
162  * io requests with range but without payload, like FLUSH and UNMAP, to member disks;
163  * it will submit as many as possible unless one base io request fails with -ENOMEM,
164  * in which case it will queue itself for later submission.
165  * params:
166  * bdev_io - pointer to parent bdev_io on raid bdev device
167  * returns:
168  * none
169  */
170 static void
171 concat_submit_null_payload_request(struct raid_bdev_io *raid_io)
172 {
173 	struct spdk_bdev_io		*bdev_io;
174 	struct raid_bdev		*raid_bdev;
175 	int				ret;
176 	struct raid_base_bdev_info	*base_info;
177 	struct spdk_io_channel		*base_ch;
178 	uint64_t			pd_lba;
179 	uint64_t			pd_blocks;
180 	uint64_t			offset_blocks;
181 	uint64_t			num_blocks;
182 	struct concat_block_range	*block_range;
183 	int				i, start_idx, stop_idx;
184 
185 	bdev_io = spdk_bdev_io_from_ctx(raid_io);
186 	raid_bdev = raid_io->raid_bdev;
187 	block_range = raid_bdev->module_private;
188 
189 	offset_blocks = bdev_io->u.bdev.offset_blocks;
190 	num_blocks = bdev_io->u.bdev.num_blocks;
191 	start_idx = -1;
192 	stop_idx = -1;
193 	/*
194 	 * Go through all base bdevs, find the first bdev and the last bdev
195 	 */
196 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
197 		/* skip the bdevs before the offset_blocks */
198 		if (offset_blocks >= block_range[i].start + block_range[i].length) {
199 			continue;
200 		}
201 		if (start_idx == -1) {
202 			start_idx = i;
203 		} else {
204 			/*
205 			 * The offset_blocks might be at the middle of the first bdev.
206 			 * Besides the first bdev, the offset_blocks should be always
207 			 * at the start of the bdev.
208 			 */
209 			assert(offset_blocks == block_range[i].start);
210 		}
211 		pd_lba = offset_blocks - block_range[i].start;
212 		pd_blocks = spdk_min(num_blocks, block_range[i].length - pd_lba);
213 		offset_blocks += pd_blocks;
214 		num_blocks -= pd_blocks;
215 		if (num_blocks == 0) {
216 			stop_idx = i;
217 			break;
218 		}
219 	}
220 	assert(start_idx >= 0);
221 	assert(stop_idx >= 0);
222 
223 	if (raid_io->base_bdev_io_remaining == 0) {
224 		raid_io->base_bdev_io_remaining = stop_idx - start_idx + 1;
225 	}
226 	offset_blocks = bdev_io->u.bdev.offset_blocks;
227 	num_blocks = bdev_io->u.bdev.num_blocks;
228 	for (i = start_idx; i <= stop_idx; i++) {
229 		assert(offset_blocks >= block_range[i].start);
230 		assert(offset_blocks < block_range[i].start + block_range[i].length);
231 		pd_lba = offset_blocks -  block_range[i].start;
232 		pd_blocks = spdk_min(num_blocks, block_range[i].length - pd_lba);
233 		offset_blocks += pd_blocks;
234 		num_blocks -= pd_blocks;
235 		/*
236 		 * Skip the IOs we have submitted
237 		 */
238 		if (i < start_idx + raid_io->base_bdev_io_submitted) {
239 			continue;
240 		}
241 		base_info = &raid_bdev->base_bdev_info[i];
242 		base_ch = raid_io->raid_ch->base_channel[i];
243 		switch (bdev_io->type) {
244 		case SPDK_BDEV_IO_TYPE_UNMAP:
245 			ret = spdk_bdev_unmap_blocks(base_info->desc, base_ch,
246 						     pd_lba, pd_blocks,
247 						     concat_base_io_complete, raid_io);
248 			break;
249 		case SPDK_BDEV_IO_TYPE_FLUSH:
250 			ret = spdk_bdev_flush_blocks(base_info->desc, base_ch,
251 						     pd_lba, pd_blocks,
252 						     concat_base_io_complete, raid_io);
253 			break;
254 		default:
255 			SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type);
256 			assert(false);
257 			ret = -EIO;
258 		}
259 		if (ret == 0) {
260 			raid_io->base_bdev_io_submitted++;
261 		} else if (ret == -ENOMEM) {
262 			raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
263 						base_ch, _concat_submit_null_payload_request);
264 			return;
265 		} else {
266 			SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
267 			assert(false);
268 			raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
269 			return;
270 		}
271 	}
272 }
273 
274 static int
275 concat_start(struct raid_bdev *raid_bdev)
276 {
277 	uint64_t total_blockcnt = 0;
278 	struct raid_base_bdev_info *base_info;
279 	struct concat_block_range *block_range;
280 
281 	block_range = calloc(raid_bdev->num_base_bdevs, sizeof(struct concat_block_range));
282 	if (!block_range) {
283 		SPDK_ERRLOG("Can not allocate block_range, num_base_bdevs: %u",
284 			    raid_bdev->num_base_bdevs);
285 		return -ENOMEM;
286 	}
287 
288 	int idx = 0;
289 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
290 		uint64_t strip_cnt = spdk_bdev_desc_get_bdev(base_info->desc)->blockcnt >>
291 				     raid_bdev->strip_size_shift;
292 		uint64_t pd_block_cnt = strip_cnt << raid_bdev->strip_size_shift;
293 
294 		block_range[idx].start = total_blockcnt;
295 		block_range[idx].length = pd_block_cnt;
296 		total_blockcnt += pd_block_cnt;
297 		idx++;
298 	}
299 
300 	raid_bdev->module_private = block_range;
301 
302 	SPDK_DEBUGLOG(bdev_concat, "total blockcount %" PRIu64 ",  numbasedev %u, strip size shift %u\n",
303 		      total_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift);
304 	raid_bdev->bdev.blockcnt = total_blockcnt;
305 
306 	raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size;
307 	raid_bdev->bdev.split_on_optimal_io_boundary = true;
308 
309 	return 0;
310 }
311 
312 static bool
313 concat_stop(struct raid_bdev *raid_bdev)
314 {
315 	struct concat_block_range *block_range = raid_bdev->module_private;
316 
317 	free(block_range);
318 
319 	return true;
320 }
321 
322 static struct raid_bdev_module g_concat_module = {
323 	.level = CONCAT,
324 	.base_bdevs_min = 1,
325 	.memory_domains_supported = true,
326 	.start = concat_start,
327 	.stop = concat_stop,
328 	.submit_rw_request = concat_submit_rw_request,
329 	.submit_null_payload_request = concat_submit_null_payload_request,
330 };
331 RAID_MODULE_REGISTER(&g_concat_module)
332 
333 SPDK_LOG_REGISTER_COMPONENT(bdev_concat)
334