xref: /spdk/module/bdev/raid/concat.c (revision a1dfa7ec92a6c49538482c8bb73f0b1ce040441f)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2022 Intel Corporation.
3  *   Copyright (c) Peng Yu yupeng0921@gmail.com.
4  *   All rights reserved.
5  */
6 
7 #include "bdev_raid.h"
8 
9 #include "spdk/env.h"
10 #include "spdk/thread.h"
11 #include "spdk/string.h"
12 #include "spdk/util.h"
13 
14 #include "spdk/log.h"
15 
16 struct concat_block_range {
17 	uint64_t start;
18 	uint64_t length;
19 };
20 
21 /*
22  * brief:
23  * concat_bdev_io_completion function is called by lower layers to notify raid
24  * module that particular bdev_io is completed.
25  * params:
26  * bdev_io - pointer to bdev io submitted to lower layers, like child io
27  * success - bdev_io status
28  * cb_arg - function callback context (parent raid_bdev_io)
29  * returns:
30  * none
31  */
32 static void
33 concat_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
34 {
35 	struct raid_bdev_io *raid_io = cb_arg;
36 
37 	spdk_bdev_free_io(bdev_io);
38 
39 	if (success) {
40 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS);
41 	} else {
42 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
43 	}
44 }
45 
46 static void concat_submit_rw_request(struct raid_bdev_io *raid_io);
47 
48 static void
49 _concat_submit_rw_request(void *_raid_io)
50 {
51 	struct raid_bdev_io *raid_io = _raid_io;
52 
53 	concat_submit_rw_request(raid_io);
54 }
55 
56 /*
57  * brief:
58  * concat_submit_rw_request function is used to submit I/O to the correct
59  * member disk for concat bdevs.
60  * params:
61  * raid_io
62  * returns:
63  * none
64  */
65 static void
66 concat_submit_rw_request(struct raid_bdev_io *raid_io)
67 {
68 	struct spdk_bdev_io		*bdev_io = spdk_bdev_io_from_ctx(raid_io);
69 	struct raid_bdev_io_channel	*raid_ch = raid_io->raid_ch;
70 	struct raid_bdev		*raid_bdev = raid_io->raid_bdev;
71 	struct concat_block_range	*block_range = raid_bdev->module_private;
72 	uint64_t			pd_lba;
73 	uint64_t			pd_blocks;
74 	int				pd_idx;
75 	int				ret = 0;
76 	struct raid_base_bdev_info	*base_info;
77 	struct spdk_io_channel		*base_ch;
78 	int i;
79 
80 	pd_idx = -1;
81 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
82 		if (block_range[i].start > bdev_io->u.bdev.offset_blocks) {
83 			break;
84 		}
85 		pd_idx = i;
86 	}
87 	assert(pd_idx >= 0);
88 	assert(bdev_io->u.bdev.offset_blocks >= block_range[pd_idx].start);
89 	pd_lba = bdev_io->u.bdev.offset_blocks - block_range[pd_idx].start;
90 	pd_blocks = bdev_io->u.bdev.num_blocks;
91 	base_info = &raid_bdev->base_bdev_info[pd_idx];
92 	if (base_info->desc == NULL) {
93 		SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
94 		assert(0);
95 	}
96 
97 	/*
98 	 * Submit child io to bdev layer with using base bdev descriptors, base
99 	 * bdev lba, base bdev child io length in blocks, buffer, completion
100 	 * function and function callback context
101 	 */
102 	assert(raid_ch != NULL);
103 	assert(raid_ch->base_channel);
104 	base_ch = raid_ch->base_channel[pd_idx];
105 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
106 		if (bdev_io->u.bdev.ext_opts != NULL) {
107 			ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch,
108 							 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
109 							 pd_lba, pd_blocks, concat_bdev_io_completion,
110 							 raid_io, bdev_io->u.bdev.ext_opts);
111 		} else {
112 			ret = spdk_bdev_readv_blocks_with_md(base_info->desc, base_ch,
113 							     bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
114 							     bdev_io->u.bdev.md_buf,
115 							     pd_lba, pd_blocks,
116 							     concat_bdev_io_completion, raid_io);
117 		}
118 	} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
119 		if (bdev_io->u.bdev.ext_opts != NULL) {
120 			ret = spdk_bdev_writev_blocks_ext(base_info->desc, base_ch,
121 							  bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
122 							  pd_lba, pd_blocks, concat_bdev_io_completion,
123 							  raid_io, bdev_io->u.bdev.ext_opts);
124 		} else {
125 			ret = spdk_bdev_writev_blocks_with_md(base_info->desc, base_ch,
126 							      bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
127 							      bdev_io->u.bdev.md_buf,
128 							      pd_lba, pd_blocks,
129 							      concat_bdev_io_completion, raid_io);
130 		}
131 	} else {
132 		SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type);
133 		assert(0);
134 	}
135 
136 	if (ret == -ENOMEM) {
137 		raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch,
138 					_concat_submit_rw_request);
139 	} else if (ret != 0) {
140 		SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
141 		assert(false);
142 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
143 	}
144 }
145 
146 static void concat_submit_null_payload_request(struct raid_bdev_io *raid_io);
147 
148 static void
149 _concat_submit_null_payload_request(void *_raid_io)
150 {
151 	struct raid_bdev_io *raid_io = _raid_io;
152 
153 	concat_submit_null_payload_request(raid_io);
154 }
155 
156 static void
157 concat_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
158 {
159 	struct raid_bdev_io *raid_io = cb_arg;
160 
161 	raid_bdev_io_complete_part(raid_io, 1, success ?
162 				   SPDK_BDEV_IO_STATUS_SUCCESS :
163 				   SPDK_BDEV_IO_STATUS_FAILED);
164 
165 	spdk_bdev_free_io(bdev_io);
166 }
167 
168 /*
169  * brief:
170  * concat_submit_null_payload_request function submits the next batch of
171  * io requests with range but without payload, like FLUSH and UNMAP, to member disks;
172  * it will submit as many as possible unless one base io request fails with -ENOMEM,
173  * in which case it will queue itself for later submission.
174  * params:
175  * bdev_io - pointer to parent bdev_io on raid bdev device
176  * returns:
177  * none
178  */
179 static void
180 concat_submit_null_payload_request(struct raid_bdev_io *raid_io)
181 {
182 	struct spdk_bdev_io		*bdev_io;
183 	struct raid_bdev		*raid_bdev;
184 	int				ret;
185 	struct raid_base_bdev_info	*base_info;
186 	struct spdk_io_channel		*base_ch;
187 	uint64_t			pd_lba;
188 	uint64_t			pd_blocks;
189 	uint64_t			offset_blocks;
190 	uint64_t			num_blocks;
191 	struct concat_block_range	*block_range;
192 	int				i, start_idx, stop_idx;
193 
194 	bdev_io = spdk_bdev_io_from_ctx(raid_io);
195 	raid_bdev = raid_io->raid_bdev;
196 	block_range = raid_bdev->module_private;
197 
198 	offset_blocks = bdev_io->u.bdev.offset_blocks;
199 	num_blocks = bdev_io->u.bdev.num_blocks;
200 	start_idx = -1;
201 	stop_idx = -1;
202 	/*
203 	 * Go through all base bdevs, find the first bdev and the last bdev
204 	 */
205 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
206 		/* skip the bdevs before the offset_blocks */
207 		if (offset_blocks >= block_range[i].start + block_range[i].length) {
208 			continue;
209 		}
210 		if (start_idx == -1) {
211 			start_idx = i;
212 		} else {
213 			/*
214 			 * The offset_blocks might be at the middle of the first bdev.
215 			 * Besides the first bdev, the offset_blocks should be always
216 			 * at the start of the bdev.
217 			 */
218 			assert(offset_blocks == block_range[i].start);
219 		}
220 		pd_lba = offset_blocks - block_range[i].start;
221 		pd_blocks = spdk_min(num_blocks, block_range[i].length - pd_lba);
222 		offset_blocks += pd_blocks;
223 		num_blocks -= pd_blocks;
224 		if (num_blocks == 0) {
225 			stop_idx = i;
226 			break;
227 		}
228 	}
229 	assert(start_idx >= 0);
230 	assert(stop_idx >= 0);
231 
232 	if (raid_io->base_bdev_io_remaining == 0) {
233 		raid_io->base_bdev_io_remaining = stop_idx - start_idx + 1;
234 	}
235 	offset_blocks = bdev_io->u.bdev.offset_blocks;
236 	num_blocks = bdev_io->u.bdev.num_blocks;
237 	for (i = start_idx; i <= stop_idx; i++) {
238 		assert(offset_blocks >= block_range[i].start);
239 		assert(offset_blocks < block_range[i].start + block_range[i].length);
240 		pd_lba = offset_blocks -  block_range[i].start;
241 		pd_blocks = spdk_min(num_blocks, block_range[i].length - pd_lba);
242 		offset_blocks += pd_blocks;
243 		num_blocks -= pd_blocks;
244 		/*
245 		 * Skip the IOs we have submitted
246 		 */
247 		if (i < start_idx + raid_io->base_bdev_io_submitted) {
248 			continue;
249 		}
250 		base_info = &raid_bdev->base_bdev_info[i];
251 		base_ch = raid_io->raid_ch->base_channel[i];
252 		switch (bdev_io->type) {
253 		case SPDK_BDEV_IO_TYPE_UNMAP:
254 			ret = spdk_bdev_unmap_blocks(base_info->desc, base_ch,
255 						     pd_lba, pd_blocks,
256 						     concat_base_io_complete, raid_io);
257 			break;
258 		case SPDK_BDEV_IO_TYPE_FLUSH:
259 			ret = spdk_bdev_flush_blocks(base_info->desc, base_ch,
260 						     pd_lba, pd_blocks,
261 						     concat_base_io_complete, raid_io);
262 			break;
263 		default:
264 			SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type);
265 			assert(false);
266 			ret = -EIO;
267 		}
268 		if (ret == 0) {
269 			raid_io->base_bdev_io_submitted++;
270 		} else if (ret == -ENOMEM) {
271 			raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch,
272 						_concat_submit_null_payload_request);
273 			return;
274 		} else {
275 			SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
276 			assert(false);
277 			raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
278 			return;
279 		}
280 	}
281 }
282 
283 static int
284 concat_start(struct raid_bdev *raid_bdev)
285 {
286 	uint64_t total_blockcnt = 0;
287 	struct raid_base_bdev_info *base_info;
288 	struct concat_block_range *block_range;
289 
290 	block_range = calloc(raid_bdev->num_base_bdevs, sizeof(struct concat_block_range));
291 	if (!block_range) {
292 		SPDK_ERRLOG("Can not allocate block_range, num_base_bdevs: %u",
293 			    raid_bdev->num_base_bdevs);
294 		return -ENOMEM;
295 	}
296 
297 	int idx = 0;
298 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
299 		uint64_t strip_cnt = base_info->bdev->blockcnt >> raid_bdev->strip_size_shift;
300 		uint64_t pd_block_cnt = strip_cnt << raid_bdev->strip_size_shift;
301 
302 		block_range[idx].start = total_blockcnt;
303 		block_range[idx].length = pd_block_cnt;
304 		total_blockcnt += pd_block_cnt;
305 		idx++;
306 	}
307 
308 	raid_bdev->module_private = block_range;
309 
310 	SPDK_DEBUGLOG(bdev_concat, "total blockcount %" PRIu64 ",  numbasedev %u, strip size shift %u\n",
311 		      total_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift);
312 	raid_bdev->bdev.blockcnt = total_blockcnt;
313 
314 	raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size;
315 	raid_bdev->bdev.split_on_optimal_io_boundary = true;
316 
317 	return 0;
318 }
319 
320 static bool
321 concat_stop(struct raid_bdev *raid_bdev)
322 {
323 	struct concat_block_range *block_range = raid_bdev->module_private;
324 
325 	free(block_range);
326 
327 	return true;
328 }
329 
330 static struct raid_bdev_module g_concat_module = {
331 	.level = CONCAT,
332 	.base_bdevs_min = 1,
333 	.start = concat_start,
334 	.stop = concat_stop,
335 	.submit_rw_request = concat_submit_rw_request,
336 	.submit_null_payload_request = concat_submit_null_payload_request,
337 };
338 RAID_MODULE_REGISTER(&g_concat_module)
339 
340 SPDK_LOG_REGISTER_COMPONENT(bdev_concat)
341