xref: /spdk/module/bdev/raid/concat.c (revision 927f1fd57bd004df581518466ec4c1b8083e5d23)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Peng Yu yupeng0921@gmail.com.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "bdev_raid.h"
35 
36 #include "spdk/env.h"
37 #include "spdk/thread.h"
38 #include "spdk/string.h"
39 #include "spdk/util.h"
40 
41 #include "spdk/log.h"
42 
43 struct concat_block_range {
44 	uint64_t start;
45 	uint64_t length;
46 };
47 
48 /*
49  * brief:
50  * concat_bdev_io_completion function is called by lower layers to notify raid
51  * module that particular bdev_io is completed.
52  * params:
53  * bdev_io - pointer to bdev io submitted to lower layers, like child io
54  * success - bdev_io status
55  * cb_arg - function callback context (parent raid_bdev_io)
56  * returns:
57  * none
58  */
59 static void
60 concat_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
61 {
62 	struct raid_bdev_io *raid_io = cb_arg;
63 
64 	spdk_bdev_free_io(bdev_io);
65 
66 	if (success) {
67 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS);
68 	} else {
69 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
70 	}
71 }
72 
73 static void
74 concat_submit_rw_request(struct raid_bdev_io *raid_io);
75 
76 static void
77 _concat_submit_rw_request(void *_raid_io)
78 {
79 	struct raid_bdev_io *raid_io = _raid_io;
80 
81 	concat_submit_rw_request(raid_io);
82 }
83 
84 /*
85  * brief:
86  * concat_submit_rw_request function is used to submit I/O to the correct
87  * member disk for concat bdevs.
88  * params:
89  * raid_io
90  * returns:
91  * none
92  */
93 static void
94 concat_submit_rw_request(struct raid_bdev_io *raid_io)
95 {
96 	struct spdk_bdev_io		*bdev_io = spdk_bdev_io_from_ctx(raid_io);
97 	struct raid_bdev_io_channel	*raid_ch = raid_io->raid_ch;
98 	struct raid_bdev		*raid_bdev = raid_io->raid_bdev;
99 	struct concat_block_range	*block_range = raid_bdev->module_private;
100 	uint64_t			pd_lba;
101 	uint64_t			pd_blocks;
102 	int				pd_idx;
103 	int				ret = 0;
104 	struct raid_base_bdev_info	*base_info;
105 	struct spdk_io_channel		*base_ch;
106 	int i;
107 
108 	pd_idx = -1;
109 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
110 		if (block_range[i].start > bdev_io->u.bdev.offset_blocks) {
111 			break;
112 		}
113 		pd_idx = i;
114 	}
115 	assert(pd_idx >= 0);
116 	assert(bdev_io->u.bdev.offset_blocks >= block_range[pd_idx].start);
117 	pd_lba = bdev_io->u.bdev.offset_blocks - block_range[pd_idx].start;
118 	pd_blocks = bdev_io->u.bdev.num_blocks;
119 	base_info = &raid_bdev->base_bdev_info[pd_idx];
120 	if (base_info->desc == NULL) {
121 		SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
122 		assert(0);
123 	}
124 
125 	/*
126 	 * Submit child io to bdev layer with using base bdev descriptors, base
127 	 * bdev lba, base bdev child io length in blocks, buffer, completion
128 	 * function and function callback context
129 	 */
130 	assert(raid_ch != NULL);
131 	assert(raid_ch->base_channel);
132 	base_ch = raid_ch->base_channel[pd_idx];
133 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
134 		ret = spdk_bdev_readv_blocks(base_info->desc, base_ch,
135 					     bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
136 					     pd_lba, pd_blocks, concat_bdev_io_completion,
137 					     raid_io);
138 	} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
139 		ret = spdk_bdev_writev_blocks(base_info->desc, base_ch,
140 					      bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
141 					      pd_lba, pd_blocks, concat_bdev_io_completion,
142 					      raid_io);
143 	} else {
144 		SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type);
145 		assert(0);
146 	}
147 
148 	if (ret == -ENOMEM) {
149 		raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch,
150 					_concat_submit_rw_request);
151 	} else if (ret != 0) {
152 		SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
153 		assert(false);
154 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
155 	}
156 }
157 
158 static void
159 concat_submit_null_payload_request(struct raid_bdev_io *raid_io);
160 
161 static void
162 _concat_submit_null_payload_request(void *_raid_io)
163 {
164 	struct raid_bdev_io *raid_io = _raid_io;
165 
166 	concat_submit_null_payload_request(raid_io);
167 }
168 
169 static void
170 concat_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
171 {
172 	struct raid_bdev_io *raid_io = cb_arg;
173 
174 	raid_bdev_io_complete_part(raid_io, 1, success ?
175 				   SPDK_BDEV_IO_STATUS_SUCCESS :
176 				   SPDK_BDEV_IO_STATUS_FAILED);
177 
178 	spdk_bdev_free_io(bdev_io);
179 }
180 
181 /*
182  * brief:
183  * concat_submit_null_payload_request function submits the next batch of
184  * io requests with range but without payload, like FLUSH and UNMAP, to member disks;
185  * it will submit as many as possible unless one base io request fails with -ENOMEM,
186  * in which case it will queue itself for later submission.
187  * params:
188  * bdev_io - pointer to parent bdev_io on raid bdev device
189  * returns:
190  * none
191  */
192 static void
193 concat_submit_null_payload_request(struct raid_bdev_io *raid_io)
194 {
195 	struct spdk_bdev_io		*bdev_io;
196 	struct raid_bdev		*raid_bdev;
197 	int				ret;
198 	struct raid_base_bdev_info	*base_info;
199 	struct spdk_io_channel		*base_ch;
200 	uint64_t			pd_lba;
201 	uint64_t			pd_blocks;
202 	uint64_t			offset_blocks;
203 	uint64_t			num_blocks;
204 	struct concat_block_range	*block_range;
205 	int				i, start_idx, stop_idx;
206 
207 	bdev_io = spdk_bdev_io_from_ctx(raid_io);
208 	raid_bdev = raid_io->raid_bdev;
209 	block_range = raid_bdev->module_private;
210 
211 	offset_blocks = bdev_io->u.bdev.offset_blocks;
212 	num_blocks = bdev_io->u.bdev.num_blocks;
213 	start_idx = -1;
214 	stop_idx = -1;
215 	/*
216 	 * Go through all base bdevs, find the first bdev and the last bdev
217 	 */
218 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
219 		/* skip the bdevs before the offset_blocks */
220 		if (offset_blocks >= block_range[i].start + block_range[i].length) {
221 			continue;
222 		}
223 		if (start_idx == -1) {
224 			start_idx = i;
225 		} else {
226 			/*
227 			 * The offset_blocks might be at the middle of the first bdev.
228 			 * Besides the first bdev, the offset_blocks should be always
229 			 * at the start of the bdev.
230 			 */
231 			assert(offset_blocks == block_range[i].start);
232 		}
233 		pd_lba = offset_blocks - block_range[i].start;
234 		pd_blocks = spdk_min(num_blocks, block_range[i].length - pd_lba);
235 		offset_blocks += pd_blocks;
236 		num_blocks -= pd_blocks;
237 		if (num_blocks == 0) {
238 			stop_idx = i;
239 			break;
240 		}
241 	}
242 	assert(start_idx >= 0);
243 	assert(stop_idx >= 0);
244 
245 	if (raid_io->base_bdev_io_remaining == 0) {
246 		raid_io->base_bdev_io_remaining = stop_idx - start_idx + 1;
247 	}
248 	offset_blocks = bdev_io->u.bdev.offset_blocks;
249 	num_blocks = bdev_io->u.bdev.num_blocks;
250 	for (i = start_idx; i <= stop_idx; i++) {
251 		assert(offset_blocks >= block_range[i].start);
252 		assert(offset_blocks < block_range[i].start + block_range[i].length);
253 		pd_lba = offset_blocks -  block_range[i].start;
254 		pd_blocks = spdk_min(num_blocks, block_range[i].length - pd_lba);
255 		offset_blocks += pd_blocks;
256 		num_blocks -= pd_blocks;
257 		/*
258 		 * Skip the IOs we have submitted
259 		 */
260 		if (i < start_idx + raid_io->base_bdev_io_submitted) {
261 			continue;
262 		}
263 		base_info = &raid_bdev->base_bdev_info[i];
264 		base_ch = raid_io->raid_ch->base_channel[i];
265 		switch (bdev_io->type) {
266 		case SPDK_BDEV_IO_TYPE_UNMAP:
267 			ret = spdk_bdev_unmap_blocks(base_info->desc, base_ch,
268 						     pd_lba, pd_blocks,
269 						     concat_base_io_complete, raid_io);
270 			break;
271 		case SPDK_BDEV_IO_TYPE_FLUSH:
272 			ret = spdk_bdev_flush_blocks(base_info->desc, base_ch,
273 						     pd_lba, pd_blocks,
274 						     concat_base_io_complete, raid_io);
275 			break;
276 		default:
277 			SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type);
278 			assert(false);
279 			ret = -EIO;
280 		}
281 		if (ret == 0) {
282 			raid_io->base_bdev_io_submitted++;
283 		} else if (ret == -ENOMEM) {
284 			raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch,
285 						_concat_submit_null_payload_request);
286 			return;
287 		} else {
288 			SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
289 			assert(false);
290 			raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
291 			return;
292 		}
293 	}
294 }
295 
296 static int concat_start(struct raid_bdev *raid_bdev)
297 {
298 	uint64_t total_blockcnt = 0;
299 	struct raid_base_bdev_info *base_info;
300 	struct concat_block_range *block_range;
301 
302 	block_range = calloc(raid_bdev->num_base_bdevs, sizeof(struct concat_block_range));
303 	if (!block_range) {
304 		SPDK_ERRLOG("Can not allocate block_range, num_base_bdevs: %u",
305 			    raid_bdev->num_base_bdevs);
306 		return -ENOMEM;
307 	}
308 
309 	int idx = 0;
310 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
311 		uint64_t strip_cnt = base_info->bdev->blockcnt >> raid_bdev->strip_size_shift;
312 		uint64_t pd_block_cnt = strip_cnt << raid_bdev->strip_size_shift;
313 
314 		block_range[idx].start = total_blockcnt;
315 		block_range[idx].length = pd_block_cnt;
316 		total_blockcnt += pd_block_cnt;
317 		idx++;
318 	}
319 
320 	raid_bdev->module_private = block_range;
321 
322 	SPDK_DEBUGLOG(bdev_concat, "total blockcount %" PRIu64 ",  numbasedev %u, strip size shift %u\n",
323 		      total_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift);
324 	raid_bdev->bdev.blockcnt = total_blockcnt;
325 
326 	raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size;
327 	raid_bdev->bdev.split_on_optimal_io_boundary = true;
328 
329 	return 0;
330 }
331 
332 static void
333 concat_stop(struct raid_bdev *raid_bdev)
334 {
335 	struct concat_block_range *block_range = raid_bdev->module_private;
336 
337 	free(block_range);
338 }
339 
340 static struct raid_bdev_module g_concat_module = {
341 	.level = CONCAT,
342 	.base_bdevs_min = 1,
343 	.start = concat_start,
344 	.stop = concat_stop,
345 	.submit_rw_request = concat_submit_rw_request,
346 	.submit_null_payload_request = concat_submit_null_payload_request,
347 };
348 RAID_MODULE_REGISTER(&g_concat_module)
349 
350 SPDK_LOG_REGISTER_COMPONENT(bdev_concat)
351