xref: /spdk/module/bdev/raid/raid0.c (revision 0ed85362c8132a2d1927757fbcade66b6660d26a)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "bdev_raid.h"
35 
36 #include "spdk/env.h"
37 #include "spdk/thread.h"
38 #include "spdk/string.h"
39 #include "spdk/util.h"
40 
41 #include "spdk_internal/log.h"
42 
43 /*
44  * brief:
45  * raid0_bdev_io_completion function is called by lower layers to notify raid
46  * module that particular bdev_io is completed.
47  * params:
48  * bdev_io - pointer to bdev io submitted to lower layers, like child io
49  * success - bdev_io status
50  * cb_arg - function callback context (parent raid_bdev_io)
51  * returns:
52  * none
53  */
54 static void
55 raid0_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
56 {
57 	struct raid_bdev_io *raid_io = cb_arg;
58 
59 	spdk_bdev_free_io(bdev_io);
60 
61 	if (success) {
62 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS);
63 	} else {
64 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
65 	}
66 }
67 
68 static void
69 raid0_submit_rw_request(struct raid_bdev_io *raid_io);
70 
71 static void
72 _raid0_submit_rw_request(void *_raid_io)
73 {
74 	struct raid_bdev_io *raid_io = _raid_io;
75 
76 	raid0_submit_rw_request(raid_io);
77 }
78 
79 /*
80  * brief:
81  * raid0_submit_rw_request function is used to submit I/O to the correct
82  * member disk for raid0 bdevs.
83  * params:
84  * raid_io
85  * returns:
86  * none
87  */
88 static void
89 raid0_submit_rw_request(struct raid_bdev_io *raid_io)
90 {
91 	struct spdk_bdev_io		*bdev_io = spdk_bdev_io_from_ctx(raid_io);
92 	struct raid_bdev_io_channel	*raid_ch = raid_io->raid_ch;
93 	struct raid_bdev		*raid_bdev = raid_io->raid_bdev;
94 	uint64_t			pd_strip;
95 	uint32_t			offset_in_strip;
96 	uint64_t			pd_lba;
97 	uint64_t			pd_blocks;
98 	uint8_t				pd_idx;
99 	int				ret = 0;
100 	uint64_t			start_strip;
101 	uint64_t			end_strip;
102 	struct raid_base_bdev_info	*base_info;
103 	struct spdk_io_channel		*base_ch;
104 
105 	start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift;
106 	end_strip = (bdev_io->u.bdev.offset_blocks + bdev_io->u.bdev.num_blocks - 1) >>
107 		    raid_bdev->strip_size_shift;
108 	if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) {
109 		assert(false);
110 		SPDK_ERRLOG("I/O spans strip boundary!\n");
111 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
112 		return;
113 	}
114 
115 	pd_strip = start_strip / raid_bdev->num_base_bdevs;
116 	pd_idx = start_strip % raid_bdev->num_base_bdevs;
117 	offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1);
118 	pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip;
119 	pd_blocks = bdev_io->u.bdev.num_blocks;
120 	base_info = &raid_bdev->base_bdev_info[pd_idx];
121 	if (base_info->desc == NULL) {
122 		SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
123 		assert(0);
124 	}
125 
126 	/*
127 	 * Submit child io to bdev layer with using base bdev descriptors, base
128 	 * bdev lba, base bdev child io length in blocks, buffer, completion
129 	 * function and function callback context
130 	 */
131 	assert(raid_ch != NULL);
132 	assert(raid_ch->base_channel);
133 	base_ch = raid_ch->base_channel[pd_idx];
134 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
135 		ret = spdk_bdev_readv_blocks(base_info->desc, base_ch,
136 					     bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
137 					     pd_lba, pd_blocks, raid0_bdev_io_completion,
138 					     raid_io);
139 	} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
140 		ret = spdk_bdev_writev_blocks(base_info->desc, base_ch,
141 					      bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
142 					      pd_lba, pd_blocks, raid0_bdev_io_completion,
143 					      raid_io);
144 	} else {
145 		SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type);
146 		assert(0);
147 	}
148 
149 	if (ret == -ENOMEM) {
150 		raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch,
151 					_raid0_submit_rw_request);
152 	} else if (ret != 0) {
153 		SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
154 		assert(false);
155 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
156 	}
157 }
158 
159 /* raid0 IO range */
160 struct raid_bdev_io_range {
161 	uint64_t	strip_size;
162 	uint64_t	start_strip_in_disk;
163 	uint64_t	end_strip_in_disk;
164 	uint64_t	start_offset_in_strip;
165 	uint64_t	end_offset_in_strip;
166 	uint8_t		start_disk;
167 	uint8_t		end_disk;
168 	uint8_t		n_disks_involved;
169 };
170 
171 static inline void
172 _raid0_get_io_range(struct raid_bdev_io_range *io_range,
173 		    uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift,
174 		    uint64_t offset_blocks, uint64_t num_blocks)
175 {
176 	uint64_t	start_strip;
177 	uint64_t	end_strip;
178 
179 	io_range->strip_size = strip_size;
180 
181 	/* The start and end strip index in raid0 bdev scope */
182 	start_strip = offset_blocks >> strip_size_shift;
183 	end_strip = (offset_blocks + num_blocks - 1) >> strip_size_shift;
184 	io_range->start_strip_in_disk = start_strip / num_base_bdevs;
185 	io_range->end_strip_in_disk = end_strip / num_base_bdevs;
186 
187 	/* The first strip may have unaligned start LBA offset.
188 	 * The end strip may have unaligned end LBA offset.
189 	 * Strips between them certainly have aligned offset and length to boundaries.
190 	 */
191 	io_range->start_offset_in_strip = offset_blocks % strip_size;
192 	io_range->end_offset_in_strip = (offset_blocks + num_blocks - 1) % strip_size;
193 
194 	/* The base bdev indexes in which start and end strips are located */
195 	io_range->start_disk = start_strip % num_base_bdevs;
196 	io_range->end_disk = end_strip % num_base_bdevs;
197 
198 	/* Calculate how many base_bdevs are involved in io operation.
199 	 * Number of base bdevs involved is between 1 and num_base_bdevs.
200 	 * It will be 1 if the first strip and last strip are the same one.
201 	 */
202 	io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs);
203 }
204 
205 static inline void
206 _raid0_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx,
207 		      uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk)
208 {
209 	uint64_t n_strips_in_disk;
210 	uint64_t start_offset_in_disk;
211 	uint64_t end_offset_in_disk;
212 	uint64_t offset_in_disk;
213 	uint64_t nblocks_in_disk;
214 	uint64_t start_strip_in_disk;
215 	uint64_t end_strip_in_disk;
216 
217 	start_strip_in_disk = io_range->start_strip_in_disk;
218 	if (disk_idx < io_range->start_disk) {
219 		start_strip_in_disk += 1;
220 	}
221 
222 	end_strip_in_disk = io_range->end_strip_in_disk;
223 	if (disk_idx > io_range->end_disk) {
224 		end_strip_in_disk -= 1;
225 	}
226 
227 	assert(end_strip_in_disk >= start_strip_in_disk);
228 	n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1;
229 
230 	if (disk_idx == io_range->start_disk) {
231 		start_offset_in_disk = io_range->start_offset_in_strip;
232 	} else {
233 		start_offset_in_disk = 0;
234 	}
235 
236 	if (disk_idx == io_range->end_disk) {
237 		end_offset_in_disk = io_range->end_offset_in_strip;
238 	} else {
239 		end_offset_in_disk = io_range->strip_size - 1;
240 	}
241 
242 	offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size;
243 	nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size
244 			  + end_offset_in_disk - start_offset_in_disk + 1;
245 
246 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID0,
247 		      "raid_bdev (strip_size 0x%lx) splits IO to base_bdev (%u) at (0x%lx, 0x%lx).\n",
248 		      io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk);
249 
250 	*_offset_in_disk = offset_in_disk;
251 	*_nblocks_in_disk = nblocks_in_disk;
252 }
253 
254 static void
255 raid0_submit_null_payload_request(struct raid_bdev_io *raid_io);
256 
257 static void
258 _raid0_submit_null_payload_request(void *_raid_io)
259 {
260 	struct raid_bdev_io *raid_io = _raid_io;
261 
262 	raid0_submit_null_payload_request(raid_io);
263 }
264 
265 static void
266 raid0_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
267 {
268 	struct raid_bdev_io *raid_io = cb_arg;
269 
270 	raid_bdev_io_complete_part(raid_io, 1, success ?
271 				   SPDK_BDEV_IO_STATUS_SUCCESS :
272 				   SPDK_BDEV_IO_STATUS_FAILED);
273 
274 	spdk_bdev_free_io(bdev_io);
275 }
276 
277 /*
278  * brief:
279  * raid0_submit_null_payload_request function submits the next batch of
280  * io requests with range but without payload, like FLUSH and UNMAP, to member disks;
281  * it will submit as many as possible unless one base io request fails with -ENOMEM,
282  * in which case it will queue itself for later submission.
283  * params:
284  * bdev_io - pointer to parent bdev_io on raid bdev device
285  * returns:
286  * none
287  */
288 static void
289 raid0_submit_null_payload_request(struct raid_bdev_io *raid_io)
290 {
291 	struct spdk_bdev_io		*bdev_io;
292 	struct raid_bdev		*raid_bdev;
293 	struct raid_bdev_io_range	io_range;
294 	int				ret;
295 	struct raid_base_bdev_info	*base_info;
296 	struct spdk_io_channel		*base_ch;
297 
298 	bdev_io = spdk_bdev_io_from_ctx(raid_io);
299 	raid_bdev = raid_io->raid_bdev;
300 
301 	_raid0_get_io_range(&io_range, raid_bdev->num_base_bdevs,
302 			    raid_bdev->strip_size, raid_bdev->strip_size_shift,
303 			    bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks);
304 
305 	if (raid_io->base_bdev_io_remaining == 0) {
306 		raid_io->base_bdev_io_remaining = io_range.n_disks_involved;
307 	}
308 
309 	while (raid_io->base_bdev_io_submitted < io_range.n_disks_involved) {
310 		uint8_t disk_idx;
311 		uint64_t offset_in_disk;
312 		uint64_t nblocks_in_disk;
313 
314 		/* base_bdev is started from start_disk to end_disk.
315 		 * It is possible that index of start_disk is larger than end_disk's.
316 		 */
317 		disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs;
318 		base_info = &raid_bdev->base_bdev_info[disk_idx];
319 		base_ch = raid_io->raid_ch->base_channel[disk_idx];
320 
321 		_raid0_split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk);
322 
323 		switch (bdev_io->type) {
324 		case SPDK_BDEV_IO_TYPE_UNMAP:
325 			ret = spdk_bdev_unmap_blocks(base_info->desc, base_ch,
326 						     offset_in_disk, nblocks_in_disk,
327 						     raid0_base_io_complete, raid_io);
328 			break;
329 
330 		case SPDK_BDEV_IO_TYPE_FLUSH:
331 			ret = spdk_bdev_flush_blocks(base_info->desc, base_ch,
332 						     offset_in_disk, nblocks_in_disk,
333 						     raid0_base_io_complete, raid_io);
334 			break;
335 
336 		default:
337 			SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type);
338 			assert(false);
339 			ret = -EIO;
340 		}
341 
342 		if (ret == 0) {
343 			raid_io->base_bdev_io_submitted++;
344 		} else if (ret == -ENOMEM) {
345 			raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch,
346 						_raid0_submit_null_payload_request);
347 			return;
348 		} else {
349 			SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
350 			assert(false);
351 			raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
352 			return;
353 		}
354 	}
355 }
356 
357 static int raid0_start(struct raid_bdev *raid_bdev)
358 {
359 	uint64_t min_blockcnt = UINT64_MAX;
360 	struct raid_base_bdev_info *base_info;
361 
362 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
363 		/* Calculate minimum block count from all base bdevs */
364 		min_blockcnt = spdk_min(min_blockcnt, base_info->bdev->blockcnt);
365 	}
366 
367 	/*
368 	 * Take the minimum block count based approach where total block count
369 	 * of raid bdev is the number of base bdev times the minimum block count
370 	 * of any base bdev.
371 	 */
372 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID0, "min blockcount %lu,  numbasedev %u, strip size shift %u\n",
373 		      min_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift);
374 	raid_bdev->bdev.blockcnt = ((min_blockcnt >> raid_bdev->strip_size_shift) <<
375 				    raid_bdev->strip_size_shift)  * raid_bdev->num_base_bdevs;
376 
377 	if (raid_bdev->num_base_bdevs > 1) {
378 		raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size;
379 		raid_bdev->bdev.split_on_optimal_io_boundary = true;
380 	} else {
381 		/* Do not need to split reads/writes on single bdev RAID modules. */
382 		raid_bdev->bdev.optimal_io_boundary = 0;
383 		raid_bdev->bdev.split_on_optimal_io_boundary = false;
384 	}
385 
386 	return 0;
387 }
388 
389 static struct raid_bdev_module g_raid0_module = {
390 	.level = RAID0,
391 	.base_bdevs_min = 1,
392 	.start = raid0_start,
393 	.submit_rw_request = raid0_submit_rw_request,
394 	.submit_null_payload_request = raid0_submit_null_payload_request,
395 };
396 RAID_MODULE_REGISTER(&g_raid0_module)
397 
398 SPDK_LOG_REGISTER_COMPONENT("bdev_raid0", SPDK_LOG_BDEV_RAID0)
399