xref: /spdk/module/bdev/raid/raid0.c (revision 927f1fd57bd004df581518466ec4c1b8083e5d23)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *   Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
7  *
8  *   Redistribution and use in source and binary forms, with or without
9  *   modification, are permitted provided that the following conditions
10  *   are met:
11  *
12  *     * Redistributions of source code must retain the above copyright
13  *       notice, this list of conditions and the following disclaimer.
14  *     * Redistributions in binary form must reproduce the above copyright
15  *       notice, this list of conditions and the following disclaimer in
16  *       the documentation and/or other materials provided with the
17  *       distribution.
18  *     * Neither the name of Intel Corporation nor the names of its
19  *       contributors may be used to endorse or promote products derived
20  *       from this software without specific prior written permission.
21  *
22  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 #include "bdev_raid.h"
36 
37 #include "spdk/env.h"
38 #include "spdk/thread.h"
39 #include "spdk/string.h"
40 #include "spdk/util.h"
41 
42 #include "spdk/log.h"
43 
44 /*
45  * brief:
46  * raid0_bdev_io_completion function is called by lower layers to notify raid
47  * module that particular bdev_io is completed.
48  * params:
49  * bdev_io - pointer to bdev io submitted to lower layers, like child io
50  * success - bdev_io status
51  * cb_arg - function callback context (parent raid_bdev_io)
52  * returns:
53  * none
54  */
55 static void
56 raid0_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
57 {
58 	struct raid_bdev_io *raid_io = cb_arg;
59 
60 	spdk_bdev_free_io(bdev_io);
61 
62 	if (success) {
63 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS);
64 	} else {
65 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
66 	}
67 }
68 
69 static void
70 raid0_submit_rw_request(struct raid_bdev_io *raid_io);
71 
72 static void
73 _raid0_submit_rw_request(void *_raid_io)
74 {
75 	struct raid_bdev_io *raid_io = _raid_io;
76 
77 	raid0_submit_rw_request(raid_io);
78 }
79 
80 /*
81  * brief:
82  * raid0_submit_rw_request function is used to submit I/O to the correct
83  * member disk for raid0 bdevs.
84  * params:
85  * raid_io
86  * returns:
87  * none
88  */
89 static void
90 raid0_submit_rw_request(struct raid_bdev_io *raid_io)
91 {
92 	struct spdk_bdev_io		*bdev_io = spdk_bdev_io_from_ctx(raid_io);
93 	struct raid_bdev_io_channel	*raid_ch = raid_io->raid_ch;
94 	struct raid_bdev		*raid_bdev = raid_io->raid_bdev;
95 	uint64_t			pd_strip;
96 	uint32_t			offset_in_strip;
97 	uint64_t			pd_lba;
98 	uint64_t			pd_blocks;
99 	uint8_t				pd_idx;
100 	int				ret = 0;
101 	uint64_t			start_strip;
102 	uint64_t			end_strip;
103 	struct raid_base_bdev_info	*base_info;
104 	struct spdk_io_channel		*base_ch;
105 
106 	start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift;
107 	end_strip = (bdev_io->u.bdev.offset_blocks + bdev_io->u.bdev.num_blocks - 1) >>
108 		    raid_bdev->strip_size_shift;
109 	if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) {
110 		assert(false);
111 		SPDK_ERRLOG("I/O spans strip boundary!\n");
112 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
113 		return;
114 	}
115 
116 	pd_strip = start_strip / raid_bdev->num_base_bdevs;
117 	pd_idx = start_strip % raid_bdev->num_base_bdevs;
118 	offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1);
119 	pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip;
120 	pd_blocks = bdev_io->u.bdev.num_blocks;
121 	base_info = &raid_bdev->base_bdev_info[pd_idx];
122 	if (base_info->desc == NULL) {
123 		SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
124 		assert(0);
125 	}
126 
127 	/*
128 	 * Submit child io to bdev layer with using base bdev descriptors, base
129 	 * bdev lba, base bdev child io length in blocks, buffer, completion
130 	 * function and function callback context
131 	 */
132 	assert(raid_ch != NULL);
133 	assert(raid_ch->base_channel);
134 	base_ch = raid_ch->base_channel[pd_idx];
135 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
136 		ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch,
137 						 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
138 						 pd_lba, pd_blocks, raid0_bdev_io_completion,
139 						 raid_io, bdev_io->u.bdev.ext_opts);
140 	} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
141 		ret = spdk_bdev_writev_blocks_ext(base_info->desc, base_ch,
142 						  bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
143 						  pd_lba, pd_blocks, raid0_bdev_io_completion,
144 						  raid_io, bdev_io->u.bdev.ext_opts);
145 	} else {
146 		SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type);
147 		assert(0);
148 	}
149 
150 	if (ret == -ENOMEM) {
151 		raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch,
152 					_raid0_submit_rw_request);
153 	} else if (ret != 0) {
154 		SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
155 		assert(false);
156 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
157 	}
158 }
159 
160 /* raid0 IO range */
161 struct raid_bdev_io_range {
162 	uint64_t	strip_size;
163 	uint64_t	start_strip_in_disk;
164 	uint64_t	end_strip_in_disk;
165 	uint64_t	start_offset_in_strip;
166 	uint64_t	end_offset_in_strip;
167 	uint8_t		start_disk;
168 	uint8_t		end_disk;
169 	uint8_t		n_disks_involved;
170 };
171 
172 static inline void
173 _raid0_get_io_range(struct raid_bdev_io_range *io_range,
174 		    uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift,
175 		    uint64_t offset_blocks, uint64_t num_blocks)
176 {
177 	uint64_t	start_strip;
178 	uint64_t	end_strip;
179 
180 	io_range->strip_size = strip_size;
181 
182 	/* The start and end strip index in raid0 bdev scope */
183 	start_strip = offset_blocks >> strip_size_shift;
184 	end_strip = (offset_blocks + num_blocks - 1) >> strip_size_shift;
185 	io_range->start_strip_in_disk = start_strip / num_base_bdevs;
186 	io_range->end_strip_in_disk = end_strip / num_base_bdevs;
187 
188 	/* The first strip may have unaligned start LBA offset.
189 	 * The end strip may have unaligned end LBA offset.
190 	 * Strips between them certainly have aligned offset and length to boundaries.
191 	 */
192 	io_range->start_offset_in_strip = offset_blocks % strip_size;
193 	io_range->end_offset_in_strip = (offset_blocks + num_blocks - 1) % strip_size;
194 
195 	/* The base bdev indexes in which start and end strips are located */
196 	io_range->start_disk = start_strip % num_base_bdevs;
197 	io_range->end_disk = end_strip % num_base_bdevs;
198 
199 	/* Calculate how many base_bdevs are involved in io operation.
200 	 * Number of base bdevs involved is between 1 and num_base_bdevs.
201 	 * It will be 1 if the first strip and last strip are the same one.
202 	 */
203 	io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs);
204 }
205 
206 static inline void
207 _raid0_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx,
208 		      uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk)
209 {
210 	uint64_t n_strips_in_disk;
211 	uint64_t start_offset_in_disk;
212 	uint64_t end_offset_in_disk;
213 	uint64_t offset_in_disk;
214 	uint64_t nblocks_in_disk;
215 	uint64_t start_strip_in_disk;
216 	uint64_t end_strip_in_disk;
217 
218 	start_strip_in_disk = io_range->start_strip_in_disk;
219 	if (disk_idx < io_range->start_disk) {
220 		start_strip_in_disk += 1;
221 	}
222 
223 	end_strip_in_disk = io_range->end_strip_in_disk;
224 	if (disk_idx > io_range->end_disk) {
225 		end_strip_in_disk -= 1;
226 	}
227 
228 	assert(end_strip_in_disk >= start_strip_in_disk);
229 	n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1;
230 
231 	if (disk_idx == io_range->start_disk) {
232 		start_offset_in_disk = io_range->start_offset_in_strip;
233 	} else {
234 		start_offset_in_disk = 0;
235 	}
236 
237 	if (disk_idx == io_range->end_disk) {
238 		end_offset_in_disk = io_range->end_offset_in_strip;
239 	} else {
240 		end_offset_in_disk = io_range->strip_size - 1;
241 	}
242 
243 	offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size;
244 	nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size
245 			  + end_offset_in_disk - start_offset_in_disk + 1;
246 
247 	SPDK_DEBUGLOG(bdev_raid0,
248 		      "raid_bdev (strip_size 0x%" PRIx64 ") splits IO to base_bdev (%u) at (0x%" PRIx64 ", 0x%" PRIx64
249 		      ").\n",
250 		      io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk);
251 
252 	*_offset_in_disk = offset_in_disk;
253 	*_nblocks_in_disk = nblocks_in_disk;
254 }
255 
256 static void
257 raid0_submit_null_payload_request(struct raid_bdev_io *raid_io);
258 
259 static void
260 _raid0_submit_null_payload_request(void *_raid_io)
261 {
262 	struct raid_bdev_io *raid_io = _raid_io;
263 
264 	raid0_submit_null_payload_request(raid_io);
265 }
266 
267 static void
268 raid0_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
269 {
270 	struct raid_bdev_io *raid_io = cb_arg;
271 
272 	raid_bdev_io_complete_part(raid_io, 1, success ?
273 				   SPDK_BDEV_IO_STATUS_SUCCESS :
274 				   SPDK_BDEV_IO_STATUS_FAILED);
275 
276 	spdk_bdev_free_io(bdev_io);
277 }
278 
279 /*
280  * brief:
281  * raid0_submit_null_payload_request function submits the next batch of
282  * io requests with range but without payload, like FLUSH and UNMAP, to member disks;
283  * it will submit as many as possible unless one base io request fails with -ENOMEM,
284  * in which case it will queue itself for later submission.
285  * params:
286  * bdev_io - pointer to parent bdev_io on raid bdev device
287  * returns:
288  * none
289  */
290 static void
291 raid0_submit_null_payload_request(struct raid_bdev_io *raid_io)
292 {
293 	struct spdk_bdev_io		*bdev_io;
294 	struct raid_bdev		*raid_bdev;
295 	struct raid_bdev_io_range	io_range;
296 	int				ret;
297 	struct raid_base_bdev_info	*base_info;
298 	struct spdk_io_channel		*base_ch;
299 
300 	bdev_io = spdk_bdev_io_from_ctx(raid_io);
301 	raid_bdev = raid_io->raid_bdev;
302 
303 	_raid0_get_io_range(&io_range, raid_bdev->num_base_bdevs,
304 			    raid_bdev->strip_size, raid_bdev->strip_size_shift,
305 			    bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks);
306 
307 	if (raid_io->base_bdev_io_remaining == 0) {
308 		raid_io->base_bdev_io_remaining = io_range.n_disks_involved;
309 	}
310 
311 	while (raid_io->base_bdev_io_submitted < io_range.n_disks_involved) {
312 		uint8_t disk_idx;
313 		uint64_t offset_in_disk;
314 		uint64_t nblocks_in_disk;
315 
316 		/* base_bdev is started from start_disk to end_disk.
317 		 * It is possible that index of start_disk is larger than end_disk's.
318 		 */
319 		disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs;
320 		base_info = &raid_bdev->base_bdev_info[disk_idx];
321 		base_ch = raid_io->raid_ch->base_channel[disk_idx];
322 
323 		_raid0_split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk);
324 
325 		switch (bdev_io->type) {
326 		case SPDK_BDEV_IO_TYPE_UNMAP:
327 			ret = spdk_bdev_unmap_blocks(base_info->desc, base_ch,
328 						     offset_in_disk, nblocks_in_disk,
329 						     raid0_base_io_complete, raid_io);
330 			break;
331 
332 		case SPDK_BDEV_IO_TYPE_FLUSH:
333 			ret = spdk_bdev_flush_blocks(base_info->desc, base_ch,
334 						     offset_in_disk, nblocks_in_disk,
335 						     raid0_base_io_complete, raid_io);
336 			break;
337 
338 		default:
339 			SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type);
340 			assert(false);
341 			ret = -EIO;
342 		}
343 
344 		if (ret == 0) {
345 			raid_io->base_bdev_io_submitted++;
346 		} else if (ret == -ENOMEM) {
347 			raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch,
348 						_raid0_submit_null_payload_request);
349 			return;
350 		} else {
351 			SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
352 			assert(false);
353 			raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
354 			return;
355 		}
356 	}
357 }
358 
359 static int raid0_start(struct raid_bdev *raid_bdev)
360 {
361 	uint64_t min_blockcnt = UINT64_MAX;
362 	struct raid_base_bdev_info *base_info;
363 
364 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
365 		/* Calculate minimum block count from all base bdevs */
366 		min_blockcnt = spdk_min(min_blockcnt, base_info->bdev->blockcnt);
367 	}
368 
369 	/*
370 	 * Take the minimum block count based approach where total block count
371 	 * of raid bdev is the number of base bdev times the minimum block count
372 	 * of any base bdev.
373 	 */
374 	SPDK_DEBUGLOG(bdev_raid0, "min blockcount %" PRIu64 ",  numbasedev %u, strip size shift %u\n",
375 		      min_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift);
376 	raid_bdev->bdev.blockcnt = ((min_blockcnt >> raid_bdev->strip_size_shift) <<
377 				    raid_bdev->strip_size_shift)  * raid_bdev->num_base_bdevs;
378 
379 	if (raid_bdev->num_base_bdevs > 1) {
380 		raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size;
381 		raid_bdev->bdev.split_on_optimal_io_boundary = true;
382 	} else {
383 		/* Do not need to split reads/writes on single bdev RAID modules. */
384 		raid_bdev->bdev.optimal_io_boundary = 0;
385 		raid_bdev->bdev.split_on_optimal_io_boundary = false;
386 	}
387 
388 	return 0;
389 }
390 
391 static struct raid_bdev_module g_raid0_module = {
392 	.level = RAID0,
393 	.base_bdevs_min = 1,
394 	.start = raid0_start,
395 	.submit_rw_request = raid0_submit_rw_request,
396 	.submit_null_payload_request = raid0_submit_null_payload_request,
397 };
398 RAID_MODULE_REGISTER(&g_raid0_module)
399 
400 SPDK_LOG_REGISTER_COMPONENT(bdev_raid0)
401