xref: /spdk/module/bdev/raid/raid0.c (revision 9889ab2dc80e40dae92dcef361d53dcba722043d)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "bdev_raid.h"
35 
36 #include "spdk/env.h"
37 #include "spdk/io_channel.h"
38 #include "spdk/string.h"
39 #include "spdk/util.h"
40 
41 #include "spdk_internal/log.h"
42 
43 /*
44  * brief:
45  * raid0_bdev_io_completion function is called by lower layers to notify raid
46  * module that particular bdev_io is completed.
47  * params:
48  * bdev_io - pointer to bdev io submitted to lower layers, like child io
49  * success - bdev_io status
50  * cb_arg - function callback context, like parent io pointer
51  * returns:
52  * none
53  */
54 static void
55 raid0_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
56 {
57 	struct spdk_bdev_io         *parent_io = cb_arg;
58 
59 	spdk_bdev_free_io(bdev_io);
60 
61 	if (success) {
62 		spdk_bdev_io_complete(parent_io, SPDK_BDEV_IO_STATUS_SUCCESS);
63 	} else {
64 		spdk_bdev_io_complete(parent_io, SPDK_BDEV_IO_STATUS_FAILED);
65 	}
66 }
67 
68 static void
69 raid0_waitq_io_process(void *ctx);
70 
71 /*
72  * brief:
73  * raid0_submit_rw_request function is used to submit I/O to the correct
74  * member disk for raid0 bdevs.
75  * params:
76  * bdev_io - parent bdev io
77  * start_strip - start strip number of this io
78  * returns:
79  * none
80  */
81 static void
82 raid0_submit_rw_request(struct spdk_bdev_io *bdev_io, uint64_t start_strip)
83 {
84 	struct raid_bdev_io		*raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
85 	struct raid_bdev_io_channel	*raid_ch = spdk_io_channel_get_ctx(raid_io->ch);
86 	struct raid_bdev		*raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
87 	uint64_t			pd_strip;
88 	uint32_t			offset_in_strip;
89 	uint64_t			pd_lba;
90 	uint64_t			pd_blocks;
91 	uint8_t				pd_idx;
92 	int				ret = 0;
93 
94 	pd_strip = start_strip / raid_bdev->num_base_bdevs;
95 	pd_idx = start_strip % raid_bdev->num_base_bdevs;
96 	offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1);
97 	pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip;
98 	pd_blocks = bdev_io->u.bdev.num_blocks;
99 	if (raid_bdev->base_bdev_info[pd_idx].desc == NULL) {
100 		SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
101 		assert(0);
102 	}
103 
104 	/*
105 	 * Submit child io to bdev layer with using base bdev descriptors, base
106 	 * bdev lba, base bdev child io length in blocks, buffer, completion
107 	 * function and function callback context
108 	 */
109 	assert(raid_ch != NULL);
110 	assert(raid_ch->base_channel);
111 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
112 		ret = spdk_bdev_readv_blocks(raid_bdev->base_bdev_info[pd_idx].desc,
113 					     raid_ch->base_channel[pd_idx],
114 					     bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
115 					     pd_lba, pd_blocks, raid0_bdev_io_completion,
116 					     bdev_io);
117 	} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
118 		ret = spdk_bdev_writev_blocks(raid_bdev->base_bdev_info[pd_idx].desc,
119 					      raid_ch->base_channel[pd_idx],
120 					      bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
121 					      pd_lba, pd_blocks, raid0_bdev_io_completion,
122 					      bdev_io);
123 	} else {
124 		SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type);
125 		assert(0);
126 	}
127 
128 	if (ret) {
129 		raid_bdev_queue_io_wait(bdev_io, pd_idx, raid0_waitq_io_process, ret);
130 	}
131 }
132 
133 /*
134  * brief:
135  * raid0_waitq_io_process function is the callback function
136  * registered by raid bdev module to bdev when bdev_io was unavailable
137  * for raid0 bdevs.
138  * params:
139  * ctx - pointer to raid_bdev_io
140  * returns:
141  * none
142  */
143 static void
144 raid0_waitq_io_process(void *ctx)
145 {
146 	struct spdk_bdev_io     *bdev_io = ctx;
147 	struct raid_bdev	*raid_bdev;
148 	uint64_t		start_strip;
149 
150 	/*
151 	 * Try to submit childs of parent bdev io. If failed due to resource
152 	 * crunch then break the loop and don't try to process other queued IOs.
153 	 */
154 	raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
155 	start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift;
156 	raid0_submit_rw_request(bdev_io, start_strip);
157 }
158 
159 /*
160  * brief:
161  * raid0_start_rw_request function is the submit_request function for
162  * read/write requests for raid0 bdevs.
163  * params:
164  * ch - pointer to raid bdev io channel
165  * bdev_io - pointer to parent bdev_io on raid bdev device
166  * returns:
167  * none
168  */
169 void
170 raid0_start_rw_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
171 {
172 	struct raid_bdev_io		*raid_io;
173 	struct raid_bdev		*raid_bdev;
174 	uint64_t			start_strip = 0;
175 	uint64_t			end_strip = 0;
176 
177 	raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
178 	raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
179 	raid_io->ch = ch;
180 	start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift;
181 	end_strip = (bdev_io->u.bdev.offset_blocks + bdev_io->u.bdev.num_blocks - 1) >>
182 		    raid_bdev->strip_size_shift;
183 	if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) {
184 		assert(false);
185 		SPDK_ERRLOG("I/O spans strip boundary!\n");
186 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
187 		return;
188 	}
189 	raid0_submit_rw_request(bdev_io, start_strip);
190 }
191 
192 /* raid0 IO range */
193 struct raid_bdev_io_range {
194 	uint64_t	strip_size;
195 	uint64_t	start_strip_in_disk;
196 	uint64_t	end_strip_in_disk;
197 	uint64_t	start_offset_in_strip;
198 	uint64_t	end_offset_in_strip;
199 	uint8_t		start_disk;
200 	uint8_t		end_disk;
201 	uint8_t		n_disks_involved;
202 };
203 
204 static inline void
205 _raid0_get_io_range(struct raid_bdev_io_range *io_range,
206 		    uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift,
207 		    uint64_t offset_blocks, uint64_t num_blocks)
208 {
209 	uint64_t	start_strip;
210 	uint64_t	end_strip;
211 
212 	io_range->strip_size = strip_size;
213 
214 	/* The start and end strip index in raid0 bdev scope */
215 	start_strip = offset_blocks >> strip_size_shift;
216 	end_strip = (offset_blocks + num_blocks - 1) >> strip_size_shift;
217 	io_range->start_strip_in_disk = start_strip / num_base_bdevs;
218 	io_range->end_strip_in_disk = end_strip / num_base_bdevs;
219 
220 	/* The first strip may have unaligned start LBA offset.
221 	 * The end strip may have unaligned end LBA offset.
222 	 * Strips between them certainly have aligned offset and length to boundaries.
223 	 */
224 	io_range->start_offset_in_strip = offset_blocks % strip_size;
225 	io_range->end_offset_in_strip = (offset_blocks + num_blocks - 1) % strip_size;
226 
227 	/* The base bdev indexes in which start and end strips are located */
228 	io_range->start_disk = start_strip % num_base_bdevs;
229 	io_range->end_disk = end_strip % num_base_bdevs;
230 
231 	/* Calculate how many base_bdevs are involved in io operation.
232 	 * Number of base bdevs involved is between 1 and num_base_bdevs.
233 	 * It will be 1 if the first strip and last strip are the same one.
234 	 */
235 	io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs);
236 }
237 
238 static inline void
239 _raid0_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx,
240 		      uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk)
241 {
242 	uint64_t n_strips_in_disk;
243 	uint64_t start_offset_in_disk;
244 	uint64_t end_offset_in_disk;
245 	uint64_t offset_in_disk;
246 	uint64_t nblocks_in_disk;
247 	uint64_t start_strip_in_disk;
248 	uint64_t end_strip_in_disk;
249 
250 	start_strip_in_disk = io_range->start_strip_in_disk;
251 	if (disk_idx < io_range->start_disk) {
252 		start_strip_in_disk += 1;
253 	}
254 
255 	end_strip_in_disk = io_range->end_strip_in_disk;
256 	if (disk_idx > io_range->end_disk) {
257 		end_strip_in_disk -= 1;
258 	}
259 
260 	assert(end_strip_in_disk >= start_strip_in_disk);
261 	n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1;
262 
263 	if (disk_idx == io_range->start_disk) {
264 		start_offset_in_disk = io_range->start_offset_in_strip;
265 	} else {
266 		start_offset_in_disk = 0;
267 	}
268 
269 	if (disk_idx == io_range->end_disk) {
270 		end_offset_in_disk = io_range->end_offset_in_strip;
271 	} else {
272 		end_offset_in_disk = io_range->strip_size - 1;
273 	}
274 
275 	offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size;
276 	nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size
277 			  + end_offset_in_disk - start_offset_in_disk + 1;
278 
279 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID0,
280 		      "raid_bdev (strip_size 0x%lx) splits IO to base_bdev (%u) at (0x%lx, 0x%lx).\n",
281 		      io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk);
282 
283 	*_offset_in_disk = offset_in_disk;
284 	*_nblocks_in_disk = nblocks_in_disk;
285 }
286 
287 /*
288  * brief:
289  * raid0_submit_null_payload_request function submits the next batch of
290  * io requests with range but without payload, like FLUSH and UNMAP, to member disks;
291  * it will submit as many as possible unless one base io request fails with -ENOMEM,
292  * in which case it will queue itself for later submission.
293  * params:
294  * bdev_io - pointer to parent bdev_io on raid bdev device
295  * returns:
296  * none
297  */
298 void
299 raid0_submit_null_payload_request(void *_bdev_io)
300 {
301 	struct spdk_bdev_io		*bdev_io = _bdev_io;
302 	struct raid_bdev_io		*raid_io;
303 	struct raid_bdev		*raid_bdev;
304 	struct raid_bdev_io_channel	*raid_ch;
305 	struct raid_bdev_io_range	io_range;
306 	int				ret;
307 
308 	raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
309 	raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
310 	raid_ch = spdk_io_channel_get_ctx(raid_io->ch);
311 
312 	_raid0_get_io_range(&io_range, raid_bdev->num_base_bdevs,
313 			    raid_bdev->strip_size, raid_bdev->strip_size_shift,
314 			    bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks);
315 
316 	raid_io->base_bdev_io_expected = io_range.n_disks_involved;
317 
318 	while (raid_io->base_bdev_io_submitted < raid_io->base_bdev_io_expected) {
319 		uint8_t disk_idx;
320 		uint64_t offset_in_disk;
321 		uint64_t nblocks_in_disk;
322 
323 		/* base_bdev is started from start_disk to end_disk.
324 		 * It is possible that index of start_disk is larger than end_disk's.
325 		 */
326 		disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs;
327 
328 		_raid0_split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk);
329 
330 		switch (bdev_io->type) {
331 		case SPDK_BDEV_IO_TYPE_UNMAP:
332 			ret = spdk_bdev_unmap_blocks(raid_bdev->base_bdev_info[disk_idx].desc,
333 						     raid_ch->base_channel[disk_idx],
334 						     offset_in_disk, nblocks_in_disk,
335 						     raid_bdev_base_io_completion, bdev_io);
336 			break;
337 
338 		case SPDK_BDEV_IO_TYPE_FLUSH:
339 			ret = spdk_bdev_flush_blocks(raid_bdev->base_bdev_info[disk_idx].desc,
340 						     raid_ch->base_channel[disk_idx],
341 						     offset_in_disk, nblocks_in_disk,
342 						     raid_bdev_base_io_completion, bdev_io);
343 			break;
344 
345 		default:
346 			SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type);
347 			assert(false);
348 			ret = -EIO;
349 		}
350 
351 		if (ret == 0) {
352 			raid_io->base_bdev_io_submitted++;
353 		} else {
354 			raid_bdev_queue_io_wait(bdev_io, disk_idx,
355 						raid0_submit_null_payload_request, ret);
356 			return;
357 		}
358 	}
359 }
360 
361 SPDK_LOG_REGISTER_COMPONENT("bdev_raid0", SPDK_LOG_BDEV_RAID0)
362