xref: /spdk/module/bdev/raid/raid0.c (revision 60982c759db49b4f4579f16e3b24df0725ba4b94)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2019 Intel Corporation.
3  *   All rights reserved.
4  *   Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  */
6 
7 #include "bdev_raid.h"
8 
9 #include "spdk/env.h"
10 #include "spdk/thread.h"
11 #include "spdk/string.h"
12 #include "spdk/util.h"
13 
14 #include "spdk/log.h"
15 
16 /*
17  * brief:
18  * raid0_bdev_io_completion function is called by lower layers to notify raid
19  * module that particular bdev_io is completed.
20  * params:
21  * bdev_io - pointer to bdev io submitted to lower layers, like child io
22  * success - bdev_io status
23  * cb_arg - function callback context (parent raid_bdev_io)
24  * returns:
25  * none
26  */
27 static void
28 raid0_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
29 {
30 	struct raid_bdev_io *raid_io = cb_arg;
31 
32 	spdk_bdev_free_io(bdev_io);
33 
34 	if (success) {
35 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS);
36 	} else {
37 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
38 	}
39 }
40 
41 static void raid0_submit_rw_request(struct raid_bdev_io *raid_io);
42 
43 static void
44 _raid0_submit_rw_request(void *_raid_io)
45 {
46 	struct raid_bdev_io *raid_io = _raid_io;
47 
48 	raid0_submit_rw_request(raid_io);
49 }
50 
51 /*
52  * brief:
53  * raid0_submit_rw_request function is used to submit I/O to the correct
54  * member disk for raid0 bdevs.
55  * params:
56  * raid_io
57  * returns:
58  * none
59  */
60 static void
61 raid0_submit_rw_request(struct raid_bdev_io *raid_io)
62 {
63 	struct spdk_bdev_io		*bdev_io = spdk_bdev_io_from_ctx(raid_io);
64 	struct spdk_bdev_ext_io_opts	io_opts = {};
65 	struct raid_bdev_io_channel	*raid_ch = raid_io->raid_ch;
66 	struct raid_bdev		*raid_bdev = raid_io->raid_bdev;
67 	uint64_t			pd_strip;
68 	uint32_t			offset_in_strip;
69 	uint64_t			pd_lba;
70 	uint64_t			pd_blocks;
71 	uint8_t				pd_idx;
72 	int				ret = 0;
73 	uint64_t			start_strip;
74 	uint64_t			end_strip;
75 	struct raid_base_bdev_info	*base_info;
76 	struct spdk_io_channel		*base_ch;
77 
78 	start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift;
79 	end_strip = (bdev_io->u.bdev.offset_blocks + bdev_io->u.bdev.num_blocks - 1) >>
80 		    raid_bdev->strip_size_shift;
81 	if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) {
82 		assert(false);
83 		SPDK_ERRLOG("I/O spans strip boundary!\n");
84 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
85 		return;
86 	}
87 
88 	pd_strip = start_strip / raid_bdev->num_base_bdevs;
89 	pd_idx = start_strip % raid_bdev->num_base_bdevs;
90 	offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1);
91 	pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip;
92 	pd_blocks = bdev_io->u.bdev.num_blocks;
93 	base_info = &raid_bdev->base_bdev_info[pd_idx];
94 	if (base_info->desc == NULL) {
95 		SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
96 		assert(0);
97 	}
98 
99 	/*
100 	 * Submit child io to bdev layer with using base bdev descriptors, base
101 	 * bdev lba, base bdev child io length in blocks, buffer, completion
102 	 * function and function callback context
103 	 */
104 	assert(raid_ch != NULL);
105 	assert(raid_ch->base_channel);
106 	base_ch = raid_ch->base_channel[pd_idx];
107 
108 	io_opts.size = sizeof(io_opts);
109 	io_opts.memory_domain = bdev_io->u.bdev.memory_domain;
110 	io_opts.memory_domain_ctx = bdev_io->u.bdev.memory_domain_ctx;
111 	io_opts.metadata = bdev_io->u.bdev.md_buf;
112 
113 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
114 		ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch,
115 						 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
116 						 pd_lba, pd_blocks, raid0_bdev_io_completion,
117 						 raid_io, &io_opts);
118 	} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
119 		ret = spdk_bdev_writev_blocks_ext(base_info->desc, base_ch,
120 						  bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
121 						  pd_lba, pd_blocks, raid0_bdev_io_completion,
122 						  raid_io, &io_opts);
123 	} else {
124 		SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type);
125 		assert(0);
126 	}
127 
128 	if (ret == -ENOMEM) {
129 		raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
130 					base_ch, _raid0_submit_rw_request);
131 	} else if (ret != 0) {
132 		SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
133 		assert(false);
134 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
135 	}
136 }
137 
138 /* raid0 IO range */
139 struct raid_bdev_io_range {
140 	uint64_t	strip_size;
141 	uint64_t	start_strip_in_disk;
142 	uint64_t	end_strip_in_disk;
143 	uint64_t	start_offset_in_strip;
144 	uint64_t	end_offset_in_strip;
145 	uint8_t		start_disk;
146 	uint8_t		end_disk;
147 	uint8_t		n_disks_involved;
148 };
149 
150 static inline void
151 _raid0_get_io_range(struct raid_bdev_io_range *io_range,
152 		    uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift,
153 		    uint64_t offset_blocks, uint64_t num_blocks)
154 {
155 	uint64_t	start_strip;
156 	uint64_t	end_strip;
157 	uint64_t	total_blocks;
158 
159 	io_range->strip_size = strip_size;
160 	total_blocks = offset_blocks + num_blocks - (num_blocks > 0);
161 
162 	/* The start and end strip index in raid0 bdev scope */
163 	start_strip = offset_blocks >> strip_size_shift;
164 	end_strip = total_blocks >> strip_size_shift;
165 	io_range->start_strip_in_disk = start_strip / num_base_bdevs;
166 	io_range->end_strip_in_disk = end_strip / num_base_bdevs;
167 
168 	/* The first strip may have unaligned start LBA offset.
169 	 * The end strip may have unaligned end LBA offset.
170 	 * Strips between them certainly have aligned offset and length to boundaries.
171 	 */
172 	io_range->start_offset_in_strip = offset_blocks % strip_size;
173 	io_range->end_offset_in_strip = total_blocks % strip_size;
174 
175 	/* The base bdev indexes in which start and end strips are located */
176 	io_range->start_disk = start_strip % num_base_bdevs;
177 	io_range->end_disk = end_strip % num_base_bdevs;
178 
179 	/* Calculate how many base_bdevs are involved in io operation.
180 	 * Number of base bdevs involved is between 1 and num_base_bdevs.
181 	 * It will be 1 if the first strip and last strip are the same one.
182 	 */
183 	io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs);
184 }
185 
186 static inline void
187 _raid0_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx,
188 		      uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk)
189 {
190 	uint64_t n_strips_in_disk;
191 	uint64_t start_offset_in_disk;
192 	uint64_t end_offset_in_disk;
193 	uint64_t offset_in_disk;
194 	uint64_t nblocks_in_disk;
195 	uint64_t start_strip_in_disk;
196 	uint64_t end_strip_in_disk;
197 
198 	start_strip_in_disk = io_range->start_strip_in_disk;
199 	if (disk_idx < io_range->start_disk) {
200 		start_strip_in_disk += 1;
201 	}
202 
203 	end_strip_in_disk = io_range->end_strip_in_disk;
204 	if (disk_idx > io_range->end_disk) {
205 		end_strip_in_disk -= 1;
206 	}
207 
208 	assert(end_strip_in_disk >= start_strip_in_disk);
209 	n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1;
210 
211 	if (disk_idx == io_range->start_disk) {
212 		start_offset_in_disk = io_range->start_offset_in_strip;
213 	} else {
214 		start_offset_in_disk = 0;
215 	}
216 
217 	if (disk_idx == io_range->end_disk) {
218 		end_offset_in_disk = io_range->end_offset_in_strip;
219 	} else {
220 		end_offset_in_disk = io_range->strip_size - 1;
221 	}
222 
223 	offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size;
224 	nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size
225 			  + end_offset_in_disk - start_offset_in_disk + 1;
226 
227 	SPDK_DEBUGLOG(bdev_raid0,
228 		      "raid_bdev (strip_size 0x%" PRIx64 ") splits IO to base_bdev (%u) at (0x%" PRIx64 ", 0x%" PRIx64
229 		      ").\n",
230 		      io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk);
231 
232 	*_offset_in_disk = offset_in_disk;
233 	*_nblocks_in_disk = nblocks_in_disk;
234 }
235 
236 static void raid0_submit_null_payload_request(struct raid_bdev_io *raid_io);
237 
238 static void
239 _raid0_submit_null_payload_request(void *_raid_io)
240 {
241 	struct raid_bdev_io *raid_io = _raid_io;
242 
243 	raid0_submit_null_payload_request(raid_io);
244 }
245 
246 static void
247 raid0_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
248 {
249 	struct raid_bdev_io *raid_io = cb_arg;
250 
251 	raid_bdev_io_complete_part(raid_io, 1, success ?
252 				   SPDK_BDEV_IO_STATUS_SUCCESS :
253 				   SPDK_BDEV_IO_STATUS_FAILED);
254 
255 	spdk_bdev_free_io(bdev_io);
256 }
257 
258 /*
259  * brief:
260  * raid0_submit_null_payload_request function submits the next batch of
261  * io requests with range but without payload, like FLUSH and UNMAP, to member disks;
262  * it will submit as many as possible unless one base io request fails with -ENOMEM,
263  * in which case it will queue itself for later submission.
264  * params:
265  * bdev_io - pointer to parent bdev_io on raid bdev device
266  * returns:
267  * none
268  */
269 static void
270 raid0_submit_null_payload_request(struct raid_bdev_io *raid_io)
271 {
272 	struct spdk_bdev_io		*bdev_io;
273 	struct raid_bdev		*raid_bdev;
274 	struct raid_bdev_io_range	io_range;
275 	int				ret;
276 	struct raid_base_bdev_info	*base_info;
277 	struct spdk_io_channel		*base_ch;
278 
279 	bdev_io = spdk_bdev_io_from_ctx(raid_io);
280 	raid_bdev = raid_io->raid_bdev;
281 
282 	_raid0_get_io_range(&io_range, raid_bdev->num_base_bdevs,
283 			    raid_bdev->strip_size, raid_bdev->strip_size_shift,
284 			    bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks);
285 
286 	if (raid_io->base_bdev_io_remaining == 0) {
287 		raid_io->base_bdev_io_remaining = io_range.n_disks_involved;
288 	}
289 
290 	while (raid_io->base_bdev_io_submitted < io_range.n_disks_involved) {
291 		uint8_t disk_idx;
292 		uint64_t offset_in_disk;
293 		uint64_t nblocks_in_disk;
294 
295 		/* base_bdev is started from start_disk to end_disk.
296 		 * It is possible that index of start_disk is larger than end_disk's.
297 		 */
298 		disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs;
299 		base_info = &raid_bdev->base_bdev_info[disk_idx];
300 		base_ch = raid_io->raid_ch->base_channel[disk_idx];
301 
302 		_raid0_split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk);
303 
304 		switch (bdev_io->type) {
305 		case SPDK_BDEV_IO_TYPE_UNMAP:
306 			ret = spdk_bdev_unmap_blocks(base_info->desc, base_ch,
307 						     offset_in_disk, nblocks_in_disk,
308 						     raid0_base_io_complete, raid_io);
309 			break;
310 
311 		case SPDK_BDEV_IO_TYPE_FLUSH:
312 			ret = spdk_bdev_flush_blocks(base_info->desc, base_ch,
313 						     offset_in_disk, nblocks_in_disk,
314 						     raid0_base_io_complete, raid_io);
315 			break;
316 
317 		default:
318 			SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type);
319 			assert(false);
320 			ret = -EIO;
321 		}
322 
323 		if (ret == 0) {
324 			raid_io->base_bdev_io_submitted++;
325 		} else if (ret == -ENOMEM) {
326 			raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
327 						base_ch, _raid0_submit_null_payload_request);
328 			return;
329 		} else {
330 			SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
331 			assert(false);
332 			raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
333 			return;
334 		}
335 	}
336 }
337 
338 static uint64_t
339 raid0_calculate_blockcnt(struct raid_bdev *raid_bdev)
340 {
341 	uint64_t min_blockcnt = UINT64_MAX;
342 	struct raid_base_bdev_info *base_info;
343 
344 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
345 		/* Calculate minimum block count from all base bdevs */
346 		min_blockcnt = spdk_min(min_blockcnt, spdk_bdev_desc_get_bdev(base_info->desc)->blockcnt);
347 	}
348 
349 	/*
350 	 * Take the minimum block count based approach where total block count
351 	 * of raid bdev is the number of base bdev times the minimum block count
352 	 * of any base bdev.
353 	 */
354 	SPDK_DEBUGLOG(bdev_raid0, "min blockcount %" PRIu64 ",  numbasedev %u, strip size shift %u\n",
355 		      min_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift);
356 
357 	return ((min_blockcnt >> raid_bdev->strip_size_shift) <<
358 		raid_bdev->strip_size_shift)  * raid_bdev->num_base_bdevs;
359 }
360 
361 static int
362 raid0_start(struct raid_bdev *raid_bdev)
363 {
364 	raid_bdev->bdev.blockcnt = raid0_calculate_blockcnt(raid_bdev);
365 
366 	if (raid_bdev->num_base_bdevs > 1) {
367 		raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size;
368 		raid_bdev->bdev.split_on_optimal_io_boundary = true;
369 	} else {
370 		/* Do not need to split reads/writes on single bdev RAID modules. */
371 		raid_bdev->bdev.optimal_io_boundary = 0;
372 		raid_bdev->bdev.split_on_optimal_io_boundary = false;
373 	}
374 
375 	return 0;
376 }
377 
378 static void
379 raid0_resize(struct raid_bdev *raid_bdev)
380 {
381 	uint64_t blockcnt;
382 	int rc;
383 
384 	blockcnt = raid0_calculate_blockcnt(raid_bdev);
385 
386 	if (blockcnt == raid_bdev->bdev.blockcnt) {
387 		return;
388 	}
389 
390 	SPDK_NOTICELOG("raid0 '%s': min blockcount was changed from %" PRIu64 " to %" PRIu64 "\n",
391 		       raid_bdev->bdev.name,
392 		       raid_bdev->bdev.blockcnt,
393 		       blockcnt);
394 
395 	rc = spdk_bdev_notify_blockcnt_change(&raid_bdev->bdev, blockcnt);
396 	if (rc != 0) {
397 		SPDK_ERRLOG("Failed to notify blockcount change\n");
398 	}
399 }
400 
401 static struct raid_bdev_module g_raid0_module = {
402 	.level = RAID0,
403 	.base_bdevs_min = 1,
404 	.memory_domains_supported = true,
405 	.start = raid0_start,
406 	.submit_rw_request = raid0_submit_rw_request,
407 	.submit_null_payload_request = raid0_submit_null_payload_request,
408 	.resize = raid0_resize,
409 };
410 RAID_MODULE_REGISTER(&g_raid0_module)
411 
412 SPDK_LOG_REGISTER_COMPONENT(bdev_raid0)
413