xref: /spdk/module/bdev/raid/raid0.c (revision 8afdeef3becfe9409cc9e7372bd0bc10e8b7d46d)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2019 Intel Corporation.
3  *   All rights reserved.
4  *   Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  */
6 
7 #include "bdev_raid.h"
8 
9 #include "spdk/env.h"
10 #include "spdk/thread.h"
11 #include "spdk/string.h"
12 #include "spdk/util.h"
13 
14 #include "spdk/log.h"
15 
16 /*
17  * brief:
18  * raid0_bdev_io_completion function is called by lower layers to notify raid
19  * module that particular bdev_io is completed.
20  * params:
21  * bdev_io - pointer to bdev io submitted to lower layers, like child io
22  * success - bdev_io status
23  * cb_arg - function callback context (parent raid_bdev_io)
24  * returns:
25  * none
26  */
27 static void
28 raid0_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
29 {
30 	struct raid_bdev_io *raid_io = cb_arg;
31 	int rc;
32 
33 	if (success) {
34 		if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_READ &&
35 				  spdk_bdev_get_dif_type(bdev_io->bdev) != SPDK_DIF_DISABLE &&
36 				  bdev_io->bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK)) {
37 
38 			rc = raid_bdev_verify_dix_reftag(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
39 							 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.num_blocks, bdev_io->bdev,
40 							 bdev_io->u.bdev.offset_blocks);
41 			if (rc != 0) {
42 				SPDK_ERRLOG("Reftag verify failed.\n");
43 				raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
44 				return;
45 			}
46 		}
47 
48 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS);
49 	} else {
50 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
51 	}
52 
53 	spdk_bdev_free_io(bdev_io);
54 }
55 
56 static void raid0_submit_rw_request(struct raid_bdev_io *raid_io);
57 
58 static void
59 _raid0_submit_rw_request(void *_raid_io)
60 {
61 	struct raid_bdev_io *raid_io = _raid_io;
62 
63 	raid0_submit_rw_request(raid_io);
64 }
65 
66 /*
67  * brief:
68  * raid0_submit_rw_request function is used to submit I/O to the correct
69  * member disk for raid0 bdevs.
70  * params:
71  * raid_io
72  * returns:
73  * none
74  */
75 static void
76 raid0_submit_rw_request(struct raid_bdev_io *raid_io)
77 {
78 	struct spdk_bdev_ext_io_opts	io_opts = {};
79 	struct raid_bdev_io_channel	*raid_ch = raid_io->raid_ch;
80 	struct raid_bdev		*raid_bdev = raid_io->raid_bdev;
81 	uint64_t			pd_strip;
82 	uint32_t			offset_in_strip;
83 	uint64_t			pd_lba;
84 	uint64_t			pd_blocks;
85 	uint8_t				pd_idx;
86 	int				ret = 0;
87 	uint64_t			start_strip;
88 	uint64_t			end_strip;
89 	struct raid_base_bdev_info	*base_info;
90 	struct spdk_io_channel		*base_ch;
91 
92 	start_strip = raid_io->offset_blocks >> raid_bdev->strip_size_shift;
93 	end_strip = (raid_io->offset_blocks + raid_io->num_blocks - 1) >>
94 		    raid_bdev->strip_size_shift;
95 	if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) {
96 		assert(false);
97 		SPDK_ERRLOG("I/O spans strip boundary!\n");
98 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
99 		return;
100 	}
101 
102 	pd_strip = start_strip / raid_bdev->num_base_bdevs;
103 	pd_idx = start_strip % raid_bdev->num_base_bdevs;
104 	offset_in_strip = raid_io->offset_blocks & (raid_bdev->strip_size - 1);
105 	pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip;
106 	pd_blocks = raid_io->num_blocks;
107 	base_info = &raid_bdev->base_bdev_info[pd_idx];
108 	if (base_info->desc == NULL) {
109 		SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
110 		assert(0);
111 	}
112 
113 	/*
114 	 * Submit child io to bdev layer with using base bdev descriptors, base
115 	 * bdev lba, base bdev child io length in blocks, buffer, completion
116 	 * function and function callback context
117 	 */
118 	assert(raid_ch != NULL);
119 	base_ch = raid_bdev_channel_get_base_channel(raid_ch, pd_idx);
120 
121 	io_opts.size = sizeof(io_opts);
122 	io_opts.memory_domain = raid_io->memory_domain;
123 	io_opts.memory_domain_ctx = raid_io->memory_domain_ctx;
124 	io_opts.metadata = raid_io->md_buf;
125 
126 	if (raid_io->type == SPDK_BDEV_IO_TYPE_READ) {
127 		ret = raid_bdev_readv_blocks_ext(base_info, base_ch,
128 						 raid_io->iovs, raid_io->iovcnt,
129 						 pd_lba, pd_blocks, raid0_bdev_io_completion,
130 						 raid_io, &io_opts);
131 	} else if (raid_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
132 		struct spdk_bdev *bdev = &base_info->raid_bdev->bdev;
133 
134 		if (spdk_unlikely(spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE &&
135 				  bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK)) {
136 			ret = raid_bdev_verify_dix_reftag(raid_io->iovs, raid_io->iovcnt, io_opts.metadata,
137 							  pd_blocks, bdev, raid_io->offset_blocks);
138 			if (ret != 0) {
139 				SPDK_ERRLOG("bdev io submit error due to DIX verify failure\n");
140 				raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
141 				return;
142 			}
143 		}
144 
145 		ret = raid_bdev_writev_blocks_ext(base_info, base_ch,
146 						  raid_io->iovs, raid_io->iovcnt,
147 						  pd_lba, pd_blocks, raid0_bdev_io_completion,
148 						  raid_io, &io_opts);
149 	} else {
150 		SPDK_ERRLOG("Recvd not supported io type %u\n", raid_io->type);
151 		assert(0);
152 	}
153 
154 	if (ret == -ENOMEM) {
155 		raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
156 					base_ch, _raid0_submit_rw_request);
157 	} else if (ret != 0) {
158 		SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
159 		assert(false);
160 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
161 	}
162 }
163 
164 /* raid0 IO range */
165 struct raid_bdev_io_range {
166 	uint64_t	strip_size;
167 	uint64_t	start_strip_in_disk;
168 	uint64_t	end_strip_in_disk;
169 	uint64_t	start_offset_in_strip;
170 	uint64_t	end_offset_in_strip;
171 	uint8_t		start_disk;
172 	uint8_t		end_disk;
173 	uint8_t		n_disks_involved;
174 };
175 
176 static inline void
177 _raid0_get_io_range(struct raid_bdev_io_range *io_range,
178 		    uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift,
179 		    uint64_t offset_blocks, uint64_t num_blocks)
180 {
181 	uint64_t	start_strip;
182 	uint64_t	end_strip;
183 	uint64_t	total_blocks;
184 
185 	io_range->strip_size = strip_size;
186 	total_blocks = offset_blocks + num_blocks - (num_blocks > 0);
187 
188 	/* The start and end strip index in raid0 bdev scope */
189 	start_strip = offset_blocks >> strip_size_shift;
190 	end_strip = total_blocks >> strip_size_shift;
191 	io_range->start_strip_in_disk = start_strip / num_base_bdevs;
192 	io_range->end_strip_in_disk = end_strip / num_base_bdevs;
193 
194 	/* The first strip may have unaligned start LBA offset.
195 	 * The end strip may have unaligned end LBA offset.
196 	 * Strips between them certainly have aligned offset and length to boundaries.
197 	 */
198 	io_range->start_offset_in_strip = offset_blocks % strip_size;
199 	io_range->end_offset_in_strip = total_blocks % strip_size;
200 
201 	/* The base bdev indexes in which start and end strips are located */
202 	io_range->start_disk = start_strip % num_base_bdevs;
203 	io_range->end_disk = end_strip % num_base_bdevs;
204 
205 	/* Calculate how many base_bdevs are involved in io operation.
206 	 * Number of base bdevs involved is between 1 and num_base_bdevs.
207 	 * It will be 1 if the first strip and last strip are the same one.
208 	 */
209 	io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs);
210 }
211 
212 static inline void
213 _raid0_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx,
214 		      uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk)
215 {
216 	uint64_t n_strips_in_disk;
217 	uint64_t start_offset_in_disk;
218 	uint64_t end_offset_in_disk;
219 	uint64_t offset_in_disk;
220 	uint64_t nblocks_in_disk;
221 	uint64_t start_strip_in_disk;
222 	uint64_t end_strip_in_disk;
223 
224 	start_strip_in_disk = io_range->start_strip_in_disk;
225 	if (disk_idx < io_range->start_disk) {
226 		start_strip_in_disk += 1;
227 	}
228 
229 	end_strip_in_disk = io_range->end_strip_in_disk;
230 	if (disk_idx > io_range->end_disk) {
231 		end_strip_in_disk -= 1;
232 	}
233 
234 	assert(end_strip_in_disk >= start_strip_in_disk);
235 	n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1;
236 
237 	if (disk_idx == io_range->start_disk) {
238 		start_offset_in_disk = io_range->start_offset_in_strip;
239 	} else {
240 		start_offset_in_disk = 0;
241 	}
242 
243 	if (disk_idx == io_range->end_disk) {
244 		end_offset_in_disk = io_range->end_offset_in_strip;
245 	} else {
246 		end_offset_in_disk = io_range->strip_size - 1;
247 	}
248 
249 	offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size;
250 	nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size
251 			  + end_offset_in_disk - start_offset_in_disk + 1;
252 
253 	SPDK_DEBUGLOG(bdev_raid0,
254 		      "raid_bdev (strip_size 0x%" PRIx64 ") splits IO to base_bdev (%u) at (0x%" PRIx64 ", 0x%" PRIx64
255 		      ").\n",
256 		      io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk);
257 
258 	*_offset_in_disk = offset_in_disk;
259 	*_nblocks_in_disk = nblocks_in_disk;
260 }
261 
262 static void raid0_submit_null_payload_request(struct raid_bdev_io *raid_io);
263 
264 static void
265 _raid0_submit_null_payload_request(void *_raid_io)
266 {
267 	struct raid_bdev_io *raid_io = _raid_io;
268 
269 	raid0_submit_null_payload_request(raid_io);
270 }
271 
272 static void
273 raid0_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
274 {
275 	struct raid_bdev_io *raid_io = cb_arg;
276 
277 	raid_bdev_io_complete_part(raid_io, 1, success ?
278 				   SPDK_BDEV_IO_STATUS_SUCCESS :
279 				   SPDK_BDEV_IO_STATUS_FAILED);
280 
281 	spdk_bdev_free_io(bdev_io);
282 }
283 
284 /*
285  * brief:
286  * raid0_submit_null_payload_request function submits the next batch of
287  * io requests with range but without payload, like FLUSH and UNMAP, to member disks;
288  * it will submit as many as possible unless one base io request fails with -ENOMEM,
289  * in which case it will queue itself for later submission.
290  * params:
291  * bdev_io - pointer to parent bdev_io on raid bdev device
292  * returns:
293  * none
294  */
295 static void
296 raid0_submit_null_payload_request(struct raid_bdev_io *raid_io)
297 {
298 	struct raid_bdev		*raid_bdev;
299 	struct raid_bdev_io_range	io_range;
300 	int				ret;
301 	struct raid_base_bdev_info	*base_info;
302 	struct spdk_io_channel		*base_ch;
303 
304 	raid_bdev = raid_io->raid_bdev;
305 
306 	_raid0_get_io_range(&io_range, raid_bdev->num_base_bdevs,
307 			    raid_bdev->strip_size, raid_bdev->strip_size_shift,
308 			    raid_io->offset_blocks, raid_io->num_blocks);
309 
310 	if (raid_io->base_bdev_io_remaining == 0) {
311 		raid_io->base_bdev_io_remaining = io_range.n_disks_involved;
312 	}
313 
314 	while (raid_io->base_bdev_io_submitted < io_range.n_disks_involved) {
315 		uint8_t disk_idx;
316 		uint64_t offset_in_disk;
317 		uint64_t nblocks_in_disk;
318 
319 		/* base_bdev is started from start_disk to end_disk.
320 		 * It is possible that index of start_disk is larger than end_disk's.
321 		 */
322 		disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs;
323 		base_info = &raid_bdev->base_bdev_info[disk_idx];
324 		base_ch = raid_bdev_channel_get_base_channel(raid_io->raid_ch, disk_idx);
325 
326 		_raid0_split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk);
327 
328 		switch (raid_io->type) {
329 		case SPDK_BDEV_IO_TYPE_UNMAP:
330 			ret = raid_bdev_unmap_blocks(base_info, base_ch,
331 						     offset_in_disk, nblocks_in_disk,
332 						     raid0_base_io_complete, raid_io);
333 			break;
334 
335 		case SPDK_BDEV_IO_TYPE_FLUSH:
336 			ret = raid_bdev_flush_blocks(base_info, base_ch,
337 						     offset_in_disk, nblocks_in_disk,
338 						     raid0_base_io_complete, raid_io);
339 			break;
340 
341 		default:
342 			SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", raid_io->type);
343 			assert(false);
344 			ret = -EIO;
345 		}
346 
347 		if (ret == 0) {
348 			raid_io->base_bdev_io_submitted++;
349 		} else if (ret == -ENOMEM) {
350 			raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
351 						base_ch, _raid0_submit_null_payload_request);
352 			return;
353 		} else {
354 			SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
355 			assert(false);
356 			raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
357 			return;
358 		}
359 	}
360 }
361 
362 static int
363 raid0_start(struct raid_bdev *raid_bdev)
364 {
365 	uint64_t min_blockcnt = UINT64_MAX;
366 	uint64_t base_bdev_data_size;
367 	struct raid_base_bdev_info *base_info;
368 
369 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
370 		/* Calculate minimum block count from all base bdevs */
371 		min_blockcnt = spdk_min(min_blockcnt, base_info->data_size);
372 	}
373 
374 	base_bdev_data_size = (min_blockcnt >> raid_bdev->strip_size_shift) << raid_bdev->strip_size_shift;
375 
376 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
377 		base_info->data_size = base_bdev_data_size;
378 	}
379 
380 	/*
381 	 * Take the minimum block count based approach where total block count
382 	 * of raid bdev is the number of base bdev times the minimum block count
383 	 * of any base bdev.
384 	 */
385 	SPDK_DEBUGLOG(bdev_raid0, "min blockcount %" PRIu64 ",  numbasedev %u, strip size shift %u\n",
386 		      min_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift);
387 
388 	raid_bdev->bdev.blockcnt = base_bdev_data_size * raid_bdev->num_base_bdevs;
389 
390 	if (raid_bdev->num_base_bdevs > 1) {
391 		raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size;
392 		raid_bdev->bdev.split_on_optimal_io_boundary = true;
393 	} else {
394 		/* Do not need to split reads/writes on single bdev RAID modules. */
395 		raid_bdev->bdev.optimal_io_boundary = 0;
396 		raid_bdev->bdev.split_on_optimal_io_boundary = false;
397 	}
398 
399 	return 0;
400 }
401 
402 static bool
403 raid0_resize(struct raid_bdev *raid_bdev)
404 {
405 	uint64_t blockcnt;
406 	int rc;
407 	uint64_t min_blockcnt = UINT64_MAX;
408 	struct raid_base_bdev_info *base_info;
409 	uint64_t base_bdev_data_size;
410 
411 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
412 		struct spdk_bdev *base_bdev = spdk_bdev_desc_get_bdev(base_info->desc);
413 
414 		min_blockcnt = spdk_min(min_blockcnt, base_bdev->blockcnt - base_info->data_offset);
415 	}
416 
417 	base_bdev_data_size = (min_blockcnt >> raid_bdev->strip_size_shift) << raid_bdev->strip_size_shift;
418 	blockcnt = base_bdev_data_size * raid_bdev->num_base_bdevs;
419 
420 	if (blockcnt == raid_bdev->bdev.blockcnt) {
421 		return false;
422 	}
423 
424 	rc = spdk_bdev_notify_blockcnt_change(&raid_bdev->bdev, blockcnt);
425 	if (rc != 0) {
426 		SPDK_ERRLOG("Failed to notify blockcount change\n");
427 		return false;
428 	}
429 
430 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
431 		base_info->data_size = base_bdev_data_size;
432 	}
433 
434 	return true;
435 }
436 
437 static struct raid_bdev_module g_raid0_module = {
438 	.level = RAID0,
439 	.base_bdevs_min = 1,
440 	.memory_domains_supported = true,
441 	.dif_supported = true,
442 	.start = raid0_start,
443 	.submit_rw_request = raid0_submit_rw_request,
444 	.submit_null_payload_request = raid0_submit_null_payload_request,
445 	.resize = raid0_resize,
446 };
447 RAID_MODULE_REGISTER(&g_raid0_module)
448 
449 SPDK_LOG_REGISTER_COMPONENT(bdev_raid0)
450