xref: /spdk/module/bdev/raid/raid0.c (revision 45a053c5777494f4e8ce4bc1191c9de3920377f7)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2019 Intel Corporation.
3  *   All rights reserved.
4  *   Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  */
6 
7 #include "bdev_raid.h"
8 
9 #include "spdk/env.h"
10 #include "spdk/thread.h"
11 #include "spdk/string.h"
12 #include "spdk/util.h"
13 
14 #include "spdk/log.h"
15 
16 /*
17  * brief:
18  * raid0_bdev_io_completion function is called by lower layers to notify raid
19  * module that particular bdev_io is completed.
20  * params:
21  * bdev_io - pointer to bdev io submitted to lower layers, like child io
22  * success - bdev_io status
23  * cb_arg - function callback context (parent raid_bdev_io)
24  * returns:
25  * none
26  */
27 static void
28 raid0_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
29 {
30 	struct raid_bdev_io *raid_io = cb_arg;
31 
32 	spdk_bdev_free_io(bdev_io);
33 
34 	if (success) {
35 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS);
36 	} else {
37 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
38 	}
39 }
40 
41 static void raid0_submit_rw_request(struct raid_bdev_io *raid_io);
42 
43 static void
44 _raid0_submit_rw_request(void *_raid_io)
45 {
46 	struct raid_bdev_io *raid_io = _raid_io;
47 
48 	raid0_submit_rw_request(raid_io);
49 }
50 
51 /*
52  * brief:
53  * raid0_submit_rw_request function is used to submit I/O to the correct
54  * member disk for raid0 bdevs.
55  * params:
56  * raid_io
57  * returns:
58  * none
59  */
60 static void
61 raid0_submit_rw_request(struct raid_bdev_io *raid_io)
62 {
63 	struct spdk_bdev_ext_io_opts	io_opts = {};
64 	struct raid_bdev_io_channel	*raid_ch = raid_io->raid_ch;
65 	struct raid_bdev		*raid_bdev = raid_io->raid_bdev;
66 	uint64_t			pd_strip;
67 	uint32_t			offset_in_strip;
68 	uint64_t			pd_lba;
69 	uint64_t			pd_blocks;
70 	uint8_t				pd_idx;
71 	int				ret = 0;
72 	uint64_t			start_strip;
73 	uint64_t			end_strip;
74 	struct raid_base_bdev_info	*base_info;
75 	struct spdk_io_channel		*base_ch;
76 
77 	start_strip = raid_io->offset_blocks >> raid_bdev->strip_size_shift;
78 	end_strip = (raid_io->offset_blocks + raid_io->num_blocks - 1) >>
79 		    raid_bdev->strip_size_shift;
80 	if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) {
81 		assert(false);
82 		SPDK_ERRLOG("I/O spans strip boundary!\n");
83 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
84 		return;
85 	}
86 
87 	pd_strip = start_strip / raid_bdev->num_base_bdevs;
88 	pd_idx = start_strip % raid_bdev->num_base_bdevs;
89 	offset_in_strip = raid_io->offset_blocks & (raid_bdev->strip_size - 1);
90 	pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip;
91 	pd_blocks = raid_io->num_blocks;
92 	base_info = &raid_bdev->base_bdev_info[pd_idx];
93 	if (base_info->desc == NULL) {
94 		SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
95 		assert(0);
96 	}
97 
98 	/*
99 	 * Submit child io to bdev layer with using base bdev descriptors, base
100 	 * bdev lba, base bdev child io length in blocks, buffer, completion
101 	 * function and function callback context
102 	 */
103 	assert(raid_ch != NULL);
104 	base_ch = raid_bdev_channel_get_base_channel(raid_ch, pd_idx);
105 
106 	io_opts.size = sizeof(io_opts);
107 	io_opts.memory_domain = raid_io->memory_domain;
108 	io_opts.memory_domain_ctx = raid_io->memory_domain_ctx;
109 	io_opts.metadata = raid_io->md_buf;
110 
111 	if (raid_io->type == SPDK_BDEV_IO_TYPE_READ) {
112 		ret = raid_bdev_readv_blocks_ext(base_info, base_ch,
113 						 raid_io->iovs, raid_io->iovcnt,
114 						 pd_lba, pd_blocks, raid0_bdev_io_completion,
115 						 raid_io, &io_opts);
116 	} else if (raid_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
117 		ret = raid_bdev_writev_blocks_ext(base_info, base_ch,
118 						  raid_io->iovs, raid_io->iovcnt,
119 						  pd_lba, pd_blocks, raid0_bdev_io_completion,
120 						  raid_io, &io_opts);
121 	} else {
122 		SPDK_ERRLOG("Recvd not supported io type %u\n", raid_io->type);
123 		assert(0);
124 	}
125 
126 	if (ret == -ENOMEM) {
127 		raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
128 					base_ch, _raid0_submit_rw_request);
129 	} else if (ret != 0) {
130 		SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
131 		assert(false);
132 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
133 	}
134 }
135 
136 /* raid0 IO range */
137 struct raid_bdev_io_range {
138 	uint64_t	strip_size;
139 	uint64_t	start_strip_in_disk;
140 	uint64_t	end_strip_in_disk;
141 	uint64_t	start_offset_in_strip;
142 	uint64_t	end_offset_in_strip;
143 	uint8_t		start_disk;
144 	uint8_t		end_disk;
145 	uint8_t		n_disks_involved;
146 };
147 
148 static inline void
149 _raid0_get_io_range(struct raid_bdev_io_range *io_range,
150 		    uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift,
151 		    uint64_t offset_blocks, uint64_t num_blocks)
152 {
153 	uint64_t	start_strip;
154 	uint64_t	end_strip;
155 	uint64_t	total_blocks;
156 
157 	io_range->strip_size = strip_size;
158 	total_blocks = offset_blocks + num_blocks - (num_blocks > 0);
159 
160 	/* The start and end strip index in raid0 bdev scope */
161 	start_strip = offset_blocks >> strip_size_shift;
162 	end_strip = total_blocks >> strip_size_shift;
163 	io_range->start_strip_in_disk = start_strip / num_base_bdevs;
164 	io_range->end_strip_in_disk = end_strip / num_base_bdevs;
165 
166 	/* The first strip may have unaligned start LBA offset.
167 	 * The end strip may have unaligned end LBA offset.
168 	 * Strips between them certainly have aligned offset and length to boundaries.
169 	 */
170 	io_range->start_offset_in_strip = offset_blocks % strip_size;
171 	io_range->end_offset_in_strip = total_blocks % strip_size;
172 
173 	/* The base bdev indexes in which start and end strips are located */
174 	io_range->start_disk = start_strip % num_base_bdevs;
175 	io_range->end_disk = end_strip % num_base_bdevs;
176 
177 	/* Calculate how many base_bdevs are involved in io operation.
178 	 * Number of base bdevs involved is between 1 and num_base_bdevs.
179 	 * It will be 1 if the first strip and last strip are the same one.
180 	 */
181 	io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs);
182 }
183 
184 static inline void
185 _raid0_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx,
186 		      uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk)
187 {
188 	uint64_t n_strips_in_disk;
189 	uint64_t start_offset_in_disk;
190 	uint64_t end_offset_in_disk;
191 	uint64_t offset_in_disk;
192 	uint64_t nblocks_in_disk;
193 	uint64_t start_strip_in_disk;
194 	uint64_t end_strip_in_disk;
195 
196 	start_strip_in_disk = io_range->start_strip_in_disk;
197 	if (disk_idx < io_range->start_disk) {
198 		start_strip_in_disk += 1;
199 	}
200 
201 	end_strip_in_disk = io_range->end_strip_in_disk;
202 	if (disk_idx > io_range->end_disk) {
203 		end_strip_in_disk -= 1;
204 	}
205 
206 	assert(end_strip_in_disk >= start_strip_in_disk);
207 	n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1;
208 
209 	if (disk_idx == io_range->start_disk) {
210 		start_offset_in_disk = io_range->start_offset_in_strip;
211 	} else {
212 		start_offset_in_disk = 0;
213 	}
214 
215 	if (disk_idx == io_range->end_disk) {
216 		end_offset_in_disk = io_range->end_offset_in_strip;
217 	} else {
218 		end_offset_in_disk = io_range->strip_size - 1;
219 	}
220 
221 	offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size;
222 	nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size
223 			  + end_offset_in_disk - start_offset_in_disk + 1;
224 
225 	SPDK_DEBUGLOG(bdev_raid0,
226 		      "raid_bdev (strip_size 0x%" PRIx64 ") splits IO to base_bdev (%u) at (0x%" PRIx64 ", 0x%" PRIx64
227 		      ").\n",
228 		      io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk);
229 
230 	*_offset_in_disk = offset_in_disk;
231 	*_nblocks_in_disk = nblocks_in_disk;
232 }
233 
234 static void raid0_submit_null_payload_request(struct raid_bdev_io *raid_io);
235 
236 static void
237 _raid0_submit_null_payload_request(void *_raid_io)
238 {
239 	struct raid_bdev_io *raid_io = _raid_io;
240 
241 	raid0_submit_null_payload_request(raid_io);
242 }
243 
244 static void
245 raid0_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
246 {
247 	struct raid_bdev_io *raid_io = cb_arg;
248 
249 	raid_bdev_io_complete_part(raid_io, 1, success ?
250 				   SPDK_BDEV_IO_STATUS_SUCCESS :
251 				   SPDK_BDEV_IO_STATUS_FAILED);
252 
253 	spdk_bdev_free_io(bdev_io);
254 }
255 
256 /*
257  * brief:
258  * raid0_submit_null_payload_request function submits the next batch of
259  * io requests with range but without payload, like FLUSH and UNMAP, to member disks;
260  * it will submit as many as possible unless one base io request fails with -ENOMEM,
261  * in which case it will queue itself for later submission.
262  * params:
263  * bdev_io - pointer to parent bdev_io on raid bdev device
264  * returns:
265  * none
266  */
267 static void
268 raid0_submit_null_payload_request(struct raid_bdev_io *raid_io)
269 {
270 	struct raid_bdev		*raid_bdev;
271 	struct raid_bdev_io_range	io_range;
272 	int				ret;
273 	struct raid_base_bdev_info	*base_info;
274 	struct spdk_io_channel		*base_ch;
275 
276 	raid_bdev = raid_io->raid_bdev;
277 
278 	_raid0_get_io_range(&io_range, raid_bdev->num_base_bdevs,
279 			    raid_bdev->strip_size, raid_bdev->strip_size_shift,
280 			    raid_io->offset_blocks, raid_io->num_blocks);
281 
282 	if (raid_io->base_bdev_io_remaining == 0) {
283 		raid_io->base_bdev_io_remaining = io_range.n_disks_involved;
284 	}
285 
286 	while (raid_io->base_bdev_io_submitted < io_range.n_disks_involved) {
287 		uint8_t disk_idx;
288 		uint64_t offset_in_disk;
289 		uint64_t nblocks_in_disk;
290 
291 		/* base_bdev is started from start_disk to end_disk.
292 		 * It is possible that index of start_disk is larger than end_disk's.
293 		 */
294 		disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs;
295 		base_info = &raid_bdev->base_bdev_info[disk_idx];
296 		base_ch = raid_bdev_channel_get_base_channel(raid_io->raid_ch, disk_idx);
297 
298 		_raid0_split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk);
299 
300 		switch (raid_io->type) {
301 		case SPDK_BDEV_IO_TYPE_UNMAP:
302 			ret = raid_bdev_unmap_blocks(base_info, base_ch,
303 						     offset_in_disk, nblocks_in_disk,
304 						     raid0_base_io_complete, raid_io);
305 			break;
306 
307 		case SPDK_BDEV_IO_TYPE_FLUSH:
308 			ret = raid_bdev_flush_blocks(base_info, base_ch,
309 						     offset_in_disk, nblocks_in_disk,
310 						     raid0_base_io_complete, raid_io);
311 			break;
312 
313 		default:
314 			SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", raid_io->type);
315 			assert(false);
316 			ret = -EIO;
317 		}
318 
319 		if (ret == 0) {
320 			raid_io->base_bdev_io_submitted++;
321 		} else if (ret == -ENOMEM) {
322 			raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
323 						base_ch, _raid0_submit_null_payload_request);
324 			return;
325 		} else {
326 			SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
327 			assert(false);
328 			raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
329 			return;
330 		}
331 	}
332 }
333 
334 static int
335 raid0_start(struct raid_bdev *raid_bdev)
336 {
337 	uint64_t min_blockcnt = UINT64_MAX;
338 	uint64_t base_bdev_data_size;
339 	struct raid_base_bdev_info *base_info;
340 
341 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
342 		/* Calculate minimum block count from all base bdevs */
343 		min_blockcnt = spdk_min(min_blockcnt, base_info->data_size);
344 	}
345 
346 	base_bdev_data_size = (min_blockcnt >> raid_bdev->strip_size_shift) << raid_bdev->strip_size_shift;
347 
348 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
349 		base_info->data_size = base_bdev_data_size;
350 	}
351 
352 	/*
353 	 * Take the minimum block count based approach where total block count
354 	 * of raid bdev is the number of base bdev times the minimum block count
355 	 * of any base bdev.
356 	 */
357 	SPDK_DEBUGLOG(bdev_raid0, "min blockcount %" PRIu64 ",  numbasedev %u, strip size shift %u\n",
358 		      min_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift);
359 
360 	raid_bdev->bdev.blockcnt = base_bdev_data_size * raid_bdev->num_base_bdevs;
361 
362 	if (raid_bdev->num_base_bdevs > 1) {
363 		raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size;
364 		raid_bdev->bdev.split_on_optimal_io_boundary = true;
365 	} else {
366 		/* Do not need to split reads/writes on single bdev RAID modules. */
367 		raid_bdev->bdev.optimal_io_boundary = 0;
368 		raid_bdev->bdev.split_on_optimal_io_boundary = false;
369 	}
370 
371 	return 0;
372 }
373 
374 static void
375 raid0_resize(struct raid_bdev *raid_bdev)
376 {
377 	uint64_t blockcnt;
378 	int rc;
379 	uint64_t min_blockcnt = UINT64_MAX;
380 	struct raid_base_bdev_info *base_info;
381 	uint64_t base_bdev_data_size;
382 
383 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
384 		struct spdk_bdev *base_bdev = spdk_bdev_desc_get_bdev(base_info->desc);
385 
386 		min_blockcnt = spdk_min(min_blockcnt, base_bdev->blockcnt - base_info->data_offset);
387 	}
388 
389 	base_bdev_data_size = (min_blockcnt >> raid_bdev->strip_size_shift) << raid_bdev->strip_size_shift;
390 	blockcnt = base_bdev_data_size * raid_bdev->num_base_bdevs;
391 
392 	if (blockcnt == raid_bdev->bdev.blockcnt) {
393 		return;
394 	}
395 
396 	rc = spdk_bdev_notify_blockcnt_change(&raid_bdev->bdev, blockcnt);
397 	if (rc != 0) {
398 		SPDK_ERRLOG("Failed to notify blockcount change\n");
399 		return;
400 	}
401 
402 	SPDK_NOTICELOG("raid0 '%s': min blockcount was changed from %" PRIu64 " to %" PRIu64 "\n",
403 		       raid_bdev->bdev.name,
404 		       raid_bdev->bdev.blockcnt,
405 		       blockcnt);
406 
407 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
408 		base_info->data_size = base_bdev_data_size;
409 	}
410 }
411 
412 static struct raid_bdev_module g_raid0_module = {
413 	.level = RAID0,
414 	.base_bdevs_min = 1,
415 	.memory_domains_supported = true,
416 	.start = raid0_start,
417 	.submit_rw_request = raid0_submit_rw_request,
418 	.submit_null_payload_request = raid0_submit_null_payload_request,
419 	.resize = raid0_resize,
420 };
421 RAID_MODULE_REGISTER(&g_raid0_module)
422 
423 SPDK_LOG_REGISTER_COMPONENT(bdev_raid0)
424