xref: /spdk/module/bdev/raid/raid1.c (revision 33712560bf41a135559d7731fd55583c645ca714)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2022 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "bdev_raid.h"
7 
8 #include "spdk/likely.h"
9 #include "spdk/log.h"
10 
11 struct raid1_info {
12 	/* The parent raid bdev */
13 	struct raid_bdev *raid_bdev;
14 };
15 
16 struct raid1_io_channel {
17 	/* Array of per-base_bdev counters of outstanding read blocks on this channel */
18 	uint64_t read_blocks_outstanding[0];
19 };
20 
21 static void
22 raid1_channel_inc_read_counters(struct raid_bdev_io_channel *raid_ch, uint8_t idx,
23 				uint64_t num_blocks)
24 {
25 	struct raid1_io_channel *raid1_ch = raid_bdev_channel_get_module_ctx(raid_ch);
26 
27 	assert(raid1_ch->read_blocks_outstanding[idx] <= UINT64_MAX - num_blocks);
28 	raid1_ch->read_blocks_outstanding[idx] += num_blocks;
29 }
30 
31 static void
32 raid1_channel_dec_read_counters(struct raid_bdev_io_channel *raid_ch, uint8_t idx,
33 				uint64_t num_blocks)
34 {
35 	struct raid1_io_channel *raid1_ch = raid_bdev_channel_get_module_ctx(raid_ch);
36 
37 	assert(raid1_ch->read_blocks_outstanding[idx] >= num_blocks);
38 	raid1_ch->read_blocks_outstanding[idx] -= num_blocks;
39 }
40 
41 static void
42 raid1_init_ext_io_opts(struct spdk_bdev_ext_io_opts *opts, struct raid_bdev_io *raid_io)
43 {
44 	memset(opts, 0, sizeof(*opts));
45 	opts->size = sizeof(*opts);
46 	opts->memory_domain = raid_io->memory_domain;
47 	opts->memory_domain_ctx = raid_io->memory_domain_ctx;
48 	opts->metadata = raid_io->md_buf;
49 }
50 
51 static void
52 raid1_write_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
53 {
54 	struct raid_bdev_io *raid_io = cb_arg;
55 
56 	if (!success) {
57 		struct raid_base_bdev_info *base_info;
58 
59 		base_info = raid_bdev_channel_get_base_info(raid_io->raid_ch, bdev_io->bdev);
60 		if (base_info) {
61 			raid_bdev_fail_base_bdev(base_info);
62 		}
63 	}
64 
65 	spdk_bdev_free_io(bdev_io);
66 
67 	raid_bdev_io_complete_part(raid_io, 1, success ?
68 				   SPDK_BDEV_IO_STATUS_SUCCESS :
69 				   SPDK_BDEV_IO_STATUS_FAILED);
70 }
71 
72 static struct raid_base_bdev_info *
73 raid1_get_read_io_base_bdev(struct raid_bdev_io *raid_io)
74 {
75 	assert(raid_io->type == SPDK_BDEV_IO_TYPE_READ);
76 	return &raid_io->raid_bdev->base_bdev_info[raid_io->base_bdev_io_submitted];
77 }
78 
79 static void
80 raid1_correct_read_error_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
81 {
82 	struct raid_bdev_io *raid_io = cb_arg;
83 
84 	spdk_bdev_free_io(bdev_io);
85 
86 	if (!success) {
87 		struct raid_base_bdev_info *base_info = raid1_get_read_io_base_bdev(raid_io);
88 
89 		/* Writing to the bdev that had the read error failed so fail the base bdev
90 		 * but complete the raid_io successfully. */
91 		raid_bdev_fail_base_bdev(base_info);
92 	}
93 
94 	raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS);
95 }
96 
97 static void
98 raid1_correct_read_error(void *_raid_io)
99 {
100 	struct raid_bdev_io *raid_io = _raid_io;
101 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
102 	struct spdk_bdev_ext_io_opts io_opts;
103 	struct raid_base_bdev_info *base_info;
104 	struct spdk_io_channel *base_ch;
105 	uint8_t i;
106 	int ret;
107 
108 	i = raid_io->base_bdev_io_submitted;
109 	base_info = &raid_bdev->base_bdev_info[i];
110 	base_ch = raid_bdev_channel_get_base_channel(raid_io->raid_ch, i);
111 	assert(base_ch != NULL);
112 
113 	raid1_init_ext_io_opts(&io_opts, raid_io);
114 	ret = raid_bdev_writev_blocks_ext(base_info, base_ch, raid_io->iovs, raid_io->iovcnt,
115 					  raid_io->offset_blocks, raid_io->num_blocks,
116 					  raid1_correct_read_error_completion, raid_io, &io_opts);
117 	if (spdk_unlikely(ret != 0)) {
118 		if (ret == -ENOMEM) {
119 			raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
120 						base_ch, raid1_correct_read_error);
121 		} else {
122 			raid_bdev_fail_base_bdev(base_info);
123 			raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS);
124 		}
125 	}
126 }
127 
128 static void raid1_read_other_base_bdev(void *_raid_io);
129 
130 static void
131 raid1_read_other_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
132 {
133 	struct raid_bdev_io *raid_io = cb_arg;
134 
135 	spdk_bdev_free_io(bdev_io);
136 
137 	if (!success) {
138 		assert(raid_io->base_bdev_io_remaining > 0);
139 		raid_io->base_bdev_io_remaining--;
140 		raid1_read_other_base_bdev(raid_io);
141 		return;
142 	}
143 
144 	/* try to correct the read error by writing data read from the other base bdev */
145 	raid1_correct_read_error(raid_io);
146 }
147 
148 static void
149 raid1_read_other_base_bdev(void *_raid_io)
150 {
151 	struct raid_bdev_io *raid_io = _raid_io;
152 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
153 	struct spdk_bdev_ext_io_opts io_opts;
154 	struct raid_base_bdev_info *base_info;
155 	struct spdk_io_channel *base_ch;
156 	uint8_t i;
157 	int ret;
158 
159 	for (i = raid_bdev->num_base_bdevs - raid_io->base_bdev_io_remaining; i < raid_bdev->num_base_bdevs;
160 	     i++) {
161 		base_info = &raid_bdev->base_bdev_info[i];
162 		base_ch = raid_bdev_channel_get_base_channel(raid_io->raid_ch, i);
163 
164 		if (base_ch == NULL || i == raid_io->base_bdev_io_submitted) {
165 			raid_io->base_bdev_io_remaining--;
166 			continue;
167 		}
168 
169 		raid1_init_ext_io_opts(&io_opts, raid_io);
170 		ret = raid_bdev_readv_blocks_ext(base_info, base_ch, raid_io->iovs, raid_io->iovcnt,
171 						 raid_io->offset_blocks, raid_io->num_blocks,
172 						 raid1_read_other_completion, raid_io, &io_opts);
173 		if (spdk_unlikely(ret != 0)) {
174 			if (ret == -ENOMEM) {
175 				raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
176 							base_ch, raid1_read_other_base_bdev);
177 			} else {
178 				break;
179 			}
180 		}
181 		return;
182 	}
183 
184 	base_info = raid1_get_read_io_base_bdev(raid_io);
185 	raid_bdev_fail_base_bdev(base_info);
186 
187 	raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
188 }
189 
190 static void
191 raid1_read_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
192 {
193 	struct raid_bdev_io *raid_io = cb_arg;
194 
195 	spdk_bdev_free_io(bdev_io);
196 
197 	raid1_channel_dec_read_counters(raid_io->raid_ch, raid_io->base_bdev_io_submitted,
198 					raid_io->num_blocks);
199 
200 	if (!success) {
201 		raid_io->base_bdev_io_remaining = raid_io->raid_bdev->num_base_bdevs;
202 		raid1_read_other_base_bdev(raid_io);
203 		return;
204 	}
205 
206 	raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS);
207 }
208 
209 static void raid1_submit_rw_request(struct raid_bdev_io *raid_io);
210 
211 static void
212 _raid1_submit_rw_request(void *_raid_io)
213 {
214 	struct raid_bdev_io *raid_io = _raid_io;
215 
216 	raid1_submit_rw_request(raid_io);
217 }
218 
219 static uint8_t
220 raid1_channel_next_read_base_bdev(struct raid_bdev *raid_bdev, struct raid_bdev_io_channel *raid_ch)
221 {
222 	struct raid1_io_channel *raid1_ch = raid_bdev_channel_get_module_ctx(raid_ch);
223 	uint64_t read_blocks_min = UINT64_MAX;
224 	uint8_t idx = UINT8_MAX;
225 	uint8_t i;
226 
227 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
228 		if (raid_bdev_channel_get_base_channel(raid_ch, i) != NULL &&
229 		    raid1_ch->read_blocks_outstanding[i] < read_blocks_min) {
230 			read_blocks_min = raid1_ch->read_blocks_outstanding[i];
231 			idx = i;
232 		}
233 	}
234 
235 	return idx;
236 }
237 
238 static int
239 raid1_submit_read_request(struct raid_bdev_io *raid_io)
240 {
241 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
242 	struct raid_bdev_io_channel *raid_ch = raid_io->raid_ch;
243 	struct spdk_bdev_ext_io_opts io_opts;
244 	struct raid_base_bdev_info *base_info;
245 	struct spdk_io_channel *base_ch;
246 	uint8_t idx;
247 	int ret;
248 
249 	idx = raid1_channel_next_read_base_bdev(raid_bdev, raid_ch);
250 	if (spdk_unlikely(idx == UINT8_MAX)) {
251 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
252 		return 0;
253 	}
254 
255 	base_info = &raid_bdev->base_bdev_info[idx];
256 	base_ch = raid_bdev_channel_get_base_channel(raid_ch, idx);
257 
258 	raid1_init_ext_io_opts(&io_opts, raid_io);
259 	ret = raid_bdev_readv_blocks_ext(base_info, base_ch, raid_io->iovs, raid_io->iovcnt,
260 					 raid_io->offset_blocks, raid_io->num_blocks,
261 					 raid1_read_bdev_io_completion, raid_io, &io_opts);
262 
263 	if (spdk_likely(ret == 0)) {
264 		raid1_channel_inc_read_counters(raid_ch, idx, raid_io->num_blocks);
265 		raid_io->base_bdev_io_submitted = idx;
266 	} else if (spdk_unlikely(ret == -ENOMEM)) {
267 		raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
268 					base_ch, _raid1_submit_rw_request);
269 		return 0;
270 	}
271 
272 	return ret;
273 }
274 
275 static int
276 raid1_submit_write_request(struct raid_bdev_io *raid_io)
277 {
278 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
279 	struct spdk_bdev_ext_io_opts io_opts;
280 	struct raid_base_bdev_info *base_info;
281 	struct spdk_io_channel *base_ch;
282 	uint8_t idx;
283 	uint64_t base_bdev_io_not_submitted;
284 	int ret = 0;
285 
286 	if (raid_io->base_bdev_io_submitted == 0) {
287 		raid_io->base_bdev_io_remaining = raid_bdev->num_base_bdevs;
288 		raid_bdev_io_set_default_status(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
289 	}
290 
291 	raid1_init_ext_io_opts(&io_opts, raid_io);
292 	for (idx = raid_io->base_bdev_io_submitted; idx < raid_bdev->num_base_bdevs; idx++) {
293 		base_info = &raid_bdev->base_bdev_info[idx];
294 		base_ch = raid_bdev_channel_get_base_channel(raid_io->raid_ch, idx);
295 
296 		if (base_ch == NULL) {
297 			/* skip a missing base bdev's slot */
298 			raid_io->base_bdev_io_submitted++;
299 			raid_bdev_io_complete_part(raid_io, 1, SPDK_BDEV_IO_STATUS_FAILED);
300 			continue;
301 		}
302 
303 		ret = raid_bdev_writev_blocks_ext(base_info, base_ch, raid_io->iovs, raid_io->iovcnt,
304 						  raid_io->offset_blocks, raid_io->num_blocks,
305 						  raid1_write_bdev_io_completion, raid_io, &io_opts);
306 		if (spdk_unlikely(ret != 0)) {
307 			if (spdk_unlikely(ret == -ENOMEM)) {
308 				raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
309 							base_ch, _raid1_submit_rw_request);
310 				return 0;
311 			}
312 
313 			base_bdev_io_not_submitted = raid_bdev->num_base_bdevs -
314 						     raid_io->base_bdev_io_submitted;
315 			raid_bdev_io_complete_part(raid_io, base_bdev_io_not_submitted,
316 						   SPDK_BDEV_IO_STATUS_FAILED);
317 			return 0;
318 		}
319 
320 		raid_io->base_bdev_io_submitted++;
321 	}
322 
323 	if (raid_io->base_bdev_io_submitted == 0) {
324 		ret = -ENODEV;
325 	}
326 
327 	return ret;
328 }
329 
330 static void
331 raid1_submit_rw_request(struct raid_bdev_io *raid_io)
332 {
333 	int ret;
334 
335 	switch (raid_io->type) {
336 	case SPDK_BDEV_IO_TYPE_READ:
337 		ret = raid1_submit_read_request(raid_io);
338 		break;
339 	case SPDK_BDEV_IO_TYPE_WRITE:
340 		ret = raid1_submit_write_request(raid_io);
341 		break;
342 	default:
343 		ret = -EINVAL;
344 		break;
345 	}
346 
347 	if (spdk_unlikely(ret != 0)) {
348 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
349 	}
350 }
351 
352 static void
353 raid1_ioch_destroy(void *io_device, void *ctx_buf)
354 {
355 }
356 
357 static int
358 raid1_ioch_create(void *io_device, void *ctx_buf)
359 {
360 	return 0;
361 }
362 
363 static void
364 raid1_io_device_unregister_done(void *io_device)
365 {
366 	struct raid1_info *r1info = io_device;
367 
368 	raid_bdev_module_stop_done(r1info->raid_bdev);
369 
370 	free(r1info);
371 }
372 
373 static int
374 raid1_start(struct raid_bdev *raid_bdev)
375 {
376 	uint64_t min_blockcnt = UINT64_MAX;
377 	struct raid_base_bdev_info *base_info;
378 	struct raid1_info *r1info;
379 	char name[256];
380 
381 	r1info = calloc(1, sizeof(*r1info));
382 	if (!r1info) {
383 		SPDK_ERRLOG("Failed to allocate RAID1 info device structure\n");
384 		return -ENOMEM;
385 	}
386 	r1info->raid_bdev = raid_bdev;
387 
388 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
389 		min_blockcnt = spdk_min(min_blockcnt, base_info->data_size);
390 	}
391 
392 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
393 		base_info->data_size = min_blockcnt;
394 	}
395 
396 	raid_bdev->bdev.blockcnt = min_blockcnt;
397 	raid_bdev->module_private = r1info;
398 
399 	snprintf(name, sizeof(name), "raid1_%s", raid_bdev->bdev.name);
400 	spdk_io_device_register(r1info, raid1_ioch_create, raid1_ioch_destroy,
401 				sizeof(struct raid1_io_channel) + raid_bdev->num_base_bdevs * sizeof(uint64_t),
402 				name);
403 
404 	return 0;
405 }
406 
407 static bool
408 raid1_stop(struct raid_bdev *raid_bdev)
409 {
410 	struct raid1_info *r1info = raid_bdev->module_private;
411 
412 	spdk_io_device_unregister(r1info, raid1_io_device_unregister_done);
413 
414 	return false;
415 }
416 
417 static struct spdk_io_channel *
418 raid1_get_io_channel(struct raid_bdev *raid_bdev)
419 {
420 	struct raid1_info *r1info = raid_bdev->module_private;
421 
422 	return spdk_get_io_channel(r1info);
423 }
424 
425 static void
426 raid1_process_write_completed(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
427 {
428 	struct raid_bdev_process_request *process_req = cb_arg;
429 
430 	spdk_bdev_free_io(bdev_io);
431 
432 	raid_bdev_process_request_complete(process_req, success ? 0 : -EIO);
433 }
434 
435 static void raid1_process_submit_write(struct raid_bdev_process_request *process_req);
436 
437 static void
438 _raid1_process_submit_write(void *ctx)
439 {
440 	struct raid_bdev_process_request *process_req = ctx;
441 
442 	raid1_process_submit_write(process_req);
443 }
444 
445 static void
446 raid1_process_submit_write(struct raid_bdev_process_request *process_req)
447 {
448 	struct raid_bdev_io *raid_io = &process_req->raid_io;
449 	struct spdk_bdev_ext_io_opts io_opts;
450 	int ret;
451 
452 	raid1_init_ext_io_opts(&io_opts, raid_io);
453 	ret = raid_bdev_writev_blocks_ext(process_req->target, process_req->target_ch,
454 					  raid_io->iovs, raid_io->iovcnt,
455 					  raid_io->offset_blocks, raid_io->num_blocks,
456 					  raid1_process_write_completed, process_req, &io_opts);
457 	if (spdk_unlikely(ret != 0)) {
458 		if (ret == -ENOMEM) {
459 			raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(process_req->target->desc),
460 						process_req->target_ch, _raid1_process_submit_write);
461 		} else {
462 			raid_bdev_process_request_complete(process_req, ret);
463 		}
464 	}
465 }
466 
467 static void
468 raid1_process_read_completed(struct raid_bdev_io *raid_io, enum spdk_bdev_io_status status)
469 {
470 	struct raid_bdev_process_request *process_req = SPDK_CONTAINEROF(raid_io,
471 			struct raid_bdev_process_request, raid_io);
472 
473 	if (status != SPDK_BDEV_IO_STATUS_SUCCESS) {
474 		raid_bdev_process_request_complete(process_req, -EIO);
475 		return;
476 	}
477 
478 	raid1_process_submit_write(process_req);
479 }
480 
481 static int
482 raid1_submit_process_request(struct raid_bdev_process_request *process_req,
483 			     struct raid_bdev_io_channel *raid_ch)
484 {
485 	struct raid_bdev_io *raid_io = &process_req->raid_io;
486 	int ret;
487 
488 	raid_bdev_io_init(raid_io, raid_ch, SPDK_BDEV_IO_TYPE_READ,
489 			  process_req->offset_blocks, process_req->num_blocks,
490 			  &process_req->iov, 1, process_req->md_buf, NULL, NULL);
491 	raid_io->completion_cb = raid1_process_read_completed;
492 
493 	ret = raid1_submit_read_request(raid_io);
494 	if (spdk_likely(ret == 0)) {
495 		return process_req->num_blocks;
496 	} else if (ret < 0) {
497 		return ret;
498 	} else {
499 		return -EINVAL;
500 	}
501 }
502 
503 static bool
504 raid1_resize(struct raid_bdev *raid_bdev)
505 {
506 	int rc;
507 	uint64_t min_blockcnt = UINT64_MAX;
508 	struct raid_base_bdev_info *base_info;
509 
510 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
511 		struct spdk_bdev *base_bdev;
512 
513 		if (base_info->desc == NULL) {
514 			continue;
515 		}
516 		base_bdev = spdk_bdev_desc_get_bdev(base_info->desc);
517 		min_blockcnt = spdk_min(min_blockcnt, base_bdev->blockcnt - base_info->data_offset);
518 	}
519 
520 	if (min_blockcnt == raid_bdev->bdev.blockcnt) {
521 		return false;
522 	}
523 
524 	rc = spdk_bdev_notify_blockcnt_change(&raid_bdev->bdev, min_blockcnt);
525 	if (rc != 0) {
526 		SPDK_ERRLOG("Failed to notify blockcount change\n");
527 		return false;
528 	}
529 
530 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
531 		base_info->data_size = min_blockcnt;
532 	}
533 	return true;
534 }
535 
536 static struct raid_bdev_module g_raid1_module = {
537 	.level = RAID1,
538 	.base_bdevs_min = 2,
539 	.base_bdevs_constraint = {CONSTRAINT_MIN_BASE_BDEVS_OPERATIONAL, 1},
540 	.memory_domains_supported = true,
541 	.start = raid1_start,
542 	.stop = raid1_stop,
543 	.submit_rw_request = raid1_submit_rw_request,
544 	.get_io_channel = raid1_get_io_channel,
545 	.submit_process_request = raid1_submit_process_request,
546 	.resize = raid1_resize,
547 };
548 RAID_MODULE_REGISTER(&g_raid1_module)
549 
550 SPDK_LOG_REGISTER_COMPONENT(bdev_raid1)
551