xref: /spdk/module/bdev/raid/bdev_raid.c (revision ac8071d6f4579bacd4bcec4e0d8b29c8bb5842c4)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2018 Intel Corporation.
3  *   All rights reserved.
4  *   Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  */
6 
7 #include "bdev_raid.h"
8 #include "spdk/env.h"
9 #include "spdk/thread.h"
10 #include "spdk/log.h"
11 #include "spdk/string.h"
12 #include "spdk/util.h"
13 #include "spdk/json.h"
14 #include "spdk/likely.h"
15 
16 static bool g_shutdown_started = false;
17 
18 /* List of all raid bdevs */
19 struct raid_all_tailq g_raid_bdev_list = TAILQ_HEAD_INITIALIZER(g_raid_bdev_list);
20 
21 static TAILQ_HEAD(, raid_bdev_module) g_raid_modules = TAILQ_HEAD_INITIALIZER(g_raid_modules);
22 
23 /*
24  * raid_bdev_io_channel is the context of spdk_io_channel for raid bdev device. It
25  * contains the relationship of raid bdev io channel with base bdev io channels.
26  */
27 struct raid_bdev_io_channel {
28 	/* Array of IO channels of base bdevs */
29 	struct spdk_io_channel	**base_channel;
30 
31 	/* Private raid module IO channel */
32 	struct spdk_io_channel	*module_channel;
33 };
34 
35 static struct raid_bdev_module *
36 raid_bdev_module_find(enum raid_level level)
37 {
38 	struct raid_bdev_module *raid_module;
39 
40 	TAILQ_FOREACH(raid_module, &g_raid_modules, link) {
41 		if (raid_module->level == level) {
42 			return raid_module;
43 		}
44 	}
45 
46 	return NULL;
47 }
48 
49 void
50 raid_bdev_module_list_add(struct raid_bdev_module *raid_module)
51 {
52 	if (raid_bdev_module_find(raid_module->level) != NULL) {
53 		SPDK_ERRLOG("module for raid level '%s' already registered.\n",
54 			    raid_bdev_level_to_str(raid_module->level));
55 		assert(false);
56 	} else {
57 		TAILQ_INSERT_TAIL(&g_raid_modules, raid_module, link);
58 	}
59 }
60 
61 struct spdk_io_channel *
62 raid_bdev_channel_get_base_channel(struct raid_bdev_io_channel *raid_ch, uint8_t idx)
63 {
64 	return raid_ch->base_channel[idx];
65 }
66 
67 void *
68 raid_bdev_channel_get_module_ctx(struct raid_bdev_io_channel *raid_ch)
69 {
70 	assert(raid_ch->module_channel != NULL);
71 
72 	return spdk_io_channel_get_ctx(raid_ch->module_channel);
73 }
74 
75 /* Function declarations */
76 static void	raid_bdev_examine(struct spdk_bdev *bdev);
77 static int	raid_bdev_init(void);
78 static void	raid_bdev_deconfigure(struct raid_bdev *raid_bdev,
79 				      raid_bdev_destruct_cb cb_fn, void *cb_arg);
80 
81 /*
82  * brief:
83  * raid_bdev_create_cb function is a cb function for raid bdev which creates the
84  * hierarchy from raid bdev to base bdev io channels. It will be called per core
85  * params:
86  * io_device - pointer to raid bdev io device represented by raid_bdev
87  * ctx_buf - pointer to context buffer for raid bdev io channel
88  * returns:
89  * 0 - success
90  * non zero - failure
91  */
92 static int
93 raid_bdev_create_cb(void *io_device, void *ctx_buf)
94 {
95 	struct raid_bdev            *raid_bdev = io_device;
96 	struct raid_bdev_io_channel *raid_ch = ctx_buf;
97 	uint8_t i;
98 	int ret = 0;
99 
100 	SPDK_DEBUGLOG(bdev_raid, "raid_bdev_create_cb, %p\n", raid_ch);
101 
102 	assert(raid_bdev != NULL);
103 	assert(raid_bdev->state == RAID_BDEV_STATE_ONLINE);
104 
105 
106 	raid_ch->base_channel = calloc(raid_bdev->num_base_bdevs, sizeof(struct spdk_io_channel *));
107 	if (!raid_ch->base_channel) {
108 		SPDK_ERRLOG("Unable to allocate base bdevs io channel\n");
109 		return -ENOMEM;
110 	}
111 
112 	spdk_spin_lock(&raid_bdev->base_bdev_lock);
113 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
114 		/*
115 		 * Get the spdk_io_channel for all the base bdevs. This is used during
116 		 * split logic to send the respective child bdev ios to respective base
117 		 * bdev io channel.
118 		 */
119 		if (raid_bdev->base_bdev_info[i].desc == NULL) {
120 			continue;
121 		}
122 		raid_ch->base_channel[i] = spdk_bdev_get_io_channel(
123 						   raid_bdev->base_bdev_info[i].desc);
124 		if (!raid_ch->base_channel[i]) {
125 			SPDK_ERRLOG("Unable to create io channel for base bdev\n");
126 			ret = -ENOMEM;
127 			break;
128 		}
129 	}
130 	spdk_spin_unlock(&raid_bdev->base_bdev_lock);
131 
132 	if (!ret && raid_bdev->module->get_io_channel) {
133 		raid_ch->module_channel = raid_bdev->module->get_io_channel(raid_bdev);
134 		if (!raid_ch->module_channel) {
135 			SPDK_ERRLOG("Unable to create io channel for raid module\n");
136 			ret = -ENOMEM;
137 		}
138 	}
139 
140 	if (ret) {
141 		for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
142 			if (raid_ch->base_channel[i] != NULL) {
143 				spdk_put_io_channel(raid_ch->base_channel[i]);
144 			}
145 		}
146 		free(raid_ch->base_channel);
147 		raid_ch->base_channel = NULL;
148 	}
149 	return ret;
150 }
151 
152 /*
153  * brief:
154  * raid_bdev_destroy_cb function is a cb function for raid bdev which deletes the
155  * hierarchy from raid bdev to base bdev io channels. It will be called per core
156  * params:
157  * io_device - pointer to raid bdev io device represented by raid_bdev
158  * ctx_buf - pointer to context buffer for raid bdev io channel
159  * returns:
160  * none
161  */
162 static void
163 raid_bdev_destroy_cb(void *io_device, void *ctx_buf)
164 {
165 	struct raid_bdev *raid_bdev = io_device;
166 	struct raid_bdev_io_channel *raid_ch = ctx_buf;
167 	uint8_t i;
168 
169 	SPDK_DEBUGLOG(bdev_raid, "raid_bdev_destroy_cb\n");
170 
171 	assert(raid_ch != NULL);
172 	assert(raid_ch->base_channel);
173 
174 	if (raid_ch->module_channel) {
175 		spdk_put_io_channel(raid_ch->module_channel);
176 	}
177 
178 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
179 		/* Free base bdev channels */
180 		if (raid_ch->base_channel[i] != NULL) {
181 			spdk_put_io_channel(raid_ch->base_channel[i]);
182 		}
183 	}
184 	free(raid_ch->base_channel);
185 	raid_ch->base_channel = NULL;
186 }
187 
188 /*
189  * brief:
190  * raid_bdev_cleanup is used to cleanup raid_bdev related data
191  * structures.
192  * params:
193  * raid_bdev - pointer to raid_bdev
194  * returns:
195  * none
196  */
197 static void
198 raid_bdev_cleanup(struct raid_bdev *raid_bdev)
199 {
200 	struct raid_base_bdev_info *base_info;
201 
202 	SPDK_DEBUGLOG(bdev_raid, "raid_bdev_cleanup, %p name %s, state %s\n",
203 		      raid_bdev, raid_bdev->bdev.name, raid_bdev_state_to_str(raid_bdev->state));
204 	assert(raid_bdev->state != RAID_BDEV_STATE_ONLINE);
205 	assert(spdk_get_thread() == spdk_thread_get_app_thread());
206 
207 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
208 		assert(base_info->desc == NULL);
209 		free(base_info->name);
210 	}
211 
212 	TAILQ_REMOVE(&g_raid_bdev_list, raid_bdev, global_link);
213 }
214 
215 static void
216 raid_bdev_free(struct raid_bdev *raid_bdev)
217 {
218 	spdk_dma_free(raid_bdev->sb);
219 	spdk_spin_destroy(&raid_bdev->base_bdev_lock);
220 	free(raid_bdev->base_bdev_info);
221 	free(raid_bdev->bdev.name);
222 	free(raid_bdev);
223 }
224 
225 static void
226 raid_bdev_cleanup_and_free(struct raid_bdev *raid_bdev)
227 {
228 	raid_bdev_cleanup(raid_bdev);
229 	raid_bdev_free(raid_bdev);
230 }
231 
232 /*
233  * brief:
234  * free resource of base bdev for raid bdev
235  * params:
236  * base_info - raid base bdev info
237  * returns:
238  * none
239  */
240 static void
241 raid_bdev_free_base_bdev_resource(struct raid_base_bdev_info *base_info)
242 {
243 	struct raid_bdev *raid_bdev = base_info->raid_bdev;
244 
245 	assert(spdk_get_thread() == spdk_thread_get_app_thread());
246 
247 	free(base_info->name);
248 	base_info->name = NULL;
249 	if (raid_bdev->state != RAID_BDEV_STATE_CONFIGURING) {
250 		spdk_uuid_set_null(&base_info->uuid);
251 	}
252 
253 	if (base_info->desc == NULL) {
254 		return;
255 	}
256 
257 	spdk_bdev_module_release_bdev(spdk_bdev_desc_get_bdev(base_info->desc));
258 	spdk_bdev_close(base_info->desc);
259 	base_info->desc = NULL;
260 	spdk_put_io_channel(base_info->app_thread_ch);
261 	base_info->app_thread_ch = NULL;
262 
263 	if (base_info->is_configured) {
264 		assert(raid_bdev->num_base_bdevs_discovered);
265 		raid_bdev->num_base_bdevs_discovered--;
266 		base_info->is_configured = false;
267 	}
268 }
269 
270 static void
271 raid_bdev_io_device_unregister_cb(void *io_device)
272 {
273 	struct raid_bdev *raid_bdev = io_device;
274 
275 	if (raid_bdev->num_base_bdevs_discovered == 0) {
276 		/* Free raid_bdev when there are no base bdevs left */
277 		SPDK_DEBUGLOG(bdev_raid, "raid bdev base bdevs is 0, going to free all in destruct\n");
278 		raid_bdev_cleanup(raid_bdev);
279 		spdk_bdev_destruct_done(&raid_bdev->bdev, 0);
280 		raid_bdev_free(raid_bdev);
281 	} else {
282 		spdk_bdev_destruct_done(&raid_bdev->bdev, 0);
283 	}
284 }
285 
286 void
287 raid_bdev_module_stop_done(struct raid_bdev *raid_bdev)
288 {
289 	if (raid_bdev->state != RAID_BDEV_STATE_CONFIGURING) {
290 		spdk_io_device_unregister(raid_bdev, raid_bdev_io_device_unregister_cb);
291 	}
292 }
293 
294 static void
295 _raid_bdev_destruct(void *ctxt)
296 {
297 	struct raid_bdev *raid_bdev = ctxt;
298 	struct raid_base_bdev_info *base_info;
299 
300 	SPDK_DEBUGLOG(bdev_raid, "raid_bdev_destruct\n");
301 
302 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
303 		/*
304 		 * Close all base bdev descriptors for which call has come from below
305 		 * layers.  Also close the descriptors if we have started shutdown.
306 		 */
307 		if (g_shutdown_started || base_info->remove_scheduled == true) {
308 			raid_bdev_free_base_bdev_resource(base_info);
309 		}
310 	}
311 
312 	if (g_shutdown_started) {
313 		raid_bdev->state = RAID_BDEV_STATE_OFFLINE;
314 	}
315 
316 	if (raid_bdev->module->stop != NULL) {
317 		if (raid_bdev->module->stop(raid_bdev) == false) {
318 			return;
319 		}
320 	}
321 
322 	raid_bdev_module_stop_done(raid_bdev);
323 }
324 
325 static int
326 raid_bdev_destruct(void *ctx)
327 {
328 	spdk_thread_exec_msg(spdk_thread_get_app_thread(), _raid_bdev_destruct, ctx);
329 
330 	return 1;
331 }
332 
333 void
334 raid_bdev_io_complete(struct raid_bdev_io *raid_io, enum spdk_bdev_io_status status)
335 {
336 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
337 
338 	if (spdk_unlikely(raid_io->completion_cb != NULL)) {
339 		raid_io->completion_cb(raid_io, status);
340 	} else {
341 		spdk_bdev_io_complete(bdev_io, status);
342 	}
343 }
344 
345 /*
346  * brief:
347  * raid_bdev_io_complete_part - signal the completion of a part of the expected
348  * base bdev IOs and complete the raid_io if this is the final expected IO.
349  * The caller should first set raid_io->base_bdev_io_remaining. This function
350  * will decrement this counter by the value of the 'completed' parameter and
351  * complete the raid_io if the counter reaches 0. The caller is free to
352  * interpret the 'base_bdev_io_remaining' and 'completed' values as needed,
353  * it can represent e.g. blocks or IOs.
354  * params:
355  * raid_io - pointer to raid_bdev_io
356  * completed - the part of the raid_io that has been completed
357  * status - status of the base IO
358  * returns:
359  * true - if the raid_io is completed
360  * false - otherwise
361  */
362 bool
363 raid_bdev_io_complete_part(struct raid_bdev_io *raid_io, uint64_t completed,
364 			   enum spdk_bdev_io_status status)
365 {
366 	assert(raid_io->base_bdev_io_remaining >= completed);
367 	raid_io->base_bdev_io_remaining -= completed;
368 
369 	if (status != SPDK_BDEV_IO_STATUS_SUCCESS) {
370 		raid_io->base_bdev_io_status = status;
371 	}
372 
373 	if (raid_io->base_bdev_io_remaining == 0) {
374 		raid_bdev_io_complete(raid_io, raid_io->base_bdev_io_status);
375 		return true;
376 	} else {
377 		return false;
378 	}
379 }
380 
381 /*
382  * brief:
383  * raid_bdev_queue_io_wait function processes the IO which failed to submit.
384  * It will try to queue the IOs after storing the context to bdev wait queue logic.
385  * params:
386  * raid_io - pointer to raid_bdev_io
387  * bdev - the block device that the IO is submitted to
388  * ch - io channel
389  * cb_fn - callback when the spdk_bdev_io for bdev becomes available
390  * returns:
391  * none
392  */
393 void
394 raid_bdev_queue_io_wait(struct raid_bdev_io *raid_io, struct spdk_bdev *bdev,
395 			struct spdk_io_channel *ch, spdk_bdev_io_wait_cb cb_fn)
396 {
397 	raid_io->waitq_entry.bdev = bdev;
398 	raid_io->waitq_entry.cb_fn = cb_fn;
399 	raid_io->waitq_entry.cb_arg = raid_io;
400 	spdk_bdev_queue_io_wait(bdev, ch, &raid_io->waitq_entry);
401 }
402 
403 static void
404 raid_base_bdev_reset_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
405 {
406 	struct raid_bdev_io *raid_io = cb_arg;
407 
408 	spdk_bdev_free_io(bdev_io);
409 
410 	raid_bdev_io_complete_part(raid_io, 1, success ?
411 				   SPDK_BDEV_IO_STATUS_SUCCESS :
412 				   SPDK_BDEV_IO_STATUS_FAILED);
413 }
414 
415 static void raid_bdev_submit_reset_request(struct raid_bdev_io *raid_io);
416 
417 static void
418 _raid_bdev_submit_reset_request(void *_raid_io)
419 {
420 	struct raid_bdev_io *raid_io = _raid_io;
421 
422 	raid_bdev_submit_reset_request(raid_io);
423 }
424 
425 /*
426  * brief:
427  * raid_bdev_submit_reset_request function submits reset requests
428  * to member disks; it will submit as many as possible unless a reset fails with -ENOMEM, in
429  * which case it will queue it for later submission
430  * params:
431  * raid_io
432  * returns:
433  * none
434  */
435 static void
436 raid_bdev_submit_reset_request(struct raid_bdev_io *raid_io)
437 {
438 	struct raid_bdev		*raid_bdev;
439 	int				ret;
440 	uint8_t				i;
441 	struct raid_base_bdev_info	*base_info;
442 	struct spdk_io_channel		*base_ch;
443 
444 	raid_bdev = raid_io->raid_bdev;
445 
446 	if (raid_io->base_bdev_io_remaining == 0) {
447 		raid_io->base_bdev_io_remaining = raid_bdev->num_base_bdevs;
448 	}
449 
450 	for (i = raid_io->base_bdev_io_submitted; i < raid_bdev->num_base_bdevs; i++) {
451 		base_info = &raid_bdev->base_bdev_info[i];
452 		base_ch = raid_io->raid_ch->base_channel[i];
453 		if (base_ch == NULL) {
454 			raid_io->base_bdev_io_submitted++;
455 			raid_bdev_io_complete_part(raid_io, 1, SPDK_BDEV_IO_STATUS_SUCCESS);
456 			continue;
457 		}
458 		ret = spdk_bdev_reset(base_info->desc, base_ch,
459 				      raid_base_bdev_reset_complete, raid_io);
460 		if (ret == 0) {
461 			raid_io->base_bdev_io_submitted++;
462 		} else if (ret == -ENOMEM) {
463 			raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
464 						base_ch, _raid_bdev_submit_reset_request);
465 			return;
466 		} else {
467 			SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
468 			assert(false);
469 			raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
470 			return;
471 		}
472 	}
473 }
474 
475 /*
476  * brief:
477  * Callback function to spdk_bdev_io_get_buf.
478  * params:
479  * ch - pointer to raid bdev io channel
480  * bdev_io - pointer to parent bdev_io on raid bdev device
481  * success - True if buffer is allocated or false otherwise.
482  * returns:
483  * none
484  */
485 static void
486 raid_bdev_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
487 		     bool success)
488 {
489 	struct raid_bdev_io *raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
490 
491 	if (!success) {
492 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
493 		return;
494 	}
495 
496 	raid_io->raid_bdev->module->submit_rw_request(raid_io);
497 }
498 
499 void
500 raid_bdev_io_init(struct raid_bdev_io *raid_io, struct raid_bdev_io_channel *raid_ch,
501 		  enum spdk_bdev_io_type type, uint64_t offset_blocks,
502 		  uint64_t num_blocks, struct iovec *iovs, int iovcnt, void *md_buf,
503 		  struct spdk_memory_domain *memory_domain, void *memory_domain_ctx)
504 {
505 	struct spdk_io_channel *ch = spdk_io_channel_from_ctx(raid_ch);
506 	struct raid_bdev *raid_bdev = spdk_io_channel_get_io_device(ch);
507 
508 	raid_io->type = type;
509 	raid_io->offset_blocks = offset_blocks;
510 	raid_io->num_blocks = num_blocks;
511 	raid_io->iovs = iovs;
512 	raid_io->iovcnt = iovcnt;
513 	raid_io->memory_domain = memory_domain;
514 	raid_io->memory_domain_ctx = memory_domain_ctx;
515 	raid_io->md_buf = md_buf;
516 
517 	raid_io->raid_bdev = raid_bdev;
518 	raid_io->raid_ch = raid_ch;
519 	raid_io->base_bdev_io_remaining = 0;
520 	raid_io->base_bdev_io_submitted = 0;
521 	raid_io->base_bdev_io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
522 	raid_io->completion_cb = NULL;
523 }
524 
525 /*
526  * brief:
527  * raid_bdev_submit_request function is the submit_request function pointer of
528  * raid bdev function table. This is used to submit the io on raid_bdev to below
529  * layers.
530  * params:
531  * ch - pointer to raid bdev io channel
532  * bdev_io - pointer to parent bdev_io on raid bdev device
533  * returns:
534  * none
535  */
536 static void
537 raid_bdev_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
538 {
539 	struct raid_bdev_io *raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
540 
541 	raid_bdev_io_init(raid_io, spdk_io_channel_get_ctx(ch), bdev_io->type,
542 			  bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
543 			  bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.md_buf,
544 			  bdev_io->u.bdev.memory_domain, bdev_io->u.bdev.memory_domain_ctx);
545 
546 	switch (bdev_io->type) {
547 	case SPDK_BDEV_IO_TYPE_READ:
548 		spdk_bdev_io_get_buf(bdev_io, raid_bdev_get_buf_cb,
549 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
550 		break;
551 	case SPDK_BDEV_IO_TYPE_WRITE:
552 		raid_io->raid_bdev->module->submit_rw_request(raid_io);
553 		break;
554 
555 	case SPDK_BDEV_IO_TYPE_RESET:
556 		raid_bdev_submit_reset_request(raid_io);
557 		break;
558 
559 	case SPDK_BDEV_IO_TYPE_FLUSH:
560 	case SPDK_BDEV_IO_TYPE_UNMAP:
561 		raid_io->raid_bdev->module->submit_null_payload_request(raid_io);
562 		break;
563 
564 	default:
565 		SPDK_ERRLOG("submit request, invalid io type %u\n", bdev_io->type);
566 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
567 		break;
568 	}
569 }
570 
571 /*
572  * brief:
573  * _raid_bdev_io_type_supported checks whether io_type is supported in
574  * all base bdev modules of raid bdev module. If anyone among the base_bdevs
575  * doesn't support, the raid device doesn't supports.
576  *
577  * params:
578  * raid_bdev - pointer to raid bdev context
579  * io_type - io type
580  * returns:
581  * true - io_type is supported
582  * false - io_type is not supported
583  */
584 inline static bool
585 _raid_bdev_io_type_supported(struct raid_bdev *raid_bdev, enum spdk_bdev_io_type io_type)
586 {
587 	struct raid_base_bdev_info *base_info;
588 
589 	if (io_type == SPDK_BDEV_IO_TYPE_FLUSH ||
590 	    io_type == SPDK_BDEV_IO_TYPE_UNMAP) {
591 		if (raid_bdev->module->submit_null_payload_request == NULL) {
592 			return false;
593 		}
594 	}
595 
596 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
597 		if (base_info->desc == NULL) {
598 			continue;
599 		}
600 
601 		if (spdk_bdev_io_type_supported(spdk_bdev_desc_get_bdev(base_info->desc), io_type) == false) {
602 			return false;
603 		}
604 	}
605 
606 	return true;
607 }
608 
609 /*
610  * brief:
611  * raid_bdev_io_type_supported is the io_supported function for bdev function
612  * table which returns whether the particular io type is supported or not by
613  * raid bdev module
614  * params:
615  * ctx - pointer to raid bdev context
616  * type - io type
617  * returns:
618  * true - io_type is supported
619  * false - io_type is not supported
620  */
621 static bool
622 raid_bdev_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
623 {
624 	switch (io_type) {
625 	case SPDK_BDEV_IO_TYPE_READ:
626 	case SPDK_BDEV_IO_TYPE_WRITE:
627 		return true;
628 
629 	case SPDK_BDEV_IO_TYPE_FLUSH:
630 	case SPDK_BDEV_IO_TYPE_RESET:
631 	case SPDK_BDEV_IO_TYPE_UNMAP:
632 		return _raid_bdev_io_type_supported(ctx, io_type);
633 
634 	default:
635 		return false;
636 	}
637 
638 	return false;
639 }
640 
641 /*
642  * brief:
643  * raid_bdev_get_io_channel is the get_io_channel function table pointer for
644  * raid bdev. This is used to return the io channel for this raid bdev
645  * params:
646  * ctxt - pointer to raid_bdev
647  * returns:
648  * pointer to io channel for raid bdev
649  */
650 static struct spdk_io_channel *
651 raid_bdev_get_io_channel(void *ctxt)
652 {
653 	struct raid_bdev *raid_bdev = ctxt;
654 
655 	return spdk_get_io_channel(raid_bdev);
656 }
657 
658 void
659 raid_bdev_write_info_json(struct raid_bdev *raid_bdev, struct spdk_json_write_ctx *w)
660 {
661 	struct raid_base_bdev_info *base_info;
662 	char uuid_str[SPDK_UUID_STRING_LEN];
663 
664 	assert(raid_bdev != NULL);
665 	assert(spdk_get_thread() == spdk_thread_get_app_thread());
666 
667 	spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &raid_bdev->bdev.uuid);
668 	spdk_json_write_named_string(w, "uuid", uuid_str);
669 	spdk_json_write_named_uint32(w, "strip_size_kb", raid_bdev->strip_size_kb);
670 	spdk_json_write_named_string(w, "state", raid_bdev_state_to_str(raid_bdev->state));
671 	spdk_json_write_named_string(w, "raid_level", raid_bdev_level_to_str(raid_bdev->level));
672 	spdk_json_write_named_bool(w, "superblock", raid_bdev->sb != NULL);
673 	spdk_json_write_named_uint32(w, "num_base_bdevs", raid_bdev->num_base_bdevs);
674 	spdk_json_write_named_uint32(w, "num_base_bdevs_discovered", raid_bdev->num_base_bdevs_discovered);
675 	spdk_json_write_named_uint32(w, "num_base_bdevs_operational",
676 				     raid_bdev->num_base_bdevs_operational);
677 	spdk_json_write_name(w, "base_bdevs_list");
678 	spdk_json_write_array_begin(w);
679 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
680 		spdk_json_write_object_begin(w);
681 		spdk_json_write_name(w, "name");
682 		if (base_info->name) {
683 			spdk_json_write_string(w, base_info->name);
684 		} else {
685 			spdk_json_write_null(w);
686 		}
687 		spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &base_info->uuid);
688 		spdk_json_write_named_string(w, "uuid", uuid_str);
689 		spdk_json_write_named_bool(w, "is_configured", base_info->is_configured);
690 		spdk_json_write_named_uint64(w, "data_offset", base_info->data_offset);
691 		spdk_json_write_named_uint64(w, "data_size", base_info->data_size);
692 		spdk_json_write_object_end(w);
693 	}
694 	spdk_json_write_array_end(w);
695 }
696 
697 /*
698  * brief:
699  * raid_bdev_dump_info_json is the function table pointer for raid bdev
700  * params:
701  * ctx - pointer to raid_bdev
702  * w - pointer to json context
703  * returns:
704  * 0 - success
705  * non zero - failure
706  */
707 static int
708 raid_bdev_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
709 {
710 	struct raid_bdev *raid_bdev = ctx;
711 
712 	SPDK_DEBUGLOG(bdev_raid, "raid_bdev_dump_config_json\n");
713 
714 	/* Dump the raid bdev configuration related information */
715 	spdk_json_write_named_object_begin(w, "raid");
716 	raid_bdev_write_info_json(raid_bdev, w);
717 	spdk_json_write_object_end(w);
718 
719 	return 0;
720 }
721 
722 /*
723  * brief:
724  * raid_bdev_write_config_json is the function table pointer for raid bdev
725  * params:
726  * bdev - pointer to spdk_bdev
727  * w - pointer to json context
728  * returns:
729  * none
730  */
731 static void
732 raid_bdev_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
733 {
734 	struct raid_bdev *raid_bdev = bdev->ctxt;
735 	struct raid_base_bdev_info *base_info;
736 	char uuid_str[SPDK_UUID_STRING_LEN];
737 
738 	assert(spdk_get_thread() == spdk_thread_get_app_thread());
739 
740 	if (raid_bdev->sb != NULL) {
741 		/* raid bdev configuration is stored in the superblock */
742 		return;
743 	}
744 
745 	spdk_json_write_object_begin(w);
746 
747 	spdk_json_write_named_string(w, "method", "bdev_raid_create");
748 
749 	spdk_json_write_named_object_begin(w, "params");
750 	spdk_json_write_named_string(w, "name", bdev->name);
751 	spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &raid_bdev->bdev.uuid);
752 	spdk_json_write_named_string(w, "uuid", uuid_str);
753 	spdk_json_write_named_uint32(w, "strip_size_kb", raid_bdev->strip_size_kb);
754 	spdk_json_write_named_string(w, "raid_level", raid_bdev_level_to_str(raid_bdev->level));
755 	spdk_json_write_named_bool(w, "superblock", raid_bdev->sb != NULL);
756 
757 	spdk_json_write_named_array_begin(w, "base_bdevs");
758 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
759 		if (base_info->desc) {
760 			spdk_json_write_string(w, spdk_bdev_desc_get_bdev(base_info->desc)->name);
761 		}
762 	}
763 	spdk_json_write_array_end(w);
764 	spdk_json_write_object_end(w);
765 
766 	spdk_json_write_object_end(w);
767 }
768 
769 static int
770 raid_bdev_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size)
771 {
772 	struct raid_bdev *raid_bdev = ctx;
773 	struct raid_base_bdev_info *base_info;
774 	int domains_count = 0, rc = 0;
775 
776 	if (raid_bdev->module->memory_domains_supported == false) {
777 		return 0;
778 	}
779 
780 	spdk_spin_lock(&raid_bdev->base_bdev_lock);
781 
782 	/* First loop to get the number of memory domains */
783 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
784 		if (base_info->desc == NULL) {
785 			continue;
786 		}
787 		rc = spdk_bdev_get_memory_domains(spdk_bdev_desc_get_bdev(base_info->desc), NULL, 0);
788 		if (rc < 0) {
789 			goto out;
790 		}
791 		domains_count += rc;
792 	}
793 
794 	if (!domains || array_size < domains_count) {
795 		goto out;
796 	}
797 
798 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
799 		if (base_info->desc == NULL) {
800 			continue;
801 		}
802 		rc = spdk_bdev_get_memory_domains(spdk_bdev_desc_get_bdev(base_info->desc), domains, array_size);
803 		if (rc < 0) {
804 			goto out;
805 		}
806 		domains += rc;
807 		array_size -= rc;
808 	}
809 out:
810 	spdk_spin_unlock(&raid_bdev->base_bdev_lock);
811 
812 	if (rc < 0) {
813 		return rc;
814 	}
815 
816 	return domains_count;
817 }
818 
819 /* g_raid_bdev_fn_table is the function table for raid bdev */
820 static const struct spdk_bdev_fn_table g_raid_bdev_fn_table = {
821 	.destruct		= raid_bdev_destruct,
822 	.submit_request		= raid_bdev_submit_request,
823 	.io_type_supported	= raid_bdev_io_type_supported,
824 	.get_io_channel		= raid_bdev_get_io_channel,
825 	.dump_info_json		= raid_bdev_dump_info_json,
826 	.write_config_json	= raid_bdev_write_config_json,
827 	.get_memory_domains	= raid_bdev_get_memory_domains,
828 };
829 
830 struct raid_bdev *
831 raid_bdev_find_by_name(const char *name)
832 {
833 	struct raid_bdev *raid_bdev;
834 
835 	TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) {
836 		if (strcmp(raid_bdev->bdev.name, name) == 0) {
837 			return raid_bdev;
838 		}
839 	}
840 
841 	return NULL;
842 }
843 
844 static struct {
845 	const char *name;
846 	enum raid_level value;
847 } g_raid_level_names[] = {
848 	{ "raid0", RAID0 },
849 	{ "0", RAID0 },
850 	{ "raid1", RAID1 },
851 	{ "1", RAID1 },
852 	{ "raid5f", RAID5F },
853 	{ "5f", RAID5F },
854 	{ "concat", CONCAT },
855 	{ }
856 };
857 
858 static struct {
859 	const char *name;
860 	enum raid_bdev_state value;
861 } g_raid_state_names[] = {
862 	{ "online", RAID_BDEV_STATE_ONLINE },
863 	{ "configuring", RAID_BDEV_STATE_CONFIGURING },
864 	{ "offline", RAID_BDEV_STATE_OFFLINE },
865 	{ }
866 };
867 
868 /* We have to use the typedef in the function declaration to appease astyle. */
869 typedef enum raid_level raid_level_t;
870 typedef enum raid_bdev_state raid_bdev_state_t;
871 
872 raid_level_t
873 raid_bdev_str_to_level(const char *str)
874 {
875 	unsigned int i;
876 
877 	assert(str != NULL);
878 
879 	for (i = 0; g_raid_level_names[i].name != NULL; i++) {
880 		if (strcasecmp(g_raid_level_names[i].name, str) == 0) {
881 			return g_raid_level_names[i].value;
882 		}
883 	}
884 
885 	return INVALID_RAID_LEVEL;
886 }
887 
888 const char *
889 raid_bdev_level_to_str(enum raid_level level)
890 {
891 	unsigned int i;
892 
893 	for (i = 0; g_raid_level_names[i].name != NULL; i++) {
894 		if (g_raid_level_names[i].value == level) {
895 			return g_raid_level_names[i].name;
896 		}
897 	}
898 
899 	return "";
900 }
901 
902 raid_bdev_state_t
903 raid_bdev_str_to_state(const char *str)
904 {
905 	unsigned int i;
906 
907 	assert(str != NULL);
908 
909 	for (i = 0; g_raid_state_names[i].name != NULL; i++) {
910 		if (strcasecmp(g_raid_state_names[i].name, str) == 0) {
911 			return g_raid_state_names[i].value;
912 		}
913 	}
914 
915 	return RAID_BDEV_STATE_MAX;
916 }
917 
918 const char *
919 raid_bdev_state_to_str(enum raid_bdev_state state)
920 {
921 	unsigned int i;
922 
923 	for (i = 0; g_raid_state_names[i].name != NULL; i++) {
924 		if (g_raid_state_names[i].value == state) {
925 			return g_raid_state_names[i].name;
926 		}
927 	}
928 
929 	assert(false);
930 	return "";
931 }
932 
933 /*
934  * brief:
935  * raid_bdev_fini_start is called when bdev layer is starting the
936  * shutdown process
937  * params:
938  * none
939  * returns:
940  * none
941  */
942 static void
943 raid_bdev_fini_start(void)
944 {
945 	SPDK_DEBUGLOG(bdev_raid, "raid_bdev_fini_start\n");
946 	g_shutdown_started = true;
947 }
948 
949 /*
950  * brief:
951  * raid_bdev_exit is called on raid bdev module exit time by bdev layer
952  * params:
953  * none
954  * returns:
955  * none
956  */
957 static void
958 raid_bdev_exit(void)
959 {
960 	struct raid_bdev *raid_bdev, *tmp;
961 
962 	SPDK_DEBUGLOG(bdev_raid, "raid_bdev_exit\n");
963 
964 	TAILQ_FOREACH_SAFE(raid_bdev, &g_raid_bdev_list, global_link, tmp) {
965 		raid_bdev_cleanup_and_free(raid_bdev);
966 	}
967 }
968 
969 /*
970  * brief:
971  * raid_bdev_get_ctx_size is used to return the context size of bdev_io for raid
972  * module
973  * params:
974  * none
975  * returns:
976  * size of spdk_bdev_io context for raid
977  */
978 static int
979 raid_bdev_get_ctx_size(void)
980 {
981 	SPDK_DEBUGLOG(bdev_raid, "raid_bdev_get_ctx_size\n");
982 	return sizeof(struct raid_bdev_io);
983 }
984 
985 static struct spdk_bdev_module g_raid_if = {
986 	.name = "raid",
987 	.module_init = raid_bdev_init,
988 	.fini_start = raid_bdev_fini_start,
989 	.module_fini = raid_bdev_exit,
990 	.get_ctx_size = raid_bdev_get_ctx_size,
991 	.examine_disk = raid_bdev_examine,
992 	.async_init = false,
993 	.async_fini = false,
994 };
995 SPDK_BDEV_MODULE_REGISTER(raid, &g_raid_if)
996 
997 /*
998  * brief:
999  * raid_bdev_init is the initialization function for raid bdev module
1000  * params:
1001  * none
1002  * returns:
1003  * 0 - success
1004  * non zero - failure
1005  */
1006 static int
1007 raid_bdev_init(void)
1008 {
1009 	return 0;
1010 }
1011 
1012 static int
1013 _raid_bdev_create(const char *name, uint32_t strip_size, uint8_t num_base_bdevs,
1014 		  enum raid_level level, bool superblock_enabled, const struct spdk_uuid *uuid,
1015 		  struct raid_bdev **raid_bdev_out)
1016 {
1017 	struct raid_bdev *raid_bdev;
1018 	struct spdk_bdev *raid_bdev_gen;
1019 	struct raid_bdev_module *module;
1020 	struct raid_base_bdev_info *base_info;
1021 	uint8_t min_operational;
1022 
1023 	if (strnlen(name, RAID_BDEV_SB_NAME_SIZE) == RAID_BDEV_SB_NAME_SIZE) {
1024 		SPDK_ERRLOG("Raid bdev name '%s' exceeds %d characters\n", name, RAID_BDEV_SB_NAME_SIZE - 1);
1025 		return -EINVAL;
1026 	}
1027 
1028 	if (raid_bdev_find_by_name(name) != NULL) {
1029 		SPDK_ERRLOG("Duplicate raid bdev name found: %s\n", name);
1030 		return -EEXIST;
1031 	}
1032 
1033 	if (level == RAID1) {
1034 		if (strip_size != 0) {
1035 			SPDK_ERRLOG("Strip size is not supported by raid1\n");
1036 			return -EINVAL;
1037 		}
1038 	} else if (spdk_u32_is_pow2(strip_size) == false) {
1039 		SPDK_ERRLOG("Invalid strip size %" PRIu32 "\n", strip_size);
1040 		return -EINVAL;
1041 	}
1042 
1043 	module = raid_bdev_module_find(level);
1044 	if (module == NULL) {
1045 		SPDK_ERRLOG("Unsupported raid level '%d'\n", level);
1046 		return -EINVAL;
1047 	}
1048 
1049 	assert(module->base_bdevs_min != 0);
1050 	if (num_base_bdevs < module->base_bdevs_min) {
1051 		SPDK_ERRLOG("At least %u base devices required for %s\n",
1052 			    module->base_bdevs_min,
1053 			    raid_bdev_level_to_str(level));
1054 		return -EINVAL;
1055 	}
1056 
1057 	switch (module->base_bdevs_constraint.type) {
1058 	case CONSTRAINT_MAX_BASE_BDEVS_REMOVED:
1059 		min_operational = num_base_bdevs - module->base_bdevs_constraint.value;
1060 		break;
1061 	case CONSTRAINT_MIN_BASE_BDEVS_OPERATIONAL:
1062 		min_operational = module->base_bdevs_constraint.value;
1063 		break;
1064 	case CONSTRAINT_UNSET:
1065 		if (module->base_bdevs_constraint.value != 0) {
1066 			SPDK_ERRLOG("Unexpected constraint value '%u' provided for raid bdev '%s'.\n",
1067 				    (uint8_t)module->base_bdevs_constraint.value, name);
1068 			return -EINVAL;
1069 		}
1070 		min_operational = num_base_bdevs;
1071 		break;
1072 	default:
1073 		SPDK_ERRLOG("Unrecognised constraint type '%u' in module for raid level '%s'.\n",
1074 			    (uint8_t)module->base_bdevs_constraint.type,
1075 			    raid_bdev_level_to_str(module->level));
1076 		return -EINVAL;
1077 	};
1078 
1079 	if (min_operational == 0 || min_operational > num_base_bdevs) {
1080 		SPDK_ERRLOG("Wrong constraint value for raid level '%s'.\n",
1081 			    raid_bdev_level_to_str(module->level));
1082 		return -EINVAL;
1083 	}
1084 
1085 	raid_bdev = calloc(1, sizeof(*raid_bdev));
1086 	if (!raid_bdev) {
1087 		SPDK_ERRLOG("Unable to allocate memory for raid bdev\n");
1088 		return -ENOMEM;
1089 	}
1090 
1091 	spdk_spin_init(&raid_bdev->base_bdev_lock);
1092 	raid_bdev->module = module;
1093 	raid_bdev->num_base_bdevs = num_base_bdevs;
1094 	raid_bdev->base_bdev_info = calloc(raid_bdev->num_base_bdevs,
1095 					   sizeof(struct raid_base_bdev_info));
1096 	if (!raid_bdev->base_bdev_info) {
1097 		SPDK_ERRLOG("Unable able to allocate base bdev info\n");
1098 		raid_bdev_free(raid_bdev);
1099 		return -ENOMEM;
1100 	}
1101 
1102 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
1103 		base_info->raid_bdev = raid_bdev;
1104 	}
1105 
1106 	/* strip_size_kb is from the rpc param.  strip_size is in blocks and used
1107 	 * internally and set later.
1108 	 */
1109 	raid_bdev->strip_size = 0;
1110 	raid_bdev->strip_size_kb = strip_size;
1111 	raid_bdev->state = RAID_BDEV_STATE_CONFIGURING;
1112 	raid_bdev->level = level;
1113 	raid_bdev->min_base_bdevs_operational = min_operational;
1114 
1115 	if (superblock_enabled) {
1116 		raid_bdev->sb = spdk_dma_zmalloc(RAID_BDEV_SB_MAX_LENGTH, 0x1000, NULL);
1117 		if (!raid_bdev->sb) {
1118 			SPDK_ERRLOG("Failed to allocate raid bdev sb buffer\n");
1119 			raid_bdev_free(raid_bdev);
1120 			return -ENOMEM;
1121 		}
1122 	}
1123 
1124 	raid_bdev_gen = &raid_bdev->bdev;
1125 
1126 	raid_bdev_gen->name = strdup(name);
1127 	if (!raid_bdev_gen->name) {
1128 		SPDK_ERRLOG("Unable to allocate name for raid\n");
1129 		raid_bdev_free(raid_bdev);
1130 		return -ENOMEM;
1131 	}
1132 
1133 	raid_bdev_gen->product_name = "Raid Volume";
1134 	raid_bdev_gen->ctxt = raid_bdev;
1135 	raid_bdev_gen->fn_table = &g_raid_bdev_fn_table;
1136 	raid_bdev_gen->module = &g_raid_if;
1137 	raid_bdev_gen->write_cache = 0;
1138 	spdk_uuid_copy(&raid_bdev_gen->uuid, uuid);
1139 
1140 	TAILQ_INSERT_TAIL(&g_raid_bdev_list, raid_bdev, global_link);
1141 
1142 	*raid_bdev_out = raid_bdev;
1143 
1144 	return 0;
1145 }
1146 
1147 /*
1148  * brief:
1149  * raid_bdev_create allocates raid bdev based on passed configuration
1150  * params:
1151  * name - name for raid bdev
1152  * strip_size - strip size in KB
1153  * num_base_bdevs - number of base bdevs
1154  * level - raid level
1155  * superblock_enabled - true if raid should have superblock
1156  * uuid - uuid to set for the bdev
1157  * raid_bdev_out - the created raid bdev
1158  * returns:
1159  * 0 - success
1160  * non zero - failure
1161  */
1162 int
1163 raid_bdev_create(const char *name, uint32_t strip_size, uint8_t num_base_bdevs,
1164 		 enum raid_level level, bool superblock_enabled, const struct spdk_uuid *uuid,
1165 		 struct raid_bdev **raid_bdev_out)
1166 {
1167 	struct raid_bdev *raid_bdev;
1168 	int rc;
1169 
1170 	assert(uuid != NULL);
1171 
1172 	rc = _raid_bdev_create(name, strip_size, num_base_bdevs, level, superblock_enabled, uuid,
1173 			       &raid_bdev);
1174 	if (rc != 0) {
1175 		return rc;
1176 	}
1177 
1178 	if (superblock_enabled && spdk_uuid_is_null(uuid)) {
1179 		/* we need to have the uuid to store in the superblock before the bdev is registered */
1180 		spdk_uuid_generate(&raid_bdev->bdev.uuid);
1181 	}
1182 
1183 	raid_bdev->num_base_bdevs_operational = num_base_bdevs;
1184 
1185 	*raid_bdev_out = raid_bdev;
1186 
1187 	return 0;
1188 }
1189 
1190 /*
1191  * brief:
1192  * Check underlying block devices against support for metadata. Do not configure
1193  * md support when parameters from block devices are inconsistent.
1194  * params:
1195  * raid_bdev - pointer to raid bdev
1196  * returns:
1197  * 0 - The raid bdev md parameters were successfully configured.
1198  * non zero - Failed to configure md.
1199  */
1200 static int
1201 raid_bdev_configure_md(struct raid_bdev *raid_bdev)
1202 {
1203 	struct spdk_bdev *base_bdev;
1204 	uint8_t i;
1205 
1206 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
1207 		if (raid_bdev->base_bdev_info[i].desc == NULL) {
1208 			continue;
1209 		}
1210 		base_bdev = spdk_bdev_desc_get_bdev(raid_bdev->base_bdev_info[i].desc);
1211 
1212 		/* Currently, RAID bdevs do not support DIF or DIX, so a RAID bdev cannot
1213 		 * be created on top of any bdev which supports it */
1214 		if (spdk_bdev_get_dif_type(base_bdev) != SPDK_DIF_DISABLE) {
1215 			SPDK_ERRLOG("at least one base bdev has DIF or DIX enabled "
1216 				    "- unsupported RAID configuration\n");
1217 			return -EPERM;
1218 		}
1219 
1220 		if (i == 0) {
1221 			raid_bdev->bdev.md_len = spdk_bdev_get_md_size(base_bdev);
1222 			raid_bdev->bdev.md_interleave = spdk_bdev_is_md_interleaved(base_bdev);
1223 			continue;
1224 		}
1225 
1226 		if (raid_bdev->bdev.md_len != spdk_bdev_get_md_size(base_bdev) ||
1227 		    raid_bdev->bdev.md_interleave != spdk_bdev_is_md_interleaved(base_bdev)) {
1228 			SPDK_ERRLOG("base bdevs are configured with different metadata formats\n");
1229 			return -EPERM;
1230 		}
1231 	}
1232 
1233 	return 0;
1234 }
1235 
1236 static void
1237 raid_bdev_configure_cont(struct raid_bdev *raid_bdev)
1238 {
1239 	struct spdk_bdev *raid_bdev_gen = &raid_bdev->bdev;
1240 	int rc;
1241 
1242 	raid_bdev->state = RAID_BDEV_STATE_ONLINE;
1243 	SPDK_DEBUGLOG(bdev_raid, "io device register %p\n", raid_bdev);
1244 	SPDK_DEBUGLOG(bdev_raid, "blockcnt %" PRIu64 ", blocklen %u\n",
1245 		      raid_bdev_gen->blockcnt, raid_bdev_gen->blocklen);
1246 	spdk_io_device_register(raid_bdev, raid_bdev_create_cb, raid_bdev_destroy_cb,
1247 				sizeof(struct raid_bdev_io_channel),
1248 				raid_bdev_gen->name);
1249 	rc = spdk_bdev_register(raid_bdev_gen);
1250 	if (rc != 0) {
1251 		SPDK_ERRLOG("Unable to register raid bdev and stay at configuring state\n");
1252 		if (raid_bdev->module->stop != NULL) {
1253 			raid_bdev->module->stop(raid_bdev);
1254 		}
1255 		spdk_io_device_unregister(raid_bdev, NULL);
1256 		raid_bdev->state = RAID_BDEV_STATE_CONFIGURING;
1257 		return;
1258 	}
1259 	SPDK_DEBUGLOG(bdev_raid, "raid bdev generic %p\n", raid_bdev_gen);
1260 	SPDK_DEBUGLOG(bdev_raid, "raid bdev is created with name %s, raid_bdev %p\n",
1261 		      raid_bdev_gen->name, raid_bdev);
1262 }
1263 
1264 static void
1265 raid_bdev_configure_write_sb_cb(int status, struct raid_bdev *raid_bdev, void *ctx)
1266 {
1267 	if (status == 0) {
1268 		raid_bdev_configure_cont(raid_bdev);
1269 	} else {
1270 		SPDK_ERRLOG("Failed to write raid bdev '%s' superblock: %s\n",
1271 			    raid_bdev->bdev.name, spdk_strerror(-status));
1272 		if (raid_bdev->module->stop != NULL) {
1273 			raid_bdev->module->stop(raid_bdev);
1274 		}
1275 	}
1276 }
1277 
1278 /*
1279  * brief:
1280  * If raid bdev config is complete, then only register the raid bdev to
1281  * bdev layer and remove this raid bdev from configuring list and
1282  * insert the raid bdev to configured list
1283  * params:
1284  * raid_bdev - pointer to raid bdev
1285  * returns:
1286  * 0 - success
1287  * non zero - failure
1288  */
1289 static int
1290 raid_bdev_configure(struct raid_bdev *raid_bdev)
1291 {
1292 	uint32_t blocklen = 0;
1293 	struct raid_base_bdev_info *base_info;
1294 	struct spdk_bdev *base_bdev;
1295 	int rc = 0;
1296 
1297 	assert(raid_bdev->state == RAID_BDEV_STATE_CONFIGURING);
1298 	assert(raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs_operational);
1299 
1300 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
1301 		if (base_info->desc == NULL) {
1302 			continue;
1303 		}
1304 		base_bdev = spdk_bdev_desc_get_bdev(base_info->desc);
1305 
1306 		/* Check blocklen for all base bdevs that it should be same */
1307 		if (blocklen == 0) {
1308 			blocklen = base_bdev->blocklen;
1309 		} else if (blocklen != base_bdev->blocklen) {
1310 			/*
1311 			 * Assumption is that all the base bdevs for any raid bdev should
1312 			 * have same blocklen
1313 			 */
1314 			SPDK_ERRLOG("Blocklen of various bdevs not matching\n");
1315 			return -EINVAL;
1316 		}
1317 	}
1318 	assert(blocklen > 0);
1319 
1320 	/* The strip_size_kb is read in from user in KB. Convert to blocks here for
1321 	 * internal use.
1322 	 */
1323 	raid_bdev->strip_size = (raid_bdev->strip_size_kb * 1024) / blocklen;
1324 	if (raid_bdev->strip_size == 0 && raid_bdev->level != RAID1) {
1325 		SPDK_ERRLOG("Strip size cannot be smaller than the device block size\n");
1326 		return -EINVAL;
1327 	}
1328 	raid_bdev->strip_size_shift = spdk_u32log2(raid_bdev->strip_size);
1329 	raid_bdev->blocklen_shift = spdk_u32log2(blocklen);
1330 	raid_bdev->bdev.blocklen = blocklen;
1331 
1332 	rc = raid_bdev_configure_md(raid_bdev);
1333 	if (rc != 0) {
1334 		SPDK_ERRLOG("raid metadata configuration failed\n");
1335 		return rc;
1336 	}
1337 
1338 	rc = raid_bdev->module->start(raid_bdev);
1339 	if (rc != 0) {
1340 		SPDK_ERRLOG("raid module startup callback failed\n");
1341 		return rc;
1342 	}
1343 
1344 	if (raid_bdev->sb != NULL) {
1345 		if (spdk_uuid_is_null(&raid_bdev->sb->uuid)) {
1346 			/* NULL UUID is not valid in the sb so it means that we are creating a new
1347 			 * raid bdev and should initialize the superblock.
1348 			 */
1349 			raid_bdev_init_superblock(raid_bdev);
1350 		} else {
1351 			assert(spdk_uuid_compare(&raid_bdev->sb->uuid, &raid_bdev->bdev.uuid) == 0);
1352 			if (raid_bdev->sb->block_size != blocklen) {
1353 				SPDK_ERRLOG("blocklen does not match value in superblock\n");
1354 				rc = -EINVAL;
1355 			}
1356 			if (raid_bdev->sb->raid_size != raid_bdev->bdev.blockcnt) {
1357 				SPDK_ERRLOG("blockcnt does not match value in superblock\n");
1358 				rc = -EINVAL;
1359 			}
1360 			if (rc != 0) {
1361 				if (raid_bdev->module->stop != NULL) {
1362 					raid_bdev->module->stop(raid_bdev);
1363 				}
1364 				return rc;
1365 			}
1366 		}
1367 
1368 		raid_bdev_write_superblock(raid_bdev, raid_bdev_configure_write_sb_cb, NULL);
1369 	} else {
1370 		raid_bdev_configure_cont(raid_bdev);
1371 	}
1372 
1373 	return 0;
1374 }
1375 
1376 /*
1377  * brief:
1378  * If raid bdev is online and registered, change the bdev state to
1379  * configuring and unregister this raid device. Queue this raid device
1380  * in configuring list
1381  * params:
1382  * raid_bdev - pointer to raid bdev
1383  * cb_fn - callback function
1384  * cb_arg - argument to callback function
1385  * returns:
1386  * none
1387  */
1388 static void
1389 raid_bdev_deconfigure(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn,
1390 		      void *cb_arg)
1391 {
1392 	if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) {
1393 		if (cb_fn) {
1394 			cb_fn(cb_arg, 0);
1395 		}
1396 		return;
1397 	}
1398 
1399 	raid_bdev->state = RAID_BDEV_STATE_OFFLINE;
1400 	assert(raid_bdev->num_base_bdevs_discovered);
1401 	SPDK_DEBUGLOG(bdev_raid, "raid bdev state changing from online to offline\n");
1402 
1403 	spdk_bdev_unregister(&raid_bdev->bdev, cb_fn, cb_arg);
1404 }
1405 
1406 /*
1407  * brief:
1408  * raid_bdev_find_base_info_by_bdev function finds the base bdev info by bdev.
1409  * params:
1410  * base_bdev - pointer to base bdev
1411  * returns:
1412  * base bdev info if found, otherwise NULL.
1413  */
1414 static struct raid_base_bdev_info *
1415 raid_bdev_find_base_info_by_bdev(struct spdk_bdev *base_bdev)
1416 {
1417 	struct raid_bdev *raid_bdev;
1418 	struct raid_base_bdev_info *base_info;
1419 
1420 	TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) {
1421 		RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
1422 			if (base_info->desc != NULL &&
1423 			    spdk_bdev_desc_get_bdev(base_info->desc) == base_bdev) {
1424 				return base_info;
1425 			}
1426 		}
1427 	}
1428 
1429 	return NULL;
1430 }
1431 
1432 static void
1433 raid_bdev_remove_base_bdev_done(struct raid_base_bdev_info *base_info, int status)
1434 {
1435 	assert(base_info->remove_scheduled);
1436 
1437 	base_info->remove_scheduled = false;
1438 	if (base_info->remove_cb != NULL) {
1439 		base_info->remove_cb(base_info->remove_cb_ctx, status);
1440 	}
1441 }
1442 
1443 static void
1444 raid_bdev_remove_base_bdev_write_sb_cb(int status, struct raid_bdev *raid_bdev, void *ctx)
1445 {
1446 	struct raid_base_bdev_info *base_info = ctx;
1447 
1448 	if (status != 0) {
1449 		SPDK_ERRLOG("Failed to write raid bdev '%s' superblock: %s\n",
1450 			    raid_bdev->bdev.name, spdk_strerror(-status));
1451 	}
1452 
1453 	raid_bdev_remove_base_bdev_done(base_info, status);
1454 }
1455 
1456 static void
1457 raid_bdev_remove_base_bdev_on_unquiesced(void *ctx, int status)
1458 {
1459 	struct raid_base_bdev_info *base_info = ctx;
1460 	struct raid_bdev *raid_bdev = base_info->raid_bdev;
1461 
1462 	if (status != 0) {
1463 		SPDK_ERRLOG("Failed to unquiesce raid bdev %s: %s\n",
1464 			    raid_bdev->bdev.name, spdk_strerror(-status));
1465 		goto out;
1466 	}
1467 
1468 	spdk_spin_lock(&raid_bdev->base_bdev_lock);
1469 	raid_bdev_free_base_bdev_resource(base_info);
1470 	spdk_spin_unlock(&raid_bdev->base_bdev_lock);
1471 
1472 	if (raid_bdev->sb) {
1473 		struct raid_bdev_superblock *sb = raid_bdev->sb;
1474 		struct raid_bdev_sb_base_bdev *sb_base_bdev = NULL;
1475 		uint8_t slot = raid_bdev_base_bdev_slot(base_info);
1476 		uint8_t i;
1477 
1478 		for (i = 0; i < sb->base_bdevs_size; i++) {
1479 			sb_base_bdev = &sb->base_bdevs[i];
1480 
1481 			if (sb_base_bdev->state == RAID_SB_BASE_BDEV_CONFIGURED &&
1482 			    sb_base_bdev->slot == slot) {
1483 				break;
1484 			}
1485 		}
1486 
1487 		assert(i < sb->base_bdevs_size);
1488 
1489 		/* TODO: distinguish between failure and intentional removal */
1490 		sb_base_bdev->state = RAID_SB_BASE_BDEV_FAILED;
1491 
1492 		raid_bdev_write_superblock(raid_bdev, raid_bdev_remove_base_bdev_write_sb_cb, base_info);
1493 		return;
1494 	}
1495 out:
1496 	raid_bdev_remove_base_bdev_done(base_info, status);
1497 }
1498 
1499 static void
1500 raid_bdev_channel_remove_base_bdev(struct spdk_io_channel_iter *i)
1501 {
1502 	struct raid_base_bdev_info *base_info = spdk_io_channel_iter_get_ctx(i);
1503 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
1504 	struct raid_bdev_io_channel *raid_ch = spdk_io_channel_get_ctx(ch);
1505 	uint8_t idx = raid_bdev_base_bdev_slot(base_info);
1506 
1507 	SPDK_DEBUGLOG(bdev_raid, "slot: %u raid_ch: %p\n", idx, raid_ch);
1508 
1509 	if (raid_ch->base_channel[idx] != NULL) {
1510 		spdk_put_io_channel(raid_ch->base_channel[idx]);
1511 		raid_ch->base_channel[idx] = NULL;
1512 	}
1513 
1514 	spdk_for_each_channel_continue(i, 0);
1515 }
1516 
1517 static void
1518 raid_bdev_channels_remove_base_bdev_done(struct spdk_io_channel_iter *i, int status)
1519 {
1520 	struct raid_base_bdev_info *base_info = spdk_io_channel_iter_get_ctx(i);
1521 	struct raid_bdev *raid_bdev = base_info->raid_bdev;
1522 
1523 	spdk_bdev_unquiesce(&raid_bdev->bdev, &g_raid_if, raid_bdev_remove_base_bdev_on_unquiesced,
1524 			    base_info);
1525 }
1526 
1527 static void
1528 raid_bdev_remove_base_bdev_on_quiesced(void *ctx, int status)
1529 {
1530 	struct raid_base_bdev_info *base_info = ctx;
1531 	struct raid_bdev *raid_bdev = base_info->raid_bdev;
1532 
1533 	if (status != 0) {
1534 		SPDK_ERRLOG("Failed to quiesce raid bdev %s: %s\n",
1535 			    raid_bdev->bdev.name, spdk_strerror(-status));
1536 		raid_bdev_remove_base_bdev_done(base_info, status);
1537 		return;
1538 	}
1539 
1540 	spdk_for_each_channel(raid_bdev, raid_bdev_channel_remove_base_bdev, base_info,
1541 			      raid_bdev_channels_remove_base_bdev_done);
1542 }
1543 
1544 static int
1545 _raid_bdev_remove_base_bdev(struct raid_base_bdev_info *base_info,
1546 			    raid_bdev_remove_base_bdev_cb cb_fn, void *cb_ctx)
1547 {
1548 	struct raid_bdev *raid_bdev = base_info->raid_bdev;
1549 
1550 	SPDK_DEBUGLOG(bdev_raid, "%s\n", base_info->name);
1551 
1552 	assert(spdk_get_thread() == spdk_thread_get_app_thread());
1553 
1554 	if (base_info->remove_scheduled) {
1555 		return 0;
1556 	}
1557 
1558 	assert(base_info->desc);
1559 	base_info->remove_scheduled = true;
1560 	base_info->remove_cb = cb_fn;
1561 	base_info->remove_cb_ctx = cb_ctx;
1562 
1563 	if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) {
1564 		/*
1565 		 * As raid bdev is not registered yet or already unregistered,
1566 		 * so cleanup should be done here itself.
1567 		 *
1568 		 * Removing a base bdev at this stage does not change the number of operational
1569 		 * base bdevs, only the number of discovered base bdevs.
1570 		 */
1571 		raid_bdev_free_base_bdev_resource(base_info);
1572 		if (raid_bdev->num_base_bdevs_discovered == 0) {
1573 			/* There is no base bdev for this raid, so free the raid device. */
1574 			raid_bdev_cleanup_and_free(raid_bdev);
1575 		}
1576 	} else if (raid_bdev->num_base_bdevs_operational-- == raid_bdev->min_base_bdevs_operational) {
1577 		/*
1578 		 * After this base bdev is removed there will not be enough base bdevs
1579 		 * to keep the raid bdev operational.
1580 		 */
1581 		raid_bdev_deconfigure(raid_bdev, cb_fn, cb_ctx);
1582 	} else {
1583 		int ret;
1584 
1585 		ret = spdk_bdev_quiesce(&raid_bdev->bdev, &g_raid_if,
1586 					raid_bdev_remove_base_bdev_on_quiesced, base_info);
1587 		if (ret != 0) {
1588 			base_info->remove_scheduled = false;
1589 		}
1590 	}
1591 
1592 	return 0;
1593 }
1594 
1595 /*
1596  * brief:
1597  * raid_bdev_remove_base_bdev function is called by below layers when base_bdev
1598  * is removed. This function checks if this base bdev is part of any raid bdev
1599  * or not. If yes, it takes necessary action on that particular raid bdev.
1600  * params:
1601  * base_bdev - pointer to base bdev which got removed
1602  * cb_fn - callback function
1603  * cb_arg - argument to callback function
1604  * returns:
1605  * 0 - success
1606  * non zero - failure
1607  */
1608 int
1609 raid_bdev_remove_base_bdev(struct spdk_bdev *base_bdev, raid_bdev_remove_base_bdev_cb cb_fn,
1610 			   void *cb_ctx)
1611 {
1612 	struct raid_base_bdev_info *base_info;
1613 
1614 	/* Find the raid_bdev which has claimed this base_bdev */
1615 	base_info = raid_bdev_find_base_info_by_bdev(base_bdev);
1616 	if (!base_info) {
1617 		SPDK_ERRLOG("bdev to remove '%s' not found\n", base_bdev->name);
1618 		return -ENODEV;
1619 	}
1620 
1621 	return _raid_bdev_remove_base_bdev(base_info, cb_fn, cb_ctx);
1622 }
1623 
1624 /*
1625  * brief:
1626  * raid_bdev_resize_base_bdev function is called by below layers when base_bdev
1627  * is resized. This function checks if the smallest size of the base_bdevs is changed.
1628  * If yes, call module handler to resize the raid_bdev if implemented.
1629  * params:
1630  * base_bdev - pointer to base bdev which got resized.
1631  * returns:
1632  * none
1633  */
1634 static void
1635 raid_bdev_resize_base_bdev(struct spdk_bdev *base_bdev)
1636 {
1637 	struct raid_bdev *raid_bdev;
1638 	struct raid_base_bdev_info *base_info;
1639 
1640 	SPDK_DEBUGLOG(bdev_raid, "raid_bdev_resize_base_bdev\n");
1641 
1642 	base_info = raid_bdev_find_base_info_by_bdev(base_bdev);
1643 
1644 	/* Find the raid_bdev which has claimed this base_bdev */
1645 	if (!base_info) {
1646 		SPDK_ERRLOG("raid_bdev whose base_bdev '%s' not found\n", base_bdev->name);
1647 		return;
1648 	}
1649 	raid_bdev = base_info->raid_bdev;
1650 
1651 	assert(spdk_get_thread() == spdk_thread_get_app_thread());
1652 
1653 	SPDK_NOTICELOG("base_bdev '%s' was resized: old size %" PRIu64 ", new size %" PRIu64 "\n",
1654 		       base_bdev->name, base_info->blockcnt, base_bdev->blockcnt);
1655 
1656 	if (raid_bdev->module->resize) {
1657 		raid_bdev->module->resize(raid_bdev);
1658 	}
1659 }
1660 
1661 /*
1662  * brief:
1663  * raid_bdev_event_base_bdev function is called by below layers when base_bdev
1664  * triggers asynchronous event.
1665  * params:
1666  * type - event details.
1667  * bdev - bdev that triggered event.
1668  * event_ctx - context for event.
1669  * returns:
1670  * none
1671  */
1672 static void
1673 raid_bdev_event_base_bdev(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
1674 			  void *event_ctx)
1675 {
1676 	int rc;
1677 
1678 	switch (type) {
1679 	case SPDK_BDEV_EVENT_REMOVE:
1680 		rc = raid_bdev_remove_base_bdev(bdev, NULL, NULL);
1681 		if (rc != 0) {
1682 			SPDK_ERRLOG("Failed to remove base bdev %s: %s\n",
1683 				    spdk_bdev_get_name(bdev), spdk_strerror(-rc));
1684 		}
1685 		break;
1686 	case SPDK_BDEV_EVENT_RESIZE:
1687 		raid_bdev_resize_base_bdev(bdev);
1688 		break;
1689 	default:
1690 		SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
1691 		break;
1692 	}
1693 }
1694 
1695 /*
1696  * brief:
1697  * Deletes the specified raid bdev
1698  * params:
1699  * raid_bdev - pointer to raid bdev
1700  * cb_fn - callback function
1701  * cb_arg - argument to callback function
1702  */
1703 void
1704 raid_bdev_delete(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn, void *cb_arg)
1705 {
1706 	struct raid_base_bdev_info *base_info;
1707 
1708 	SPDK_DEBUGLOG(bdev_raid, "delete raid bdev: %s\n", raid_bdev->bdev.name);
1709 
1710 	if (raid_bdev->destroy_started) {
1711 		SPDK_DEBUGLOG(bdev_raid, "destroying raid bdev %s is already started\n",
1712 			      raid_bdev->bdev.name);
1713 		if (cb_fn) {
1714 			cb_fn(cb_arg, -EALREADY);
1715 		}
1716 		return;
1717 	}
1718 
1719 	raid_bdev->destroy_started = true;
1720 
1721 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
1722 		base_info->remove_scheduled = true;
1723 
1724 		if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) {
1725 			/*
1726 			 * As raid bdev is not registered yet or already unregistered,
1727 			 * so cleanup should be done here itself.
1728 			 */
1729 			raid_bdev_free_base_bdev_resource(base_info);
1730 		}
1731 	}
1732 
1733 	if (raid_bdev->num_base_bdevs_discovered == 0) {
1734 		/* There is no base bdev for this raid, so free the raid device. */
1735 		raid_bdev_cleanup_and_free(raid_bdev);
1736 		if (cb_fn) {
1737 			cb_fn(cb_arg, 0);
1738 		}
1739 	} else {
1740 		raid_bdev_deconfigure(raid_bdev, cb_fn, cb_arg);
1741 	}
1742 }
1743 
1744 static void
1745 raid_bdev_configure_base_bdev_cont(struct raid_base_bdev_info *base_info)
1746 {
1747 	struct raid_bdev *raid_bdev = base_info->raid_bdev;
1748 	int rc;
1749 
1750 	base_info->is_configured = true;
1751 
1752 	raid_bdev->num_base_bdevs_discovered++;
1753 	assert(raid_bdev->num_base_bdevs_discovered <= raid_bdev->num_base_bdevs);
1754 	assert(raid_bdev->num_base_bdevs_operational <= raid_bdev->num_base_bdevs);
1755 	assert(raid_bdev->num_base_bdevs_operational >= raid_bdev->min_base_bdevs_operational);
1756 
1757 	/*
1758 	 * Configure the raid bdev when the number of discovered base bdevs reaches the number
1759 	 * of base bdevs we know to be operational members of the array. Usually this is equal
1760 	 * to the total number of base bdevs (num_base_bdevs) but can be less - when the array is
1761 	 * degraded.
1762 	 */
1763 	if (raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs_operational) {
1764 		rc = raid_bdev_configure(raid_bdev);
1765 		if (rc != 0) {
1766 			SPDK_ERRLOG("Failed to configure raid bdev: %s\n", spdk_strerror(-rc));
1767 		}
1768 	}
1769 }
1770 
1771 static void
1772 raid_bdev_configure_base_bdev_check_sb_cb(const struct raid_bdev_superblock *sb, int status,
1773 		void *ctx)
1774 {
1775 	struct raid_base_bdev_info *base_info = ctx;
1776 
1777 	switch (status) {
1778 	case 0:
1779 		/* valid superblock found */
1780 		SPDK_ERRLOG("Existing raid superblock found on bdev %s\n", base_info->name);
1781 		raid_bdev_free_base_bdev_resource(base_info);
1782 		break;
1783 	case -EINVAL:
1784 		/* no valid superblock */
1785 		raid_bdev_configure_base_bdev_cont(base_info);
1786 		break;
1787 	default:
1788 		SPDK_ERRLOG("Failed to examine bdev %s: %s\n",
1789 			    base_info->name, spdk_strerror(-status));
1790 		break;
1791 	}
1792 }
1793 
1794 static int
1795 raid_bdev_configure_base_bdev(struct raid_base_bdev_info *base_info, bool existing)
1796 {
1797 	struct raid_bdev *raid_bdev = base_info->raid_bdev;
1798 	struct spdk_bdev_desc *desc;
1799 	struct spdk_bdev *bdev;
1800 	const struct spdk_uuid *bdev_uuid;
1801 	int rc;
1802 
1803 	assert(spdk_get_thread() == spdk_thread_get_app_thread());
1804 	assert(base_info->desc == NULL);
1805 
1806 	/*
1807 	 * Base bdev can be added by name or uuid. Here we assure both properties are set and valid
1808 	 * before claiming the bdev.
1809 	 */
1810 
1811 	if (!spdk_uuid_is_null(&base_info->uuid)) {
1812 		char uuid_str[SPDK_UUID_STRING_LEN];
1813 		const char *bdev_name;
1814 
1815 		spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &base_info->uuid);
1816 
1817 		/* UUID of a bdev is registered as its alias */
1818 		bdev = spdk_bdev_get_by_name(uuid_str);
1819 		if (bdev == NULL) {
1820 			return -ENODEV;
1821 		}
1822 
1823 		bdev_name = spdk_bdev_get_name(bdev);
1824 
1825 		if (base_info->name == NULL) {
1826 			assert(existing == true);
1827 			base_info->name = strdup(bdev_name);
1828 			if (base_info->name == NULL) {
1829 				return -ENOMEM;
1830 			}
1831 		} else if (strcmp(base_info->name, bdev_name) != 0) {
1832 			SPDK_ERRLOG("Name mismatch for base bdev '%s' - expected '%s'\n",
1833 				    bdev_name, base_info->name);
1834 			return -EINVAL;
1835 		}
1836 	}
1837 
1838 	assert(base_info->name != NULL);
1839 
1840 	rc = spdk_bdev_open_ext(base_info->name, true, raid_bdev_event_base_bdev, NULL, &desc);
1841 	if (rc != 0) {
1842 		if (rc != -ENODEV) {
1843 			SPDK_ERRLOG("Unable to create desc on bdev '%s'\n", base_info->name);
1844 		}
1845 		return rc;
1846 	}
1847 
1848 	bdev = spdk_bdev_desc_get_bdev(desc);
1849 	bdev_uuid = spdk_bdev_get_uuid(bdev);
1850 
1851 	if (spdk_uuid_is_null(&base_info->uuid)) {
1852 		spdk_uuid_copy(&base_info->uuid, bdev_uuid);
1853 	} else if (spdk_uuid_compare(&base_info->uuid, bdev_uuid) != 0) {
1854 		SPDK_ERRLOG("UUID mismatch for base bdev '%s'\n", base_info->name);
1855 		spdk_bdev_close(desc);
1856 		return -EINVAL;
1857 	}
1858 
1859 	rc = spdk_bdev_module_claim_bdev(bdev, NULL, &g_raid_if);
1860 	if (rc != 0) {
1861 		SPDK_ERRLOG("Unable to claim this bdev as it is already claimed\n");
1862 		spdk_bdev_close(desc);
1863 		return rc;
1864 	}
1865 
1866 	SPDK_DEBUGLOG(bdev_raid, "bdev %s is claimed\n", bdev->name);
1867 
1868 	assert(raid_bdev->state != RAID_BDEV_STATE_ONLINE);
1869 
1870 	base_info->app_thread_ch = spdk_bdev_get_io_channel(desc);
1871 	if (base_info->app_thread_ch == NULL) {
1872 		SPDK_ERRLOG("Failed to get io channel\n");
1873 		spdk_bdev_module_release_bdev(bdev);
1874 		spdk_bdev_close(desc);
1875 		return -ENOMEM;
1876 	}
1877 
1878 	base_info->desc = desc;
1879 	base_info->blockcnt = bdev->blockcnt;
1880 
1881 	if (raid_bdev->sb != NULL) {
1882 		uint64_t data_offset;
1883 
1884 		if (base_info->data_offset == 0) {
1885 			assert((RAID_BDEV_MIN_DATA_OFFSET_SIZE % bdev->blocklen) == 0);
1886 			data_offset = RAID_BDEV_MIN_DATA_OFFSET_SIZE / bdev->blocklen;
1887 		} else {
1888 			data_offset = base_info->data_offset;
1889 		}
1890 
1891 		if (bdev->optimal_io_boundary != 0) {
1892 			data_offset = spdk_divide_round_up(data_offset,
1893 							   bdev->optimal_io_boundary) * bdev->optimal_io_boundary;
1894 			if (base_info->data_offset != 0 && base_info->data_offset != data_offset) {
1895 				SPDK_WARNLOG("Data offset %lu on bdev '%s' is different than optimal value %lu\n",
1896 					     base_info->data_offset, base_info->name, data_offset);
1897 				data_offset = base_info->data_offset;
1898 			}
1899 		}
1900 
1901 		base_info->data_offset = data_offset;
1902 	}
1903 
1904 	if (base_info->data_offset >= bdev->blockcnt) {
1905 		SPDK_ERRLOG("Data offset %lu exceeds base bdev capacity %lu on bdev '%s'\n",
1906 			    base_info->data_offset, bdev->blockcnt, base_info->name);
1907 		rc = -EINVAL;
1908 		goto out;
1909 	}
1910 
1911 	if (base_info->data_size == 0) {
1912 		base_info->data_size = bdev->blockcnt - base_info->data_offset;
1913 	} else if (base_info->data_offset + base_info->data_size > bdev->blockcnt) {
1914 		SPDK_ERRLOG("Data offset and size exceeds base bdev capacity %lu on bdev '%s'\n",
1915 			    bdev->blockcnt, base_info->name);
1916 		rc = -EINVAL;
1917 		goto out;
1918 	}
1919 
1920 	if (existing) {
1921 		raid_bdev_configure_base_bdev_cont(base_info);
1922 	} else {
1923 		/* check for existing superblock when using a new bdev */
1924 		rc = raid_bdev_load_base_bdev_superblock(desc, base_info->app_thread_ch,
1925 				raid_bdev_configure_base_bdev_check_sb_cb, base_info);
1926 		if (rc) {
1927 			SPDK_ERRLOG("Failed to read bdev %s superblock: %s\n",
1928 				    bdev->name, spdk_strerror(-rc));
1929 		}
1930 	}
1931 out:
1932 	if (rc != 0) {
1933 		raid_bdev_free_base_bdev_resource(base_info);
1934 	}
1935 	return rc;
1936 }
1937 
1938 /*
1939  * brief:
1940  * raid_bdev_add_base_device function is the actual function which either adds
1941  * the nvme base device to existing raid bdev or create a new raid bdev. It also claims
1942  * the base device and keep the open descriptor.
1943  * params:
1944  * raid_bdev - pointer to raid bdev
1945  * name - name of the base bdev
1946  * slot - position to add base bdev
1947  * returns:
1948  * 0 - success
1949  * non zero - failure
1950  */
1951 int
1952 raid_bdev_add_base_device(struct raid_bdev *raid_bdev, const char *name, uint8_t slot)
1953 {
1954 	struct raid_base_bdev_info *base_info;
1955 	int rc;
1956 
1957 	if (slot >= raid_bdev->num_base_bdevs) {
1958 		return -EINVAL;
1959 	}
1960 
1961 	base_info = &raid_bdev->base_bdev_info[slot];
1962 
1963 	if (base_info->name != NULL) {
1964 		SPDK_ERRLOG("Slot %u on raid bdev '%s' already assigned to bdev '%s'\n",
1965 			    slot, raid_bdev->bdev.name, base_info->name);
1966 		return -EBUSY;
1967 	}
1968 
1969 	if (!spdk_uuid_is_null(&base_info->uuid)) {
1970 		char uuid_str[SPDK_UUID_STRING_LEN];
1971 
1972 		spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &base_info->uuid);
1973 		SPDK_ERRLOG("Slot %u on raid bdev '%s' already assigned to bdev with uuid %s\n",
1974 			    slot, raid_bdev->bdev.name, uuid_str);
1975 		return -EBUSY;
1976 	}
1977 
1978 	base_info->name = strdup(name);
1979 	if (base_info->name == NULL) {
1980 		return -ENOMEM;
1981 	}
1982 
1983 	rc = raid_bdev_configure_base_bdev(base_info, false);
1984 	if (rc != 0) {
1985 		if (rc != -ENODEV) {
1986 			SPDK_ERRLOG("Failed to allocate resource for bdev '%s'\n", name);
1987 		}
1988 		return rc;
1989 	}
1990 
1991 	return 0;
1992 }
1993 
1994 static int
1995 raid_bdev_create_from_sb(const struct raid_bdev_superblock *sb, struct raid_bdev **raid_bdev_out)
1996 {
1997 	struct raid_bdev *raid_bdev;
1998 	uint8_t i;
1999 	int rc;
2000 
2001 	rc = _raid_bdev_create(sb->name, (sb->strip_size * sb->block_size) / 1024, sb->num_base_bdevs,
2002 			       sb->level, true, &sb->uuid, &raid_bdev);
2003 	if (rc != 0) {
2004 		return rc;
2005 	}
2006 
2007 	assert(sb->length <= RAID_BDEV_SB_MAX_LENGTH);
2008 	memcpy(raid_bdev->sb, sb, sb->length);
2009 
2010 	for (i = 0; i < sb->base_bdevs_size; i++) {
2011 		const struct raid_bdev_sb_base_bdev *sb_base_bdev = &sb->base_bdevs[i];
2012 		struct raid_base_bdev_info *base_info = &raid_bdev->base_bdev_info[sb_base_bdev->slot];
2013 
2014 		if (sb_base_bdev->state == RAID_SB_BASE_BDEV_CONFIGURED) {
2015 			spdk_uuid_copy(&base_info->uuid, &sb_base_bdev->uuid);
2016 			raid_bdev->num_base_bdevs_operational++;
2017 		}
2018 
2019 		base_info->data_offset = sb_base_bdev->data_offset;
2020 		base_info->data_size = sb_base_bdev->data_size;
2021 	}
2022 
2023 	*raid_bdev_out = raid_bdev;
2024 	return 0;
2025 }
2026 
2027 static void
2028 raid_bdev_examine_no_sb(struct spdk_bdev *bdev)
2029 {
2030 	struct raid_bdev *raid_bdev;
2031 	struct raid_base_bdev_info *base_info;
2032 
2033 	TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) {
2034 		RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
2035 			if (base_info->desc == NULL && base_info->name != NULL &&
2036 			    strcmp(bdev->name, base_info->name) == 0) {
2037 				raid_bdev_configure_base_bdev(base_info, true);
2038 				break;
2039 			}
2040 		}
2041 	}
2042 }
2043 
2044 static void
2045 raid_bdev_examine_sb(const struct raid_bdev_superblock *sb, struct spdk_bdev *bdev)
2046 {
2047 	const struct raid_bdev_sb_base_bdev *sb_base_bdev;
2048 	struct raid_bdev *raid_bdev;
2049 	struct raid_base_bdev_info *iter, *base_info;
2050 	uint8_t i;
2051 	int rc;
2052 
2053 	if (sb->block_size != bdev->blocklen) {
2054 		SPDK_WARNLOG("Bdev %s block size (%u) does not match the value in superblock (%u)\n",
2055 			     bdev->name, sb->block_size, bdev->blocklen);
2056 		return;
2057 	}
2058 
2059 	if (spdk_uuid_is_null(&sb->uuid)) {
2060 		SPDK_WARNLOG("NULL raid bdev UUID in superblock on bdev %s\n", bdev->name);
2061 		return;
2062 	}
2063 
2064 	TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) {
2065 		if (spdk_uuid_compare(&raid_bdev->bdev.uuid, &sb->uuid) == 0) {
2066 			break;
2067 		}
2068 	}
2069 
2070 	if (raid_bdev) {
2071 		if (sb->seq_number > raid_bdev->sb->seq_number) {
2072 			SPDK_DEBUGLOG(bdev_raid,
2073 				      "raid superblock seq_number on bdev %s (%lu) greater than existing raid bdev %s (%lu)\n",
2074 				      bdev->name, sb->seq_number, raid_bdev->bdev.name, raid_bdev->sb->seq_number);
2075 
2076 			if (raid_bdev->state != RAID_BDEV_STATE_CONFIGURING) {
2077 				SPDK_WARNLOG("Newer version of raid bdev %s superblock found on bdev %s but raid bdev is not in configuring state.\n",
2078 					     raid_bdev->bdev.name, bdev->name);
2079 				return;
2080 			}
2081 
2082 			/* remove and then recreate the raid bdev using the newer superblock */
2083 			raid_bdev_delete(raid_bdev, NULL, NULL);
2084 			raid_bdev = NULL;
2085 		} else if (sb->seq_number < raid_bdev->sb->seq_number) {
2086 			SPDK_DEBUGLOG(bdev_raid,
2087 				      "raid superblock seq_number on bdev %s (%lu) smaller than existing raid bdev %s (%lu)\n",
2088 				      bdev->name, sb->seq_number, raid_bdev->bdev.name, raid_bdev->sb->seq_number);
2089 			/* use the current raid bdev superblock */
2090 			sb = raid_bdev->sb;
2091 		}
2092 	}
2093 
2094 	for (i = 0; i < sb->base_bdevs_size; i++) {
2095 		sb_base_bdev = &sb->base_bdevs[i];
2096 
2097 		assert(spdk_uuid_is_null(&sb_base_bdev->uuid) == false);
2098 
2099 		if (spdk_uuid_compare(&sb_base_bdev->uuid, spdk_bdev_get_uuid(bdev)) == 0) {
2100 			break;
2101 		}
2102 	}
2103 
2104 	if (i == sb->base_bdevs_size) {
2105 		SPDK_DEBUGLOG(bdev_raid, "raid superblock does not contain this bdev's uuid\n");
2106 		return;
2107 	}
2108 
2109 	if (!raid_bdev) {
2110 		rc = raid_bdev_create_from_sb(sb, &raid_bdev);
2111 		if (rc != 0) {
2112 			SPDK_ERRLOG("Failed to create raid bdev %s: %s\n",
2113 				    sb->name, spdk_strerror(-rc));
2114 			return;
2115 		}
2116 	}
2117 
2118 	if (sb_base_bdev->state != RAID_SB_BASE_BDEV_CONFIGURED) {
2119 		SPDK_NOTICELOG("Bdev %s is not an active member of raid bdev %s. Ignoring.\n",
2120 			       bdev->name, raid_bdev->bdev.name);
2121 		return;
2122 	}
2123 
2124 	base_info = NULL;
2125 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, iter) {
2126 		if (spdk_uuid_compare(&iter->uuid, spdk_bdev_get_uuid(bdev)) == 0) {
2127 			base_info = iter;
2128 			break;
2129 		}
2130 	}
2131 
2132 	if (base_info == NULL) {
2133 		SPDK_ERRLOG("Bdev %s is not a member of raid bdev %s\n",
2134 			    bdev->name, raid_bdev->bdev.name);
2135 		return;
2136 	}
2137 
2138 	rc = raid_bdev_configure_base_bdev(base_info, true);
2139 	if (rc != 0) {
2140 		SPDK_ERRLOG("Failed to configure bdev %s as base bdev of raid %s: %s\n",
2141 			    bdev->name, raid_bdev->bdev.name, spdk_strerror(-rc));
2142 	}
2143 }
2144 
2145 struct raid_bdev_examine_ctx {
2146 	struct spdk_bdev_desc *desc;
2147 	struct spdk_io_channel *ch;
2148 };
2149 
2150 static void
2151 raid_bdev_examine_ctx_free(struct raid_bdev_examine_ctx *ctx)
2152 {
2153 	if (!ctx) {
2154 		return;
2155 	}
2156 
2157 	if (ctx->ch) {
2158 		spdk_put_io_channel(ctx->ch);
2159 	}
2160 
2161 	if (ctx->desc) {
2162 		spdk_bdev_close(ctx->desc);
2163 	}
2164 
2165 	free(ctx);
2166 }
2167 
2168 static void
2169 raid_bdev_examine_load_sb_cb(const struct raid_bdev_superblock *sb, int status, void *_ctx)
2170 {
2171 	struct raid_bdev_examine_ctx *ctx = _ctx;
2172 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(ctx->desc);
2173 
2174 	switch (status) {
2175 	case 0:
2176 		/* valid superblock found */
2177 		SPDK_DEBUGLOG(bdev_raid, "raid superblock found on bdev %s\n", bdev->name);
2178 		raid_bdev_examine_sb(sb, bdev);
2179 		break;
2180 	case -EINVAL:
2181 		/* no valid superblock, check if it can be claimed anyway */
2182 		raid_bdev_examine_no_sb(bdev);
2183 		break;
2184 	default:
2185 		SPDK_ERRLOG("Failed to examine bdev %s: %s\n",
2186 			    bdev->name, spdk_strerror(-status));
2187 		break;
2188 	}
2189 
2190 	raid_bdev_examine_ctx_free(ctx);
2191 	spdk_bdev_module_examine_done(&g_raid_if);
2192 }
2193 
2194 static void
2195 raid_bdev_examine_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *event_ctx)
2196 {
2197 }
2198 
2199 /*
2200  * brief:
2201  * raid_bdev_examine function is the examine function call by the below layers
2202  * like bdev_nvme layer. This function will check if this base bdev can be
2203  * claimed by this raid bdev or not.
2204  * params:
2205  * bdev - pointer to base bdev
2206  * returns:
2207  * none
2208  */
2209 static void
2210 raid_bdev_examine(struct spdk_bdev *bdev)
2211 {
2212 	struct raid_bdev_examine_ctx *ctx;
2213 	int rc;
2214 
2215 	if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) {
2216 		raid_bdev_examine_no_sb(bdev);
2217 		spdk_bdev_module_examine_done(&g_raid_if);
2218 		return;
2219 	}
2220 
2221 	ctx = calloc(1, sizeof(*ctx));
2222 	if (!ctx) {
2223 		SPDK_ERRLOG("Failed to examine bdev %s: %s\n",
2224 			    bdev->name, spdk_strerror(ENOMEM));
2225 		goto err;
2226 	}
2227 
2228 	rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, raid_bdev_examine_event_cb, NULL,
2229 				&ctx->desc);
2230 	if (rc) {
2231 		SPDK_ERRLOG("Failed to open bdev %s: %s\n",
2232 			    bdev->name, spdk_strerror(-rc));
2233 		goto err;
2234 	}
2235 
2236 	ctx->ch = spdk_bdev_get_io_channel(ctx->desc);
2237 	if (!ctx->ch) {
2238 		SPDK_ERRLOG("Failed to get io channel for bdev %s\n", bdev->name);
2239 		goto err;
2240 	}
2241 
2242 	rc = raid_bdev_load_base_bdev_superblock(ctx->desc, ctx->ch, raid_bdev_examine_load_sb_cb, ctx);
2243 	if (rc) {
2244 		SPDK_ERRLOG("Failed to read bdev %s superblock: %s\n",
2245 			    bdev->name, spdk_strerror(-rc));
2246 		goto err;
2247 	}
2248 
2249 	return;
2250 err:
2251 	raid_bdev_examine_ctx_free(ctx);
2252 	spdk_bdev_module_examine_done(&g_raid_if);
2253 }
2254 
2255 /* Log component for bdev raid bdev module */
2256 SPDK_LOG_REGISTER_COMPONENT(bdev_raid)
2257