xref: /spdk/module/bdev/raid/bdev_raid.c (revision a1bda4e1aa244bccfa3a6d3d69d4856b2c32b61d)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "bdev_raid.h"
35 #include "spdk/env.h"
36 #include "spdk/io_channel.h"
37 #include "spdk/conf.h"
38 #include "spdk_internal/log.h"
39 #include "spdk/string.h"
40 #include "spdk/util.h"
41 #include "spdk/json.h"
42 #include "spdk/string.h"
43 
44 static bool g_shutdown_started = false;
45 
46 /* raid bdev config as read from config file */
47 struct raid_config	g_raid_config = {
48 	.raid_bdev_config_head = TAILQ_HEAD_INITIALIZER(g_raid_config.raid_bdev_config_head),
49 };
50 
51 /*
52  * List of raid bdev in configured list, these raid bdevs are registered with
53  * bdev layer
54  */
55 struct raid_configured_tailq	g_raid_bdev_configured_list = TAILQ_HEAD_INITIALIZER(
56 			g_raid_bdev_configured_list);
57 
58 /* List of raid bdev in configuring list */
59 struct raid_configuring_tailq	g_raid_bdev_configuring_list = TAILQ_HEAD_INITIALIZER(
60 			g_raid_bdev_configuring_list);
61 
62 /* List of all raid bdevs */
63 struct raid_all_tailq		g_raid_bdev_list = TAILQ_HEAD_INITIALIZER(g_raid_bdev_list);
64 
65 /* List of all raid bdevs that are offline */
66 struct raid_offline_tailq	g_raid_bdev_offline_list = TAILQ_HEAD_INITIALIZER(
67 			g_raid_bdev_offline_list);
68 
69 /* Function declarations */
70 static void	raid_bdev_examine(struct spdk_bdev *bdev);
71 static int	raid_bdev_init(void);
72 static void	raid_bdev_waitq_io_process(void *ctx);
73 static void	raid_bdev_deconfigure(struct raid_bdev *raid_bdev,
74 				      raid_bdev_destruct_cb cb_fn, void *cb_arg);
75 static void	raid_bdev_remove_base_bdev(void *ctx);
76 
77 /*
78  * brief:
79  * raid_bdev_create_cb function is a cb function for raid bdev which creates the
80  * hierarchy from raid bdev to base bdev io channels. It will be called per core
81  * params:
82  * io_device - pointer to raid bdev io device represented by raid_bdev
83  * ctx_buf - pointer to context buffer for raid bdev io channel
84  * returns:
85  * 0 - success
86  * non zero - failure
87  */
88 static int
89 raid_bdev_create_cb(void *io_device, void *ctx_buf)
90 {
91 	struct raid_bdev            *raid_bdev = io_device;
92 	struct raid_bdev_io_channel *raid_ch = ctx_buf;
93 
94 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_create_cb, %p\n", raid_ch);
95 
96 	assert(raid_bdev != NULL);
97 	assert(raid_bdev->state == RAID_BDEV_STATE_ONLINE);
98 
99 	raid_ch->num_channels = raid_bdev->num_base_bdevs;
100 
101 	raid_ch->base_channel = calloc(raid_ch->num_channels,
102 				       sizeof(struct spdk_io_channel *));
103 	if (!raid_ch->base_channel) {
104 		SPDK_ERRLOG("Unable to allocate base bdevs io channel\n");
105 		return -ENOMEM;
106 	}
107 	for (uint8_t i = 0; i < raid_ch->num_channels; i++) {
108 		/*
109 		 * Get the spdk_io_channel for all the base bdevs. This is used during
110 		 * split logic to send the respective child bdev ios to respective base
111 		 * bdev io channel.
112 		 */
113 		raid_ch->base_channel[i] = spdk_bdev_get_io_channel(
114 						   raid_bdev->base_bdev_info[i].desc);
115 		if (!raid_ch->base_channel[i]) {
116 			for (uint8_t j = 0; j < i; j++) {
117 				spdk_put_io_channel(raid_ch->base_channel[j]);
118 			}
119 			free(raid_ch->base_channel);
120 			raid_ch->base_channel = NULL;
121 			SPDK_ERRLOG("Unable to create io channel for base bdev\n");
122 			return -ENOMEM;
123 		}
124 	}
125 
126 	return 0;
127 }
128 
129 /*
130  * brief:
131  * raid_bdev_destroy_cb function is a cb function for raid bdev which deletes the
132  * hierarchy from raid bdev to base bdev io channels. It will be called per core
133  * params:
134  * io_device - pointer to raid bdev io device represented by raid_bdev
135  * ctx_buf - pointer to context buffer for raid bdev io channel
136  * returns:
137  * none
138  */
139 static void
140 raid_bdev_destroy_cb(void *io_device, void *ctx_buf)
141 {
142 	struct raid_bdev_io_channel *raid_ch = ctx_buf;
143 
144 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_destroy_cb\n");
145 
146 	assert(raid_ch != NULL);
147 	assert(raid_ch->base_channel);
148 	for (uint8_t i = 0; i < raid_ch->num_channels; i++) {
149 		/* Free base bdev channels */
150 		assert(raid_ch->base_channel[i] != NULL);
151 		spdk_put_io_channel(raid_ch->base_channel[i]);
152 	}
153 	free(raid_ch->base_channel);
154 	raid_ch->base_channel = NULL;
155 }
156 
157 /*
158  * brief:
159  * raid_bdev_cleanup is used to cleanup and free raid_bdev related data
160  * structures.
161  * params:
162  * raid_bdev - pointer to raid_bdev
163  * returns:
164  * none
165  */
166 static void
167 raid_bdev_cleanup(struct raid_bdev *raid_bdev)
168 {
169 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_cleanup, %p name %s, state %u, config %p\n",
170 		      raid_bdev,
171 		      raid_bdev->bdev.name, raid_bdev->state, raid_bdev->config);
172 	if (raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) {
173 		TAILQ_REMOVE(&g_raid_bdev_configuring_list, raid_bdev, state_link);
174 	} else if (raid_bdev->state == RAID_BDEV_STATE_OFFLINE) {
175 		TAILQ_REMOVE(&g_raid_bdev_offline_list, raid_bdev, state_link);
176 	} else {
177 		assert(0);
178 	}
179 	TAILQ_REMOVE(&g_raid_bdev_list, raid_bdev, global_link);
180 	free(raid_bdev->bdev.name);
181 	free(raid_bdev->base_bdev_info);
182 	if (raid_bdev->config) {
183 		raid_bdev->config->raid_bdev = NULL;
184 	}
185 	free(raid_bdev);
186 }
187 
188 /*
189  * brief:
190  * free resource of base bdev for raid bdev
191  * params:
192  * raid_bdev - pointer to raid bdev
193  * base_bdev_slot - position to base bdev in raid bdev
194  * returns:
195  * 0 - success
196  * non zero - failure
197  */
198 static void
199 raid_bdev_free_base_bdev_resource(struct raid_bdev *raid_bdev, uint8_t base_bdev_slot)
200 {
201 	struct raid_base_bdev_info *info;
202 
203 	info = &raid_bdev->base_bdev_info[base_bdev_slot];
204 
205 	spdk_bdev_module_release_bdev(info->bdev);
206 	spdk_bdev_close(info->desc);
207 	info->desc = NULL;
208 	info->bdev = NULL;
209 
210 	assert(raid_bdev->num_base_bdevs_discovered);
211 	raid_bdev->num_base_bdevs_discovered--;
212 }
213 
214 /*
215  * brief:
216  * raid_bdev_destruct is the destruct function table pointer for raid bdev
217  * params:
218  * ctxt - pointer to raid_bdev
219  * returns:
220  * 0 - success
221  * non zero - failure
222  */
223 static int
224 raid_bdev_destruct(void *ctxt)
225 {
226 	struct raid_bdev *raid_bdev = ctxt;
227 
228 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_destruct\n");
229 
230 	raid_bdev->destruct_called = true;
231 	for (uint8_t i = 0; i < raid_bdev->num_base_bdevs; i++) {
232 		/*
233 		 * Close all base bdev descriptors for which call has come from below
234 		 * layers.  Also close the descriptors if we have started shutdown.
235 		 */
236 		if (g_shutdown_started ||
237 		    ((raid_bdev->base_bdev_info[i].remove_scheduled == true) &&
238 		     (raid_bdev->base_bdev_info[i].bdev != NULL))) {
239 			raid_bdev_free_base_bdev_resource(raid_bdev, i);
240 		}
241 	}
242 
243 	if (g_shutdown_started) {
244 		TAILQ_REMOVE(&g_raid_bdev_configured_list, raid_bdev, state_link);
245 		raid_bdev->state = RAID_BDEV_STATE_OFFLINE;
246 		TAILQ_INSERT_TAIL(&g_raid_bdev_offline_list, raid_bdev, state_link);
247 	}
248 
249 	spdk_io_device_unregister(raid_bdev, NULL);
250 
251 	if (raid_bdev->num_base_bdevs_discovered == 0) {
252 		/* Free raid_bdev when there are no base bdevs left */
253 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev base bdevs is 0, going to free all in destruct\n");
254 		raid_bdev_cleanup(raid_bdev);
255 	}
256 
257 	return 0;
258 }
259 
260 /*
261  * brief:
262  * raid_bdev_io_completion function is called by lower layers to notify raid
263  * module that particular bdev_io is completed.
264  * params:
265  * bdev_io - pointer to bdev io submitted to lower layers, like child io
266  * success - bdev_io status
267  * cb_arg - function callback context, like parent io pointer
268  * returns:
269  * none
270  */
271 static void
272 raid_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
273 {
274 	struct spdk_bdev_io         *parent_io = cb_arg;
275 
276 	spdk_bdev_free_io(bdev_io);
277 
278 	if (success) {
279 		spdk_bdev_io_complete(parent_io, SPDK_BDEV_IO_STATUS_SUCCESS);
280 	} else {
281 		spdk_bdev_io_complete(parent_io, SPDK_BDEV_IO_STATUS_FAILED);
282 	}
283 }
284 
285 /*
286  * brief:
287  * raid_bdev_submit_rw_request function is used to submit I/O to the correct
288  * member disk
289  * params:
290  * bdev_io - parent bdev io
291  * start_strip - start strip number of this io
292  * returns:
293  * 0 - success
294  * non zero - failure
295  */
296 static int
297 raid_bdev_submit_rw_request(struct spdk_bdev_io *bdev_io, uint64_t start_strip)
298 {
299 	struct raid_bdev_io		*raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
300 	struct raid_bdev_io_channel	*raid_ch = spdk_io_channel_get_ctx(raid_io->ch);
301 	struct raid_bdev		*raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
302 	uint64_t			pd_strip;
303 	uint32_t			offset_in_strip;
304 	uint64_t			pd_lba;
305 	uint64_t			pd_blocks;
306 	uint8_t				pd_idx;
307 	int				ret = 0;
308 
309 	pd_strip = start_strip / raid_bdev->num_base_bdevs;
310 	pd_idx = start_strip % raid_bdev->num_base_bdevs;
311 	offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1);
312 	pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip;
313 	pd_blocks = bdev_io->u.bdev.num_blocks;
314 	if (raid_bdev->base_bdev_info[pd_idx].desc == NULL) {
315 		SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
316 		assert(0);
317 	}
318 
319 	/*
320 	 * Submit child io to bdev layer with using base bdev descriptors, base
321 	 * bdev lba, base bdev child io length in blocks, buffer, completion
322 	 * function and function callback context
323 	 */
324 	assert(raid_ch != NULL);
325 	assert(raid_ch->base_channel);
326 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
327 		ret = spdk_bdev_readv_blocks(raid_bdev->base_bdev_info[pd_idx].desc,
328 					     raid_ch->base_channel[pd_idx],
329 					     bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
330 					     pd_lba, pd_blocks, raid_bdev_io_completion,
331 					     bdev_io);
332 	} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
333 		ret = spdk_bdev_writev_blocks(raid_bdev->base_bdev_info[pd_idx].desc,
334 					      raid_ch->base_channel[pd_idx],
335 					      bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
336 					      pd_lba, pd_blocks, raid_bdev_io_completion,
337 					      bdev_io);
338 	} else {
339 		SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type);
340 		assert(0);
341 	}
342 
343 	return ret;
344 }
345 
346 /*
347  * brief:
348  * get_curr_base_bdev_index function calculates the base bdev index
349  * params:
350  * raid_bdev - pointer to raid bdev
351  * raid_io - pointer to parent io context
352  * returns:
353  * base bdev index
354  */
355 static uint8_t
356 get_curr_base_bdev_index(struct raid_bdev *raid_bdev, struct raid_bdev_io *raid_io)
357 {
358 	struct spdk_bdev_io	*bdev_io;
359 	uint64_t		start_strip;
360 
361 	bdev_io = SPDK_CONTAINEROF(raid_io, struct spdk_bdev_io, driver_ctx);
362 	start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift;
363 
364 	return (start_strip % raid_bdev->num_base_bdevs);
365 }
366 
367 /*
368  * brief:
369  * raid_bdev_io_submit_fail_process function processes the IO which failed to submit.
370  * It will try to queue the IOs after storing the context to bdev wait queue logic.
371  * params:
372  * bdev_io - pointer to bdev_io
373  * raid_io - pointer to raid bdev io
374  * ret - return code
375  * returns:
376  * none
377  */
378 static void
379 raid_bdev_io_submit_fail_process(struct raid_bdev *raid_bdev, struct spdk_bdev_io *bdev_io,
380 				 struct raid_bdev_io *raid_io, int ret)
381 {
382 	struct raid_bdev_io_channel	*raid_ch;
383 	uint8_t				pd_idx;
384 
385 	if (ret != -ENOMEM) {
386 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
387 	} else {
388 		/* Queue the IO to bdev layer wait queue */
389 		pd_idx = get_curr_base_bdev_index(raid_bdev, raid_io);
390 		raid_io->waitq_entry.bdev = raid_bdev->base_bdev_info[pd_idx].bdev;
391 		raid_io->waitq_entry.cb_fn = raid_bdev_waitq_io_process;
392 		raid_io->waitq_entry.cb_arg = raid_io;
393 		raid_ch = spdk_io_channel_get_ctx(raid_io->ch);
394 		if (spdk_bdev_queue_io_wait(raid_bdev->base_bdev_info[pd_idx].bdev,
395 					    raid_ch->base_channel[pd_idx],
396 					    &raid_io->waitq_entry) != 0) {
397 			SPDK_ERRLOG("bdev io waitq error, it should not happen\n");
398 			assert(0);
399 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
400 		}
401 	}
402 }
403 
404 /*
405  * brief:
406  * raid_bdev_waitq_io_process function is the callback function
407  * registered by raid bdev module to bdev when bdev_io was unavailable.
408  * params:
409  * ctx - pointer to raid_bdev_io
410  * returns:
411  * none
412  */
413 static void
414 raid_bdev_waitq_io_process(void *ctx)
415 {
416 	struct raid_bdev_io	*raid_io = ctx;
417 	struct spdk_bdev_io	*bdev_io;
418 	struct raid_bdev	*raid_bdev;
419 	int			ret;
420 	uint64_t		start_strip;
421 
422 	bdev_io = SPDK_CONTAINEROF(raid_io, struct spdk_bdev_io, driver_ctx);
423 	/*
424 	 * Try to submit childs of parent bdev io. If failed due to resource
425 	 * crunch then break the loop and don't try to process other queued IOs.
426 	 */
427 	raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
428 	start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift;
429 	ret = raid_bdev_submit_rw_request(bdev_io, start_strip);
430 	if (ret != 0) {
431 		raid_bdev_io_submit_fail_process(raid_bdev, bdev_io, raid_io, ret);
432 	}
433 }
434 
435 /*
436  * brief:
437  * raid_bdev_start_rw_request function is the submit_request function for
438  * read/write requests
439  * params:
440  * ch - pointer to raid bdev io channel
441  * bdev_io - pointer to parent bdev_io on raid bdev device
442  * returns:
443  * none
444  */
445 static void
446 raid_bdev_start_rw_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
447 {
448 	struct raid_bdev_io		*raid_io;
449 	struct raid_bdev		*raid_bdev;
450 	uint64_t			start_strip = 0;
451 	uint64_t			end_strip = 0;
452 	int				ret;
453 
454 	raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
455 	raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
456 	raid_io->ch = ch;
457 	start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift;
458 	end_strip = (bdev_io->u.bdev.offset_blocks + bdev_io->u.bdev.num_blocks - 1) >>
459 		    raid_bdev->strip_size_shift;
460 	if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) {
461 		assert(false);
462 		SPDK_ERRLOG("I/O spans strip boundary!\n");
463 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
464 		return;
465 	}
466 	ret = raid_bdev_submit_rw_request(bdev_io, start_strip);
467 	if (ret != 0) {
468 		raid_bdev_io_submit_fail_process(raid_bdev, bdev_io, raid_io, ret);
469 	}
470 }
471 
472 /*
473  * brief:
474  * raid_bdev_base_io_completion is the completion callback for member disk requests
475  * params:
476  * bdev_io - pointer to member disk requested bdev_io
477  * success - true if successful, false if unsuccessful
478  * cb_arg - callback argument (parent raid bdev_io)
479  * returns:
480  * none
481  */
482 static void
483 raid_bdev_base_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
484 {
485 	struct spdk_bdev_io *parent_io = cb_arg;
486 	struct raid_bdev_io *raid_io = (struct raid_bdev_io *)parent_io->driver_ctx;
487 
488 	spdk_bdev_free_io(bdev_io);
489 
490 	if (!success) {
491 		raid_io->base_bdev_io_status = SPDK_BDEV_IO_STATUS_FAILED;
492 	}
493 
494 	raid_io->base_bdev_io_completed++;
495 	if (raid_io->base_bdev_io_completed == raid_io->base_bdev_io_expected) {
496 		spdk_bdev_io_complete(parent_io, raid_io->base_bdev_io_status);
497 	}
498 }
499 
500 /*
501  * brief:
502  * raid_bdev_base_io_submit_fail_process processes IO requests for member disk
503  * which failed to submit
504  * params:
505  * raid_bdev_io - pointer to raid bdev_io
506  * pd_idx - base_dev index in raid_bdev
507  * cb_fn - callback when the spdk_bdev_io for base_bdev becomes available
508  * ret - return code
509  * returns:
510  * none
511  */
512 static void
513 raid_bdev_base_io_submit_fail_process(struct spdk_bdev_io *raid_bdev_io, uint8_t pd_idx,
514 				      spdk_bdev_io_wait_cb cb_fn, int ret)
515 {
516 	struct raid_bdev_io *raid_io = (struct raid_bdev_io *)raid_bdev_io->driver_ctx;
517 	struct raid_bdev_io_channel *raid_ch = spdk_io_channel_get_ctx(raid_io->ch);
518 	struct raid_bdev *raid_bdev = (struct raid_bdev *)raid_bdev_io->bdev->ctxt;
519 
520 	assert(ret != 0);
521 
522 	if (ret == -ENOMEM) {
523 		raid_io->waitq_entry.bdev = raid_bdev->base_bdev_info[pd_idx].bdev;
524 		raid_io->waitq_entry.cb_fn = cb_fn;
525 		raid_io->waitq_entry.cb_arg = raid_bdev_io;
526 		spdk_bdev_queue_io_wait(raid_bdev->base_bdev_info[pd_idx].bdev,
527 					raid_ch->base_channel[pd_idx],
528 					&raid_io->waitq_entry);
529 		return;
530 	}
531 
532 	SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
533 	assert(false);
534 	spdk_bdev_io_complete(raid_bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
535 }
536 
537 /*
538  * brief:
539  * _raid_bdev_submit_reset_request_next function submits the next batch of reset requests
540  * to member disks; it will submit as many as possible unless a reset fails with -ENOMEM, in
541  * which case it will queue it for later submission
542  * params:
543  * bdev_io - pointer to parent bdev_io on raid bdev device
544  * returns:
545  * none
546  */
547 static void
548 _raid_bdev_submit_reset_request_next(void *_bdev_io)
549 {
550 	struct spdk_bdev_io		*bdev_io = _bdev_io;
551 	struct raid_bdev_io		*raid_io;
552 	struct raid_bdev		*raid_bdev;
553 	struct raid_bdev_io_channel	*raid_ch;
554 	int				ret;
555 	uint8_t				i;
556 
557 	raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
558 	raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
559 	raid_ch = spdk_io_channel_get_ctx(raid_io->ch);
560 
561 	while (raid_io->base_bdev_io_submitted < raid_bdev->num_base_bdevs) {
562 		i = raid_io->base_bdev_io_submitted;
563 		ret = spdk_bdev_reset(raid_bdev->base_bdev_info[i].desc,
564 				      raid_ch->base_channel[i],
565 				      raid_bdev_base_io_completion, bdev_io);
566 		if (ret == 0) {
567 			raid_io->base_bdev_io_submitted++;
568 		} else {
569 			raid_bdev_base_io_submit_fail_process(bdev_io, i,
570 							      _raid_bdev_submit_reset_request_next, ret);
571 			return;
572 		}
573 	}
574 }
575 
576 /*
577  * brief:
578  * _raid_bdev_submit_reset_request function is the submit_request function for
579  * reset requests
580  * params:
581  * ch - pointer to raid bdev io channel
582  * bdev_io - pointer to parent bdev_io on raid bdev device
583  * returns:
584  * none
585  */
586 static void
587 _raid_bdev_submit_reset_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
588 {
589 	struct raid_bdev_io		*raid_io;
590 	struct raid_bdev		*raid_bdev;
591 
592 	raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
593 	raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
594 	raid_io->ch = ch;
595 	raid_io->base_bdev_io_submitted = 0;
596 	raid_io->base_bdev_io_completed = 0;
597 	raid_io->base_bdev_io_expected = raid_bdev->num_base_bdevs;
598 	raid_io->base_bdev_io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
599 	_raid_bdev_submit_reset_request_next(bdev_io);
600 }
601 
602 static inline void
603 _raid_bdev_get_io_range(struct raid_bdev_io_range *io_range,
604 			uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift,
605 			uint64_t offset_blocks, uint64_t num_blocks)
606 {
607 	uint64_t	start_strip;
608 	uint64_t	end_strip;
609 
610 	io_range->strip_size = strip_size;
611 
612 	/* The start and end strip index in raid0 bdev scope */
613 	start_strip = offset_blocks >> strip_size_shift;
614 	end_strip = (offset_blocks + num_blocks - 1) >> strip_size_shift;
615 	io_range->start_strip_in_disk = start_strip / num_base_bdevs;
616 	io_range->end_strip_in_disk = end_strip / num_base_bdevs;
617 
618 	/* The first strip may have unaligned start LBA offset.
619 	 * The end strip may have unaligned end LBA offset.
620 	 * Strips between them certainly have aligned offset and length to boundaries.
621 	 */
622 	io_range->start_offset_in_strip = offset_blocks % strip_size;
623 	io_range->end_offset_in_strip = (offset_blocks + num_blocks - 1) % strip_size;
624 
625 	/* The base bdev indexes in which start and end strips are located */
626 	io_range->start_disk = start_strip % num_base_bdevs;
627 	io_range->end_disk = end_strip % num_base_bdevs;
628 
629 	/* Calculate how many base_bdevs are involved in io operation.
630 	 * Number of base bdevs involved is between 1 and num_base_bdevs.
631 	 * It will be 1 if the first strip and last strip are the same one.
632 	 */
633 	io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs);
634 }
635 
636 static inline void
637 _raid_bdev_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx,
638 			  uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk)
639 {
640 	uint64_t n_strips_in_disk;
641 	uint64_t start_offset_in_disk;
642 	uint64_t end_offset_in_disk;
643 	uint64_t offset_in_disk;
644 	uint64_t nblocks_in_disk;
645 	uint64_t start_strip_in_disk;
646 	uint64_t end_strip_in_disk;
647 
648 	start_strip_in_disk = io_range->start_strip_in_disk;
649 	if (disk_idx < io_range->start_disk) {
650 		start_strip_in_disk += 1;
651 	}
652 
653 	end_strip_in_disk = io_range->end_strip_in_disk;
654 	if (disk_idx > io_range->end_disk) {
655 		end_strip_in_disk -= 1;
656 	}
657 
658 	assert(end_strip_in_disk >= start_strip_in_disk);
659 	n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1;
660 
661 	if (disk_idx == io_range->start_disk) {
662 		start_offset_in_disk = io_range->start_offset_in_strip;
663 	} else {
664 		start_offset_in_disk = 0;
665 	}
666 
667 	if (disk_idx == io_range->end_disk) {
668 		end_offset_in_disk = io_range->end_offset_in_strip;
669 	} else {
670 		end_offset_in_disk = io_range->strip_size - 1;
671 	}
672 
673 	offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size;
674 	nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size
675 			  + end_offset_in_disk - start_offset_in_disk + 1;
676 
677 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID,
678 		      "raid_bdev (strip_size 0x%lx) splits IO to base_bdev (%u) at (0x%lx, 0x%lx).\n",
679 		      io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk);
680 
681 	*_offset_in_disk = offset_in_disk;
682 	*_nblocks_in_disk = nblocks_in_disk;
683 }
684 
685 /*
686  * brief:
687  * _raid_bdev_submit_null_payload_request_next function submits the next batch of
688  * io requests with range but without payload, like FLUSH and UNMAP, to member disks;
689  * it will submit as many as possible unless one base io request fails with -ENOMEM,
690  * in which case it will queue itself for later submission.
691  * params:
692  * bdev_io - pointer to parent bdev_io on raid bdev device
693  * returns:
694  * none
695  */
696 static void
697 _raid_bdev_submit_null_payload_request_next(void *_bdev_io)
698 {
699 	struct spdk_bdev_io		*bdev_io = _bdev_io;
700 	struct raid_bdev_io		*raid_io;
701 	struct raid_bdev		*raid_bdev;
702 	struct raid_bdev_io_channel	*raid_ch;
703 	struct raid_bdev_io_range	io_range;
704 	int				ret;
705 
706 	raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
707 	raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
708 	raid_ch = spdk_io_channel_get_ctx(raid_io->ch);
709 
710 	_raid_bdev_get_io_range(&io_range, raid_bdev->num_base_bdevs,
711 				raid_bdev->strip_size, raid_bdev->strip_size_shift,
712 				bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks);
713 
714 	raid_io->base_bdev_io_expected = io_range.n_disks_involved;
715 
716 	while (raid_io->base_bdev_io_submitted < raid_io->base_bdev_io_expected) {
717 		uint8_t disk_idx;
718 		uint64_t offset_in_disk;
719 		uint64_t nblocks_in_disk;
720 
721 		/* base_bdev is started from start_disk to end_disk.
722 		 * It is possible that index of start_disk is larger than end_disk's.
723 		 */
724 		disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs;
725 
726 		_raid_bdev_split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk);
727 
728 		switch (bdev_io->type) {
729 		case SPDK_BDEV_IO_TYPE_UNMAP:
730 			ret = spdk_bdev_unmap_blocks(raid_bdev->base_bdev_info[disk_idx].desc,
731 						     raid_ch->base_channel[disk_idx],
732 						     offset_in_disk, nblocks_in_disk,
733 						     raid_bdev_base_io_completion, bdev_io);
734 			break;
735 
736 		case SPDK_BDEV_IO_TYPE_FLUSH:
737 			ret = spdk_bdev_flush_blocks(raid_bdev->base_bdev_info[disk_idx].desc,
738 						     raid_ch->base_channel[disk_idx],
739 						     offset_in_disk, nblocks_in_disk,
740 						     raid_bdev_base_io_completion, bdev_io);
741 			break;
742 
743 		default:
744 			SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type);
745 			assert(false);
746 			ret = -EIO;
747 		}
748 
749 		if (ret == 0) {
750 			raid_io->base_bdev_io_submitted++;
751 		} else {
752 			raid_bdev_base_io_submit_fail_process(bdev_io, disk_idx,
753 							      _raid_bdev_submit_null_payload_request_next, ret);
754 			return;
755 		}
756 	}
757 }
758 
759 /*
760  * brief:
761  * _raid_bdev_submit_null_payload_request function is the submit_request function
762  * for io requests with range but without payload, like UNMAP and FLUSH.
763  * params:
764  * ch - pointer to raid bdev io channel
765  * bdev_io - pointer to parent bdev_io on raid bdev device
766  * returns:
767  * none
768  */
769 static void
770 _raid_bdev_submit_null_payload_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
771 {
772 	struct raid_bdev_io		*raid_io;
773 
774 	raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
775 	raid_io->ch = ch;
776 	raid_io->base_bdev_io_submitted = 0;
777 	raid_io->base_bdev_io_completed = 0;
778 	raid_io->base_bdev_io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
779 
780 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev: type %d, range (0x%lx, 0x%lx)\n",
781 		      bdev_io->type, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks);
782 
783 	_raid_bdev_submit_null_payload_request_next(bdev_io);
784 }
785 
786 /*
787  * brief:
788  * Callback function to spdk_bdev_io_get_buf.
789  * params:
790  * ch - pointer to raid bdev io channel
791  * bdev_io - pointer to parent bdev_io on raid bdev device
792  * success - True if buffer is allocated or false otherwise.
793  * returns:
794  * none
795  */
796 static void
797 raid_bdev_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
798 		     bool success)
799 {
800 	if (!success) {
801 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
802 		return;
803 	}
804 
805 	raid_bdev_start_rw_request(ch, bdev_io);
806 }
807 
808 /*
809  * brief:
810  * raid_bdev_submit_request function is the submit_request function pointer of
811  * raid bdev function table. This is used to submit the io on raid_bdev to below
812  * layers.
813  * params:
814  * ch - pointer to raid bdev io channel
815  * bdev_io - pointer to parent bdev_io on raid bdev device
816  * returns:
817  * none
818  */
819 static void
820 raid_bdev_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
821 {
822 	switch (bdev_io->type) {
823 	case SPDK_BDEV_IO_TYPE_READ:
824 		spdk_bdev_io_get_buf(bdev_io, raid_bdev_get_buf_cb,
825 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
826 		break;
827 	case SPDK_BDEV_IO_TYPE_WRITE:
828 		raid_bdev_start_rw_request(ch, bdev_io);
829 		break;
830 
831 	case SPDK_BDEV_IO_TYPE_RESET:
832 		_raid_bdev_submit_reset_request(ch, bdev_io);
833 		break;
834 
835 	case SPDK_BDEV_IO_TYPE_FLUSH:
836 	case SPDK_BDEV_IO_TYPE_UNMAP:
837 		_raid_bdev_submit_null_payload_request(ch, bdev_io);
838 		break;
839 
840 	default:
841 		SPDK_ERRLOG("submit request, invalid io type %u\n", bdev_io->type);
842 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
843 		break;
844 	}
845 
846 }
847 
848 /*
849  * brief:
850  * _raid_bdev_io_type_supported checks whether io_type is supported in
851  * all base bdev modules of raid bdev module. If anyone among the base_bdevs
852  * doesn't support, the raid device doesn't supports.
853  *
854  * params:
855  * raid_bdev - pointer to raid bdev context
856  * io_type - io type
857  * returns:
858  * true - io_type is supported
859  * false - io_type is not supported
860  */
861 inline static bool
862 _raid_bdev_io_type_supported(struct raid_bdev *raid_bdev, enum spdk_bdev_io_type io_type)
863 {
864 	uint8_t i;
865 
866 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
867 		if (raid_bdev->base_bdev_info[i].bdev == NULL) {
868 			assert(false);
869 			continue;
870 		}
871 
872 		if (spdk_bdev_io_type_supported(raid_bdev->base_bdev_info[i].bdev,
873 						io_type) == false) {
874 			return false;
875 		}
876 	}
877 
878 	return true;
879 }
880 
881 /*
882  * brief:
883  * raid_bdev_io_type_supported is the io_supported function for bdev function
884  * table which returns whether the particular io type is supported or not by
885  * raid bdev module
886  * params:
887  * ctx - pointer to raid bdev context
888  * type - io type
889  * returns:
890  * true - io_type is supported
891  * false - io_type is not supported
892  */
893 static bool
894 raid_bdev_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
895 {
896 	switch (io_type) {
897 	case SPDK_BDEV_IO_TYPE_READ:
898 	case SPDK_BDEV_IO_TYPE_WRITE:
899 		return true;
900 
901 	case SPDK_BDEV_IO_TYPE_FLUSH:
902 	case SPDK_BDEV_IO_TYPE_RESET:
903 	case SPDK_BDEV_IO_TYPE_UNMAP:
904 		return _raid_bdev_io_type_supported(ctx, io_type);
905 
906 	default:
907 		return false;
908 	}
909 
910 	return false;
911 }
912 
913 /*
914  * brief:
915  * raid_bdev_get_io_channel is the get_io_channel function table pointer for
916  * raid bdev. This is used to return the io channel for this raid bdev
917  * params:
918  * ctxt - pointer to raid_bdev
919  * returns:
920  * pointer to io channel for raid bdev
921  */
922 static struct spdk_io_channel *
923 raid_bdev_get_io_channel(void *ctxt)
924 {
925 	struct raid_bdev *raid_bdev = ctxt;
926 
927 	return spdk_get_io_channel(raid_bdev);
928 }
929 
930 /*
931  * brief:
932  * raid_bdev_dump_info_json is the function table pointer for raid bdev
933  * params:
934  * ctx - pointer to raid_bdev
935  * w - pointer to json context
936  * returns:
937  * 0 - success
938  * non zero - failure
939  */
940 static int
941 raid_bdev_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
942 {
943 	struct raid_bdev *raid_bdev = ctx;
944 
945 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_dump_config_json\n");
946 	assert(raid_bdev != NULL);
947 
948 	/* Dump the raid bdev configuration related information */
949 	spdk_json_write_named_object_begin(w, "raid");
950 	spdk_json_write_named_uint32(w, "strip_size", raid_bdev->strip_size);
951 	spdk_json_write_named_uint32(w, "strip_size_kb", raid_bdev->strip_size_kb);
952 	spdk_json_write_named_uint32(w, "state", raid_bdev->state);
953 	spdk_json_write_named_uint32(w, "raid_level", raid_bdev->raid_level);
954 	spdk_json_write_named_uint32(w, "destruct_called", raid_bdev->destruct_called);
955 	spdk_json_write_named_uint32(w, "num_base_bdevs", raid_bdev->num_base_bdevs);
956 	spdk_json_write_named_uint32(w, "num_base_bdevs_discovered", raid_bdev->num_base_bdevs_discovered);
957 	spdk_json_write_name(w, "base_bdevs_list");
958 	spdk_json_write_array_begin(w);
959 	for (uint8_t i = 0; i < raid_bdev->num_base_bdevs; i++) {
960 		if (raid_bdev->base_bdev_info[i].bdev) {
961 			spdk_json_write_string(w, raid_bdev->base_bdev_info[i].bdev->name);
962 		} else {
963 			spdk_json_write_null(w);
964 		}
965 	}
966 	spdk_json_write_array_end(w);
967 	spdk_json_write_object_end(w);
968 
969 	return 0;
970 }
971 
972 /*
973  * brief:
974  * raid_bdev_write_config_json is the function table pointer for raid bdev
975  * params:
976  * bdev - pointer to spdk_bdev
977  * w - pointer to json context
978  * returns:
979  * none
980  */
981 static void
982 raid_bdev_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
983 {
984 	struct raid_bdev *raid_bdev = bdev->ctxt;
985 	struct spdk_bdev *base;
986 	uint8_t i;
987 
988 	spdk_json_write_object_begin(w);
989 
990 	spdk_json_write_named_string(w, "method", "construct_raid_bdev");
991 
992 	spdk_json_write_named_object_begin(w, "params");
993 	spdk_json_write_named_string(w, "name", bdev->name);
994 	spdk_json_write_named_uint32(w, "strip_size", raid_bdev->strip_size_kb);
995 	spdk_json_write_named_uint32(w, "raid_level", raid_bdev->raid_level);
996 
997 	spdk_json_write_named_array_begin(w, "base_bdevs");
998 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
999 		base = raid_bdev->base_bdev_info[i].bdev;
1000 		if (base) {
1001 			spdk_json_write_string(w, base->name);
1002 		}
1003 	}
1004 	spdk_json_write_array_end(w);
1005 	spdk_json_write_object_end(w);
1006 
1007 	spdk_json_write_object_end(w);
1008 }
1009 
1010 /* g_raid_bdev_fn_table is the function table for raid bdev */
1011 static const struct spdk_bdev_fn_table g_raid_bdev_fn_table = {
1012 	.destruct		= raid_bdev_destruct,
1013 	.submit_request		= raid_bdev_submit_request,
1014 	.io_type_supported	= raid_bdev_io_type_supported,
1015 	.get_io_channel		= raid_bdev_get_io_channel,
1016 	.dump_info_json		= raid_bdev_dump_info_json,
1017 	.write_config_json	= raid_bdev_write_config_json,
1018 };
1019 
1020 /*
1021  * brief:
1022  * raid_bdev_config_cleanup function is used to free memory for one raid_bdev in configuration
1023  * params:
1024  * raid_cfg - pointer to raid_bdev_config structure
1025  * returns:
1026  * none
1027  */
1028 void
1029 raid_bdev_config_cleanup(struct raid_bdev_config *raid_cfg)
1030 {
1031 	uint8_t i;
1032 
1033 	TAILQ_REMOVE(&g_raid_config.raid_bdev_config_head, raid_cfg, link);
1034 	g_raid_config.total_raid_bdev--;
1035 
1036 	if (raid_cfg->base_bdev) {
1037 		for (i = 0; i < raid_cfg->num_base_bdevs; i++) {
1038 			free(raid_cfg->base_bdev[i].name);
1039 		}
1040 		free(raid_cfg->base_bdev);
1041 	}
1042 	free(raid_cfg->name);
1043 	free(raid_cfg);
1044 }
1045 
1046 /*
1047  * brief:
1048  * raid_bdev_free is the raid bdev function table function pointer. This is
1049  * called on bdev free path
1050  * params:
1051  * none
1052  * returns:
1053  * none
1054  */
1055 static void
1056 raid_bdev_free(void)
1057 {
1058 	struct raid_bdev_config *raid_cfg, *tmp;
1059 
1060 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_free\n");
1061 	TAILQ_FOREACH_SAFE(raid_cfg, &g_raid_config.raid_bdev_config_head, link, tmp) {
1062 		raid_bdev_config_cleanup(raid_cfg);
1063 	}
1064 }
1065 
1066 /* brief
1067  * raid_bdev_config_find_by_name is a helper function to find raid bdev config
1068  * by name as key.
1069  *
1070  * params:
1071  * raid_name - name for raid bdev.
1072  */
1073 struct raid_bdev_config *
1074 raid_bdev_config_find_by_name(const char *raid_name)
1075 {
1076 	struct raid_bdev_config *raid_cfg;
1077 
1078 	TAILQ_FOREACH(raid_cfg, &g_raid_config.raid_bdev_config_head, link) {
1079 		if (!strcmp(raid_cfg->name, raid_name)) {
1080 			return raid_cfg;
1081 		}
1082 	}
1083 
1084 	return raid_cfg;
1085 }
1086 
1087 /*
1088  * brief
1089  * raid_bdev_config_add function adds config for newly created raid bdev.
1090  *
1091  * params:
1092  * raid_name - name for raid bdev.
1093  * strip_size - strip size in KB
1094  * num_base_bdevs - number of base bdevs.
1095  * raid_level - raid level, only raid level 0 is supported.
1096  * _raid_cfg - Pointer to newly added configuration
1097  */
1098 int
1099 raid_bdev_config_add(const char *raid_name, uint32_t strip_size, uint8_t num_base_bdevs,
1100 		     uint8_t raid_level, struct raid_bdev_config **_raid_cfg)
1101 {
1102 	struct raid_bdev_config *raid_cfg;
1103 
1104 	raid_cfg = raid_bdev_config_find_by_name(raid_name);
1105 	if (raid_cfg != NULL) {
1106 		SPDK_ERRLOG("Duplicate raid bdev name found in config file %s\n",
1107 			    raid_name);
1108 		return -EEXIST;
1109 	}
1110 
1111 	if (spdk_u32_is_pow2(strip_size) == false) {
1112 		SPDK_ERRLOG("Invalid strip size %" PRIu32 "\n", strip_size);
1113 		return -EINVAL;
1114 	}
1115 
1116 	if (num_base_bdevs == 0) {
1117 		SPDK_ERRLOG("Invalid base device count %u\n", num_base_bdevs);
1118 		return -EINVAL;
1119 	}
1120 
1121 	if (raid_level != 0) {
1122 		SPDK_ERRLOG("invalid raid level %u, only raid level 0 is supported\n",
1123 			    raid_level);
1124 		return -EINVAL;
1125 	}
1126 
1127 	raid_cfg = calloc(1, sizeof(*raid_cfg));
1128 	if (raid_cfg == NULL) {
1129 		SPDK_ERRLOG("unable to allocate memory\n");
1130 		return -ENOMEM;
1131 	}
1132 
1133 	raid_cfg->name = strdup(raid_name);
1134 	if (!raid_cfg->name) {
1135 		free(raid_cfg);
1136 		SPDK_ERRLOG("unable to allocate memory\n");
1137 		return -ENOMEM;
1138 	}
1139 	raid_cfg->strip_size = strip_size;
1140 	raid_cfg->num_base_bdevs = num_base_bdevs;
1141 	raid_cfg->raid_level = raid_level;
1142 
1143 	raid_cfg->base_bdev = calloc(num_base_bdevs, sizeof(*raid_cfg->base_bdev));
1144 	if (raid_cfg->base_bdev == NULL) {
1145 		free(raid_cfg->name);
1146 		free(raid_cfg);
1147 		SPDK_ERRLOG("unable to allocate memory\n");
1148 		return -ENOMEM;
1149 	}
1150 
1151 	TAILQ_INSERT_TAIL(&g_raid_config.raid_bdev_config_head, raid_cfg, link);
1152 	g_raid_config.total_raid_bdev++;
1153 
1154 	*_raid_cfg = raid_cfg;
1155 	return 0;
1156 }
1157 
1158 /*
1159  * brief:
1160  * raid_bdev_config_add_base_bdev function add base bdev to raid bdev config.
1161  *
1162  * params:
1163  * raid_cfg - pointer to raid bdev configuration
1164  * base_bdev_name - name of base bdev
1165  * slot - Position to add base bdev
1166  */
1167 int
1168 raid_bdev_config_add_base_bdev(struct raid_bdev_config *raid_cfg, const char *base_bdev_name,
1169 			       uint8_t slot)
1170 {
1171 	uint8_t i;
1172 	struct raid_bdev_config *tmp;
1173 
1174 	if (slot >= raid_cfg->num_base_bdevs) {
1175 		return -EINVAL;
1176 	}
1177 
1178 	TAILQ_FOREACH(tmp, &g_raid_config.raid_bdev_config_head, link) {
1179 		for (i = 0; i < tmp->num_base_bdevs; i++) {
1180 			if (tmp->base_bdev[i].name != NULL) {
1181 				if (!strcmp(tmp->base_bdev[i].name, base_bdev_name)) {
1182 					SPDK_ERRLOG("duplicate base bdev name %s mentioned\n",
1183 						    base_bdev_name);
1184 					return -EEXIST;
1185 				}
1186 			}
1187 		}
1188 	}
1189 
1190 	raid_cfg->base_bdev[slot].name = strdup(base_bdev_name);
1191 	if (raid_cfg->base_bdev[slot].name == NULL) {
1192 		SPDK_ERRLOG("unable to allocate memory\n");
1193 		return -ENOMEM;
1194 	}
1195 
1196 	return 0;
1197 }
1198 /*
1199  * brief:
1200  * raid_bdev_parse_raid is used to parse the raid bdev from config file based on
1201  * pre-defined raid bdev format in config file.
1202  * Format of config file:
1203  *   [RAID1]
1204  *   Name raid1
1205  *   StripSize 64
1206  *   NumDevices 2
1207  *   RaidLevel 0
1208  *   Devices Nvme0n1 Nvme1n1
1209  *
1210  *   [RAID2]
1211  *   Name raid2
1212  *   StripSize 64
1213  *   NumDevices 3
1214  *   RaidLevel 0
1215  *   Devices Nvme2n1 Nvme3n1 Nvme4n1
1216  *
1217  * params:
1218  * conf_section - pointer to config section
1219  * returns:
1220  * 0 - success
1221  * non zero - failure
1222  */
1223 static int
1224 raid_bdev_parse_raid(struct spdk_conf_section *conf_section)
1225 {
1226 	const char *raid_name;
1227 	uint32_t strip_size;
1228 	uint8_t num_base_bdevs, raid_level;
1229 	const char *base_bdev_name;
1230 	struct raid_bdev_config *raid_cfg;
1231 	int rc, i, val;
1232 
1233 	raid_name = spdk_conf_section_get_val(conf_section, "Name");
1234 	if (raid_name == NULL) {
1235 		SPDK_ERRLOG("raid_name is null\n");
1236 		return -EINVAL;
1237 	}
1238 
1239 	val = spdk_conf_section_get_intval(conf_section, "StripSize");
1240 	if (val < 0) {
1241 		return -EINVAL;
1242 	}
1243 	strip_size = val;
1244 
1245 	val = spdk_conf_section_get_intval(conf_section, "NumDevices");
1246 	if (val < 0) {
1247 		return -EINVAL;
1248 	}
1249 	num_base_bdevs = val;
1250 
1251 	val = spdk_conf_section_get_intval(conf_section, "RaidLevel");
1252 	if (val < 0) {
1253 		return -EINVAL;
1254 	}
1255 	raid_level = val;
1256 
1257 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "%s %" PRIu32 " %u %u\n",
1258 		      raid_name, strip_size, num_base_bdevs, raid_level);
1259 
1260 	rc = raid_bdev_config_add(raid_name, strip_size, num_base_bdevs, raid_level,
1261 				  &raid_cfg);
1262 	if (rc != 0) {
1263 		SPDK_ERRLOG("Failed to add raid bdev config\n");
1264 		return rc;
1265 	}
1266 
1267 	for (i = 0; true; i++) {
1268 		base_bdev_name = spdk_conf_section_get_nmval(conf_section, "Devices", 0, i);
1269 		if (base_bdev_name == NULL) {
1270 			break;
1271 		}
1272 		if (i >= num_base_bdevs) {
1273 			raid_bdev_config_cleanup(raid_cfg);
1274 			SPDK_ERRLOG("Number of devices mentioned is more than count\n");
1275 			return -EINVAL;
1276 		}
1277 
1278 		rc = raid_bdev_config_add_base_bdev(raid_cfg, base_bdev_name, i);
1279 		if (rc != 0) {
1280 			raid_bdev_config_cleanup(raid_cfg);
1281 			SPDK_ERRLOG("Failed to add base bdev to raid bdev config\n");
1282 			return rc;
1283 		}
1284 	}
1285 
1286 	if (i != raid_cfg->num_base_bdevs) {
1287 		raid_bdev_config_cleanup(raid_cfg);
1288 		SPDK_ERRLOG("Number of devices mentioned is less than count\n");
1289 		return -EINVAL;
1290 	}
1291 
1292 	rc = raid_bdev_create(raid_cfg);
1293 	if (rc != 0) {
1294 		raid_bdev_config_cleanup(raid_cfg);
1295 		SPDK_ERRLOG("Failed to create raid bdev\n");
1296 		return rc;
1297 	}
1298 
1299 	rc = raid_bdev_add_base_devices(raid_cfg);
1300 	if (rc != 0) {
1301 		SPDK_ERRLOG("Failed to add any base bdev to raid bdev\n");
1302 		/* Config is not removed in this case. */
1303 	}
1304 
1305 	return 0;
1306 }
1307 
1308 /*
1309  * brief:
1310  * raid_bdev_parse_config is used to find the raid bdev config section and parse it
1311  * Format of config file:
1312  * params:
1313  * none
1314  * returns:
1315  * 0 - success
1316  * non zero - failure
1317  */
1318 static int
1319 raid_bdev_parse_config(void)
1320 {
1321 	int                      ret;
1322 	struct spdk_conf_section *conf_section;
1323 
1324 	conf_section = spdk_conf_first_section(NULL);
1325 	while (conf_section != NULL) {
1326 		if (spdk_conf_section_match_prefix(conf_section, "RAID")) {
1327 			ret = raid_bdev_parse_raid(conf_section);
1328 			if (ret < 0) {
1329 				SPDK_ERRLOG("Unable to parse raid bdev section\n");
1330 				return ret;
1331 			}
1332 		}
1333 		conf_section = spdk_conf_next_section(conf_section);
1334 	}
1335 
1336 	return 0;
1337 }
1338 
1339 /*
1340  * brief:
1341  * raid_bdev_fini_start is called when bdev layer is starting the
1342  * shutdown process
1343  * params:
1344  * none
1345  * returns:
1346  * none
1347  */
1348 static void
1349 raid_bdev_fini_start(void)
1350 {
1351 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_fini_start\n");
1352 	g_shutdown_started = true;
1353 }
1354 
1355 /*
1356  * brief:
1357  * raid_bdev_exit is called on raid bdev module exit time by bdev layer
1358  * params:
1359  * none
1360  * returns:
1361  * none
1362  */
1363 static void
1364 raid_bdev_exit(void)
1365 {
1366 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_exit\n");
1367 	raid_bdev_free();
1368 }
1369 
1370 /*
1371  * brief:
1372  * raid_bdev_get_ctx_size is used to return the context size of bdev_io for raid
1373  * module
1374  * params:
1375  * none
1376  * returns:
1377  * size of spdk_bdev_io context for raid
1378  */
1379 static int
1380 raid_bdev_get_ctx_size(void)
1381 {
1382 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_get_ctx_size\n");
1383 	return sizeof(struct raid_bdev_io);
1384 }
1385 
1386 /*
1387  * brief:
1388  * raid_bdev_get_running_config is used to get the configuration options.
1389  *
1390  * params:
1391  * fp - The pointer to a file that will be written to the configuration options.
1392  * returns:
1393  * none
1394  */
1395 static void
1396 raid_bdev_get_running_config(FILE *fp)
1397 {
1398 	struct raid_bdev *raid_bdev;
1399 	struct spdk_bdev *base;
1400 	int index = 1;
1401 	uint8_t i;
1402 
1403 	TAILQ_FOREACH(raid_bdev, &g_raid_bdev_configured_list, state_link) {
1404 		fprintf(fp,
1405 			"\n"
1406 			"[RAID%d]\n"
1407 			"  Name %s\n"
1408 			"  StripSize %" PRIu32 "\n"
1409 			"  NumDevices %u\n"
1410 			"  RaidLevel %hhu\n",
1411 			index, raid_bdev->bdev.name, raid_bdev->strip_size_kb,
1412 			raid_bdev->num_base_bdevs, raid_bdev->raid_level);
1413 		fprintf(fp,
1414 			"  Devices ");
1415 		for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
1416 			base = raid_bdev->base_bdev_info[i].bdev;
1417 			if (base) {
1418 				fprintf(fp,
1419 					"%s ",
1420 					base->name);
1421 			}
1422 		}
1423 		fprintf(fp,
1424 			"\n");
1425 		index++;
1426 	}
1427 }
1428 
1429 /*
1430  * brief:
1431  * raid_bdev_can_claim_bdev is the function to check if this base_bdev can be
1432  * claimed by raid bdev or not.
1433  * params:
1434  * bdev_name - represents base bdev name
1435  * _raid_cfg - pointer to raid bdev config parsed from config file
1436  * base_bdev_slot - if bdev can be claimed, it represents the base_bdev correct
1437  * slot. This field is only valid if return value of this function is true
1438  * returns:
1439  * true - if bdev can be claimed
1440  * false - if bdev can't be claimed
1441  */
1442 static bool
1443 raid_bdev_can_claim_bdev(const char *bdev_name, struct raid_bdev_config **_raid_cfg,
1444 			 uint8_t *base_bdev_slot)
1445 {
1446 	struct raid_bdev_config *raid_cfg;
1447 	uint8_t i;
1448 
1449 	TAILQ_FOREACH(raid_cfg, &g_raid_config.raid_bdev_config_head, link) {
1450 		for (i = 0; i < raid_cfg->num_base_bdevs; i++) {
1451 			/*
1452 			 * Check if the base bdev name is part of raid bdev configuration.
1453 			 * If match is found then return true and the slot information where
1454 			 * this base bdev should be inserted in raid bdev
1455 			 */
1456 			if (!strcmp(bdev_name, raid_cfg->base_bdev[i].name)) {
1457 				*_raid_cfg = raid_cfg;
1458 				*base_bdev_slot = i;
1459 				return true;
1460 			}
1461 		}
1462 	}
1463 
1464 	return false;
1465 }
1466 
1467 
1468 static struct spdk_bdev_module g_raid_if = {
1469 	.name = "raid",
1470 	.module_init = raid_bdev_init,
1471 	.fini_start = raid_bdev_fini_start,
1472 	.module_fini = raid_bdev_exit,
1473 	.get_ctx_size = raid_bdev_get_ctx_size,
1474 	.examine_config = raid_bdev_examine,
1475 	.config_text = raid_bdev_get_running_config,
1476 	.async_init = false,
1477 	.async_fini = false,
1478 };
1479 SPDK_BDEV_MODULE_REGISTER(raid, &g_raid_if)
1480 
1481 /*
1482  * brief:
1483  * raid_bdev_init is the initialization function for raid bdev module
1484  * params:
1485  * none
1486  * returns:
1487  * 0 - success
1488  * non zero - failure
1489  */
1490 static int
1491 raid_bdev_init(void)
1492 {
1493 	int ret;
1494 
1495 	/* Parse config file for raids */
1496 	ret = raid_bdev_parse_config();
1497 	if (ret < 0) {
1498 		SPDK_ERRLOG("raid bdev init failed parsing\n");
1499 		raid_bdev_free();
1500 		return ret;
1501 	}
1502 
1503 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_init completed successfully\n");
1504 
1505 	return 0;
1506 }
1507 
1508 /*
1509  * brief:
1510  * raid_bdev_create allocates raid bdev based on passed configuration
1511  * params:
1512  * raid_cfg - configuration of raid bdev
1513  * returns:
1514  * 0 - success
1515  * non zero - failure
1516  */
1517 int
1518 raid_bdev_create(struct raid_bdev_config *raid_cfg)
1519 {
1520 	struct raid_bdev *raid_bdev;
1521 	struct spdk_bdev *raid_bdev_gen;
1522 
1523 	raid_bdev = calloc(1, sizeof(*raid_bdev));
1524 	if (!raid_bdev) {
1525 		SPDK_ERRLOG("Unable to allocate memory for raid bdev\n");
1526 		return -ENOMEM;
1527 	}
1528 
1529 	assert(raid_cfg->num_base_bdevs != 0);
1530 	raid_bdev->num_base_bdevs = raid_cfg->num_base_bdevs;
1531 	raid_bdev->base_bdev_info = calloc(raid_bdev->num_base_bdevs,
1532 					   sizeof(struct raid_base_bdev_info));
1533 	if (!raid_bdev->base_bdev_info) {
1534 		SPDK_ERRLOG("Unable able to allocate base bdev info\n");
1535 		free(raid_bdev);
1536 		return -ENOMEM;
1537 	}
1538 
1539 	/* strip_size_kb is from the rpc param.  strip_size is in blocks and used
1540 	 * intnerally and set later.
1541 	 */
1542 	raid_bdev->strip_size = 0;
1543 	raid_bdev->strip_size_kb = raid_cfg->strip_size;
1544 	raid_bdev->state = RAID_BDEV_STATE_CONFIGURING;
1545 	raid_bdev->config = raid_cfg;
1546 
1547 	raid_bdev_gen = &raid_bdev->bdev;
1548 
1549 	raid_bdev_gen->name = strdup(raid_cfg->name);
1550 	if (!raid_bdev_gen->name) {
1551 		SPDK_ERRLOG("Unable to allocate name for raid\n");
1552 		free(raid_bdev->base_bdev_info);
1553 		free(raid_bdev);
1554 		return -ENOMEM;
1555 	}
1556 
1557 	raid_bdev_gen->product_name = "Raid Volume";
1558 	raid_bdev_gen->ctxt = raid_bdev;
1559 	raid_bdev_gen->fn_table = &g_raid_bdev_fn_table;
1560 	raid_bdev_gen->module = &g_raid_if;
1561 	raid_bdev_gen->write_cache = 0;
1562 
1563 	TAILQ_INSERT_TAIL(&g_raid_bdev_configuring_list, raid_bdev, state_link);
1564 	TAILQ_INSERT_TAIL(&g_raid_bdev_list, raid_bdev, global_link);
1565 
1566 	raid_cfg->raid_bdev = raid_bdev;
1567 
1568 	return 0;
1569 }
1570 
1571 /*
1572  * brief
1573  * raid_bdev_alloc_base_bdev_resource allocates resource of base bdev.
1574  * params:
1575  * raid_bdev - pointer to raid bdev
1576  * bdev - pointer to base bdev
1577  * base_bdev_slot - position to add base bdev
1578  * returns:
1579  * 0 - success
1580  * non zero - failure
1581  */
1582 static int
1583 raid_bdev_alloc_base_bdev_resource(struct raid_bdev *raid_bdev, struct spdk_bdev *bdev,
1584 				   uint8_t base_bdev_slot)
1585 {
1586 	struct spdk_bdev_desc *desc;
1587 	int rc;
1588 
1589 	rc = spdk_bdev_open(bdev, true, raid_bdev_remove_base_bdev, bdev, &desc);
1590 	if (rc != 0) {
1591 		SPDK_ERRLOG("Unable to create desc on bdev '%s'\n", bdev->name);
1592 		return rc;
1593 	}
1594 
1595 	rc = spdk_bdev_module_claim_bdev(bdev, NULL, &g_raid_if);
1596 	if (rc != 0) {
1597 		SPDK_ERRLOG("Unable to claim this bdev as it is already claimed\n");
1598 		spdk_bdev_close(desc);
1599 		return rc;
1600 	}
1601 
1602 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "bdev %s is claimed\n", bdev->name);
1603 
1604 	assert(raid_bdev->state != RAID_BDEV_STATE_ONLINE);
1605 	assert(base_bdev_slot < raid_bdev->num_base_bdevs);
1606 
1607 	raid_bdev->base_bdev_info[base_bdev_slot].bdev = bdev;
1608 	raid_bdev->base_bdev_info[base_bdev_slot].desc = desc;
1609 	raid_bdev->num_base_bdevs_discovered++;
1610 	assert(raid_bdev->num_base_bdevs_discovered <= raid_bdev->num_base_bdevs);
1611 
1612 	return 0;
1613 }
1614 
1615 /*
1616  * brief:
1617  * If raid bdev config is complete, then only register the raid bdev to
1618  * bdev layer and remove this raid bdev from configuring list and
1619  * insert the raid bdev to configured list
1620  * params:
1621  * raid_bdev - pointer to raid bdev
1622  * returns:
1623  * 0 - success
1624  * non zero - failure
1625  */
1626 static int
1627 raid_bdev_configure(struct raid_bdev *raid_bdev)
1628 {
1629 	uint32_t		blocklen;
1630 	uint64_t		min_blockcnt;
1631 	struct spdk_bdev	*raid_bdev_gen;
1632 	int rc = 0;
1633 
1634 	blocklen = raid_bdev->base_bdev_info[0].bdev->blocklen;
1635 	min_blockcnt = raid_bdev->base_bdev_info[0].bdev->blockcnt;
1636 	for (uint8_t i = 1; i < raid_bdev->num_base_bdevs; i++) {
1637 		/* Calculate minimum block count from all base bdevs */
1638 		if (raid_bdev->base_bdev_info[i].bdev->blockcnt < min_blockcnt) {
1639 			min_blockcnt = raid_bdev->base_bdev_info[i].bdev->blockcnt;
1640 		}
1641 
1642 		/* Check blocklen for all base bdevs that it should be same */
1643 		if (blocklen != raid_bdev->base_bdev_info[i].bdev->blocklen) {
1644 			/*
1645 			 * Assumption is that all the base bdevs for any raid bdev should
1646 			 * have same blocklen
1647 			 */
1648 			SPDK_ERRLOG("Blocklen of various bdevs not matching\n");
1649 			return -EINVAL;
1650 		}
1651 	}
1652 
1653 	/* The strip_size_kb is read in from user in KB. Convert to blocks here for
1654 	 * internal use.
1655 	 */
1656 	raid_bdev->strip_size = (raid_bdev->strip_size_kb * 1024) / blocklen;
1657 	raid_bdev->strip_size_shift = spdk_u32log2(raid_bdev->strip_size);
1658 	raid_bdev->blocklen_shift = spdk_u32log2(blocklen);
1659 
1660 	raid_bdev_gen = &raid_bdev->bdev;
1661 	raid_bdev_gen->blocklen = blocklen;
1662 	if (raid_bdev->num_base_bdevs > 1) {
1663 		raid_bdev_gen->optimal_io_boundary = raid_bdev->strip_size;
1664 		raid_bdev_gen->split_on_optimal_io_boundary = true;
1665 	} else {
1666 		/* Do not need to split reads/writes on single bdev RAID modules. */
1667 		raid_bdev_gen->optimal_io_boundary = 0;
1668 		raid_bdev_gen->split_on_optimal_io_boundary = false;
1669 	}
1670 
1671 	/*
1672 	 * RAID bdev logic is for striping so take the minimum block count based
1673 	 * approach where total block count of raid bdev is the number of base
1674 	 * bdev times the minimum block count of any base bdev
1675 	 */
1676 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "min blockcount %lu,  numbasedev %u, strip size shift %u\n",
1677 		      min_blockcnt,
1678 		      raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift);
1679 	raid_bdev_gen->blockcnt = ((min_blockcnt >> raid_bdev->strip_size_shift) <<
1680 				   raid_bdev->strip_size_shift)  * raid_bdev->num_base_bdevs;
1681 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "io device register %p\n", raid_bdev);
1682 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "blockcnt %lu, blocklen %u\n", raid_bdev_gen->blockcnt,
1683 		      raid_bdev_gen->blocklen);
1684 	if (raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) {
1685 		raid_bdev->state = RAID_BDEV_STATE_ONLINE;
1686 		spdk_io_device_register(raid_bdev, raid_bdev_create_cb, raid_bdev_destroy_cb,
1687 					sizeof(struct raid_bdev_io_channel),
1688 					raid_bdev->bdev.name);
1689 		rc = spdk_bdev_register(raid_bdev_gen);
1690 		if (rc != 0) {
1691 			SPDK_ERRLOG("Unable to register raid bdev and stay at configuring state\n");
1692 			spdk_io_device_unregister(raid_bdev, NULL);
1693 			raid_bdev->state = RAID_BDEV_STATE_CONFIGURING;
1694 			return rc;
1695 		}
1696 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev generic %p\n", raid_bdev_gen);
1697 		TAILQ_REMOVE(&g_raid_bdev_configuring_list, raid_bdev, state_link);
1698 		TAILQ_INSERT_TAIL(&g_raid_bdev_configured_list, raid_bdev, state_link);
1699 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev is created with name %s, raid_bdev %p\n",
1700 			      raid_bdev_gen->name, raid_bdev);
1701 	}
1702 
1703 	return 0;
1704 }
1705 
1706 /*
1707  * brief:
1708  * If raid bdev is online and registered, change the bdev state to
1709  * configuring and unregister this raid device. Queue this raid device
1710  * in configuring list
1711  * params:
1712  * raid_bdev - pointer to raid bdev
1713  * cb_fn - callback function
1714  * cb_arg - argument to callback function
1715  * returns:
1716  * none
1717  */
1718 static void
1719 raid_bdev_deconfigure(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn,
1720 		      void *cb_arg)
1721 {
1722 	if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) {
1723 		if (cb_fn) {
1724 			cb_fn(cb_arg, 0);
1725 		}
1726 		return;
1727 	}
1728 
1729 	assert(raid_bdev->num_base_bdevs == raid_bdev->num_base_bdevs_discovered);
1730 	TAILQ_REMOVE(&g_raid_bdev_configured_list, raid_bdev, state_link);
1731 	raid_bdev->state = RAID_BDEV_STATE_OFFLINE;
1732 	assert(raid_bdev->num_base_bdevs_discovered);
1733 	TAILQ_INSERT_TAIL(&g_raid_bdev_offline_list, raid_bdev, state_link);
1734 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev state chaning from online to offline\n");
1735 
1736 	spdk_bdev_unregister(&raid_bdev->bdev, cb_fn, cb_arg);
1737 }
1738 
1739 /*
1740  * brief:
1741  * raid_bdev_find_by_base_bdev function finds the raid bdev which has
1742  *  claimed the base bdev.
1743  * params:
1744  * base_bdev - pointer to base bdev pointer
1745  * _raid_bdev - Referenct to pointer to raid bdev
1746  * _base_bdev_slot - Reference to the slot of the base bdev.
1747  * returns:
1748  * true - if the raid bdev is found.
1749  * false - if the raid bdev is not found.
1750  */
1751 static bool
1752 raid_bdev_find_by_base_bdev(struct spdk_bdev *base_bdev, struct raid_bdev **_raid_bdev,
1753 			    uint8_t *_base_bdev_slot)
1754 {
1755 	struct raid_bdev	*raid_bdev;
1756 	uint8_t			i;
1757 
1758 	TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) {
1759 		for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
1760 			if (raid_bdev->base_bdev_info[i].bdev == base_bdev) {
1761 				*_raid_bdev = raid_bdev;
1762 				*_base_bdev_slot = i;
1763 				return true;
1764 			}
1765 		}
1766 	}
1767 
1768 	return false;
1769 }
1770 
1771 /*
1772  * brief:
1773  * raid_bdev_remove_base_bdev function is called by below layers when base_bdev
1774  * is removed. This function checks if this base bdev is part of any raid bdev
1775  * or not. If yes, it takes necessary action on that particular raid bdev.
1776  * params:
1777  * ctx - pointer to base bdev pointer which got removed
1778  * returns:
1779  * none
1780  */
1781 static void
1782 raid_bdev_remove_base_bdev(void *ctx)
1783 {
1784 	struct spdk_bdev	*base_bdev = ctx;
1785 	struct raid_bdev	*raid_bdev = NULL;
1786 	uint8_t			base_bdev_slot = 0;
1787 
1788 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_remove_base_bdev\n");
1789 
1790 	/* Find the raid_bdev which has claimed this base_bdev */
1791 	if (!raid_bdev_find_by_base_bdev(base_bdev, &raid_bdev, &base_bdev_slot)) {
1792 		SPDK_ERRLOG("bdev to remove '%s' not found\n", base_bdev->name);
1793 		return;
1794 	}
1795 
1796 	assert(raid_bdev->base_bdev_info[base_bdev_slot].desc);
1797 	raid_bdev->base_bdev_info[base_bdev_slot].remove_scheduled = true;
1798 
1799 	if (raid_bdev->destruct_called == true ||
1800 	    raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) {
1801 		/*
1802 		 * As raid bdev is not registered yet or already unregistered,
1803 		 * so cleanup should be done here itself.
1804 		 */
1805 		raid_bdev_free_base_bdev_resource(raid_bdev, base_bdev_slot);
1806 		if (raid_bdev->num_base_bdevs_discovered == 0) {
1807 			/* There is no base bdev for this raid, so free the raid device. */
1808 			raid_bdev_cleanup(raid_bdev);
1809 			return;
1810 		}
1811 	}
1812 
1813 	raid_bdev_deconfigure(raid_bdev, NULL, NULL);
1814 }
1815 
1816 /*
1817  * brief:
1818  * Remove base bdevs from the raid bdev one by one.  Skip any base bdev which
1819  *  doesn't exist.
1820  * params:
1821  * raid_cfg - pointer to raid bdev config.
1822  * cb_fn - callback function
1823  * cb_ctx - argument to callback function
1824  */
1825 void
1826 raid_bdev_remove_base_devices(struct raid_bdev_config *raid_cfg,
1827 			      raid_bdev_destruct_cb cb_fn, void *cb_arg)
1828 {
1829 	struct raid_bdev		*raid_bdev;
1830 	struct raid_base_bdev_info	*info;
1831 	uint8_t				i;
1832 
1833 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_remove_base_devices\n");
1834 
1835 	raid_bdev = raid_cfg->raid_bdev;
1836 	if (raid_bdev == NULL) {
1837 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev %s doesn't exist now\n", raid_cfg->name);
1838 		if (cb_fn) {
1839 			cb_fn(cb_arg, 0);
1840 		}
1841 		return;
1842 	}
1843 
1844 	if (raid_bdev->destroy_started) {
1845 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "destroying raid bdev %s is already started\n",
1846 			      raid_cfg->name);
1847 		if (cb_fn) {
1848 			cb_fn(cb_arg, -EALREADY);
1849 		}
1850 		return;
1851 	}
1852 
1853 	raid_bdev->destroy_started = true;
1854 
1855 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
1856 		info = &raid_bdev->base_bdev_info[i];
1857 
1858 		if (info->bdev == NULL) {
1859 			continue;
1860 		}
1861 
1862 		assert(info->desc);
1863 		info->remove_scheduled = true;
1864 
1865 		if (raid_bdev->destruct_called == true ||
1866 		    raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) {
1867 			/*
1868 			 * As raid bdev is not registered yet or already unregistered,
1869 			 * so cleanup should be done here itself.
1870 			 */
1871 			raid_bdev_free_base_bdev_resource(raid_bdev, i);
1872 			if (raid_bdev->num_base_bdevs_discovered == 0) {
1873 				/* There is no base bdev for this raid, so free the raid device. */
1874 				raid_bdev_cleanup(raid_bdev);
1875 				if (cb_fn) {
1876 					cb_fn(cb_arg, 0);
1877 				}
1878 				return;
1879 			}
1880 		}
1881 	}
1882 
1883 	raid_bdev_deconfigure(raid_bdev, cb_fn, cb_arg);
1884 }
1885 
1886 /*
1887  * brief:
1888  * raid_bdev_add_base_device function is the actual function which either adds
1889  * the nvme base device to existing raid bdev or create a new raid bdev. It also claims
1890  * the base device and keep the open descriptor.
1891  * params:
1892  * raid_cfg - pointer to raid bdev config
1893  * bdev - pointer to base bdev
1894  * base_bdev_slot - position to add base bdev
1895  * returns:
1896  * 0 - success
1897  * non zero - failure
1898  */
1899 static int
1900 raid_bdev_add_base_device(struct raid_bdev_config *raid_cfg, struct spdk_bdev *bdev,
1901 			  uint8_t base_bdev_slot)
1902 {
1903 	struct raid_bdev	*raid_bdev;
1904 	int			rc;
1905 
1906 	raid_bdev = raid_cfg->raid_bdev;
1907 	if (!raid_bdev) {
1908 		SPDK_ERRLOG("Raid bdev '%s' is not created yet\n", raid_cfg->name);
1909 		return -ENODEV;
1910 	}
1911 
1912 	rc = raid_bdev_alloc_base_bdev_resource(raid_bdev, bdev, base_bdev_slot);
1913 	if (rc != 0) {
1914 		SPDK_ERRLOG("Failed to allocate resource for bdev '%s'\n", bdev->name);
1915 		return rc;
1916 	}
1917 
1918 	assert(raid_bdev->num_base_bdevs_discovered <= raid_bdev->num_base_bdevs);
1919 
1920 	if (raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs) {
1921 		rc = raid_bdev_configure(raid_bdev);
1922 		if (rc != 0) {
1923 			SPDK_ERRLOG("Failed to configure raid bdev\n");
1924 			return rc;
1925 		}
1926 	}
1927 
1928 	return 0;
1929 }
1930 
1931 /*
1932  * brief:
1933  * Add base bdevs to the raid bdev one by one.  Skip any base bdev which doesn't
1934  *  exist or fails to add. If all base bdevs are successfully added, the raid bdev
1935  *  moves to the configured state and becomes available. Otherwise, the raid bdev
1936  *  stays at the configuring state with added base bdevs.
1937  * params:
1938  * raid_cfg - pointer to raid bdev config
1939  * returns:
1940  * 0 - The raid bdev moves to the configured state or stays at the configuring
1941  *     state with added base bdevs due to any nonexistent base bdev.
1942  * non zero - Failed to add any base bdev and stays at the configuring state with
1943  *            added base bdevs.
1944  */
1945 int
1946 raid_bdev_add_base_devices(struct raid_bdev_config *raid_cfg)
1947 {
1948 	struct spdk_bdev	*base_bdev;
1949 	uint8_t			i;
1950 	int			rc = 0, _rc;
1951 
1952 	for (i = 0; i < raid_cfg->num_base_bdevs; i++) {
1953 		base_bdev = spdk_bdev_get_by_name(raid_cfg->base_bdev[i].name);
1954 		if (base_bdev == NULL) {
1955 			SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "base bdev %s doesn't exist now\n",
1956 				      raid_cfg->base_bdev[i].name);
1957 			continue;
1958 		}
1959 
1960 		_rc = raid_bdev_add_base_device(raid_cfg, base_bdev, i);
1961 		if (_rc != 0) {
1962 			SPDK_ERRLOG("Failed to add base bdev %s to RAID bdev %s: %s\n",
1963 				    raid_cfg->base_bdev[i].name, raid_cfg->name,
1964 				    spdk_strerror(-_rc));
1965 			if (rc == 0) {
1966 				rc = _rc;
1967 			}
1968 		}
1969 	}
1970 
1971 	return rc;
1972 }
1973 
1974 /*
1975  * brief:
1976  * raid_bdev_examine function is the examine function call by the below layers
1977  * like bdev_nvme layer. This function will check if this base bdev can be
1978  * claimed by this raid bdev or not.
1979  * params:
1980  * bdev - pointer to base bdev
1981  * returns:
1982  * none
1983  */
1984 static void
1985 raid_bdev_examine(struct spdk_bdev *bdev)
1986 {
1987 	struct raid_bdev_config	*raid_cfg;
1988 	uint8_t			base_bdev_slot;
1989 
1990 	if (raid_bdev_can_claim_bdev(bdev->name, &raid_cfg, &base_bdev_slot)) {
1991 		raid_bdev_add_base_device(raid_cfg, bdev, base_bdev_slot);
1992 	} else {
1993 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "bdev %s can't be claimed\n",
1994 			      bdev->name);
1995 	}
1996 
1997 	spdk_bdev_module_examine_done(&g_raid_if);
1998 }
1999 
2000 /* Log component for bdev raid bdev module */
2001 SPDK_LOG_REGISTER_COMPONENT("bdev_raid", SPDK_LOG_BDEV_RAID)
2002