xref: /spdk/module/bdev/raid/bdev_raid.c (revision 6b6dfea6c704a049e553024aa7e44ae916948e20)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "bdev_raid.h"
35 #include "spdk/env.h"
36 #include "spdk/io_channel.h"
37 #include "spdk/conf.h"
38 #include "spdk_internal/log.h"
39 #include "spdk/string.h"
40 #include "spdk/util.h"
41 #include "spdk/json.h"
42 #include "spdk/string.h"
43 
44 static bool g_shutdown_started = false;
45 
46 /* raid bdev config as read from config file */
47 struct raid_config	g_raid_config = {
48 	.raid_bdev_config_head = TAILQ_HEAD_INITIALIZER(g_raid_config.raid_bdev_config_head),
49 };
50 
51 /*
52  * List of raid bdev in configured list, these raid bdevs are registered with
53  * bdev layer
54  */
55 struct raid_configured_tailq	g_raid_bdev_configured_list = TAILQ_HEAD_INITIALIZER(
56 			g_raid_bdev_configured_list);
57 
58 /* List of raid bdev in configuring list */
59 struct raid_configuring_tailq	g_raid_bdev_configuring_list = TAILQ_HEAD_INITIALIZER(
60 			g_raid_bdev_configuring_list);
61 
62 /* List of all raid bdevs */
63 struct raid_all_tailq		g_raid_bdev_list = TAILQ_HEAD_INITIALIZER(g_raid_bdev_list);
64 
65 /* List of all raid bdevs that are offline */
66 struct raid_offline_tailq	g_raid_bdev_offline_list = TAILQ_HEAD_INITIALIZER(
67 			g_raid_bdev_offline_list);
68 
69 /* Function declarations */
70 static void	raid_bdev_examine(struct spdk_bdev *bdev);
71 static int	raid_bdev_init(void);
72 static void	raid0_waitq_io_process(void *ctx);
73 static void	raid_bdev_deconfigure(struct raid_bdev *raid_bdev,
74 				      raid_bdev_destruct_cb cb_fn, void *cb_arg);
75 static void	raid_bdev_remove_base_bdev(void *ctx);
76 
77 /*
78  * brief:
79  * raid_bdev_create_cb function is a cb function for raid bdev which creates the
80  * hierarchy from raid bdev to base bdev io channels. It will be called per core
81  * params:
82  * io_device - pointer to raid bdev io device represented by raid_bdev
83  * ctx_buf - pointer to context buffer for raid bdev io channel
84  * returns:
85  * 0 - success
86  * non zero - failure
87  */
88 static int
89 raid_bdev_create_cb(void *io_device, void *ctx_buf)
90 {
91 	struct raid_bdev            *raid_bdev = io_device;
92 	struct raid_bdev_io_channel *raid_ch = ctx_buf;
93 
94 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_create_cb, %p\n", raid_ch);
95 
96 	assert(raid_bdev != NULL);
97 	assert(raid_bdev->state == RAID_BDEV_STATE_ONLINE);
98 
99 	raid_ch->num_channels = raid_bdev->num_base_bdevs;
100 
101 	raid_ch->base_channel = calloc(raid_ch->num_channels,
102 				       sizeof(struct spdk_io_channel *));
103 	if (!raid_ch->base_channel) {
104 		SPDK_ERRLOG("Unable to allocate base bdevs io channel\n");
105 		return -ENOMEM;
106 	}
107 	for (uint8_t i = 0; i < raid_ch->num_channels; i++) {
108 		/*
109 		 * Get the spdk_io_channel for all the base bdevs. This is used during
110 		 * split logic to send the respective child bdev ios to respective base
111 		 * bdev io channel.
112 		 */
113 		raid_ch->base_channel[i] = spdk_bdev_get_io_channel(
114 						   raid_bdev->base_bdev_info[i].desc);
115 		if (!raid_ch->base_channel[i]) {
116 			for (uint8_t j = 0; j < i; j++) {
117 				spdk_put_io_channel(raid_ch->base_channel[j]);
118 			}
119 			free(raid_ch->base_channel);
120 			raid_ch->base_channel = NULL;
121 			SPDK_ERRLOG("Unable to create io channel for base bdev\n");
122 			return -ENOMEM;
123 		}
124 	}
125 
126 	return 0;
127 }
128 
129 /*
130  * brief:
131  * raid_bdev_destroy_cb function is a cb function for raid bdev which deletes the
132  * hierarchy from raid bdev to base bdev io channels. It will be called per core
133  * params:
134  * io_device - pointer to raid bdev io device represented by raid_bdev
135  * ctx_buf - pointer to context buffer for raid bdev io channel
136  * returns:
137  * none
138  */
139 static void
140 raid_bdev_destroy_cb(void *io_device, void *ctx_buf)
141 {
142 	struct raid_bdev_io_channel *raid_ch = ctx_buf;
143 
144 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_destroy_cb\n");
145 
146 	assert(raid_ch != NULL);
147 	assert(raid_ch->base_channel);
148 	for (uint8_t i = 0; i < raid_ch->num_channels; i++) {
149 		/* Free base bdev channels */
150 		assert(raid_ch->base_channel[i] != NULL);
151 		spdk_put_io_channel(raid_ch->base_channel[i]);
152 	}
153 	free(raid_ch->base_channel);
154 	raid_ch->base_channel = NULL;
155 }
156 
157 /*
158  * brief:
159  * raid_bdev_cleanup is used to cleanup and free raid_bdev related data
160  * structures.
161  * params:
162  * raid_bdev - pointer to raid_bdev
163  * returns:
164  * none
165  */
166 static void
167 raid_bdev_cleanup(struct raid_bdev *raid_bdev)
168 {
169 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_cleanup, %p name %s, state %u, config %p\n",
170 		      raid_bdev,
171 		      raid_bdev->bdev.name, raid_bdev->state, raid_bdev->config);
172 	if (raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) {
173 		TAILQ_REMOVE(&g_raid_bdev_configuring_list, raid_bdev, state_link);
174 	} else if (raid_bdev->state == RAID_BDEV_STATE_OFFLINE) {
175 		TAILQ_REMOVE(&g_raid_bdev_offline_list, raid_bdev, state_link);
176 	} else {
177 		assert(0);
178 	}
179 	TAILQ_REMOVE(&g_raid_bdev_list, raid_bdev, global_link);
180 	free(raid_bdev->bdev.name);
181 	free(raid_bdev->base_bdev_info);
182 	if (raid_bdev->config) {
183 		raid_bdev->config->raid_bdev = NULL;
184 	}
185 	free(raid_bdev);
186 }
187 
188 /*
189  * brief:
190  * free resource of base bdev for raid bdev
191  * params:
192  * raid_bdev - pointer to raid bdev
193  * base_bdev_slot - position to base bdev in raid bdev
194  * returns:
195  * 0 - success
196  * non zero - failure
197  */
198 static void
199 raid_bdev_free_base_bdev_resource(struct raid_bdev *raid_bdev, uint8_t base_bdev_slot)
200 {
201 	struct raid_base_bdev_info *info;
202 
203 	info = &raid_bdev->base_bdev_info[base_bdev_slot];
204 
205 	spdk_bdev_module_release_bdev(info->bdev);
206 	spdk_bdev_close(info->desc);
207 	info->desc = NULL;
208 	info->bdev = NULL;
209 
210 	assert(raid_bdev->num_base_bdevs_discovered);
211 	raid_bdev->num_base_bdevs_discovered--;
212 }
213 
214 /*
215  * brief:
216  * raid_bdev_destruct is the destruct function table pointer for raid bdev
217  * params:
218  * ctxt - pointer to raid_bdev
219  * returns:
220  * 0 - success
221  * non zero - failure
222  */
223 static int
224 raid_bdev_destruct(void *ctxt)
225 {
226 	struct raid_bdev *raid_bdev = ctxt;
227 
228 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_destruct\n");
229 
230 	raid_bdev->destruct_called = true;
231 	for (uint8_t i = 0; i < raid_bdev->num_base_bdevs; i++) {
232 		/*
233 		 * Close all base bdev descriptors for which call has come from below
234 		 * layers.  Also close the descriptors if we have started shutdown.
235 		 */
236 		if (g_shutdown_started ||
237 		    ((raid_bdev->base_bdev_info[i].remove_scheduled == true) &&
238 		     (raid_bdev->base_bdev_info[i].bdev != NULL))) {
239 			raid_bdev_free_base_bdev_resource(raid_bdev, i);
240 		}
241 	}
242 
243 	if (g_shutdown_started) {
244 		TAILQ_REMOVE(&g_raid_bdev_configured_list, raid_bdev, state_link);
245 		raid_bdev->state = RAID_BDEV_STATE_OFFLINE;
246 		TAILQ_INSERT_TAIL(&g_raid_bdev_offline_list, raid_bdev, state_link);
247 	}
248 
249 	spdk_io_device_unregister(raid_bdev, NULL);
250 
251 	if (raid_bdev->num_base_bdevs_discovered == 0) {
252 		/* Free raid_bdev when there are no base bdevs left */
253 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev base bdevs is 0, going to free all in destruct\n");
254 		raid_bdev_cleanup(raid_bdev);
255 	}
256 
257 	return 0;
258 }
259 
260 /*
261  * brief:
262  * raid_bdev_io_completion function is called by lower layers to notify raid
263  * module that particular bdev_io is completed.
264  * params:
265  * bdev_io - pointer to bdev io submitted to lower layers, like child io
266  * success - bdev_io status
267  * cb_arg - function callback context, like parent io pointer
268  * returns:
269  * none
270  */
271 static void
272 raid_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
273 {
274 	struct spdk_bdev_io         *parent_io = cb_arg;
275 
276 	spdk_bdev_free_io(bdev_io);
277 
278 	if (success) {
279 		spdk_bdev_io_complete(parent_io, SPDK_BDEV_IO_STATUS_SUCCESS);
280 	} else {
281 		spdk_bdev_io_complete(parent_io, SPDK_BDEV_IO_STATUS_FAILED);
282 	}
283 }
284 
285 /*
286  * brief:
287  * raid0_submit_rw_request function is used to submit I/O to the correct
288  * member disk for raid0 bdevs.
289  * params:
290  * bdev_io - parent bdev io
291  * start_strip - start strip number of this io
292  * returns:
293  * 0 - success
294  * non zero - failure
295  */
296 static int
297 raid0_submit_rw_request(struct spdk_bdev_io *bdev_io, uint64_t start_strip)
298 {
299 	struct raid_bdev_io		*raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
300 	struct raid_bdev_io_channel	*raid_ch = spdk_io_channel_get_ctx(raid_io->ch);
301 	struct raid_bdev		*raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
302 	uint64_t			pd_strip;
303 	uint32_t			offset_in_strip;
304 	uint64_t			pd_lba;
305 	uint64_t			pd_blocks;
306 	uint8_t				pd_idx;
307 	int				ret = 0;
308 
309 	pd_strip = start_strip / raid_bdev->num_base_bdevs;
310 	pd_idx = start_strip % raid_bdev->num_base_bdevs;
311 	offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1);
312 	pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip;
313 	pd_blocks = bdev_io->u.bdev.num_blocks;
314 	if (raid_bdev->base_bdev_info[pd_idx].desc == NULL) {
315 		SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
316 		assert(0);
317 	}
318 
319 	/*
320 	 * Submit child io to bdev layer with using base bdev descriptors, base
321 	 * bdev lba, base bdev child io length in blocks, buffer, completion
322 	 * function and function callback context
323 	 */
324 	assert(raid_ch != NULL);
325 	assert(raid_ch->base_channel);
326 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
327 		ret = spdk_bdev_readv_blocks(raid_bdev->base_bdev_info[pd_idx].desc,
328 					     raid_ch->base_channel[pd_idx],
329 					     bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
330 					     pd_lba, pd_blocks, raid_bdev_io_completion,
331 					     bdev_io);
332 	} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
333 		ret = spdk_bdev_writev_blocks(raid_bdev->base_bdev_info[pd_idx].desc,
334 					      raid_ch->base_channel[pd_idx],
335 					      bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
336 					      pd_lba, pd_blocks, raid_bdev_io_completion,
337 					      bdev_io);
338 	} else {
339 		SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type);
340 		assert(0);
341 	}
342 
343 	return ret;
344 }
345 
346 /*
347  * brief:
348  * raid0_get_curr_base_bdev_index function calculates the base bdev index
349  * for raid0 bdevs.
350  * params:
351  * raid_bdev - pointer to raid bdev
352  * raid_io - pointer to parent io context
353  * returns:
354  * base bdev index
355  */
356 static uint8_t
357 raid0_get_curr_base_bdev_index(struct raid_bdev *raid_bdev, struct raid_bdev_io *raid_io)
358 {
359 	struct spdk_bdev_io	*bdev_io;
360 	uint64_t		start_strip;
361 
362 	bdev_io = SPDK_CONTAINEROF(raid_io, struct spdk_bdev_io, driver_ctx);
363 	start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift;
364 
365 	return (start_strip % raid_bdev->num_base_bdevs);
366 }
367 
368 /*
369  * brief:
370  * raid_bdev_io_submit_fail_process function processes the IO which failed to submit.
371  * It will try to queue the IOs after storing the context to bdev wait queue logic.
372  * params:
373  * bdev_io - pointer to bdev_io
374  * raid_io - pointer to raid bdev io
375  * ret - return code
376  * returns:
377  * none
378  */
379 static void
380 raid_bdev_io_submit_fail_process(struct raid_bdev *raid_bdev, struct spdk_bdev_io *bdev_io,
381 				 struct raid_bdev_io *raid_io, int ret)
382 {
383 	struct raid_bdev_io_channel	*raid_ch;
384 	uint8_t				pd_idx;
385 
386 	if (ret != -ENOMEM) {
387 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
388 	} else {
389 		/* Queue the IO to bdev layer wait queue */
390 		pd_idx = raid_bdev->fn_table->get_curr_base_index(raid_bdev, raid_io);
391 		raid_io->waitq_entry.bdev = raid_bdev->base_bdev_info[pd_idx].bdev;
392 		raid_io->waitq_entry.cb_fn = raid_bdev->fn_table->waitq_io_process;
393 		raid_io->waitq_entry.cb_arg = raid_io;
394 		raid_ch = spdk_io_channel_get_ctx(raid_io->ch);
395 		if (spdk_bdev_queue_io_wait(raid_bdev->base_bdev_info[pd_idx].bdev,
396 					    raid_ch->base_channel[pd_idx],
397 					    &raid_io->waitq_entry) != 0) {
398 			SPDK_ERRLOG("bdev io waitq error, it should not happen\n");
399 			assert(0);
400 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
401 		}
402 	}
403 }
404 
405 /*
406  * brief:
407  * raid0_waitq_io_process function is the callback function
408  * registered by raid bdev module to bdev when bdev_io was unavailable
409  * for raid0 bdevs.
410  * params:
411  * ctx - pointer to raid_bdev_io
412  * returns:
413  * none
414  */
415 static void
416 raid0_waitq_io_process(void *ctx)
417 {
418 	struct raid_bdev_io	*raid_io = ctx;
419 	struct spdk_bdev_io	*bdev_io;
420 	struct raid_bdev	*raid_bdev;
421 	int			ret;
422 	uint64_t		start_strip;
423 
424 	bdev_io = SPDK_CONTAINEROF(raid_io, struct spdk_bdev_io, driver_ctx);
425 	/*
426 	 * Try to submit childs of parent bdev io. If failed due to resource
427 	 * crunch then break the loop and don't try to process other queued IOs.
428 	 */
429 	raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
430 	start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift;
431 	ret = raid0_submit_rw_request(bdev_io, start_strip);
432 	if (ret != 0) {
433 		raid_bdev_io_submit_fail_process(raid_bdev, bdev_io, raid_io, ret);
434 	}
435 }
436 
437 /*
438  * brief:
439  * raid0_start_rw_request function is the submit_request function for
440  * read/write requests for raid0 bdevs.
441  * params:
442  * ch - pointer to raid bdev io channel
443  * bdev_io - pointer to parent bdev_io on raid bdev device
444  * returns:
445  * none
446  */
447 static void
448 raid0_start_rw_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
449 {
450 	struct raid_bdev_io		*raid_io;
451 	struct raid_bdev		*raid_bdev;
452 	uint64_t			start_strip = 0;
453 	uint64_t			end_strip = 0;
454 	int				ret;
455 
456 	raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
457 	raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
458 	raid_io->ch = ch;
459 	start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift;
460 	end_strip = (bdev_io->u.bdev.offset_blocks + bdev_io->u.bdev.num_blocks - 1) >>
461 		    raid_bdev->strip_size_shift;
462 	if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) {
463 		assert(false);
464 		SPDK_ERRLOG("I/O spans strip boundary!\n");
465 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
466 		return;
467 	}
468 	ret = raid0_submit_rw_request(bdev_io, start_strip);
469 	if (ret != 0) {
470 		raid_bdev_io_submit_fail_process(raid_bdev, bdev_io, raid_io, ret);
471 	}
472 }
473 
474 /*
475  * brief:
476  * raid_bdev_base_io_completion is the completion callback for member disk requests
477  * params:
478  * bdev_io - pointer to member disk requested bdev_io
479  * success - true if successful, false if unsuccessful
480  * cb_arg - callback argument (parent raid bdev_io)
481  * returns:
482  * none
483  */
484 static void
485 raid_bdev_base_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
486 {
487 	struct spdk_bdev_io *parent_io = cb_arg;
488 	struct raid_bdev_io *raid_io = (struct raid_bdev_io *)parent_io->driver_ctx;
489 
490 	spdk_bdev_free_io(bdev_io);
491 
492 	if (!success) {
493 		raid_io->base_bdev_io_status = SPDK_BDEV_IO_STATUS_FAILED;
494 	}
495 
496 	raid_io->base_bdev_io_completed++;
497 	if (raid_io->base_bdev_io_completed == raid_io->base_bdev_io_expected) {
498 		spdk_bdev_io_complete(parent_io, raid_io->base_bdev_io_status);
499 	}
500 }
501 
502 /*
503  * brief:
504  * raid_bdev_base_io_submit_fail_process processes IO requests for member disk
505  * which failed to submit
506  * params:
507  * raid_bdev_io - pointer to raid bdev_io
508  * pd_idx - base_dev index in raid_bdev
509  * cb_fn - callback when the spdk_bdev_io for base_bdev becomes available
510  * ret - return code
511  * returns:
512  * none
513  */
514 static void
515 raid_bdev_base_io_submit_fail_process(struct spdk_bdev_io *raid_bdev_io, uint8_t pd_idx,
516 				      spdk_bdev_io_wait_cb cb_fn, int ret)
517 {
518 	struct raid_bdev_io *raid_io = (struct raid_bdev_io *)raid_bdev_io->driver_ctx;
519 	struct raid_bdev_io_channel *raid_ch = spdk_io_channel_get_ctx(raid_io->ch);
520 	struct raid_bdev *raid_bdev = (struct raid_bdev *)raid_bdev_io->bdev->ctxt;
521 
522 	assert(ret != 0);
523 
524 	if (ret == -ENOMEM) {
525 		raid_io->waitq_entry.bdev = raid_bdev->base_bdev_info[pd_idx].bdev;
526 		raid_io->waitq_entry.cb_fn = cb_fn;
527 		raid_io->waitq_entry.cb_arg = raid_bdev_io;
528 		spdk_bdev_queue_io_wait(raid_bdev->base_bdev_info[pd_idx].bdev,
529 					raid_ch->base_channel[pd_idx],
530 					&raid_io->waitq_entry);
531 		return;
532 	}
533 
534 	SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
535 	assert(false);
536 	spdk_bdev_io_complete(raid_bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
537 }
538 
539 /*
540  * brief:
541  * _raid_bdev_submit_reset_request_next function submits the next batch of reset requests
542  * to member disks; it will submit as many as possible unless a reset fails with -ENOMEM, in
543  * which case it will queue it for later submission
544  * params:
545  * bdev_io - pointer to parent bdev_io on raid bdev device
546  * returns:
547  * none
548  */
549 static void
550 _raid_bdev_submit_reset_request_next(void *_bdev_io)
551 {
552 	struct spdk_bdev_io		*bdev_io = _bdev_io;
553 	struct raid_bdev_io		*raid_io;
554 	struct raid_bdev		*raid_bdev;
555 	struct raid_bdev_io_channel	*raid_ch;
556 	int				ret;
557 	uint8_t				i;
558 
559 	raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
560 	raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
561 	raid_ch = spdk_io_channel_get_ctx(raid_io->ch);
562 
563 	while (raid_io->base_bdev_io_submitted < raid_bdev->num_base_bdevs) {
564 		i = raid_io->base_bdev_io_submitted;
565 		ret = spdk_bdev_reset(raid_bdev->base_bdev_info[i].desc,
566 				      raid_ch->base_channel[i],
567 				      raid_bdev_base_io_completion, bdev_io);
568 		if (ret == 0) {
569 			raid_io->base_bdev_io_submitted++;
570 		} else {
571 			raid_bdev_base_io_submit_fail_process(bdev_io, i,
572 							      _raid_bdev_submit_reset_request_next, ret);
573 			return;
574 		}
575 	}
576 }
577 
578 /*
579  * brief:
580  * _raid_bdev_submit_reset_request function is the submit_request function for
581  * reset requests
582  * params:
583  * ch - pointer to raid bdev io channel
584  * bdev_io - pointer to parent bdev_io on raid bdev device
585  * returns:
586  * none
587  */
588 static void
589 _raid_bdev_submit_reset_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
590 {
591 	struct raid_bdev_io		*raid_io;
592 	struct raid_bdev		*raid_bdev;
593 
594 	raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
595 	raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
596 	raid_io->ch = ch;
597 	raid_io->base_bdev_io_submitted = 0;
598 	raid_io->base_bdev_io_completed = 0;
599 	raid_io->base_bdev_io_expected = raid_bdev->num_base_bdevs;
600 	raid_io->base_bdev_io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
601 	_raid_bdev_submit_reset_request_next(bdev_io);
602 }
603 
604 static inline void
605 _raid0_get_io_range(struct raid_bdev_io_range *io_range,
606 		    uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift,
607 		    uint64_t offset_blocks, uint64_t num_blocks)
608 {
609 	uint64_t	start_strip;
610 	uint64_t	end_strip;
611 
612 	io_range->strip_size = strip_size;
613 
614 	/* The start and end strip index in raid0 bdev scope */
615 	start_strip = offset_blocks >> strip_size_shift;
616 	end_strip = (offset_blocks + num_blocks - 1) >> strip_size_shift;
617 	io_range->start_strip_in_disk = start_strip / num_base_bdevs;
618 	io_range->end_strip_in_disk = end_strip / num_base_bdevs;
619 
620 	/* The first strip may have unaligned start LBA offset.
621 	 * The end strip may have unaligned end LBA offset.
622 	 * Strips between them certainly have aligned offset and length to boundaries.
623 	 */
624 	io_range->start_offset_in_strip = offset_blocks % strip_size;
625 	io_range->end_offset_in_strip = (offset_blocks + num_blocks - 1) % strip_size;
626 
627 	/* The base bdev indexes in which start and end strips are located */
628 	io_range->start_disk = start_strip % num_base_bdevs;
629 	io_range->end_disk = end_strip % num_base_bdevs;
630 
631 	/* Calculate how many base_bdevs are involved in io operation.
632 	 * Number of base bdevs involved is between 1 and num_base_bdevs.
633 	 * It will be 1 if the first strip and last strip are the same one.
634 	 */
635 	io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs);
636 }
637 
638 static inline void
639 _raid0_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx,
640 		      uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk)
641 {
642 	uint64_t n_strips_in_disk;
643 	uint64_t start_offset_in_disk;
644 	uint64_t end_offset_in_disk;
645 	uint64_t offset_in_disk;
646 	uint64_t nblocks_in_disk;
647 	uint64_t start_strip_in_disk;
648 	uint64_t end_strip_in_disk;
649 
650 	start_strip_in_disk = io_range->start_strip_in_disk;
651 	if (disk_idx < io_range->start_disk) {
652 		start_strip_in_disk += 1;
653 	}
654 
655 	end_strip_in_disk = io_range->end_strip_in_disk;
656 	if (disk_idx > io_range->end_disk) {
657 		end_strip_in_disk -= 1;
658 	}
659 
660 	assert(end_strip_in_disk >= start_strip_in_disk);
661 	n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1;
662 
663 	if (disk_idx == io_range->start_disk) {
664 		start_offset_in_disk = io_range->start_offset_in_strip;
665 	} else {
666 		start_offset_in_disk = 0;
667 	}
668 
669 	if (disk_idx == io_range->end_disk) {
670 		end_offset_in_disk = io_range->end_offset_in_strip;
671 	} else {
672 		end_offset_in_disk = io_range->strip_size - 1;
673 	}
674 
675 	offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size;
676 	nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size
677 			  + end_offset_in_disk - start_offset_in_disk + 1;
678 
679 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID,
680 		      "raid_bdev (strip_size 0x%lx) splits IO to base_bdev (%u) at (0x%lx, 0x%lx).\n",
681 		      io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk);
682 
683 	*_offset_in_disk = offset_in_disk;
684 	*_nblocks_in_disk = nblocks_in_disk;
685 }
686 
687 /*
688  * brief:
689  * _raid_bdev_submit_null_payload_request_next function submits the next batch of
690  * io requests with range but without payload, like FLUSH and UNMAP, to member disks;
691  * it will submit as many as possible unless one base io request fails with -ENOMEM,
692  * in which case it will queue itself for later submission.
693  * params:
694  * bdev_io - pointer to parent bdev_io on raid bdev device
695  * returns:
696  * none
697  */
698 static void
699 _raid_bdev_submit_null_payload_request_next(void *_bdev_io)
700 {
701 	struct spdk_bdev_io		*bdev_io = _bdev_io;
702 	struct raid_bdev_io		*raid_io;
703 	struct raid_bdev		*raid_bdev;
704 	struct raid_bdev_io_channel	*raid_ch;
705 	struct raid_bdev_io_range	io_range;
706 	int				ret;
707 
708 	raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
709 	raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
710 	raid_ch = spdk_io_channel_get_ctx(raid_io->ch);
711 
712 	raid_bdev->fn_table->get_io_range(&io_range, raid_bdev->num_base_bdevs,
713 					  raid_bdev->strip_size, raid_bdev->strip_size_shift,
714 					  bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks);
715 
716 	raid_io->base_bdev_io_expected = io_range.n_disks_involved;
717 
718 	while (raid_io->base_bdev_io_submitted < raid_io->base_bdev_io_expected) {
719 		uint8_t disk_idx;
720 		uint64_t offset_in_disk;
721 		uint64_t nblocks_in_disk;
722 
723 		/* base_bdev is started from start_disk to end_disk.
724 		 * It is possible that index of start_disk is larger than end_disk's.
725 		 */
726 		disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs;
727 
728 		raid_bdev->fn_table->split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk);
729 
730 		switch (bdev_io->type) {
731 		case SPDK_BDEV_IO_TYPE_UNMAP:
732 			ret = spdk_bdev_unmap_blocks(raid_bdev->base_bdev_info[disk_idx].desc,
733 						     raid_ch->base_channel[disk_idx],
734 						     offset_in_disk, nblocks_in_disk,
735 						     raid_bdev_base_io_completion, bdev_io);
736 			break;
737 
738 		case SPDK_BDEV_IO_TYPE_FLUSH:
739 			ret = spdk_bdev_flush_blocks(raid_bdev->base_bdev_info[disk_idx].desc,
740 						     raid_ch->base_channel[disk_idx],
741 						     offset_in_disk, nblocks_in_disk,
742 						     raid_bdev_base_io_completion, bdev_io);
743 			break;
744 
745 		default:
746 			SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type);
747 			assert(false);
748 			ret = -EIO;
749 		}
750 
751 		if (ret == 0) {
752 			raid_io->base_bdev_io_submitted++;
753 		} else {
754 			raid_bdev_base_io_submit_fail_process(bdev_io, disk_idx,
755 							      _raid_bdev_submit_null_payload_request_next, ret);
756 			return;
757 		}
758 	}
759 }
760 
761 /*
762  * brief:
763  * _raid_bdev_submit_null_payload_request function is the submit_request function
764  * for io requests with range but without payload, like UNMAP and FLUSH.
765  * params:
766  * ch - pointer to raid bdev io channel
767  * bdev_io - pointer to parent bdev_io on raid bdev device
768  * returns:
769  * none
770  */
771 static void
772 _raid_bdev_submit_null_payload_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
773 {
774 	struct raid_bdev_io		*raid_io;
775 
776 	raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
777 	raid_io->ch = ch;
778 	raid_io->base_bdev_io_submitted = 0;
779 	raid_io->base_bdev_io_completed = 0;
780 	raid_io->base_bdev_io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
781 
782 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev: type %d, range (0x%lx, 0x%lx)\n",
783 		      bdev_io->type, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks);
784 
785 	_raid_bdev_submit_null_payload_request_next(bdev_io);
786 }
787 
788 /*
789  * brief:
790  * Callback function to spdk_bdev_io_get_buf.
791  * params:
792  * ch - pointer to raid bdev io channel
793  * bdev_io - pointer to parent bdev_io on raid bdev device
794  * success - True if buffer is allocated or false otherwise.
795  * returns:
796  * none
797  */
798 static void
799 raid_bdev_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
800 		     bool success)
801 {
802 	struct raid_bdev		*raid_bdev;
803 
804 	raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
805 
806 	if (!success) {
807 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
808 		return;
809 	}
810 
811 	raid_bdev->fn_table->start_rw_request(ch, bdev_io);
812 }
813 
814 /*
815  * brief:
816  * raid_bdev_submit_request function is the submit_request function pointer of
817  * raid bdev function table. This is used to submit the io on raid_bdev to below
818  * layers.
819  * params:
820  * ch - pointer to raid bdev io channel
821  * bdev_io - pointer to parent bdev_io on raid bdev device
822  * returns:
823  * none
824  */
825 static void
826 raid_bdev_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
827 {
828 	struct raid_bdev		*raid_bdev;
829 
830 	raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
831 
832 	switch (bdev_io->type) {
833 	case SPDK_BDEV_IO_TYPE_READ:
834 		spdk_bdev_io_get_buf(bdev_io, raid_bdev_get_buf_cb,
835 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
836 		break;
837 	case SPDK_BDEV_IO_TYPE_WRITE:
838 		raid_bdev->fn_table->start_rw_request(ch, bdev_io);
839 		break;
840 
841 	case SPDK_BDEV_IO_TYPE_RESET:
842 		_raid_bdev_submit_reset_request(ch, bdev_io);
843 		break;
844 
845 	case SPDK_BDEV_IO_TYPE_FLUSH:
846 	case SPDK_BDEV_IO_TYPE_UNMAP:
847 		_raid_bdev_submit_null_payload_request(ch, bdev_io);
848 		break;
849 
850 	default:
851 		SPDK_ERRLOG("submit request, invalid io type %u\n", bdev_io->type);
852 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
853 		break;
854 	}
855 
856 }
857 
858 /*
859  * brief:
860  * _raid_bdev_io_type_supported checks whether io_type is supported in
861  * all base bdev modules of raid bdev module. If anyone among the base_bdevs
862  * doesn't support, the raid device doesn't supports.
863  *
864  * params:
865  * raid_bdev - pointer to raid bdev context
866  * io_type - io type
867  * returns:
868  * true - io_type is supported
869  * false - io_type is not supported
870  */
871 inline static bool
872 _raid_bdev_io_type_supported(struct raid_bdev *raid_bdev, enum spdk_bdev_io_type io_type)
873 {
874 	uint8_t i;
875 
876 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
877 		if (raid_bdev->base_bdev_info[i].bdev == NULL) {
878 			assert(false);
879 			continue;
880 		}
881 
882 		if (spdk_bdev_io_type_supported(raid_bdev->base_bdev_info[i].bdev,
883 						io_type) == false) {
884 			return false;
885 		}
886 	}
887 
888 	return true;
889 }
890 
891 /*
892  * brief:
893  * raid_bdev_io_type_supported is the io_supported function for bdev function
894  * table which returns whether the particular io type is supported or not by
895  * raid bdev module
896  * params:
897  * ctx - pointer to raid bdev context
898  * type - io type
899  * returns:
900  * true - io_type is supported
901  * false - io_type is not supported
902  */
903 static bool
904 raid_bdev_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
905 {
906 	switch (io_type) {
907 	case SPDK_BDEV_IO_TYPE_READ:
908 	case SPDK_BDEV_IO_TYPE_WRITE:
909 		return true;
910 
911 	case SPDK_BDEV_IO_TYPE_FLUSH:
912 	case SPDK_BDEV_IO_TYPE_RESET:
913 	case SPDK_BDEV_IO_TYPE_UNMAP:
914 		return _raid_bdev_io_type_supported(ctx, io_type);
915 
916 	default:
917 		return false;
918 	}
919 
920 	return false;
921 }
922 
923 /*
924  * brief:
925  * raid_bdev_get_io_channel is the get_io_channel function table pointer for
926  * raid bdev. This is used to return the io channel for this raid bdev
927  * params:
928  * ctxt - pointer to raid_bdev
929  * returns:
930  * pointer to io channel for raid bdev
931  */
932 static struct spdk_io_channel *
933 raid_bdev_get_io_channel(void *ctxt)
934 {
935 	struct raid_bdev *raid_bdev = ctxt;
936 
937 	return spdk_get_io_channel(raid_bdev);
938 }
939 
940 /*
941  * brief:
942  * raid_bdev_dump_info_json is the function table pointer for raid bdev
943  * params:
944  * ctx - pointer to raid_bdev
945  * w - pointer to json context
946  * returns:
947  * 0 - success
948  * non zero - failure
949  */
950 static int
951 raid_bdev_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
952 {
953 	struct raid_bdev *raid_bdev = ctx;
954 
955 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_dump_config_json\n");
956 	assert(raid_bdev != NULL);
957 
958 	/* Dump the raid bdev configuration related information */
959 	spdk_json_write_named_object_begin(w, "raid");
960 	spdk_json_write_named_uint32(w, "strip_size", raid_bdev->strip_size);
961 	spdk_json_write_named_uint32(w, "strip_size_kb", raid_bdev->strip_size_kb);
962 	spdk_json_write_named_uint32(w, "state", raid_bdev->state);
963 	spdk_json_write_named_uint32(w, "raid_level", raid_bdev->raid_level);
964 	spdk_json_write_named_uint32(w, "destruct_called", raid_bdev->destruct_called);
965 	spdk_json_write_named_uint32(w, "num_base_bdevs", raid_bdev->num_base_bdevs);
966 	spdk_json_write_named_uint32(w, "num_base_bdevs_discovered", raid_bdev->num_base_bdevs_discovered);
967 	spdk_json_write_name(w, "base_bdevs_list");
968 	spdk_json_write_array_begin(w);
969 	for (uint8_t i = 0; i < raid_bdev->num_base_bdevs; i++) {
970 		if (raid_bdev->base_bdev_info[i].bdev) {
971 			spdk_json_write_string(w, raid_bdev->base_bdev_info[i].bdev->name);
972 		} else {
973 			spdk_json_write_null(w);
974 		}
975 	}
976 	spdk_json_write_array_end(w);
977 	spdk_json_write_object_end(w);
978 
979 	return 0;
980 }
981 
982 /*
983  * brief:
984  * raid_bdev_write_config_json is the function table pointer for raid bdev
985  * params:
986  * bdev - pointer to spdk_bdev
987  * w - pointer to json context
988  * returns:
989  * none
990  */
991 static void
992 raid_bdev_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
993 {
994 	struct raid_bdev *raid_bdev = bdev->ctxt;
995 	struct spdk_bdev *base;
996 	uint8_t i;
997 
998 	spdk_json_write_object_begin(w);
999 
1000 	spdk_json_write_named_string(w, "method", "bdev_raid_create");
1001 
1002 	spdk_json_write_named_object_begin(w, "params");
1003 	spdk_json_write_named_string(w, "name", bdev->name);
1004 	spdk_json_write_named_uint32(w, "strip_size", raid_bdev->strip_size_kb);
1005 	spdk_json_write_named_uint32(w, "raid_level", raid_bdev->raid_level);
1006 
1007 	spdk_json_write_named_array_begin(w, "base_bdevs");
1008 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
1009 		base = raid_bdev->base_bdev_info[i].bdev;
1010 		if (base) {
1011 			spdk_json_write_string(w, base->name);
1012 		}
1013 	}
1014 	spdk_json_write_array_end(w);
1015 	spdk_json_write_object_end(w);
1016 
1017 	spdk_json_write_object_end(w);
1018 }
1019 
1020 /* g_raid_bdev_fn_table is the function table for raid bdev */
1021 static const struct spdk_bdev_fn_table g_raid_bdev_fn_table = {
1022 	.destruct		= raid_bdev_destruct,
1023 	.submit_request		= raid_bdev_submit_request,
1024 	.io_type_supported	= raid_bdev_io_type_supported,
1025 	.get_io_channel		= raid_bdev_get_io_channel,
1026 	.dump_info_json		= raid_bdev_dump_info_json,
1027 	.write_config_json	= raid_bdev_write_config_json,
1028 };
1029 
1030 /*
1031  * brief:
1032  * raid_bdev_config_cleanup function is used to free memory for one raid_bdev in configuration
1033  * params:
1034  * raid_cfg - pointer to raid_bdev_config structure
1035  * returns:
1036  * none
1037  */
1038 void
1039 raid_bdev_config_cleanup(struct raid_bdev_config *raid_cfg)
1040 {
1041 	uint8_t i;
1042 
1043 	TAILQ_REMOVE(&g_raid_config.raid_bdev_config_head, raid_cfg, link);
1044 	g_raid_config.total_raid_bdev--;
1045 
1046 	if (raid_cfg->base_bdev) {
1047 		for (i = 0; i < raid_cfg->num_base_bdevs; i++) {
1048 			free(raid_cfg->base_bdev[i].name);
1049 		}
1050 		free(raid_cfg->base_bdev);
1051 	}
1052 	free(raid_cfg->name);
1053 	free(raid_cfg);
1054 }
1055 
1056 /*
1057  * brief:
1058  * raid_bdev_free is the raid bdev function table function pointer. This is
1059  * called on bdev free path
1060  * params:
1061  * none
1062  * returns:
1063  * none
1064  */
1065 static void
1066 raid_bdev_free(void)
1067 {
1068 	struct raid_bdev_config *raid_cfg, *tmp;
1069 
1070 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_free\n");
1071 	TAILQ_FOREACH_SAFE(raid_cfg, &g_raid_config.raid_bdev_config_head, link, tmp) {
1072 		raid_bdev_config_cleanup(raid_cfg);
1073 	}
1074 }
1075 
1076 /* brief
1077  * raid_bdev_config_find_by_name is a helper function to find raid bdev config
1078  * by name as key.
1079  *
1080  * params:
1081  * raid_name - name for raid bdev.
1082  */
1083 struct raid_bdev_config *
1084 raid_bdev_config_find_by_name(const char *raid_name)
1085 {
1086 	struct raid_bdev_config *raid_cfg;
1087 
1088 	TAILQ_FOREACH(raid_cfg, &g_raid_config.raid_bdev_config_head, link) {
1089 		if (!strcmp(raid_cfg->name, raid_name)) {
1090 			return raid_cfg;
1091 		}
1092 	}
1093 
1094 	return raid_cfg;
1095 }
1096 
1097 /*
1098  * brief
1099  * raid_bdev_config_add function adds config for newly created raid bdev.
1100  *
1101  * params:
1102  * raid_name - name for raid bdev.
1103  * strip_size - strip size in KB
1104  * num_base_bdevs - number of base bdevs.
1105  * raid_level - raid level, only raid level 0 is supported.
1106  * _raid_cfg - Pointer to newly added configuration
1107  */
1108 int
1109 raid_bdev_config_add(const char *raid_name, uint32_t strip_size, uint8_t num_base_bdevs,
1110 		     uint8_t raid_level, struct raid_bdev_config **_raid_cfg)
1111 {
1112 	struct raid_bdev_config *raid_cfg;
1113 
1114 	raid_cfg = raid_bdev_config_find_by_name(raid_name);
1115 	if (raid_cfg != NULL) {
1116 		SPDK_ERRLOG("Duplicate raid bdev name found in config file %s\n",
1117 			    raid_name);
1118 		return -EEXIST;
1119 	}
1120 
1121 	if (spdk_u32_is_pow2(strip_size) == false) {
1122 		SPDK_ERRLOG("Invalid strip size %" PRIu32 "\n", strip_size);
1123 		return -EINVAL;
1124 	}
1125 
1126 	if (num_base_bdevs == 0) {
1127 		SPDK_ERRLOG("Invalid base device count %u\n", num_base_bdevs);
1128 		return -EINVAL;
1129 	}
1130 
1131 	if (raid_level != 0) {
1132 		SPDK_ERRLOG("invalid raid level %u, only raid level 0 is supported\n",
1133 			    raid_level);
1134 		return -EINVAL;
1135 	}
1136 
1137 	raid_cfg = calloc(1, sizeof(*raid_cfg));
1138 	if (raid_cfg == NULL) {
1139 		SPDK_ERRLOG("unable to allocate memory\n");
1140 		return -ENOMEM;
1141 	}
1142 
1143 	raid_cfg->name = strdup(raid_name);
1144 	if (!raid_cfg->name) {
1145 		free(raid_cfg);
1146 		SPDK_ERRLOG("unable to allocate memory\n");
1147 		return -ENOMEM;
1148 	}
1149 	raid_cfg->strip_size = strip_size;
1150 	raid_cfg->num_base_bdevs = num_base_bdevs;
1151 	raid_cfg->raid_level = raid_level;
1152 
1153 	raid_cfg->base_bdev = calloc(num_base_bdevs, sizeof(*raid_cfg->base_bdev));
1154 	if (raid_cfg->base_bdev == NULL) {
1155 		free(raid_cfg->name);
1156 		free(raid_cfg);
1157 		SPDK_ERRLOG("unable to allocate memory\n");
1158 		return -ENOMEM;
1159 	}
1160 
1161 	TAILQ_INSERT_TAIL(&g_raid_config.raid_bdev_config_head, raid_cfg, link);
1162 	g_raid_config.total_raid_bdev++;
1163 
1164 	*_raid_cfg = raid_cfg;
1165 	return 0;
1166 }
1167 
1168 /*
1169  * brief:
1170  * raid_bdev_config_add_base_bdev function add base bdev to raid bdev config.
1171  *
1172  * params:
1173  * raid_cfg - pointer to raid bdev configuration
1174  * base_bdev_name - name of base bdev
1175  * slot - Position to add base bdev
1176  */
1177 int
1178 raid_bdev_config_add_base_bdev(struct raid_bdev_config *raid_cfg, const char *base_bdev_name,
1179 			       uint8_t slot)
1180 {
1181 	uint8_t i;
1182 	struct raid_bdev_config *tmp;
1183 
1184 	if (slot >= raid_cfg->num_base_bdevs) {
1185 		return -EINVAL;
1186 	}
1187 
1188 	TAILQ_FOREACH(tmp, &g_raid_config.raid_bdev_config_head, link) {
1189 		for (i = 0; i < tmp->num_base_bdevs; i++) {
1190 			if (tmp->base_bdev[i].name != NULL) {
1191 				if (!strcmp(tmp->base_bdev[i].name, base_bdev_name)) {
1192 					SPDK_ERRLOG("duplicate base bdev name %s mentioned\n",
1193 						    base_bdev_name);
1194 					return -EEXIST;
1195 				}
1196 			}
1197 		}
1198 	}
1199 
1200 	raid_cfg->base_bdev[slot].name = strdup(base_bdev_name);
1201 	if (raid_cfg->base_bdev[slot].name == NULL) {
1202 		SPDK_ERRLOG("unable to allocate memory\n");
1203 		return -ENOMEM;
1204 	}
1205 
1206 	return 0;
1207 }
1208 /*
1209  * brief:
1210  * raid_bdev_parse_raid is used to parse the raid bdev from config file based on
1211  * pre-defined raid bdev format in config file.
1212  * Format of config file:
1213  *   [RAID1]
1214  *   Name raid1
1215  *   StripSize 64
1216  *   NumDevices 2
1217  *   RaidLevel 0
1218  *   Devices Nvme0n1 Nvme1n1
1219  *
1220  *   [RAID2]
1221  *   Name raid2
1222  *   StripSize 64
1223  *   NumDevices 3
1224  *   RaidLevel 0
1225  *   Devices Nvme2n1 Nvme3n1 Nvme4n1
1226  *
1227  * params:
1228  * conf_section - pointer to config section
1229  * returns:
1230  * 0 - success
1231  * non zero - failure
1232  */
1233 static int
1234 raid_bdev_parse_raid(struct spdk_conf_section *conf_section)
1235 {
1236 	const char *raid_name;
1237 	uint32_t strip_size;
1238 	uint8_t num_base_bdevs, raid_level;
1239 	const char *base_bdev_name;
1240 	struct raid_bdev_config *raid_cfg;
1241 	int rc, i, val;
1242 
1243 	raid_name = spdk_conf_section_get_val(conf_section, "Name");
1244 	if (raid_name == NULL) {
1245 		SPDK_ERRLOG("raid_name is null\n");
1246 		return -EINVAL;
1247 	}
1248 
1249 	val = spdk_conf_section_get_intval(conf_section, "StripSize");
1250 	if (val < 0) {
1251 		return -EINVAL;
1252 	}
1253 	strip_size = val;
1254 
1255 	val = spdk_conf_section_get_intval(conf_section, "NumDevices");
1256 	if (val < 0) {
1257 		return -EINVAL;
1258 	}
1259 	num_base_bdevs = val;
1260 
1261 	val = spdk_conf_section_get_intval(conf_section, "RaidLevel");
1262 	if (val < 0) {
1263 		return -EINVAL;
1264 	}
1265 	raid_level = val;
1266 
1267 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "%s %" PRIu32 " %u %u\n",
1268 		      raid_name, strip_size, num_base_bdevs, raid_level);
1269 
1270 	rc = raid_bdev_config_add(raid_name, strip_size, num_base_bdevs, raid_level,
1271 				  &raid_cfg);
1272 	if (rc != 0) {
1273 		SPDK_ERRLOG("Failed to add raid bdev config\n");
1274 		return rc;
1275 	}
1276 
1277 	for (i = 0; true; i++) {
1278 		base_bdev_name = spdk_conf_section_get_nmval(conf_section, "Devices", 0, i);
1279 		if (base_bdev_name == NULL) {
1280 			break;
1281 		}
1282 		if (i >= num_base_bdevs) {
1283 			raid_bdev_config_cleanup(raid_cfg);
1284 			SPDK_ERRLOG("Number of devices mentioned is more than count\n");
1285 			return -EINVAL;
1286 		}
1287 
1288 		rc = raid_bdev_config_add_base_bdev(raid_cfg, base_bdev_name, i);
1289 		if (rc != 0) {
1290 			raid_bdev_config_cleanup(raid_cfg);
1291 			SPDK_ERRLOG("Failed to add base bdev to raid bdev config\n");
1292 			return rc;
1293 		}
1294 	}
1295 
1296 	if (i != raid_cfg->num_base_bdevs) {
1297 		raid_bdev_config_cleanup(raid_cfg);
1298 		SPDK_ERRLOG("Number of devices mentioned is less than count\n");
1299 		return -EINVAL;
1300 	}
1301 
1302 	rc = raid_bdev_create(raid_cfg);
1303 	if (rc != 0) {
1304 		raid_bdev_config_cleanup(raid_cfg);
1305 		SPDK_ERRLOG("Failed to create raid bdev\n");
1306 		return rc;
1307 	}
1308 
1309 	rc = raid_bdev_add_base_devices(raid_cfg);
1310 	if (rc != 0) {
1311 		SPDK_ERRLOG("Failed to add any base bdev to raid bdev\n");
1312 		/* Config is not removed in this case. */
1313 	}
1314 
1315 	return 0;
1316 }
1317 
1318 /*
1319  * brief:
1320  * raid_bdev_parse_config is used to find the raid bdev config section and parse it
1321  * Format of config file:
1322  * params:
1323  * none
1324  * returns:
1325  * 0 - success
1326  * non zero - failure
1327  */
1328 static int
1329 raid_bdev_parse_config(void)
1330 {
1331 	int                      ret;
1332 	struct spdk_conf_section *conf_section;
1333 
1334 	conf_section = spdk_conf_first_section(NULL);
1335 	while (conf_section != NULL) {
1336 		if (spdk_conf_section_match_prefix(conf_section, "RAID")) {
1337 			ret = raid_bdev_parse_raid(conf_section);
1338 			if (ret < 0) {
1339 				SPDK_ERRLOG("Unable to parse raid bdev section\n");
1340 				return ret;
1341 			}
1342 		}
1343 		conf_section = spdk_conf_next_section(conf_section);
1344 	}
1345 
1346 	return 0;
1347 }
1348 
1349 /*
1350  * brief:
1351  * raid_bdev_fini_start is called when bdev layer is starting the
1352  * shutdown process
1353  * params:
1354  * none
1355  * returns:
1356  * none
1357  */
1358 static void
1359 raid_bdev_fini_start(void)
1360 {
1361 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_fini_start\n");
1362 	g_shutdown_started = true;
1363 }
1364 
1365 /*
1366  * brief:
1367  * raid_bdev_exit is called on raid bdev module exit time by bdev layer
1368  * params:
1369  * none
1370  * returns:
1371  * none
1372  */
1373 static void
1374 raid_bdev_exit(void)
1375 {
1376 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_exit\n");
1377 	raid_bdev_free();
1378 }
1379 
1380 /*
1381  * brief:
1382  * raid_bdev_get_ctx_size is used to return the context size of bdev_io for raid
1383  * module
1384  * params:
1385  * none
1386  * returns:
1387  * size of spdk_bdev_io context for raid
1388  */
1389 static int
1390 raid_bdev_get_ctx_size(void)
1391 {
1392 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_get_ctx_size\n");
1393 	return sizeof(struct raid_bdev_io);
1394 }
1395 
1396 /*
1397  * brief:
1398  * raid_bdev_get_running_config is used to get the configuration options.
1399  *
1400  * params:
1401  * fp - The pointer to a file that will be written to the configuration options.
1402  * returns:
1403  * none
1404  */
1405 static void
1406 raid_bdev_get_running_config(FILE *fp)
1407 {
1408 	struct raid_bdev *raid_bdev;
1409 	struct spdk_bdev *base;
1410 	int index = 1;
1411 	uint8_t i;
1412 
1413 	TAILQ_FOREACH(raid_bdev, &g_raid_bdev_configured_list, state_link) {
1414 		fprintf(fp,
1415 			"\n"
1416 			"[RAID%d]\n"
1417 			"  Name %s\n"
1418 			"  StripSize %" PRIu32 "\n"
1419 			"  NumDevices %u\n"
1420 			"  RaidLevel %hhu\n",
1421 			index, raid_bdev->bdev.name, raid_bdev->strip_size_kb,
1422 			raid_bdev->num_base_bdevs, raid_bdev->raid_level);
1423 		fprintf(fp,
1424 			"  Devices ");
1425 		for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
1426 			base = raid_bdev->base_bdev_info[i].bdev;
1427 			if (base) {
1428 				fprintf(fp,
1429 					"%s ",
1430 					base->name);
1431 			}
1432 		}
1433 		fprintf(fp,
1434 			"\n");
1435 		index++;
1436 	}
1437 }
1438 
1439 /*
1440  * brief:
1441  * raid_bdev_can_claim_bdev is the function to check if this base_bdev can be
1442  * claimed by raid bdev or not.
1443  * params:
1444  * bdev_name - represents base bdev name
1445  * _raid_cfg - pointer to raid bdev config parsed from config file
1446  * base_bdev_slot - if bdev can be claimed, it represents the base_bdev correct
1447  * slot. This field is only valid if return value of this function is true
1448  * returns:
1449  * true - if bdev can be claimed
1450  * false - if bdev can't be claimed
1451  */
1452 static bool
1453 raid_bdev_can_claim_bdev(const char *bdev_name, struct raid_bdev_config **_raid_cfg,
1454 			 uint8_t *base_bdev_slot)
1455 {
1456 	struct raid_bdev_config *raid_cfg;
1457 	uint8_t i;
1458 
1459 	TAILQ_FOREACH(raid_cfg, &g_raid_config.raid_bdev_config_head, link) {
1460 		for (i = 0; i < raid_cfg->num_base_bdevs; i++) {
1461 			/*
1462 			 * Check if the base bdev name is part of raid bdev configuration.
1463 			 * If match is found then return true and the slot information where
1464 			 * this base bdev should be inserted in raid bdev
1465 			 */
1466 			if (!strcmp(bdev_name, raid_cfg->base_bdev[i].name)) {
1467 				*_raid_cfg = raid_cfg;
1468 				*base_bdev_slot = i;
1469 				return true;
1470 			}
1471 		}
1472 	}
1473 
1474 	return false;
1475 }
1476 
1477 
1478 static struct spdk_bdev_module g_raid_if = {
1479 	.name = "raid",
1480 	.module_init = raid_bdev_init,
1481 	.fini_start = raid_bdev_fini_start,
1482 	.module_fini = raid_bdev_exit,
1483 	.get_ctx_size = raid_bdev_get_ctx_size,
1484 	.examine_config = raid_bdev_examine,
1485 	.config_text = raid_bdev_get_running_config,
1486 	.async_init = false,
1487 	.async_fini = false,
1488 };
1489 SPDK_BDEV_MODULE_REGISTER(raid, &g_raid_if)
1490 
1491 /*
1492  * brief:
1493  * raid_bdev_init is the initialization function for raid bdev module
1494  * params:
1495  * none
1496  * returns:
1497  * 0 - success
1498  * non zero - failure
1499  */
1500 static int
1501 raid_bdev_init(void)
1502 {
1503 	int ret;
1504 
1505 	/* Parse config file for raids */
1506 	ret = raid_bdev_parse_config();
1507 	if (ret < 0) {
1508 		SPDK_ERRLOG("raid bdev init failed parsing\n");
1509 		raid_bdev_free();
1510 		return ret;
1511 	}
1512 
1513 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_init completed successfully\n");
1514 
1515 	return 0;
1516 }
1517 
1518 static const struct raid_fn_table g_raid0_fn_table = {
1519 	.start_rw_request	= raid0_start_rw_request,
1520 	.get_curr_base_index	= raid0_get_curr_base_bdev_index,
1521 	.waitq_io_process	= raid0_waitq_io_process,
1522 	.get_io_range		= _raid0_get_io_range,
1523 	.split_io_range		= _raid0_split_io_range,
1524 };
1525 
1526 /*
1527  * brief:
1528  * raid_bdev_create allocates raid bdev based on passed configuration
1529  * params:
1530  * raid_cfg - configuration of raid bdev
1531  * returns:
1532  * 0 - success
1533  * non zero - failure
1534  */
1535 int
1536 raid_bdev_create(struct raid_bdev_config *raid_cfg)
1537 {
1538 	struct raid_bdev *raid_bdev;
1539 	struct spdk_bdev *raid_bdev_gen;
1540 
1541 	raid_bdev = calloc(1, sizeof(*raid_bdev));
1542 	if (!raid_bdev) {
1543 		SPDK_ERRLOG("Unable to allocate memory for raid bdev\n");
1544 		return -ENOMEM;
1545 	}
1546 
1547 	assert(raid_cfg->num_base_bdevs != 0);
1548 	raid_bdev->num_base_bdevs = raid_cfg->num_base_bdevs;
1549 	raid_bdev->base_bdev_info = calloc(raid_bdev->num_base_bdevs,
1550 					   sizeof(struct raid_base_bdev_info));
1551 	if (!raid_bdev->base_bdev_info) {
1552 		SPDK_ERRLOG("Unable able to allocate base bdev info\n");
1553 		free(raid_bdev);
1554 		return -ENOMEM;
1555 	}
1556 
1557 	/* strip_size_kb is from the rpc param.  strip_size is in blocks and used
1558 	 * intnerally and set later.
1559 	 */
1560 	raid_bdev->strip_size = 0;
1561 	raid_bdev->strip_size_kb = raid_cfg->strip_size;
1562 	raid_bdev->state = RAID_BDEV_STATE_CONFIGURING;
1563 	raid_bdev->config = raid_cfg;
1564 	raid_bdev->raid_level = raid_cfg->raid_level;
1565 
1566 	switch (raid_bdev->raid_level) {
1567 	case 0:
1568 		raid_bdev->fn_table = &g_raid0_fn_table;
1569 		break;
1570 	default:
1571 		SPDK_ERRLOG("invalid raid level %u\n", raid_bdev->raid_level);
1572 		free(raid_bdev);
1573 		return -EINVAL;
1574 	}
1575 
1576 	raid_bdev_gen = &raid_bdev->bdev;
1577 
1578 	raid_bdev_gen->name = strdup(raid_cfg->name);
1579 	if (!raid_bdev_gen->name) {
1580 		SPDK_ERRLOG("Unable to allocate name for raid\n");
1581 		free(raid_bdev->base_bdev_info);
1582 		free(raid_bdev);
1583 		return -ENOMEM;
1584 	}
1585 
1586 	raid_bdev_gen->product_name = "Raid Volume";
1587 	raid_bdev_gen->ctxt = raid_bdev;
1588 	raid_bdev_gen->fn_table = &g_raid_bdev_fn_table;
1589 	raid_bdev_gen->module = &g_raid_if;
1590 	raid_bdev_gen->write_cache = 0;
1591 
1592 	TAILQ_INSERT_TAIL(&g_raid_bdev_configuring_list, raid_bdev, state_link);
1593 	TAILQ_INSERT_TAIL(&g_raid_bdev_list, raid_bdev, global_link);
1594 
1595 	raid_cfg->raid_bdev = raid_bdev;
1596 
1597 	return 0;
1598 }
1599 
1600 /*
1601  * brief
1602  * raid_bdev_alloc_base_bdev_resource allocates resource of base bdev.
1603  * params:
1604  * raid_bdev - pointer to raid bdev
1605  * bdev - pointer to base bdev
1606  * base_bdev_slot - position to add base bdev
1607  * returns:
1608  * 0 - success
1609  * non zero - failure
1610  */
1611 static int
1612 raid_bdev_alloc_base_bdev_resource(struct raid_bdev *raid_bdev, struct spdk_bdev *bdev,
1613 				   uint8_t base_bdev_slot)
1614 {
1615 	struct spdk_bdev_desc *desc;
1616 	int rc;
1617 
1618 	rc = spdk_bdev_open(bdev, true, raid_bdev_remove_base_bdev, bdev, &desc);
1619 	if (rc != 0) {
1620 		SPDK_ERRLOG("Unable to create desc on bdev '%s'\n", bdev->name);
1621 		return rc;
1622 	}
1623 
1624 	rc = spdk_bdev_module_claim_bdev(bdev, NULL, &g_raid_if);
1625 	if (rc != 0) {
1626 		SPDK_ERRLOG("Unable to claim this bdev as it is already claimed\n");
1627 		spdk_bdev_close(desc);
1628 		return rc;
1629 	}
1630 
1631 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "bdev %s is claimed\n", bdev->name);
1632 
1633 	assert(raid_bdev->state != RAID_BDEV_STATE_ONLINE);
1634 	assert(base_bdev_slot < raid_bdev->num_base_bdevs);
1635 
1636 	raid_bdev->base_bdev_info[base_bdev_slot].bdev = bdev;
1637 	raid_bdev->base_bdev_info[base_bdev_slot].desc = desc;
1638 	raid_bdev->num_base_bdevs_discovered++;
1639 	assert(raid_bdev->num_base_bdevs_discovered <= raid_bdev->num_base_bdevs);
1640 
1641 	return 0;
1642 }
1643 
1644 /*
1645  * brief:
1646  * If raid bdev config is complete, then only register the raid bdev to
1647  * bdev layer and remove this raid bdev from configuring list and
1648  * insert the raid bdev to configured list
1649  * params:
1650  * raid_bdev - pointer to raid bdev
1651  * returns:
1652  * 0 - success
1653  * non zero - failure
1654  */
1655 static int
1656 raid_bdev_configure(struct raid_bdev *raid_bdev)
1657 {
1658 	uint32_t		blocklen;
1659 	uint64_t		min_blockcnt;
1660 	struct spdk_bdev	*raid_bdev_gen;
1661 	int rc = 0;
1662 
1663 	blocklen = raid_bdev->base_bdev_info[0].bdev->blocklen;
1664 	min_blockcnt = raid_bdev->base_bdev_info[0].bdev->blockcnt;
1665 	for (uint8_t i = 1; i < raid_bdev->num_base_bdevs; i++) {
1666 		/* Calculate minimum block count from all base bdevs */
1667 		if (raid_bdev->base_bdev_info[i].bdev->blockcnt < min_blockcnt) {
1668 			min_blockcnt = raid_bdev->base_bdev_info[i].bdev->blockcnt;
1669 		}
1670 
1671 		/* Check blocklen for all base bdevs that it should be same */
1672 		if (blocklen != raid_bdev->base_bdev_info[i].bdev->blocklen) {
1673 			/*
1674 			 * Assumption is that all the base bdevs for any raid bdev should
1675 			 * have same blocklen
1676 			 */
1677 			SPDK_ERRLOG("Blocklen of various bdevs not matching\n");
1678 			return -EINVAL;
1679 		}
1680 	}
1681 
1682 	/* The strip_size_kb is read in from user in KB. Convert to blocks here for
1683 	 * internal use.
1684 	 */
1685 	raid_bdev->strip_size = (raid_bdev->strip_size_kb * 1024) / blocklen;
1686 	raid_bdev->strip_size_shift = spdk_u32log2(raid_bdev->strip_size);
1687 	raid_bdev->blocklen_shift = spdk_u32log2(blocklen);
1688 
1689 	raid_bdev_gen = &raid_bdev->bdev;
1690 	raid_bdev_gen->blocklen = blocklen;
1691 	if (raid_bdev->num_base_bdevs > 1) {
1692 		raid_bdev_gen->optimal_io_boundary = raid_bdev->strip_size;
1693 		raid_bdev_gen->split_on_optimal_io_boundary = true;
1694 	} else {
1695 		/* Do not need to split reads/writes on single bdev RAID modules. */
1696 		raid_bdev_gen->optimal_io_boundary = 0;
1697 		raid_bdev_gen->split_on_optimal_io_boundary = false;
1698 	}
1699 
1700 	/*
1701 	 * RAID bdev logic is for striping so take the minimum block count based
1702 	 * approach where total block count of raid bdev is the number of base
1703 	 * bdev times the minimum block count of any base bdev
1704 	 */
1705 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "min blockcount %lu,  numbasedev %u, strip size shift %u\n",
1706 		      min_blockcnt,
1707 		      raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift);
1708 	raid_bdev_gen->blockcnt = ((min_blockcnt >> raid_bdev->strip_size_shift) <<
1709 				   raid_bdev->strip_size_shift)  * raid_bdev->num_base_bdevs;
1710 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "io device register %p\n", raid_bdev);
1711 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "blockcnt %lu, blocklen %u\n", raid_bdev_gen->blockcnt,
1712 		      raid_bdev_gen->blocklen);
1713 	if (raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) {
1714 		raid_bdev->state = RAID_BDEV_STATE_ONLINE;
1715 		spdk_io_device_register(raid_bdev, raid_bdev_create_cb, raid_bdev_destroy_cb,
1716 					sizeof(struct raid_bdev_io_channel),
1717 					raid_bdev->bdev.name);
1718 		rc = spdk_bdev_register(raid_bdev_gen);
1719 		if (rc != 0) {
1720 			SPDK_ERRLOG("Unable to register raid bdev and stay at configuring state\n");
1721 			spdk_io_device_unregister(raid_bdev, NULL);
1722 			raid_bdev->state = RAID_BDEV_STATE_CONFIGURING;
1723 			return rc;
1724 		}
1725 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev generic %p\n", raid_bdev_gen);
1726 		TAILQ_REMOVE(&g_raid_bdev_configuring_list, raid_bdev, state_link);
1727 		TAILQ_INSERT_TAIL(&g_raid_bdev_configured_list, raid_bdev, state_link);
1728 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev is created with name %s, raid_bdev %p\n",
1729 			      raid_bdev_gen->name, raid_bdev);
1730 	}
1731 
1732 	return 0;
1733 }
1734 
1735 /*
1736  * brief:
1737  * If raid bdev is online and registered, change the bdev state to
1738  * configuring and unregister this raid device. Queue this raid device
1739  * in configuring list
1740  * params:
1741  * raid_bdev - pointer to raid bdev
1742  * cb_fn - callback function
1743  * cb_arg - argument to callback function
1744  * returns:
1745  * none
1746  */
1747 static void
1748 raid_bdev_deconfigure(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn,
1749 		      void *cb_arg)
1750 {
1751 	if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) {
1752 		if (cb_fn) {
1753 			cb_fn(cb_arg, 0);
1754 		}
1755 		return;
1756 	}
1757 
1758 	assert(raid_bdev->num_base_bdevs == raid_bdev->num_base_bdevs_discovered);
1759 	TAILQ_REMOVE(&g_raid_bdev_configured_list, raid_bdev, state_link);
1760 	raid_bdev->state = RAID_BDEV_STATE_OFFLINE;
1761 	assert(raid_bdev->num_base_bdevs_discovered);
1762 	TAILQ_INSERT_TAIL(&g_raid_bdev_offline_list, raid_bdev, state_link);
1763 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev state chaning from online to offline\n");
1764 
1765 	spdk_bdev_unregister(&raid_bdev->bdev, cb_fn, cb_arg);
1766 }
1767 
1768 /*
1769  * brief:
1770  * raid_bdev_find_by_base_bdev function finds the raid bdev which has
1771  *  claimed the base bdev.
1772  * params:
1773  * base_bdev - pointer to base bdev pointer
1774  * _raid_bdev - Referenct to pointer to raid bdev
1775  * _base_bdev_slot - Reference to the slot of the base bdev.
1776  * returns:
1777  * true - if the raid bdev is found.
1778  * false - if the raid bdev is not found.
1779  */
1780 static bool
1781 raid_bdev_find_by_base_bdev(struct spdk_bdev *base_bdev, struct raid_bdev **_raid_bdev,
1782 			    uint8_t *_base_bdev_slot)
1783 {
1784 	struct raid_bdev	*raid_bdev;
1785 	uint8_t			i;
1786 
1787 	TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) {
1788 		for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
1789 			if (raid_bdev->base_bdev_info[i].bdev == base_bdev) {
1790 				*_raid_bdev = raid_bdev;
1791 				*_base_bdev_slot = i;
1792 				return true;
1793 			}
1794 		}
1795 	}
1796 
1797 	return false;
1798 }
1799 
1800 /*
1801  * brief:
1802  * raid_bdev_remove_base_bdev function is called by below layers when base_bdev
1803  * is removed. This function checks if this base bdev is part of any raid bdev
1804  * or not. If yes, it takes necessary action on that particular raid bdev.
1805  * params:
1806  * ctx - pointer to base bdev pointer which got removed
1807  * returns:
1808  * none
1809  */
1810 static void
1811 raid_bdev_remove_base_bdev(void *ctx)
1812 {
1813 	struct spdk_bdev	*base_bdev = ctx;
1814 	struct raid_bdev	*raid_bdev = NULL;
1815 	uint8_t			base_bdev_slot = 0;
1816 
1817 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_remove_base_bdev\n");
1818 
1819 	/* Find the raid_bdev which has claimed this base_bdev */
1820 	if (!raid_bdev_find_by_base_bdev(base_bdev, &raid_bdev, &base_bdev_slot)) {
1821 		SPDK_ERRLOG("bdev to remove '%s' not found\n", base_bdev->name);
1822 		return;
1823 	}
1824 
1825 	assert(raid_bdev->base_bdev_info[base_bdev_slot].desc);
1826 	raid_bdev->base_bdev_info[base_bdev_slot].remove_scheduled = true;
1827 
1828 	if (raid_bdev->destruct_called == true ||
1829 	    raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) {
1830 		/*
1831 		 * As raid bdev is not registered yet or already unregistered,
1832 		 * so cleanup should be done here itself.
1833 		 */
1834 		raid_bdev_free_base_bdev_resource(raid_bdev, base_bdev_slot);
1835 		if (raid_bdev->num_base_bdevs_discovered == 0) {
1836 			/* There is no base bdev for this raid, so free the raid device. */
1837 			raid_bdev_cleanup(raid_bdev);
1838 			return;
1839 		}
1840 	}
1841 
1842 	raid_bdev_deconfigure(raid_bdev, NULL, NULL);
1843 }
1844 
1845 /*
1846  * brief:
1847  * Remove base bdevs from the raid bdev one by one.  Skip any base bdev which
1848  *  doesn't exist.
1849  * params:
1850  * raid_cfg - pointer to raid bdev config.
1851  * cb_fn - callback function
1852  * cb_ctx - argument to callback function
1853  */
1854 void
1855 raid_bdev_remove_base_devices(struct raid_bdev_config *raid_cfg,
1856 			      raid_bdev_destruct_cb cb_fn, void *cb_arg)
1857 {
1858 	struct raid_bdev		*raid_bdev;
1859 	struct raid_base_bdev_info	*info;
1860 	uint8_t				i;
1861 
1862 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_remove_base_devices\n");
1863 
1864 	raid_bdev = raid_cfg->raid_bdev;
1865 	if (raid_bdev == NULL) {
1866 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev %s doesn't exist now\n", raid_cfg->name);
1867 		if (cb_fn) {
1868 			cb_fn(cb_arg, 0);
1869 		}
1870 		return;
1871 	}
1872 
1873 	if (raid_bdev->destroy_started) {
1874 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "destroying raid bdev %s is already started\n",
1875 			      raid_cfg->name);
1876 		if (cb_fn) {
1877 			cb_fn(cb_arg, -EALREADY);
1878 		}
1879 		return;
1880 	}
1881 
1882 	raid_bdev->destroy_started = true;
1883 
1884 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
1885 		info = &raid_bdev->base_bdev_info[i];
1886 
1887 		if (info->bdev == NULL) {
1888 			continue;
1889 		}
1890 
1891 		assert(info->desc);
1892 		info->remove_scheduled = true;
1893 
1894 		if (raid_bdev->destruct_called == true ||
1895 		    raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) {
1896 			/*
1897 			 * As raid bdev is not registered yet or already unregistered,
1898 			 * so cleanup should be done here itself.
1899 			 */
1900 			raid_bdev_free_base_bdev_resource(raid_bdev, i);
1901 			if (raid_bdev->num_base_bdevs_discovered == 0) {
1902 				/* There is no base bdev for this raid, so free the raid device. */
1903 				raid_bdev_cleanup(raid_bdev);
1904 				if (cb_fn) {
1905 					cb_fn(cb_arg, 0);
1906 				}
1907 				return;
1908 			}
1909 		}
1910 	}
1911 
1912 	raid_bdev_deconfigure(raid_bdev, cb_fn, cb_arg);
1913 }
1914 
1915 /*
1916  * brief:
1917  * raid_bdev_add_base_device function is the actual function which either adds
1918  * the nvme base device to existing raid bdev or create a new raid bdev. It also claims
1919  * the base device and keep the open descriptor.
1920  * params:
1921  * raid_cfg - pointer to raid bdev config
1922  * bdev - pointer to base bdev
1923  * base_bdev_slot - position to add base bdev
1924  * returns:
1925  * 0 - success
1926  * non zero - failure
1927  */
1928 static int
1929 raid_bdev_add_base_device(struct raid_bdev_config *raid_cfg, struct spdk_bdev *bdev,
1930 			  uint8_t base_bdev_slot)
1931 {
1932 	struct raid_bdev	*raid_bdev;
1933 	int			rc;
1934 
1935 	raid_bdev = raid_cfg->raid_bdev;
1936 	if (!raid_bdev) {
1937 		SPDK_ERRLOG("Raid bdev '%s' is not created yet\n", raid_cfg->name);
1938 		return -ENODEV;
1939 	}
1940 
1941 	rc = raid_bdev_alloc_base_bdev_resource(raid_bdev, bdev, base_bdev_slot);
1942 	if (rc != 0) {
1943 		SPDK_ERRLOG("Failed to allocate resource for bdev '%s'\n", bdev->name);
1944 		return rc;
1945 	}
1946 
1947 	assert(raid_bdev->num_base_bdevs_discovered <= raid_bdev->num_base_bdevs);
1948 
1949 	if (raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs) {
1950 		rc = raid_bdev_configure(raid_bdev);
1951 		if (rc != 0) {
1952 			SPDK_ERRLOG("Failed to configure raid bdev\n");
1953 			return rc;
1954 		}
1955 	}
1956 
1957 	return 0;
1958 }
1959 
1960 /*
1961  * brief:
1962  * Add base bdevs to the raid bdev one by one.  Skip any base bdev which doesn't
1963  *  exist or fails to add. If all base bdevs are successfully added, the raid bdev
1964  *  moves to the configured state and becomes available. Otherwise, the raid bdev
1965  *  stays at the configuring state with added base bdevs.
1966  * params:
1967  * raid_cfg - pointer to raid bdev config
1968  * returns:
1969  * 0 - The raid bdev moves to the configured state or stays at the configuring
1970  *     state with added base bdevs due to any nonexistent base bdev.
1971  * non zero - Failed to add any base bdev and stays at the configuring state with
1972  *            added base bdevs.
1973  */
1974 int
1975 raid_bdev_add_base_devices(struct raid_bdev_config *raid_cfg)
1976 {
1977 	struct spdk_bdev	*base_bdev;
1978 	uint8_t			i;
1979 	int			rc = 0, _rc;
1980 
1981 	for (i = 0; i < raid_cfg->num_base_bdevs; i++) {
1982 		base_bdev = spdk_bdev_get_by_name(raid_cfg->base_bdev[i].name);
1983 		if (base_bdev == NULL) {
1984 			SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "base bdev %s doesn't exist now\n",
1985 				      raid_cfg->base_bdev[i].name);
1986 			continue;
1987 		}
1988 
1989 		_rc = raid_bdev_add_base_device(raid_cfg, base_bdev, i);
1990 		if (_rc != 0) {
1991 			SPDK_ERRLOG("Failed to add base bdev %s to RAID bdev %s: %s\n",
1992 				    raid_cfg->base_bdev[i].name, raid_cfg->name,
1993 				    spdk_strerror(-_rc));
1994 			if (rc == 0) {
1995 				rc = _rc;
1996 			}
1997 		}
1998 	}
1999 
2000 	return rc;
2001 }
2002 
2003 /*
2004  * brief:
2005  * raid_bdev_examine function is the examine function call by the below layers
2006  * like bdev_nvme layer. This function will check if this base bdev can be
2007  * claimed by this raid bdev or not.
2008  * params:
2009  * bdev - pointer to base bdev
2010  * returns:
2011  * none
2012  */
2013 static void
2014 raid_bdev_examine(struct spdk_bdev *bdev)
2015 {
2016 	struct raid_bdev_config	*raid_cfg;
2017 	uint8_t			base_bdev_slot;
2018 
2019 	if (raid_bdev_can_claim_bdev(bdev->name, &raid_cfg, &base_bdev_slot)) {
2020 		raid_bdev_add_base_device(raid_cfg, bdev, base_bdev_slot);
2021 	} else {
2022 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "bdev %s can't be claimed\n",
2023 			      bdev->name);
2024 	}
2025 
2026 	spdk_bdev_module_examine_done(&g_raid_if);
2027 }
2028 
2029 /* Log component for bdev raid bdev module */
2030 SPDK_LOG_REGISTER_COMPONENT("bdev_raid", SPDK_LOG_BDEV_RAID)
2031