xref: /spdk/module/bdev/raid/bdev_raid.c (revision 407e88fd2ab020d753e33014cf759353a9901b51)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "bdev_raid.h"
35 #include "spdk/env.h"
36 #include "spdk/io_channel.h"
37 #include "spdk/conf.h"
38 #include "spdk_internal/log.h"
39 #include "spdk/string.h"
40 #include "spdk/util.h"
41 #include "spdk/json.h"
42 #include "spdk/string.h"
43 
44 static bool g_shutdown_started = false;
45 
46 /* raid bdev config as read from config file */
47 struct raid_config	g_raid_config = {
48 	.raid_bdev_config_head = TAILQ_HEAD_INITIALIZER(g_raid_config.raid_bdev_config_head),
49 };
50 
51 /*
52  * List of raid bdev in configured list, these raid bdevs are registered with
53  * bdev layer
54  */
55 struct raid_configured_tailq	g_raid_bdev_configured_list = TAILQ_HEAD_INITIALIZER(
56 			g_raid_bdev_configured_list);
57 
58 /* List of raid bdev in configuring list */
59 struct raid_configuring_tailq	g_raid_bdev_configuring_list = TAILQ_HEAD_INITIALIZER(
60 			g_raid_bdev_configuring_list);
61 
62 /* List of all raid bdevs */
63 struct raid_all_tailq		g_raid_bdev_list = TAILQ_HEAD_INITIALIZER(g_raid_bdev_list);
64 
65 /* List of all raid bdevs that are offline */
66 struct raid_offline_tailq	g_raid_bdev_offline_list = TAILQ_HEAD_INITIALIZER(
67 			g_raid_bdev_offline_list);
68 
69 /* Function declarations */
70 static void	raid_bdev_examine(struct spdk_bdev *bdev);
71 static int	raid_bdev_init(void);
72 static void	raid_bdev_waitq_io_process(void *ctx);
73 static void	raid_bdev_deconfigure(struct raid_bdev *raid_bdev,
74 				      raid_bdev_destruct_cb cb_fn, void *cb_arg);
75 static void	raid_bdev_remove_base_bdev(void *ctx);
76 
77 /*
78  * brief:
79  * raid_bdev_create_cb function is a cb function for raid bdev which creates the
80  * hierarchy from raid bdev to base bdev io channels. It will be called per core
81  * params:
82  * io_device - pointer to raid bdev io device represented by raid_bdev
83  * ctx_buf - pointer to context buffer for raid bdev io channel
84  * returns:
85  * 0 - success
86  * non zero - failure
87  */
88 static int
89 raid_bdev_create_cb(void *io_device, void *ctx_buf)
90 {
91 	struct raid_bdev            *raid_bdev = io_device;
92 	struct raid_bdev_io_channel *raid_ch = ctx_buf;
93 
94 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_create_cb, %p\n", raid_ch);
95 
96 	assert(raid_bdev != NULL);
97 	assert(raid_bdev->state == RAID_BDEV_STATE_ONLINE);
98 
99 	raid_ch->num_channels = raid_bdev->num_base_bdevs;
100 
101 	raid_ch->base_channel = calloc(raid_ch->num_channels,
102 				       sizeof(struct spdk_io_channel *));
103 	if (!raid_ch->base_channel) {
104 		SPDK_ERRLOG("Unable to allocate base bdevs io channel\n");
105 		return -ENOMEM;
106 	}
107 	for (uint8_t i = 0; i < raid_ch->num_channels; i++) {
108 		/*
109 		 * Get the spdk_io_channel for all the base bdevs. This is used during
110 		 * split logic to send the respective child bdev ios to respective base
111 		 * bdev io channel.
112 		 */
113 		raid_ch->base_channel[i] = spdk_bdev_get_io_channel(
114 						   raid_bdev->base_bdev_info[i].desc);
115 		if (!raid_ch->base_channel[i]) {
116 			for (uint8_t j = 0; j < i; j++) {
117 				spdk_put_io_channel(raid_ch->base_channel[j]);
118 			}
119 			free(raid_ch->base_channel);
120 			raid_ch->base_channel = NULL;
121 			SPDK_ERRLOG("Unable to create io channel for base bdev\n");
122 			return -ENOMEM;
123 		}
124 	}
125 
126 	return 0;
127 }
128 
129 /*
130  * brief:
131  * raid_bdev_destroy_cb function is a cb function for raid bdev which deletes the
132  * hierarchy from raid bdev to base bdev io channels. It will be called per core
133  * params:
134  * io_device - pointer to raid bdev io device represented by raid_bdev
135  * ctx_buf - pointer to context buffer for raid bdev io channel
136  * returns:
137  * none
138  */
139 static void
140 raid_bdev_destroy_cb(void *io_device, void *ctx_buf)
141 {
142 	struct raid_bdev_io_channel *raid_ch = ctx_buf;
143 
144 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_destroy_cb\n");
145 
146 	assert(raid_ch != NULL);
147 	assert(raid_ch->base_channel);
148 	for (uint8_t i = 0; i < raid_ch->num_channels; i++) {
149 		/* Free base bdev channels */
150 		assert(raid_ch->base_channel[i] != NULL);
151 		spdk_put_io_channel(raid_ch->base_channel[i]);
152 	}
153 	free(raid_ch->base_channel);
154 	raid_ch->base_channel = NULL;
155 }
156 
157 /*
158  * brief:
159  * raid_bdev_cleanup is used to cleanup and free raid_bdev related data
160  * structures.
161  * params:
162  * raid_bdev - pointer to raid_bdev
163  * returns:
164  * none
165  */
166 static void
167 raid_bdev_cleanup(struct raid_bdev *raid_bdev)
168 {
169 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_cleanup, %p name %s, state %u, config %p\n",
170 		      raid_bdev,
171 		      raid_bdev->bdev.name, raid_bdev->state, raid_bdev->config);
172 	if (raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) {
173 		TAILQ_REMOVE(&g_raid_bdev_configuring_list, raid_bdev, state_link);
174 	} else if (raid_bdev->state == RAID_BDEV_STATE_OFFLINE) {
175 		TAILQ_REMOVE(&g_raid_bdev_offline_list, raid_bdev, state_link);
176 	} else {
177 		assert(0);
178 	}
179 	TAILQ_REMOVE(&g_raid_bdev_list, raid_bdev, global_link);
180 	free(raid_bdev->bdev.name);
181 	free(raid_bdev->base_bdev_info);
182 	if (raid_bdev->config) {
183 		raid_bdev->config->raid_bdev = NULL;
184 	}
185 	free(raid_bdev);
186 }
187 
188 /*
189  * brief:
190  * free resource of base bdev for raid bdev
191  * params:
192  * raid_bdev - pointer to raid bdev
193  * base_bdev_slot - position to base bdev in raid bdev
194  * returns:
195  * 0 - success
196  * non zero - failure
197  */
198 static void
199 raid_bdev_free_base_bdev_resource(struct raid_bdev *raid_bdev, uint8_t base_bdev_slot)
200 {
201 	struct raid_base_bdev_info *info;
202 
203 	info = &raid_bdev->base_bdev_info[base_bdev_slot];
204 
205 	spdk_bdev_module_release_bdev(info->bdev);
206 	spdk_bdev_close(info->desc);
207 	info->desc = NULL;
208 	info->bdev = NULL;
209 
210 	assert(raid_bdev->num_base_bdevs_discovered);
211 	raid_bdev->num_base_bdevs_discovered--;
212 }
213 
214 /*
215  * brief:
216  * raid_bdev_destruct is the destruct function table pointer for raid bdev
217  * params:
218  * ctxt - pointer to raid_bdev
219  * returns:
220  * 0 - success
221  * non zero - failure
222  */
223 static int
224 raid_bdev_destruct(void *ctxt)
225 {
226 	struct raid_bdev *raid_bdev = ctxt;
227 
228 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_destruct\n");
229 
230 	raid_bdev->destruct_called = true;
231 	for (uint8_t i = 0; i < raid_bdev->num_base_bdevs; i++) {
232 		/*
233 		 * Close all base bdev descriptors for which call has come from below
234 		 * layers.  Also close the descriptors if we have started shutdown.
235 		 */
236 		if (g_shutdown_started ||
237 		    ((raid_bdev->base_bdev_info[i].remove_scheduled == true) &&
238 		     (raid_bdev->base_bdev_info[i].bdev != NULL))) {
239 			raid_bdev_free_base_bdev_resource(raid_bdev, i);
240 		}
241 	}
242 
243 	if (g_shutdown_started) {
244 		TAILQ_REMOVE(&g_raid_bdev_configured_list, raid_bdev, state_link);
245 		raid_bdev->state = RAID_BDEV_STATE_OFFLINE;
246 		TAILQ_INSERT_TAIL(&g_raid_bdev_offline_list, raid_bdev, state_link);
247 	}
248 
249 	spdk_io_device_unregister(raid_bdev, NULL);
250 
251 	if (raid_bdev->num_base_bdevs_discovered == 0) {
252 		/* Free raid_bdev when there are no base bdevs left */
253 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev base bdevs is 0, going to free all in destruct\n");
254 		raid_bdev_cleanup(raid_bdev);
255 	}
256 
257 	return 0;
258 }
259 
260 /*
261  * brief:
262  * raid_bdev_io_completion function is called by lower layers to notify raid
263  * module that particular bdev_io is completed.
264  * params:
265  * bdev_io - pointer to bdev io submitted to lower layers, like child io
266  * success - bdev_io status
267  * cb_arg - function callback context, like parent io pointer
268  * returns:
269  * none
270  */
271 static void
272 raid_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
273 {
274 	struct spdk_bdev_io         *parent_io = cb_arg;
275 
276 	spdk_bdev_free_io(bdev_io);
277 
278 	if (success) {
279 		spdk_bdev_io_complete(parent_io, SPDK_BDEV_IO_STATUS_SUCCESS);
280 	} else {
281 		spdk_bdev_io_complete(parent_io, SPDK_BDEV_IO_STATUS_FAILED);
282 	}
283 }
284 
285 /*
286  * brief:
287  * raid_bdev_submit_rw_request function is used to submit I/O to the correct
288  * member disk
289  * params:
290  * bdev_io - parent bdev io
291  * start_strip - start strip number of this io
292  * returns:
293  * 0 - success
294  * non zero - failure
295  */
296 static int
297 raid_bdev_submit_rw_request(struct spdk_bdev_io *bdev_io, uint64_t start_strip)
298 {
299 	struct raid_bdev_io		*raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
300 	struct raid_bdev_io_channel	*raid_ch = spdk_io_channel_get_ctx(raid_io->ch);
301 	struct raid_bdev		*raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
302 	uint64_t			pd_strip;
303 	uint32_t			offset_in_strip;
304 	uint64_t			pd_lba;
305 	uint64_t			pd_blocks;
306 	uint8_t				pd_idx;
307 	int				ret = 0;
308 
309 	pd_strip = start_strip / raid_bdev->num_base_bdevs;
310 	pd_idx = start_strip % raid_bdev->num_base_bdevs;
311 	offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1);
312 	pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip;
313 	pd_blocks = bdev_io->u.bdev.num_blocks;
314 	if (raid_bdev->base_bdev_info[pd_idx].desc == NULL) {
315 		SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
316 		assert(0);
317 	}
318 
319 	/*
320 	 * Submit child io to bdev layer with using base bdev descriptors, base
321 	 * bdev lba, base bdev child io length in blocks, buffer, completion
322 	 * function and function callback context
323 	 */
324 	assert(raid_ch != NULL);
325 	assert(raid_ch->base_channel);
326 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
327 		ret = spdk_bdev_readv_blocks(raid_bdev->base_bdev_info[pd_idx].desc,
328 					     raid_ch->base_channel[pd_idx],
329 					     bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
330 					     pd_lba, pd_blocks, raid_bdev_io_completion,
331 					     bdev_io);
332 	} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
333 		ret = spdk_bdev_writev_blocks(raid_bdev->base_bdev_info[pd_idx].desc,
334 					      raid_ch->base_channel[pd_idx],
335 					      bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
336 					      pd_lba, pd_blocks, raid_bdev_io_completion,
337 					      bdev_io);
338 	} else {
339 		SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type);
340 		assert(0);
341 	}
342 
343 	return ret;
344 }
345 
346 /*
347  * brief:
348  * get_curr_base_bdev_index function calculates the base bdev index
349  * params:
350  * raid_bdev - pointer to raid bdev
351  * raid_io - pointer to parent io context
352  * returns:
353  * base bdev index
354  */
355 static uint8_t
356 get_curr_base_bdev_index(struct raid_bdev *raid_bdev, struct raid_bdev_io *raid_io)
357 {
358 	struct spdk_bdev_io	*bdev_io;
359 	uint64_t		start_strip;
360 
361 	bdev_io = SPDK_CONTAINEROF(raid_io, struct spdk_bdev_io, driver_ctx);
362 	start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift;
363 
364 	return (start_strip % raid_bdev->num_base_bdevs);
365 }
366 
367 /*
368  * brief:
369  * raid_bdev_io_submit_fail_process function processes the IO which failed to submit.
370  * It will try to queue the IOs after storing the context to bdev wait queue logic.
371  * params:
372  * bdev_io - pointer to bdev_io
373  * raid_io - pointer to raid bdev io
374  * ret - return code
375  * returns:
376  * none
377  */
378 static void
379 raid_bdev_io_submit_fail_process(struct raid_bdev *raid_bdev, struct spdk_bdev_io *bdev_io,
380 				 struct raid_bdev_io *raid_io, int ret)
381 {
382 	struct raid_bdev_io_channel	*raid_ch;
383 	uint8_t				pd_idx;
384 
385 	if (ret != -ENOMEM) {
386 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
387 	} else {
388 		/* Queue the IO to bdev layer wait queue */
389 		pd_idx = get_curr_base_bdev_index(raid_bdev, raid_io);
390 		raid_io->waitq_entry.bdev = raid_bdev->base_bdev_info[pd_idx].bdev;
391 		raid_io->waitq_entry.cb_fn = raid_bdev_waitq_io_process;
392 		raid_io->waitq_entry.cb_arg = raid_io;
393 		raid_ch = spdk_io_channel_get_ctx(raid_io->ch);
394 		if (spdk_bdev_queue_io_wait(raid_bdev->base_bdev_info[pd_idx].bdev,
395 					    raid_ch->base_channel[pd_idx],
396 					    &raid_io->waitq_entry) != 0) {
397 			SPDK_ERRLOG("bdev io waitq error, it should not happen\n");
398 			assert(0);
399 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
400 		}
401 	}
402 }
403 
404 /*
405  * brief:
406  * raid_bdev_waitq_io_process function is the callback function
407  * registered by raid bdev module to bdev when bdev_io was unavailable.
408  * params:
409  * ctx - pointer to raid_bdev_io
410  * returns:
411  * none
412  */
413 static void
414 raid_bdev_waitq_io_process(void *ctx)
415 {
416 	struct raid_bdev_io	*raid_io = ctx;
417 	struct spdk_bdev_io	*bdev_io;
418 	struct raid_bdev	*raid_bdev;
419 	int			ret;
420 	uint64_t		start_strip;
421 
422 	bdev_io = SPDK_CONTAINEROF(raid_io, struct spdk_bdev_io, driver_ctx);
423 	/*
424 	 * Try to submit childs of parent bdev io. If failed due to resource
425 	 * crunch then break the loop and don't try to process other queued IOs.
426 	 */
427 	raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
428 	start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift;
429 	ret = raid_bdev_submit_rw_request(bdev_io, start_strip);
430 	if (ret != 0) {
431 		raid_bdev_io_submit_fail_process(raid_bdev, bdev_io, raid_io, ret);
432 	}
433 }
434 
435 /*
436  * brief:
437  * raid_bdev_start_rw_request function is the submit_request function for
438  * read/write requests
439  * params:
440  * ch - pointer to raid bdev io channel
441  * bdev_io - pointer to parent bdev_io on raid bdev device
442  * returns:
443  * none
444  */
445 static void
446 raid_bdev_start_rw_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
447 {
448 	struct raid_bdev_io		*raid_io;
449 	struct raid_bdev		*raid_bdev;
450 	uint64_t			start_strip = 0;
451 	uint64_t			end_strip = 0;
452 	int				ret;
453 
454 	raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
455 	raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
456 	raid_io->ch = ch;
457 	start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift;
458 	end_strip = (bdev_io->u.bdev.offset_blocks + bdev_io->u.bdev.num_blocks - 1) >>
459 		    raid_bdev->strip_size_shift;
460 	if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) {
461 		assert(false);
462 		SPDK_ERRLOG("I/O spans strip boundary!\n");
463 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
464 		return;
465 	}
466 	ret = raid_bdev_submit_rw_request(bdev_io, start_strip);
467 	if (ret != 0) {
468 		raid_bdev_io_submit_fail_process(raid_bdev, bdev_io, raid_io, ret);
469 	}
470 }
471 
472 /*
473  * brief:
474  * raid_bdev_base_io_completion is the completion callback for member disk requests
475  * params:
476  * bdev_io - pointer to member disk requested bdev_io
477  * success - true if successful, false if unsuccessful
478  * cb_arg - callback argument (parent raid bdev_io)
479  * returns:
480  * none
481  */
482 static void
483 raid_bdev_base_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
484 {
485 	struct spdk_bdev_io *parent_io = cb_arg;
486 	struct raid_bdev_io *raid_io = (struct raid_bdev_io *)parent_io->driver_ctx;
487 
488 	spdk_bdev_free_io(bdev_io);
489 
490 	if (!success) {
491 		raid_io->base_bdev_io_status = SPDK_BDEV_IO_STATUS_FAILED;
492 	}
493 
494 	raid_io->base_bdev_io_completed++;
495 	if (raid_io->base_bdev_io_completed == raid_io->base_bdev_io_expected) {
496 		spdk_bdev_io_complete(parent_io, raid_io->base_bdev_io_status);
497 	}
498 }
499 
500 /*
501  * brief:
502  * raid_bdev_base_io_submit_fail_process processes IO requests for member disk
503  * which failed to submit
504  * params:
505  * raid_bdev_io - pointer to raid bdev_io
506  * pd_idx - base_dev index in raid_bdev
507  * cb_fn - callback when the spdk_bdev_io for base_bdev becomes available
508  * ret - return code
509  * returns:
510  * none
511  */
512 static void
513 raid_bdev_base_io_submit_fail_process(struct spdk_bdev_io *raid_bdev_io, uint8_t pd_idx,
514 				      spdk_bdev_io_wait_cb cb_fn, int ret)
515 {
516 	struct raid_bdev_io *raid_io = (struct raid_bdev_io *)raid_bdev_io->driver_ctx;
517 	struct raid_bdev_io_channel *raid_ch = spdk_io_channel_get_ctx(raid_io->ch);
518 	struct raid_bdev *raid_bdev = (struct raid_bdev *)raid_bdev_io->bdev->ctxt;
519 
520 	assert(ret != 0);
521 
522 	if (ret == -ENOMEM) {
523 		raid_io->waitq_entry.bdev = raid_bdev->base_bdev_info[pd_idx].bdev;
524 		raid_io->waitq_entry.cb_fn = cb_fn;
525 		raid_io->waitq_entry.cb_arg = raid_bdev_io;
526 		spdk_bdev_queue_io_wait(raid_bdev->base_bdev_info[pd_idx].bdev,
527 					raid_ch->base_channel[pd_idx],
528 					&raid_io->waitq_entry);
529 		return;
530 	}
531 
532 	SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
533 	assert(false);
534 	spdk_bdev_io_complete(raid_bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
535 }
536 
537 /*
538  * brief:
539  * _raid_bdev_submit_reset_request_next function submits the next batch of reset requests
540  * to member disks; it will submit as many as possible unless a reset fails with -ENOMEM, in
541  * which case it will queue it for later submission
542  * params:
543  * bdev_io - pointer to parent bdev_io on raid bdev device
544  * returns:
545  * none
546  */
547 static void
548 _raid_bdev_submit_reset_request_next(void *_bdev_io)
549 {
550 	struct spdk_bdev_io		*bdev_io = _bdev_io;
551 	struct raid_bdev_io		*raid_io;
552 	struct raid_bdev		*raid_bdev;
553 	struct raid_bdev_io_channel	*raid_ch;
554 	int				ret;
555 	uint8_t				i;
556 
557 	raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
558 	raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
559 	raid_ch = spdk_io_channel_get_ctx(raid_io->ch);
560 
561 	while (raid_io->base_bdev_io_submitted < raid_bdev->num_base_bdevs) {
562 		i = raid_io->base_bdev_io_submitted;
563 		ret = spdk_bdev_reset(raid_bdev->base_bdev_info[i].desc,
564 				      raid_ch->base_channel[i],
565 				      raid_bdev_base_io_completion, bdev_io);
566 		if (ret == 0) {
567 			raid_io->base_bdev_io_submitted++;
568 		} else {
569 			raid_bdev_base_io_submit_fail_process(bdev_io, i,
570 							      _raid_bdev_submit_reset_request_next, ret);
571 			return;
572 		}
573 	}
574 }
575 
576 /*
577  * brief:
578  * _raid_bdev_submit_reset_request function is the submit_request function for
579  * reset requests
580  * params:
581  * ch - pointer to raid bdev io channel
582  * bdev_io - pointer to parent bdev_io on raid bdev device
583  * returns:
584  * none
585  */
586 static void
587 _raid_bdev_submit_reset_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
588 {
589 	struct raid_bdev_io		*raid_io;
590 	struct raid_bdev		*raid_bdev;
591 
592 	raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
593 	raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
594 	raid_io->ch = ch;
595 	raid_io->base_bdev_io_submitted = 0;
596 	raid_io->base_bdev_io_completed = 0;
597 	raid_io->base_bdev_io_expected = raid_bdev->num_base_bdevs;
598 	raid_io->base_bdev_io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
599 	_raid_bdev_submit_reset_request_next(bdev_io);
600 }
601 
602 /* raid0 IO range */
603 struct raid_bdev_io_range {
604 	uint64_t	strip_size;
605 	uint64_t	start_strip_in_disk;
606 	uint64_t	end_strip_in_disk;
607 	uint64_t	start_offset_in_strip;
608 	uint64_t	end_offset_in_strip;
609 	uint8_t		start_disk;
610 	uint8_t		end_disk;
611 	uint8_t		n_disks_involved;
612 };
613 
614 static inline void
615 _raid_bdev_get_io_range(struct raid_bdev_io_range *io_range,
616 			uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift,
617 			uint64_t offset_blocks, uint64_t num_blocks)
618 {
619 	uint64_t	start_strip;
620 	uint64_t	end_strip;
621 
622 	io_range->strip_size = strip_size;
623 
624 	/* The start and end strip index in raid0 bdev scope */
625 	start_strip = offset_blocks >> strip_size_shift;
626 	end_strip = (offset_blocks + num_blocks - 1) >> strip_size_shift;
627 	io_range->start_strip_in_disk = start_strip / num_base_bdevs;
628 	io_range->end_strip_in_disk = end_strip / num_base_bdevs;
629 
630 	/* The first strip may have unaligned start LBA offset.
631 	 * The end strip may have unaligned end LBA offset.
632 	 * Strips between them certainly have aligned offset and length to boundaries.
633 	 */
634 	io_range->start_offset_in_strip = offset_blocks % strip_size;
635 	io_range->end_offset_in_strip = (offset_blocks + num_blocks - 1) % strip_size;
636 
637 	/* The base bdev indexes in which start and end strips are located */
638 	io_range->start_disk = start_strip % num_base_bdevs;
639 	io_range->end_disk = end_strip % num_base_bdevs;
640 
641 	/* Calculate how many base_bdevs are involved in io operation.
642 	 * Number of base bdevs involved is between 1 and num_base_bdevs.
643 	 * It will be 1 if the first strip and last strip are the same one.
644 	 */
645 	io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs);
646 }
647 
648 static inline void
649 _raid_bdev_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx,
650 			  uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk)
651 {
652 	uint64_t n_strips_in_disk;
653 	uint64_t start_offset_in_disk;
654 	uint64_t end_offset_in_disk;
655 	uint64_t offset_in_disk;
656 	uint64_t nblocks_in_disk;
657 	uint64_t start_strip_in_disk;
658 	uint64_t end_strip_in_disk;
659 
660 	start_strip_in_disk = io_range->start_strip_in_disk;
661 	if (disk_idx < io_range->start_disk) {
662 		start_strip_in_disk += 1;
663 	}
664 
665 	end_strip_in_disk = io_range->end_strip_in_disk;
666 	if (disk_idx > io_range->end_disk) {
667 		end_strip_in_disk -= 1;
668 	}
669 
670 	assert(end_strip_in_disk >= start_strip_in_disk);
671 	n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1;
672 
673 	if (disk_idx == io_range->start_disk) {
674 		start_offset_in_disk = io_range->start_offset_in_strip;
675 	} else {
676 		start_offset_in_disk = 0;
677 	}
678 
679 	if (disk_idx == io_range->end_disk) {
680 		end_offset_in_disk = io_range->end_offset_in_strip;
681 	} else {
682 		end_offset_in_disk = io_range->strip_size - 1;
683 	}
684 
685 	offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size;
686 	nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size
687 			  + end_offset_in_disk - start_offset_in_disk + 1;
688 
689 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID,
690 		      "raid_bdev (strip_size 0x%lx) splits IO to base_bdev (%u) at (0x%lx, 0x%lx).\n",
691 		      io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk);
692 
693 	*_offset_in_disk = offset_in_disk;
694 	*_nblocks_in_disk = nblocks_in_disk;
695 }
696 
697 /*
698  * brief:
699  * _raid_bdev_submit_null_payload_request_next function submits the next batch of
700  * io requests with range but without payload, like FLUSH and UNMAP, to member disks;
701  * it will submit as many as possible unless one base io request fails with -ENOMEM,
702  * in which case it will queue itself for later submission.
703  * params:
704  * bdev_io - pointer to parent bdev_io on raid bdev device
705  * returns:
706  * none
707  */
708 static void
709 _raid_bdev_submit_null_payload_request_next(void *_bdev_io)
710 {
711 	struct spdk_bdev_io		*bdev_io = _bdev_io;
712 	struct raid_bdev_io		*raid_io;
713 	struct raid_bdev		*raid_bdev;
714 	struct raid_bdev_io_channel	*raid_ch;
715 	struct raid_bdev_io_range	io_range;
716 	int				ret;
717 
718 	raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
719 	raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
720 	raid_ch = spdk_io_channel_get_ctx(raid_io->ch);
721 
722 	_raid_bdev_get_io_range(&io_range, raid_bdev->num_base_bdevs,
723 				raid_bdev->strip_size, raid_bdev->strip_size_shift,
724 				bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks);
725 
726 	raid_io->base_bdev_io_expected = io_range.n_disks_involved;
727 
728 	while (raid_io->base_bdev_io_submitted < raid_io->base_bdev_io_expected) {
729 		uint8_t disk_idx;
730 		uint64_t offset_in_disk;
731 		uint64_t nblocks_in_disk;
732 
733 		/* base_bdev is started from start_disk to end_disk.
734 		 * It is possible that index of start_disk is larger than end_disk's.
735 		 */
736 		disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs;
737 
738 		_raid_bdev_split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk);
739 
740 		switch (bdev_io->type) {
741 		case SPDK_BDEV_IO_TYPE_UNMAP:
742 			ret = spdk_bdev_unmap_blocks(raid_bdev->base_bdev_info[disk_idx].desc,
743 						     raid_ch->base_channel[disk_idx],
744 						     offset_in_disk, nblocks_in_disk,
745 						     raid_bdev_base_io_completion, bdev_io);
746 			break;
747 
748 		case SPDK_BDEV_IO_TYPE_FLUSH:
749 			ret = spdk_bdev_flush_blocks(raid_bdev->base_bdev_info[disk_idx].desc,
750 						     raid_ch->base_channel[disk_idx],
751 						     offset_in_disk, nblocks_in_disk,
752 						     raid_bdev_base_io_completion, bdev_io);
753 			break;
754 
755 		default:
756 			SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type);
757 			assert(false);
758 			ret = -EIO;
759 		}
760 
761 		if (ret == 0) {
762 			raid_io->base_bdev_io_submitted++;
763 		} else {
764 			raid_bdev_base_io_submit_fail_process(bdev_io, disk_idx,
765 							      _raid_bdev_submit_null_payload_request_next, ret);
766 			return;
767 		}
768 	}
769 }
770 
771 /*
772  * brief:
773  * _raid_bdev_submit_null_payload_request function is the submit_request function
774  * for io requests with range but without payload, like UNMAP and FLUSH.
775  * params:
776  * ch - pointer to raid bdev io channel
777  * bdev_io - pointer to parent bdev_io on raid bdev device
778  * returns:
779  * none
780  */
781 static void
782 _raid_bdev_submit_null_payload_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
783 {
784 	struct raid_bdev_io		*raid_io;
785 
786 	raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
787 	raid_io->ch = ch;
788 	raid_io->base_bdev_io_submitted = 0;
789 	raid_io->base_bdev_io_completed = 0;
790 	raid_io->base_bdev_io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
791 
792 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev: type %d, range (0x%lx, 0x%lx)\n",
793 		      bdev_io->type, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks);
794 
795 	_raid_bdev_submit_null_payload_request_next(bdev_io);
796 }
797 
798 /*
799  * brief:
800  * Callback function to spdk_bdev_io_get_buf.
801  * params:
802  * ch - pointer to raid bdev io channel
803  * bdev_io - pointer to parent bdev_io on raid bdev device
804  * success - True if buffer is allocated or false otherwise.
805  * returns:
806  * none
807  */
808 static void
809 raid_bdev_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
810 		     bool success)
811 {
812 	if (!success) {
813 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
814 		return;
815 	}
816 
817 	raid_bdev_start_rw_request(ch, bdev_io);
818 }
819 
820 /*
821  * brief:
822  * raid_bdev_submit_request function is the submit_request function pointer of
823  * raid bdev function table. This is used to submit the io on raid_bdev to below
824  * layers.
825  * params:
826  * ch - pointer to raid bdev io channel
827  * bdev_io - pointer to parent bdev_io on raid bdev device
828  * returns:
829  * none
830  */
831 static void
832 raid_bdev_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
833 {
834 	switch (bdev_io->type) {
835 	case SPDK_BDEV_IO_TYPE_READ:
836 		if (bdev_io->u.bdev.iovs == NULL || bdev_io->u.bdev.iovs[0].iov_base == NULL) {
837 			spdk_bdev_io_get_buf(bdev_io, raid_bdev_get_buf_cb,
838 					     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
839 		} else {
840 			/* Just call it directly if iov_base is already populated. */
841 			raid_bdev_start_rw_request(ch, bdev_io);
842 		}
843 		break;
844 	case SPDK_BDEV_IO_TYPE_WRITE:
845 		raid_bdev_start_rw_request(ch, bdev_io);
846 		break;
847 
848 	case SPDK_BDEV_IO_TYPE_RESET:
849 		_raid_bdev_submit_reset_request(ch, bdev_io);
850 		break;
851 
852 	case SPDK_BDEV_IO_TYPE_FLUSH:
853 	case SPDK_BDEV_IO_TYPE_UNMAP:
854 		_raid_bdev_submit_null_payload_request(ch, bdev_io);
855 		break;
856 
857 	default:
858 		SPDK_ERRLOG("submit request, invalid io type %u\n", bdev_io->type);
859 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
860 		break;
861 	}
862 
863 }
864 
865 /*
866  * brief:
867  * _raid_bdev_io_type_supported checks whether io_type is supported in
868  * all base bdev modules of raid bdev module. If anyone among the base_bdevs
869  * doesn't support, the raid device doesn't supports.
870  *
871  * params:
872  * raid_bdev - pointer to raid bdev context
873  * io_type - io type
874  * returns:
875  * true - io_type is supported
876  * false - io_type is not supported
877  */
878 inline static bool
879 _raid_bdev_io_type_supported(struct raid_bdev *raid_bdev, enum spdk_bdev_io_type io_type)
880 {
881 	uint8_t i;
882 
883 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
884 		if (raid_bdev->base_bdev_info[i].bdev == NULL) {
885 			assert(false);
886 			continue;
887 		}
888 
889 		if (spdk_bdev_io_type_supported(raid_bdev->base_bdev_info[i].bdev,
890 						io_type) == false) {
891 			return false;
892 		}
893 	}
894 
895 	return true;
896 }
897 
898 /*
899  * brief:
900  * raid_bdev_io_type_supported is the io_supported function for bdev function
901  * table which returns whether the particular io type is supported or not by
902  * raid bdev module
903  * params:
904  * ctx - pointer to raid bdev context
905  * type - io type
906  * returns:
907  * true - io_type is supported
908  * false - io_type is not supported
909  */
910 static bool
911 raid_bdev_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
912 {
913 	switch (io_type) {
914 	case SPDK_BDEV_IO_TYPE_READ:
915 	case SPDK_BDEV_IO_TYPE_WRITE:
916 		return true;
917 
918 	case SPDK_BDEV_IO_TYPE_FLUSH:
919 	case SPDK_BDEV_IO_TYPE_RESET:
920 	case SPDK_BDEV_IO_TYPE_UNMAP:
921 		return _raid_bdev_io_type_supported(ctx, io_type);
922 
923 	default:
924 		return false;
925 	}
926 
927 	return false;
928 }
929 
930 /*
931  * brief:
932  * raid_bdev_get_io_channel is the get_io_channel function table pointer for
933  * raid bdev. This is used to return the io channel for this raid bdev
934  * params:
935  * ctxt - pointer to raid_bdev
936  * returns:
937  * pointer to io channel for raid bdev
938  */
939 static struct spdk_io_channel *
940 raid_bdev_get_io_channel(void *ctxt)
941 {
942 	struct raid_bdev *raid_bdev = ctxt;
943 
944 	return spdk_get_io_channel(raid_bdev);
945 }
946 
947 /*
948  * brief:
949  * raid_bdev_dump_info_json is the function table pointer for raid bdev
950  * params:
951  * ctx - pointer to raid_bdev
952  * w - pointer to json context
953  * returns:
954  * 0 - success
955  * non zero - failure
956  */
957 static int
958 raid_bdev_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
959 {
960 	struct raid_bdev *raid_bdev = ctx;
961 
962 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_dump_config_json\n");
963 	assert(raid_bdev != NULL);
964 
965 	/* Dump the raid bdev configuration related information */
966 	spdk_json_write_named_object_begin(w, "raid");
967 	spdk_json_write_named_uint32(w, "strip_size", raid_bdev->strip_size);
968 	spdk_json_write_named_uint32(w, "strip_size_kb", raid_bdev->strip_size_kb);
969 	spdk_json_write_named_uint32(w, "state", raid_bdev->state);
970 	spdk_json_write_named_uint32(w, "raid_level", raid_bdev->raid_level);
971 	spdk_json_write_named_uint32(w, "destruct_called", raid_bdev->destruct_called);
972 	spdk_json_write_named_uint32(w, "num_base_bdevs", raid_bdev->num_base_bdevs);
973 	spdk_json_write_named_uint32(w, "num_base_bdevs_discovered", raid_bdev->num_base_bdevs_discovered);
974 	spdk_json_write_name(w, "base_bdevs_list");
975 	spdk_json_write_array_begin(w);
976 	for (uint8_t i = 0; i < raid_bdev->num_base_bdevs; i++) {
977 		if (raid_bdev->base_bdev_info[i].bdev) {
978 			spdk_json_write_string(w, raid_bdev->base_bdev_info[i].bdev->name);
979 		} else {
980 			spdk_json_write_null(w);
981 		}
982 	}
983 	spdk_json_write_array_end(w);
984 	spdk_json_write_object_end(w);
985 
986 	return 0;
987 }
988 
989 /*
990  * brief:
991  * raid_bdev_write_config_json is the function table pointer for raid bdev
992  * params:
993  * bdev - pointer to spdk_bdev
994  * w - pointer to json context
995  * returns:
996  * none
997  */
998 static void
999 raid_bdev_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1000 {
1001 	struct raid_bdev *raid_bdev = bdev->ctxt;
1002 	struct spdk_bdev *base;
1003 	uint8_t i;
1004 
1005 	spdk_json_write_object_begin(w);
1006 
1007 	spdk_json_write_named_string(w, "method", "construct_raid_bdev");
1008 
1009 	spdk_json_write_named_object_begin(w, "params");
1010 	spdk_json_write_named_string(w, "name", bdev->name);
1011 	spdk_json_write_named_uint32(w, "strip_size", raid_bdev->strip_size_kb);
1012 	spdk_json_write_named_uint32(w, "raid_level", raid_bdev->raid_level);
1013 
1014 	spdk_json_write_named_array_begin(w, "base_bdevs");
1015 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
1016 		base = raid_bdev->base_bdev_info[i].bdev;
1017 		if (base) {
1018 			spdk_json_write_string(w, base->name);
1019 		}
1020 	}
1021 	spdk_json_write_array_end(w);
1022 	spdk_json_write_object_end(w);
1023 
1024 	spdk_json_write_object_end(w);
1025 }
1026 
1027 /* g_raid_bdev_fn_table is the function table for raid bdev */
1028 static const struct spdk_bdev_fn_table g_raid_bdev_fn_table = {
1029 	.destruct		= raid_bdev_destruct,
1030 	.submit_request		= raid_bdev_submit_request,
1031 	.io_type_supported	= raid_bdev_io_type_supported,
1032 	.get_io_channel		= raid_bdev_get_io_channel,
1033 	.dump_info_json		= raid_bdev_dump_info_json,
1034 	.write_config_json	= raid_bdev_write_config_json,
1035 };
1036 
1037 /*
1038  * brief:
1039  * raid_bdev_config_cleanup function is used to free memory for one raid_bdev in configuration
1040  * params:
1041  * raid_cfg - pointer to raid_bdev_config structure
1042  * returns:
1043  * none
1044  */
1045 void
1046 raid_bdev_config_cleanup(struct raid_bdev_config *raid_cfg)
1047 {
1048 	uint8_t i;
1049 
1050 	TAILQ_REMOVE(&g_raid_config.raid_bdev_config_head, raid_cfg, link);
1051 	g_raid_config.total_raid_bdev--;
1052 
1053 	if (raid_cfg->base_bdev) {
1054 		for (i = 0; i < raid_cfg->num_base_bdevs; i++) {
1055 			free(raid_cfg->base_bdev[i].name);
1056 		}
1057 		free(raid_cfg->base_bdev);
1058 	}
1059 	free(raid_cfg->name);
1060 	free(raid_cfg);
1061 }
1062 
1063 /*
1064  * brief:
1065  * raid_bdev_free is the raid bdev function table function pointer. This is
1066  * called on bdev free path
1067  * params:
1068  * none
1069  * returns:
1070  * none
1071  */
1072 static void
1073 raid_bdev_free(void)
1074 {
1075 	struct raid_bdev_config *raid_cfg, *tmp;
1076 
1077 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_free\n");
1078 	TAILQ_FOREACH_SAFE(raid_cfg, &g_raid_config.raid_bdev_config_head, link, tmp) {
1079 		raid_bdev_config_cleanup(raid_cfg);
1080 	}
1081 }
1082 
1083 /* brief
1084  * raid_bdev_config_find_by_name is a helper function to find raid bdev config
1085  * by name as key.
1086  *
1087  * params:
1088  * raid_name - name for raid bdev.
1089  */
1090 struct raid_bdev_config *
1091 raid_bdev_config_find_by_name(const char *raid_name)
1092 {
1093 	struct raid_bdev_config *raid_cfg;
1094 
1095 	TAILQ_FOREACH(raid_cfg, &g_raid_config.raid_bdev_config_head, link) {
1096 		if (!strcmp(raid_cfg->name, raid_name)) {
1097 			return raid_cfg;
1098 		}
1099 	}
1100 
1101 	return raid_cfg;
1102 }
1103 
1104 /*
1105  * brief
1106  * raid_bdev_config_add function adds config for newly created raid bdev.
1107  *
1108  * params:
1109  * raid_name - name for raid bdev.
1110  * strip_size - strip size in KB
1111  * num_base_bdevs - number of base bdevs.
1112  * raid_level - raid level, only raid level 0 is supported.
1113  * _raid_cfg - Pointer to newly added configuration
1114  */
1115 int
1116 raid_bdev_config_add(const char *raid_name, uint32_t strip_size, uint8_t num_base_bdevs,
1117 		     uint8_t raid_level, struct raid_bdev_config **_raid_cfg)
1118 {
1119 	struct raid_bdev_config *raid_cfg;
1120 
1121 	raid_cfg = raid_bdev_config_find_by_name(raid_name);
1122 	if (raid_cfg != NULL) {
1123 		SPDK_ERRLOG("Duplicate raid bdev name found in config file %s\n",
1124 			    raid_name);
1125 		return -EEXIST;
1126 	}
1127 
1128 	if (spdk_u32_is_pow2(strip_size) == false) {
1129 		SPDK_ERRLOG("Invalid strip size %" PRIu32 "\n", strip_size);
1130 		return -EINVAL;
1131 	}
1132 
1133 	if (num_base_bdevs == 0) {
1134 		SPDK_ERRLOG("Invalid base device count %u\n", num_base_bdevs);
1135 		return -EINVAL;
1136 	}
1137 
1138 	if (raid_level != 0) {
1139 		SPDK_ERRLOG("invalid raid level %u, only raid level 0 is supported\n",
1140 			    raid_level);
1141 		return -EINVAL;
1142 	}
1143 
1144 	raid_cfg = calloc(1, sizeof(*raid_cfg));
1145 	if (raid_cfg == NULL) {
1146 		SPDK_ERRLOG("unable to allocate memory\n");
1147 		return -ENOMEM;
1148 	}
1149 
1150 	raid_cfg->name = strdup(raid_name);
1151 	if (!raid_cfg->name) {
1152 		free(raid_cfg);
1153 		SPDK_ERRLOG("unable to allocate memory\n");
1154 		return -ENOMEM;
1155 	}
1156 	raid_cfg->strip_size = strip_size;
1157 	raid_cfg->num_base_bdevs = num_base_bdevs;
1158 	raid_cfg->raid_level = raid_level;
1159 
1160 	raid_cfg->base_bdev = calloc(num_base_bdevs, sizeof(*raid_cfg->base_bdev));
1161 	if (raid_cfg->base_bdev == NULL) {
1162 		free(raid_cfg->name);
1163 		free(raid_cfg);
1164 		SPDK_ERRLOG("unable to allocate memory\n");
1165 		return -ENOMEM;
1166 	}
1167 
1168 	TAILQ_INSERT_TAIL(&g_raid_config.raid_bdev_config_head, raid_cfg, link);
1169 	g_raid_config.total_raid_bdev++;
1170 
1171 	*_raid_cfg = raid_cfg;
1172 	return 0;
1173 }
1174 
1175 /*
1176  * brief:
1177  * raid_bdev_config_add_base_bdev function add base bdev to raid bdev config.
1178  *
1179  * params:
1180  * raid_cfg - pointer to raid bdev configuration
1181  * base_bdev_name - name of base bdev
1182  * slot - Position to add base bdev
1183  */
1184 int
1185 raid_bdev_config_add_base_bdev(struct raid_bdev_config *raid_cfg, const char *base_bdev_name,
1186 			       uint8_t slot)
1187 {
1188 	uint8_t i;
1189 	struct raid_bdev_config *tmp;
1190 
1191 	if (slot >= raid_cfg->num_base_bdevs) {
1192 		return -EINVAL;
1193 	}
1194 
1195 	TAILQ_FOREACH(tmp, &g_raid_config.raid_bdev_config_head, link) {
1196 		for (i = 0; i < tmp->num_base_bdevs; i++) {
1197 			if (tmp->base_bdev[i].name != NULL) {
1198 				if (!strcmp(tmp->base_bdev[i].name, base_bdev_name)) {
1199 					SPDK_ERRLOG("duplicate base bdev name %s mentioned\n",
1200 						    base_bdev_name);
1201 					return -EEXIST;
1202 				}
1203 			}
1204 		}
1205 	}
1206 
1207 	raid_cfg->base_bdev[slot].name = strdup(base_bdev_name);
1208 	if (raid_cfg->base_bdev[slot].name == NULL) {
1209 		SPDK_ERRLOG("unable to allocate memory\n");
1210 		return -ENOMEM;
1211 	}
1212 
1213 	return 0;
1214 }
1215 /*
1216  * brief:
1217  * raid_bdev_parse_raid is used to parse the raid bdev from config file based on
1218  * pre-defined raid bdev format in config file.
1219  * Format of config file:
1220  *   [RAID1]
1221  *   Name raid1
1222  *   StripSize 64
1223  *   NumDevices 2
1224  *   RaidLevel 0
1225  *   Devices Nvme0n1 Nvme1n1
1226  *
1227  *   [RAID2]
1228  *   Name raid2
1229  *   StripSize 64
1230  *   NumDevices 3
1231  *   RaidLevel 0
1232  *   Devices Nvme2n1 Nvme3n1 Nvme4n1
1233  *
1234  * params:
1235  * conf_section - pointer to config section
1236  * returns:
1237  * 0 - success
1238  * non zero - failure
1239  */
1240 static int
1241 raid_bdev_parse_raid(struct spdk_conf_section *conf_section)
1242 {
1243 	const char *raid_name;
1244 	uint32_t strip_size;
1245 	uint8_t num_base_bdevs, raid_level;
1246 	const char *base_bdev_name;
1247 	struct raid_bdev_config *raid_cfg;
1248 	int rc, i, val;
1249 
1250 	raid_name = spdk_conf_section_get_val(conf_section, "Name");
1251 	if (raid_name == NULL) {
1252 		SPDK_ERRLOG("raid_name is null\n");
1253 		return -EINVAL;
1254 	}
1255 
1256 	val = spdk_conf_section_get_intval(conf_section, "StripSize");
1257 	if (val < 0) {
1258 		return -EINVAL;
1259 	}
1260 	strip_size = val;
1261 
1262 	val = spdk_conf_section_get_intval(conf_section, "NumDevices");
1263 	if (val < 0) {
1264 		return -EINVAL;
1265 	}
1266 	num_base_bdevs = val;
1267 
1268 	val = spdk_conf_section_get_intval(conf_section, "RaidLevel");
1269 	if (val < 0) {
1270 		return -EINVAL;
1271 	}
1272 	raid_level = val;
1273 
1274 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "%s %" PRIu32 " %u %u\n",
1275 		      raid_name, strip_size, num_base_bdevs, raid_level);
1276 
1277 	rc = raid_bdev_config_add(raid_name, strip_size, num_base_bdevs, raid_level,
1278 				  &raid_cfg);
1279 	if (rc != 0) {
1280 		SPDK_ERRLOG("Failed to add raid bdev config\n");
1281 		return rc;
1282 	}
1283 
1284 	for (i = 0; true; i++) {
1285 		base_bdev_name = spdk_conf_section_get_nmval(conf_section, "Devices", 0, i);
1286 		if (base_bdev_name == NULL) {
1287 			break;
1288 		}
1289 		if (i >= num_base_bdevs) {
1290 			raid_bdev_config_cleanup(raid_cfg);
1291 			SPDK_ERRLOG("Number of devices mentioned is more than count\n");
1292 			return -EINVAL;
1293 		}
1294 
1295 		rc = raid_bdev_config_add_base_bdev(raid_cfg, base_bdev_name, i);
1296 		if (rc != 0) {
1297 			raid_bdev_config_cleanup(raid_cfg);
1298 			SPDK_ERRLOG("Failed to add base bdev to raid bdev config\n");
1299 			return rc;
1300 		}
1301 	}
1302 
1303 	if (i != raid_cfg->num_base_bdevs) {
1304 		raid_bdev_config_cleanup(raid_cfg);
1305 		SPDK_ERRLOG("Number of devices mentioned is less than count\n");
1306 		return -EINVAL;
1307 	}
1308 
1309 	rc = raid_bdev_create(raid_cfg);
1310 	if (rc != 0) {
1311 		raid_bdev_config_cleanup(raid_cfg);
1312 		SPDK_ERRLOG("Failed to create raid bdev\n");
1313 		return rc;
1314 	}
1315 
1316 	rc = raid_bdev_add_base_devices(raid_cfg);
1317 	if (rc != 0) {
1318 		SPDK_ERRLOG("Failed to add any base bdev to raid bdev\n");
1319 		/* Config is not removed in this case. */
1320 	}
1321 
1322 	return 0;
1323 }
1324 
1325 /*
1326  * brief:
1327  * raid_bdev_parse_config is used to find the raid bdev config section and parse it
1328  * Format of config file:
1329  * params:
1330  * none
1331  * returns:
1332  * 0 - success
1333  * non zero - failure
1334  */
1335 static int
1336 raid_bdev_parse_config(void)
1337 {
1338 	int                      ret;
1339 	struct spdk_conf_section *conf_section;
1340 
1341 	conf_section = spdk_conf_first_section(NULL);
1342 	while (conf_section != NULL) {
1343 		if (spdk_conf_section_match_prefix(conf_section, "RAID")) {
1344 			ret = raid_bdev_parse_raid(conf_section);
1345 			if (ret < 0) {
1346 				SPDK_ERRLOG("Unable to parse raid bdev section\n");
1347 				return ret;
1348 			}
1349 		}
1350 		conf_section = spdk_conf_next_section(conf_section);
1351 	}
1352 
1353 	return 0;
1354 }
1355 
1356 /*
1357  * brief:
1358  * raid_bdev_fini_start is called when bdev layer is starting the
1359  * shutdown process
1360  * params:
1361  * none
1362  * returns:
1363  * none
1364  */
1365 static void
1366 raid_bdev_fini_start(void)
1367 {
1368 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_fini_start\n");
1369 	g_shutdown_started = true;
1370 }
1371 
1372 /*
1373  * brief:
1374  * raid_bdev_exit is called on raid bdev module exit time by bdev layer
1375  * params:
1376  * none
1377  * returns:
1378  * none
1379  */
1380 static void
1381 raid_bdev_exit(void)
1382 {
1383 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_exit\n");
1384 	raid_bdev_free();
1385 }
1386 
1387 /*
1388  * brief:
1389  * raid_bdev_get_ctx_size is used to return the context size of bdev_io for raid
1390  * module
1391  * params:
1392  * none
1393  * returns:
1394  * size of spdk_bdev_io context for raid
1395  */
1396 static int
1397 raid_bdev_get_ctx_size(void)
1398 {
1399 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_get_ctx_size\n");
1400 	return sizeof(struct raid_bdev_io);
1401 }
1402 
1403 /*
1404  * brief:
1405  * raid_bdev_get_running_config is used to get the configuration options.
1406  *
1407  * params:
1408  * fp - The pointer to a file that will be written to the configuration options.
1409  * returns:
1410  * none
1411  */
1412 static void
1413 raid_bdev_get_running_config(FILE *fp)
1414 {
1415 	struct raid_bdev *raid_bdev;
1416 	struct spdk_bdev *base;
1417 	int index = 1;
1418 	uint8_t i;
1419 
1420 	TAILQ_FOREACH(raid_bdev, &g_raid_bdev_configured_list, state_link) {
1421 		fprintf(fp,
1422 			"\n"
1423 			"[RAID%d]\n"
1424 			"  Name %s\n"
1425 			"  StripSize %" PRIu32 "\n"
1426 			"  NumDevices %u\n"
1427 			"  RaidLevel %hhu\n",
1428 			index, raid_bdev->bdev.name, raid_bdev->strip_size_kb,
1429 			raid_bdev->num_base_bdevs, raid_bdev->raid_level);
1430 		fprintf(fp,
1431 			"  Devices ");
1432 		for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
1433 			base = raid_bdev->base_bdev_info[i].bdev;
1434 			if (base) {
1435 				fprintf(fp,
1436 					"%s ",
1437 					base->name);
1438 			}
1439 		}
1440 		fprintf(fp,
1441 			"\n");
1442 		index++;
1443 	}
1444 }
1445 
1446 /*
1447  * brief:
1448  * raid_bdev_can_claim_bdev is the function to check if this base_bdev can be
1449  * claimed by raid bdev or not.
1450  * params:
1451  * bdev_name - represents base bdev name
1452  * _raid_cfg - pointer to raid bdev config parsed from config file
1453  * base_bdev_slot - if bdev can be claimed, it represents the base_bdev correct
1454  * slot. This field is only valid if return value of this function is true
1455  * returns:
1456  * true - if bdev can be claimed
1457  * false - if bdev can't be claimed
1458  */
1459 static bool
1460 raid_bdev_can_claim_bdev(const char *bdev_name, struct raid_bdev_config **_raid_cfg,
1461 			 uint8_t *base_bdev_slot)
1462 {
1463 	struct raid_bdev_config *raid_cfg;
1464 	uint8_t i;
1465 
1466 	TAILQ_FOREACH(raid_cfg, &g_raid_config.raid_bdev_config_head, link) {
1467 		for (i = 0; i < raid_cfg->num_base_bdevs; i++) {
1468 			/*
1469 			 * Check if the base bdev name is part of raid bdev configuration.
1470 			 * If match is found then return true and the slot information where
1471 			 * this base bdev should be inserted in raid bdev
1472 			 */
1473 			if (!strcmp(bdev_name, raid_cfg->base_bdev[i].name)) {
1474 				*_raid_cfg = raid_cfg;
1475 				*base_bdev_slot = i;
1476 				return true;
1477 			}
1478 		}
1479 	}
1480 
1481 	return false;
1482 }
1483 
1484 
1485 static struct spdk_bdev_module g_raid_if = {
1486 	.name = "raid",
1487 	.module_init = raid_bdev_init,
1488 	.fini_start = raid_bdev_fini_start,
1489 	.module_fini = raid_bdev_exit,
1490 	.get_ctx_size = raid_bdev_get_ctx_size,
1491 	.examine_config = raid_bdev_examine,
1492 	.config_text = raid_bdev_get_running_config,
1493 	.async_init = false,
1494 	.async_fini = false,
1495 };
1496 SPDK_BDEV_MODULE_REGISTER(raid, &g_raid_if)
1497 
1498 /*
1499  * brief:
1500  * raid_bdev_init is the initialization function for raid bdev module
1501  * params:
1502  * none
1503  * returns:
1504  * 0 - success
1505  * non zero - failure
1506  */
1507 static int
1508 raid_bdev_init(void)
1509 {
1510 	int ret;
1511 
1512 	/* Parse config file for raids */
1513 	ret = raid_bdev_parse_config();
1514 	if (ret < 0) {
1515 		SPDK_ERRLOG("raid bdev init failed parsing\n");
1516 		raid_bdev_free();
1517 		return ret;
1518 	}
1519 
1520 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_init completed successfully\n");
1521 
1522 	return 0;
1523 }
1524 
1525 /*
1526  * brief:
1527  * raid_bdev_create allocates raid bdev based on passed configuration
1528  * params:
1529  * raid_cfg - configuration of raid bdev
1530  * returns:
1531  * 0 - success
1532  * non zero - failure
1533  */
1534 int
1535 raid_bdev_create(struct raid_bdev_config *raid_cfg)
1536 {
1537 	struct raid_bdev *raid_bdev;
1538 	struct spdk_bdev *raid_bdev_gen;
1539 
1540 	raid_bdev = calloc(1, sizeof(*raid_bdev));
1541 	if (!raid_bdev) {
1542 		SPDK_ERRLOG("Unable to allocate memory for raid bdev\n");
1543 		return -ENOMEM;
1544 	}
1545 
1546 	assert(raid_cfg->num_base_bdevs != 0);
1547 	raid_bdev->num_base_bdevs = raid_cfg->num_base_bdevs;
1548 	raid_bdev->base_bdev_info = calloc(raid_bdev->num_base_bdevs,
1549 					   sizeof(struct raid_base_bdev_info));
1550 	if (!raid_bdev->base_bdev_info) {
1551 		SPDK_ERRLOG("Unable able to allocate base bdev info\n");
1552 		free(raid_bdev);
1553 		return -ENOMEM;
1554 	}
1555 
1556 	/* strip_size_kb is from the rpc param.  strip_size is in blocks and used
1557 	 * intnerally and set later.
1558 	 */
1559 	raid_bdev->strip_size = 0;
1560 	raid_bdev->strip_size_kb = raid_cfg->strip_size;
1561 	raid_bdev->state = RAID_BDEV_STATE_CONFIGURING;
1562 	raid_bdev->config = raid_cfg;
1563 
1564 	raid_bdev_gen = &raid_bdev->bdev;
1565 
1566 	raid_bdev_gen->name = strdup(raid_cfg->name);
1567 	if (!raid_bdev_gen->name) {
1568 		SPDK_ERRLOG("Unable to allocate name for raid\n");
1569 		free(raid_bdev->base_bdev_info);
1570 		free(raid_bdev);
1571 		return -ENOMEM;
1572 	}
1573 
1574 	raid_bdev_gen->product_name = "Raid Volume";
1575 	raid_bdev_gen->ctxt = raid_bdev;
1576 	raid_bdev_gen->fn_table = &g_raid_bdev_fn_table;
1577 	raid_bdev_gen->module = &g_raid_if;
1578 	raid_bdev_gen->write_cache = 0;
1579 
1580 	TAILQ_INSERT_TAIL(&g_raid_bdev_configuring_list, raid_bdev, state_link);
1581 	TAILQ_INSERT_TAIL(&g_raid_bdev_list, raid_bdev, global_link);
1582 
1583 	raid_cfg->raid_bdev = raid_bdev;
1584 
1585 	return 0;
1586 }
1587 
1588 /*
1589  * brief
1590  * raid_bdev_alloc_base_bdev_resource allocates resource of base bdev.
1591  * params:
1592  * raid_bdev - pointer to raid bdev
1593  * bdev - pointer to base bdev
1594  * base_bdev_slot - position to add base bdev
1595  * returns:
1596  * 0 - success
1597  * non zero - failure
1598  */
1599 static int
1600 raid_bdev_alloc_base_bdev_resource(struct raid_bdev *raid_bdev, struct spdk_bdev *bdev,
1601 				   uint8_t base_bdev_slot)
1602 {
1603 	struct spdk_bdev_desc *desc;
1604 	int rc;
1605 
1606 	rc = spdk_bdev_open(bdev, true, raid_bdev_remove_base_bdev, bdev, &desc);
1607 	if (rc != 0) {
1608 		SPDK_ERRLOG("Unable to create desc on bdev '%s'\n", bdev->name);
1609 		return rc;
1610 	}
1611 
1612 	rc = spdk_bdev_module_claim_bdev(bdev, NULL, &g_raid_if);
1613 	if (rc != 0) {
1614 		SPDK_ERRLOG("Unable to claim this bdev as it is already claimed\n");
1615 		spdk_bdev_close(desc);
1616 		return rc;
1617 	}
1618 
1619 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "bdev %s is claimed\n", bdev->name);
1620 
1621 	assert(raid_bdev->state != RAID_BDEV_STATE_ONLINE);
1622 	assert(base_bdev_slot < raid_bdev->num_base_bdevs);
1623 
1624 	raid_bdev->base_bdev_info[base_bdev_slot].bdev = bdev;
1625 	raid_bdev->base_bdev_info[base_bdev_slot].desc = desc;
1626 	raid_bdev->num_base_bdevs_discovered++;
1627 	assert(raid_bdev->num_base_bdevs_discovered <= raid_bdev->num_base_bdevs);
1628 
1629 	return 0;
1630 }
1631 
1632 /*
1633  * brief:
1634  * If raid bdev config is complete, then only register the raid bdev to
1635  * bdev layer and remove this raid bdev from configuring list and
1636  * insert the raid bdev to configured list
1637  * params:
1638  * raid_bdev - pointer to raid bdev
1639  * returns:
1640  * 0 - success
1641  * non zero - failure
1642  */
1643 static int
1644 raid_bdev_configure(struct raid_bdev *raid_bdev)
1645 {
1646 	uint32_t		blocklen;
1647 	uint64_t		min_blockcnt;
1648 	struct spdk_bdev	*raid_bdev_gen;
1649 	int rc = 0;
1650 
1651 	blocklen = raid_bdev->base_bdev_info[0].bdev->blocklen;
1652 	min_blockcnt = raid_bdev->base_bdev_info[0].bdev->blockcnt;
1653 	for (uint8_t i = 1; i < raid_bdev->num_base_bdevs; i++) {
1654 		/* Calculate minimum block count from all base bdevs */
1655 		if (raid_bdev->base_bdev_info[i].bdev->blockcnt < min_blockcnt) {
1656 			min_blockcnt = raid_bdev->base_bdev_info[i].bdev->blockcnt;
1657 		}
1658 
1659 		/* Check blocklen for all base bdevs that it should be same */
1660 		if (blocklen != raid_bdev->base_bdev_info[i].bdev->blocklen) {
1661 			/*
1662 			 * Assumption is that all the base bdevs for any raid bdev should
1663 			 * have same blocklen
1664 			 */
1665 			SPDK_ERRLOG("Blocklen of various bdevs not matching\n");
1666 			return -EINVAL;
1667 		}
1668 	}
1669 
1670 	/* The strip_size_kb is read in from user in KB. Convert to blocks here for
1671 	 * internal use.
1672 	 */
1673 	raid_bdev->strip_size = (raid_bdev->strip_size_kb * 1024) / blocklen;
1674 	raid_bdev->strip_size_shift = spdk_u32log2(raid_bdev->strip_size);
1675 	raid_bdev->blocklen_shift = spdk_u32log2(blocklen);
1676 
1677 	raid_bdev_gen = &raid_bdev->bdev;
1678 	raid_bdev_gen->blocklen = blocklen;
1679 	if (raid_bdev->num_base_bdevs > 1) {
1680 		raid_bdev_gen->optimal_io_boundary = raid_bdev->strip_size;
1681 		raid_bdev_gen->split_on_optimal_io_boundary = true;
1682 	} else {
1683 		/* Do not need to split reads/writes on single bdev RAID modules. */
1684 		raid_bdev_gen->optimal_io_boundary = 0;
1685 		raid_bdev_gen->split_on_optimal_io_boundary = false;
1686 	}
1687 
1688 	/*
1689 	 * RAID bdev logic is for striping so take the minimum block count based
1690 	 * approach where total block count of raid bdev is the number of base
1691 	 * bdev times the minimum block count of any base bdev
1692 	 */
1693 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "min blockcount %lu,  numbasedev %u, strip size shift %u\n",
1694 		      min_blockcnt,
1695 		      raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift);
1696 	raid_bdev_gen->blockcnt = ((min_blockcnt >> raid_bdev->strip_size_shift) <<
1697 				   raid_bdev->strip_size_shift)  * raid_bdev->num_base_bdevs;
1698 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "io device register %p\n", raid_bdev);
1699 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "blockcnt %lu, blocklen %u\n", raid_bdev_gen->blockcnt,
1700 		      raid_bdev_gen->blocklen);
1701 	if (raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) {
1702 		raid_bdev->state = RAID_BDEV_STATE_ONLINE;
1703 		spdk_io_device_register(raid_bdev, raid_bdev_create_cb, raid_bdev_destroy_cb,
1704 					sizeof(struct raid_bdev_io_channel),
1705 					raid_bdev->bdev.name);
1706 		rc = spdk_bdev_register(raid_bdev_gen);
1707 		if (rc != 0) {
1708 			SPDK_ERRLOG("Unable to register raid bdev and stay at configuring state\n");
1709 			spdk_io_device_unregister(raid_bdev, NULL);
1710 			raid_bdev->state = RAID_BDEV_STATE_CONFIGURING;
1711 			return rc;
1712 		}
1713 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev generic %p\n", raid_bdev_gen);
1714 		TAILQ_REMOVE(&g_raid_bdev_configuring_list, raid_bdev, state_link);
1715 		TAILQ_INSERT_TAIL(&g_raid_bdev_configured_list, raid_bdev, state_link);
1716 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev is created with name %s, raid_bdev %p\n",
1717 			      raid_bdev_gen->name, raid_bdev);
1718 	}
1719 
1720 	return 0;
1721 }
1722 
1723 /*
1724  * brief:
1725  * If raid bdev is online and registered, change the bdev state to
1726  * configuring and unregister this raid device. Queue this raid device
1727  * in configuring list
1728  * params:
1729  * raid_bdev - pointer to raid bdev
1730  * cb_fn - callback function
1731  * cb_arg - argument to callback function
1732  * returns:
1733  * none
1734  */
1735 static void
1736 raid_bdev_deconfigure(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn,
1737 		      void *cb_arg)
1738 {
1739 	if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) {
1740 		if (cb_fn) {
1741 			cb_fn(cb_arg, 0);
1742 		}
1743 		return;
1744 	}
1745 
1746 	assert(raid_bdev->num_base_bdevs == raid_bdev->num_base_bdevs_discovered);
1747 	TAILQ_REMOVE(&g_raid_bdev_configured_list, raid_bdev, state_link);
1748 	raid_bdev->state = RAID_BDEV_STATE_OFFLINE;
1749 	assert(raid_bdev->num_base_bdevs_discovered);
1750 	TAILQ_INSERT_TAIL(&g_raid_bdev_offline_list, raid_bdev, state_link);
1751 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev state chaning from online to offline\n");
1752 
1753 	spdk_bdev_unregister(&raid_bdev->bdev, cb_fn, cb_arg);
1754 }
1755 
1756 /*
1757  * brief:
1758  * raid_bdev_find_by_base_bdev function finds the raid bdev which has
1759  *  claimed the base bdev.
1760  * params:
1761  * base_bdev - pointer to base bdev pointer
1762  * _raid_bdev - Referenct to pointer to raid bdev
1763  * _base_bdev_slot - Reference to the slot of the base bdev.
1764  * returns:
1765  * true - if the raid bdev is found.
1766  * false - if the raid bdev is not found.
1767  */
1768 static bool
1769 raid_bdev_find_by_base_bdev(struct spdk_bdev *base_bdev, struct raid_bdev **_raid_bdev,
1770 			    uint8_t *_base_bdev_slot)
1771 {
1772 	struct raid_bdev	*raid_bdev;
1773 	uint8_t			i;
1774 
1775 	TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) {
1776 		for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
1777 			if (raid_bdev->base_bdev_info[i].bdev == base_bdev) {
1778 				*_raid_bdev = raid_bdev;
1779 				*_base_bdev_slot = i;
1780 				return true;
1781 			}
1782 		}
1783 	}
1784 
1785 	return false;
1786 }
1787 
1788 /*
1789  * brief:
1790  * raid_bdev_remove_base_bdev function is called by below layers when base_bdev
1791  * is removed. This function checks if this base bdev is part of any raid bdev
1792  * or not. If yes, it takes necessary action on that particular raid bdev.
1793  * params:
1794  * ctx - pointer to base bdev pointer which got removed
1795  * returns:
1796  * none
1797  */
1798 static void
1799 raid_bdev_remove_base_bdev(void *ctx)
1800 {
1801 	struct spdk_bdev	*base_bdev = ctx;
1802 	struct raid_bdev	*raid_bdev = NULL;
1803 	uint8_t			base_bdev_slot = 0;
1804 
1805 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_remove_base_bdev\n");
1806 
1807 	/* Find the raid_bdev which has claimed this base_bdev */
1808 	if (!raid_bdev_find_by_base_bdev(base_bdev, &raid_bdev, &base_bdev_slot)) {
1809 		SPDK_ERRLOG("bdev to remove '%s' not found\n", base_bdev->name);
1810 		return;
1811 	}
1812 
1813 	assert(raid_bdev->base_bdev_info[base_bdev_slot].desc);
1814 	raid_bdev->base_bdev_info[base_bdev_slot].remove_scheduled = true;
1815 
1816 	if (raid_bdev->destruct_called == true ||
1817 	    raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) {
1818 		/*
1819 		 * As raid bdev is not registered yet or already unregistered,
1820 		 * so cleanup should be done here itself.
1821 		 */
1822 		raid_bdev_free_base_bdev_resource(raid_bdev, base_bdev_slot);
1823 		if (raid_bdev->num_base_bdevs_discovered == 0) {
1824 			/* There is no base bdev for this raid, so free the raid device. */
1825 			raid_bdev_cleanup(raid_bdev);
1826 			return;
1827 		}
1828 	}
1829 
1830 	raid_bdev_deconfigure(raid_bdev, NULL, NULL);
1831 }
1832 
1833 /*
1834  * brief:
1835  * Remove base bdevs from the raid bdev one by one.  Skip any base bdev which
1836  *  doesn't exist.
1837  * params:
1838  * raid_cfg - pointer to raid bdev config.
1839  * cb_fn - callback function
1840  * cb_ctx - argument to callback function
1841  */
1842 void
1843 raid_bdev_remove_base_devices(struct raid_bdev_config *raid_cfg,
1844 			      raid_bdev_destruct_cb cb_fn, void *cb_arg)
1845 {
1846 	struct raid_bdev		*raid_bdev;
1847 	struct raid_base_bdev_info	*info;
1848 	uint8_t				i;
1849 
1850 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_remove_base_devices\n");
1851 
1852 	raid_bdev = raid_cfg->raid_bdev;
1853 	if (raid_bdev == NULL) {
1854 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev %s doesn't exist now\n", raid_cfg->name);
1855 		if (cb_fn) {
1856 			cb_fn(cb_arg, 0);
1857 		}
1858 		return;
1859 	}
1860 
1861 	if (raid_bdev->destroy_started) {
1862 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "destroying raid bdev %s is already started\n",
1863 			      raid_cfg->name);
1864 		if (cb_fn) {
1865 			cb_fn(cb_arg, -EALREADY);
1866 		}
1867 		return;
1868 	}
1869 
1870 	raid_bdev->destroy_started = true;
1871 
1872 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
1873 		info = &raid_bdev->base_bdev_info[i];
1874 
1875 		if (info->bdev == NULL) {
1876 			continue;
1877 		}
1878 
1879 		assert(info->desc);
1880 		info->remove_scheduled = true;
1881 
1882 		if (raid_bdev->destruct_called == true ||
1883 		    raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) {
1884 			/*
1885 			 * As raid bdev is not registered yet or already unregistered,
1886 			 * so cleanup should be done here itself.
1887 			 */
1888 			raid_bdev_free_base_bdev_resource(raid_bdev, i);
1889 			if (raid_bdev->num_base_bdevs_discovered == 0) {
1890 				/* There is no base bdev for this raid, so free the raid device. */
1891 				raid_bdev_cleanup(raid_bdev);
1892 				if (cb_fn) {
1893 					cb_fn(cb_arg, 0);
1894 				}
1895 				return;
1896 			}
1897 		}
1898 	}
1899 
1900 	raid_bdev_deconfigure(raid_bdev, cb_fn, cb_arg);
1901 }
1902 
1903 /*
1904  * brief:
1905  * raid_bdev_add_base_device function is the actual function which either adds
1906  * the nvme base device to existing raid bdev or create a new raid bdev. It also claims
1907  * the base device and keep the open descriptor.
1908  * params:
1909  * raid_cfg - pointer to raid bdev config
1910  * bdev - pointer to base bdev
1911  * base_bdev_slot - position to add base bdev
1912  * returns:
1913  * 0 - success
1914  * non zero - failure
1915  */
1916 static int
1917 raid_bdev_add_base_device(struct raid_bdev_config *raid_cfg, struct spdk_bdev *bdev,
1918 			  uint8_t base_bdev_slot)
1919 {
1920 	struct raid_bdev	*raid_bdev;
1921 	int			rc;
1922 
1923 	raid_bdev = raid_cfg->raid_bdev;
1924 	if (!raid_bdev) {
1925 		SPDK_ERRLOG("Raid bdev '%s' is not created yet\n", raid_cfg->name);
1926 		return -ENODEV;
1927 	}
1928 
1929 	rc = raid_bdev_alloc_base_bdev_resource(raid_bdev, bdev, base_bdev_slot);
1930 	if (rc != 0) {
1931 		SPDK_ERRLOG("Failed to allocate resource for bdev '%s'\n", bdev->name);
1932 		return rc;
1933 	}
1934 
1935 	assert(raid_bdev->num_base_bdevs_discovered <= raid_bdev->num_base_bdevs);
1936 
1937 	if (raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs) {
1938 		rc = raid_bdev_configure(raid_bdev);
1939 		if (rc != 0) {
1940 			SPDK_ERRLOG("Failed to configure raid bdev\n");
1941 			return rc;
1942 		}
1943 	}
1944 
1945 	return 0;
1946 }
1947 
1948 /*
1949  * brief:
1950  * Add base bdevs to the raid bdev one by one.  Skip any base bdev which doesn't
1951  *  exist or fails to add. If all base bdevs are successfully added, the raid bdev
1952  *  moves to the configured state and becomes available. Otherwise, the raid bdev
1953  *  stays at the configuring state with added base bdevs.
1954  * params:
1955  * raid_cfg - pointer to raid bdev config
1956  * returns:
1957  * 0 - The raid bdev moves to the configured state or stays at the configuring
1958  *     state with added base bdevs due to any nonexistent base bdev.
1959  * non zero - Failed to add any base bdev and stays at the configuring state with
1960  *            added base bdevs.
1961  */
1962 int
1963 raid_bdev_add_base_devices(struct raid_bdev_config *raid_cfg)
1964 {
1965 	struct spdk_bdev	*base_bdev;
1966 	uint8_t			i;
1967 	int			rc = 0, _rc;
1968 
1969 	for (i = 0; i < raid_cfg->num_base_bdevs; i++) {
1970 		base_bdev = spdk_bdev_get_by_name(raid_cfg->base_bdev[i].name);
1971 		if (base_bdev == NULL) {
1972 			SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "base bdev %s doesn't exist now\n",
1973 				      raid_cfg->base_bdev[i].name);
1974 			continue;
1975 		}
1976 
1977 		_rc = raid_bdev_add_base_device(raid_cfg, base_bdev, i);
1978 		if (_rc != 0) {
1979 			SPDK_ERRLOG("Failed to add base bdev %s to RAID bdev %s: %s\n",
1980 				    raid_cfg->base_bdev[i].name, raid_cfg->name,
1981 				    spdk_strerror(-_rc));
1982 			if (rc == 0) {
1983 				rc = _rc;
1984 			}
1985 		}
1986 	}
1987 
1988 	return rc;
1989 }
1990 
1991 /*
1992  * brief:
1993  * raid_bdev_examine function is the examine function call by the below layers
1994  * like bdev_nvme layer. This function will check if this base bdev can be
1995  * claimed by this raid bdev or not.
1996  * params:
1997  * bdev - pointer to base bdev
1998  * returns:
1999  * none
2000  */
2001 static void
2002 raid_bdev_examine(struct spdk_bdev *bdev)
2003 {
2004 	struct raid_bdev_config	*raid_cfg;
2005 	uint8_t			base_bdev_slot;
2006 
2007 	if (raid_bdev_can_claim_bdev(bdev->name, &raid_cfg, &base_bdev_slot)) {
2008 		raid_bdev_add_base_device(raid_cfg, bdev, base_bdev_slot);
2009 	} else {
2010 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "bdev %s can't be claimed\n",
2011 			      bdev->name);
2012 	}
2013 
2014 	spdk_bdev_module_examine_done(&g_raid_if);
2015 }
2016 
2017 /* Log component for bdev raid bdev module */
2018 SPDK_LOG_REGISTER_COMPONENT("bdev_raid", SPDK_LOG_BDEV_RAID)
2019