xref: /spdk/module/bdev/raid/bdev_raid.c (revision 2f249ace0c4834ef5351caf86b22079989cd488f)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "bdev_raid.h"
35 #include "spdk/env.h"
36 #include "spdk/io_channel.h"
37 #include "spdk/conf.h"
38 #include "spdk_internal/log.h"
39 #include "spdk/string.h"
40 #include "spdk/util.h"
41 #include "spdk/json.h"
42 #include "spdk/string.h"
43 
44 static bool g_shutdown_started = false;
45 
46 /* raid bdev config as read from config file */
47 struct raid_config	g_raid_config = {
48 	.raid_bdev_config_head = TAILQ_HEAD_INITIALIZER(g_raid_config.raid_bdev_config_head),
49 };
50 
51 /*
52  * List of raid bdev in configured list, these raid bdevs are registered with
53  * bdev layer
54  */
55 struct raid_configured_tailq	g_raid_bdev_configured_list = TAILQ_HEAD_INITIALIZER(
56 			g_raid_bdev_configured_list);
57 
58 /* List of raid bdev in configuring list */
59 struct raid_configuring_tailq	g_raid_bdev_configuring_list = TAILQ_HEAD_INITIALIZER(
60 			g_raid_bdev_configuring_list);
61 
62 /* List of all raid bdevs */
63 struct raid_all_tailq		g_raid_bdev_list = TAILQ_HEAD_INITIALIZER(g_raid_bdev_list);
64 
65 /* List of all raid bdevs that are offline */
66 struct raid_offline_tailq	g_raid_bdev_offline_list = TAILQ_HEAD_INITIALIZER(
67 			g_raid_bdev_offline_list);
68 
69 static TAILQ_HEAD(, raid_bdev_module) g_raid_modules = TAILQ_HEAD_INITIALIZER(g_raid_modules);
70 
71 static struct raid_bdev_module *raid_bdev_module_find(enum raid_level level)
72 {
73 	struct raid_bdev_module *raid_module;
74 
75 	TAILQ_FOREACH(raid_module, &g_raid_modules, link) {
76 		if (raid_module->level == level) {
77 			return raid_module;
78 		}
79 	}
80 
81 	return NULL;
82 }
83 
84 void raid_bdev_module_list_add(struct raid_bdev_module *raid_module)
85 {
86 	if (raid_bdev_module_find(raid_module->level) != NULL) {
87 		SPDK_ERRLOG("module for raid level '%s' already registered.\n",
88 			    raid_bdev_level_to_str(raid_module->level));
89 		assert(false);
90 	} else {
91 		TAILQ_INSERT_TAIL(&g_raid_modules, raid_module, link);
92 	}
93 }
94 
95 /* Function declarations */
96 static void	raid_bdev_examine(struct spdk_bdev *bdev);
97 static int	raid_bdev_init(void);
98 static void	raid_bdev_deconfigure(struct raid_bdev *raid_bdev,
99 				      raid_bdev_destruct_cb cb_fn, void *cb_arg);
100 static void	raid_bdev_remove_base_bdev(void *ctx);
101 
102 /*
103  * brief:
104  * raid_bdev_create_cb function is a cb function for raid bdev which creates the
105  * hierarchy from raid bdev to base bdev io channels. It will be called per core
106  * params:
107  * io_device - pointer to raid bdev io device represented by raid_bdev
108  * ctx_buf - pointer to context buffer for raid bdev io channel
109  * returns:
110  * 0 - success
111  * non zero - failure
112  */
113 static int
114 raid_bdev_create_cb(void *io_device, void *ctx_buf)
115 {
116 	struct raid_bdev            *raid_bdev = io_device;
117 	struct raid_bdev_io_channel *raid_ch = ctx_buf;
118 
119 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_create_cb, %p\n", raid_ch);
120 
121 	assert(raid_bdev != NULL);
122 	assert(raid_bdev->state == RAID_BDEV_STATE_ONLINE);
123 
124 	raid_ch->num_channels = raid_bdev->num_base_bdevs;
125 
126 	raid_ch->base_channel = calloc(raid_ch->num_channels,
127 				       sizeof(struct spdk_io_channel *));
128 	if (!raid_ch->base_channel) {
129 		SPDK_ERRLOG("Unable to allocate base bdevs io channel\n");
130 		return -ENOMEM;
131 	}
132 	for (uint8_t i = 0; i < raid_ch->num_channels; i++) {
133 		/*
134 		 * Get the spdk_io_channel for all the base bdevs. This is used during
135 		 * split logic to send the respective child bdev ios to respective base
136 		 * bdev io channel.
137 		 */
138 		raid_ch->base_channel[i] = spdk_bdev_get_io_channel(
139 						   raid_bdev->base_bdev_info[i].desc);
140 		if (!raid_ch->base_channel[i]) {
141 			for (uint8_t j = 0; j < i; j++) {
142 				spdk_put_io_channel(raid_ch->base_channel[j]);
143 			}
144 			free(raid_ch->base_channel);
145 			raid_ch->base_channel = NULL;
146 			SPDK_ERRLOG("Unable to create io channel for base bdev\n");
147 			return -ENOMEM;
148 		}
149 	}
150 
151 	return 0;
152 }
153 
154 /*
155  * brief:
156  * raid_bdev_destroy_cb function is a cb function for raid bdev which deletes the
157  * hierarchy from raid bdev to base bdev io channels. It will be called per core
158  * params:
159  * io_device - pointer to raid bdev io device represented by raid_bdev
160  * ctx_buf - pointer to context buffer for raid bdev io channel
161  * returns:
162  * none
163  */
164 static void
165 raid_bdev_destroy_cb(void *io_device, void *ctx_buf)
166 {
167 	struct raid_bdev_io_channel *raid_ch = ctx_buf;
168 
169 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_destroy_cb\n");
170 
171 	assert(raid_ch != NULL);
172 	assert(raid_ch->base_channel);
173 	for (uint8_t i = 0; i < raid_ch->num_channels; i++) {
174 		/* Free base bdev channels */
175 		assert(raid_ch->base_channel[i] != NULL);
176 		spdk_put_io_channel(raid_ch->base_channel[i]);
177 	}
178 	free(raid_ch->base_channel);
179 	raid_ch->base_channel = NULL;
180 }
181 
182 /*
183  * brief:
184  * raid_bdev_cleanup is used to cleanup and free raid_bdev related data
185  * structures.
186  * params:
187  * raid_bdev - pointer to raid_bdev
188  * returns:
189  * none
190  */
191 static void
192 raid_bdev_cleanup(struct raid_bdev *raid_bdev)
193 {
194 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_cleanup, %p name %s, state %u, config %p\n",
195 		      raid_bdev,
196 		      raid_bdev->bdev.name, raid_bdev->state, raid_bdev->config);
197 	if (raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) {
198 		TAILQ_REMOVE(&g_raid_bdev_configuring_list, raid_bdev, state_link);
199 	} else if (raid_bdev->state == RAID_BDEV_STATE_OFFLINE) {
200 		TAILQ_REMOVE(&g_raid_bdev_offline_list, raid_bdev, state_link);
201 	} else {
202 		assert(0);
203 	}
204 	TAILQ_REMOVE(&g_raid_bdev_list, raid_bdev, global_link);
205 	free(raid_bdev->bdev.name);
206 	free(raid_bdev->base_bdev_info);
207 	if (raid_bdev->config) {
208 		raid_bdev->config->raid_bdev = NULL;
209 	}
210 	free(raid_bdev);
211 }
212 
213 /*
214  * brief:
215  * free resource of base bdev for raid bdev
216  * params:
217  * raid_bdev - pointer to raid bdev
218  * base_bdev_slot - position to base bdev in raid bdev
219  * returns:
220  * 0 - success
221  * non zero - failure
222  */
223 static void
224 raid_bdev_free_base_bdev_resource(struct raid_bdev *raid_bdev, uint8_t base_bdev_slot)
225 {
226 	struct raid_base_bdev_info *info;
227 
228 	info = &raid_bdev->base_bdev_info[base_bdev_slot];
229 
230 	spdk_bdev_module_release_bdev(info->bdev);
231 	spdk_bdev_close(info->desc);
232 	info->desc = NULL;
233 	info->bdev = NULL;
234 
235 	assert(raid_bdev->num_base_bdevs_discovered);
236 	raid_bdev->num_base_bdevs_discovered--;
237 }
238 
239 /*
240  * brief:
241  * raid_bdev_destruct is the destruct function table pointer for raid bdev
242  * params:
243  * ctxt - pointer to raid_bdev
244  * returns:
245  * 0 - success
246  * non zero - failure
247  */
248 static int
249 raid_bdev_destruct(void *ctxt)
250 {
251 	struct raid_bdev *raid_bdev = ctxt;
252 
253 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_destruct\n");
254 
255 	raid_bdev->destruct_called = true;
256 	for (uint8_t i = 0; i < raid_bdev->num_base_bdevs; i++) {
257 		/*
258 		 * Close all base bdev descriptors for which call has come from below
259 		 * layers.  Also close the descriptors if we have started shutdown.
260 		 */
261 		if (g_shutdown_started ||
262 		    ((raid_bdev->base_bdev_info[i].remove_scheduled == true) &&
263 		     (raid_bdev->base_bdev_info[i].bdev != NULL))) {
264 			raid_bdev_free_base_bdev_resource(raid_bdev, i);
265 		}
266 	}
267 
268 	if (g_shutdown_started) {
269 		TAILQ_REMOVE(&g_raid_bdev_configured_list, raid_bdev, state_link);
270 		raid_bdev->state = RAID_BDEV_STATE_OFFLINE;
271 		TAILQ_INSERT_TAIL(&g_raid_bdev_offline_list, raid_bdev, state_link);
272 	}
273 
274 	spdk_io_device_unregister(raid_bdev, NULL);
275 
276 	if (raid_bdev->num_base_bdevs_discovered == 0) {
277 		/* Free raid_bdev when there are no base bdevs left */
278 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev base bdevs is 0, going to free all in destruct\n");
279 		raid_bdev_cleanup(raid_bdev);
280 	}
281 
282 	return 0;
283 }
284 
285 /*
286  * brief:
287  * raid_bdev_base_io_completion is the completion callback for member disk requests
288  * params:
289  * bdev_io - pointer to member disk requested bdev_io
290  * success - true if successful, false if unsuccessful
291  * cb_arg - callback argument (parent raid bdev_io)
292  * returns:
293  * none
294  */
295 void
296 raid_bdev_base_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
297 {
298 	struct spdk_bdev_io *parent_io = cb_arg;
299 	struct raid_bdev_io *raid_io = (struct raid_bdev_io *)parent_io->driver_ctx;
300 
301 	spdk_bdev_free_io(bdev_io);
302 
303 	if (!success) {
304 		raid_io->base_bdev_io_status = SPDK_BDEV_IO_STATUS_FAILED;
305 	}
306 
307 	raid_io->base_bdev_io_completed++;
308 	if (raid_io->base_bdev_io_completed == raid_io->base_bdev_io_expected) {
309 		spdk_bdev_io_complete(parent_io, raid_io->base_bdev_io_status);
310 	}
311 }
312 
313 /*
314  * brief:
315  * raid_bdev_queue_io_wait function processes the IO which failed to submit.
316  * It will try to queue the IOs after storing the context to bdev wait queue logic.
317  * params:
318  * raid_bdev_io - pointer to raid bdev_io
319  * pd_idx - base_dev index in raid_bdev
320  * cb_fn - callback when the spdk_bdev_io for base_bdev becomes available
321  * ret - return code
322  * returns:
323  * none
324  */
325 void
326 raid_bdev_queue_io_wait(struct spdk_bdev_io *raid_bdev_io, uint8_t pd_idx,
327 			spdk_bdev_io_wait_cb cb_fn, int ret)
328 {
329 	struct raid_bdev_io *raid_io = (struct raid_bdev_io *)raid_bdev_io->driver_ctx;
330 	struct raid_bdev *raid_bdev = (struct raid_bdev *)raid_bdev_io->bdev->ctxt;
331 
332 	assert(ret != 0);
333 
334 	if (ret == -ENOMEM) {
335 		raid_io->waitq_entry.bdev = raid_bdev->base_bdev_info[pd_idx].bdev;
336 		raid_io->waitq_entry.cb_fn = cb_fn;
337 		raid_io->waitq_entry.cb_arg = raid_bdev_io;
338 		spdk_bdev_queue_io_wait(raid_bdev->base_bdev_info[pd_idx].bdev,
339 					raid_io->raid_ch->base_channel[pd_idx],
340 					&raid_io->waitq_entry);
341 		return;
342 	}
343 
344 	SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
345 	assert(false);
346 	spdk_bdev_io_complete(raid_bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
347 }
348 
349 static void
350 raid_bdev_submit_reset_request(struct raid_bdev_io *raid_io);
351 
352 static void
353 _raid_bdev_submit_reset_request(void *_bdev_io)
354 {
355 	struct spdk_bdev_io *bdev_io = _bdev_io;
356 	struct raid_bdev_io *raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
357 
358 	raid_bdev_submit_reset_request(raid_io);
359 }
360 
361 /*
362  * brief:
363  * raid_bdev_submit_reset_request function submits reset requests
364  * to member disks; it will submit as many as possible unless a reset fails with -ENOMEM, in
365  * which case it will queue it for later submission
366  * params:
367  * raid_io
368  * returns:
369  * none
370  */
371 static void
372 raid_bdev_submit_reset_request(struct raid_bdev_io *raid_io)
373 {
374 	struct spdk_bdev_io		*bdev_io;
375 	struct raid_bdev		*raid_bdev;
376 	int				ret;
377 	uint8_t				i;
378 
379 	bdev_io = spdk_bdev_io_from_ctx(raid_io);
380 	raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt;
381 
382 	raid_io->base_bdev_io_expected = raid_bdev->num_base_bdevs;
383 
384 	while (raid_io->base_bdev_io_submitted < raid_bdev->num_base_bdevs) {
385 		i = raid_io->base_bdev_io_submitted;
386 		ret = spdk_bdev_reset(raid_bdev->base_bdev_info[i].desc,
387 				      raid_io->raid_ch->base_channel[i],
388 				      raid_bdev_base_io_completion, bdev_io);
389 		if (ret == 0) {
390 			raid_io->base_bdev_io_submitted++;
391 		} else {
392 			raid_bdev_queue_io_wait(bdev_io, i,
393 						_raid_bdev_submit_reset_request, ret);
394 			return;
395 		}
396 	}
397 }
398 
399 /*
400  * brief:
401  * Callback function to spdk_bdev_io_get_buf.
402  * params:
403  * ch - pointer to raid bdev io channel
404  * bdev_io - pointer to parent bdev_io on raid bdev device
405  * success - True if buffer is allocated or false otherwise.
406  * returns:
407  * none
408  */
409 static void
410 raid_bdev_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
411 		     bool success)
412 {
413 	struct raid_bdev_io *raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
414 
415 	if (!success) {
416 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
417 		return;
418 	}
419 
420 	raid0_submit_rw_request(raid_io);
421 }
422 
423 /*
424  * brief:
425  * raid_bdev_submit_request function is the submit_request function pointer of
426  * raid bdev function table. This is used to submit the io on raid_bdev to below
427  * layers.
428  * params:
429  * ch - pointer to raid bdev io channel
430  * bdev_io - pointer to parent bdev_io on raid bdev device
431  * returns:
432  * none
433  */
434 static void
435 raid_bdev_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
436 {
437 	struct raid_bdev_io *raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
438 
439 	raid_io->raid_ch = spdk_io_channel_get_ctx(ch);
440 	raid_io->base_bdev_io_submitted = 0;
441 	raid_io->base_bdev_io_completed = 0;
442 	raid_io->base_bdev_io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
443 
444 	switch (bdev_io->type) {
445 	case SPDK_BDEV_IO_TYPE_READ:
446 		spdk_bdev_io_get_buf(bdev_io, raid_bdev_get_buf_cb,
447 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
448 		break;
449 	case SPDK_BDEV_IO_TYPE_WRITE:
450 		raid0_submit_rw_request(raid_io);
451 		break;
452 
453 	case SPDK_BDEV_IO_TYPE_RESET:
454 		raid_bdev_submit_reset_request(raid_io);
455 		break;
456 
457 	case SPDK_BDEV_IO_TYPE_FLUSH:
458 	case SPDK_BDEV_IO_TYPE_UNMAP:
459 		raid0_submit_null_payload_request(raid_io);
460 		break;
461 
462 	default:
463 		SPDK_ERRLOG("submit request, invalid io type %u\n", bdev_io->type);
464 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
465 		break;
466 	}
467 
468 }
469 
470 /*
471  * brief:
472  * _raid_bdev_io_type_supported checks whether io_type is supported in
473  * all base bdev modules of raid bdev module. If anyone among the base_bdevs
474  * doesn't support, the raid device doesn't supports.
475  *
476  * params:
477  * raid_bdev - pointer to raid bdev context
478  * io_type - io type
479  * returns:
480  * true - io_type is supported
481  * false - io_type is not supported
482  */
483 inline static bool
484 _raid_bdev_io_type_supported(struct raid_bdev *raid_bdev, enum spdk_bdev_io_type io_type)
485 {
486 	uint8_t i;
487 
488 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
489 		if (raid_bdev->base_bdev_info[i].bdev == NULL) {
490 			assert(false);
491 			continue;
492 		}
493 
494 		if (spdk_bdev_io_type_supported(raid_bdev->base_bdev_info[i].bdev,
495 						io_type) == false) {
496 			return false;
497 		}
498 	}
499 
500 	return true;
501 }
502 
503 /*
504  * brief:
505  * raid_bdev_io_type_supported is the io_supported function for bdev function
506  * table which returns whether the particular io type is supported or not by
507  * raid bdev module
508  * params:
509  * ctx - pointer to raid bdev context
510  * type - io type
511  * returns:
512  * true - io_type is supported
513  * false - io_type is not supported
514  */
515 static bool
516 raid_bdev_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
517 {
518 	switch (io_type) {
519 	case SPDK_BDEV_IO_TYPE_READ:
520 	case SPDK_BDEV_IO_TYPE_WRITE:
521 		return true;
522 
523 	case SPDK_BDEV_IO_TYPE_FLUSH:
524 	case SPDK_BDEV_IO_TYPE_RESET:
525 	case SPDK_BDEV_IO_TYPE_UNMAP:
526 		return _raid_bdev_io_type_supported(ctx, io_type);
527 
528 	default:
529 		return false;
530 	}
531 
532 	return false;
533 }
534 
535 /*
536  * brief:
537  * raid_bdev_get_io_channel is the get_io_channel function table pointer for
538  * raid bdev. This is used to return the io channel for this raid bdev
539  * params:
540  * ctxt - pointer to raid_bdev
541  * returns:
542  * pointer to io channel for raid bdev
543  */
544 static struct spdk_io_channel *
545 raid_bdev_get_io_channel(void *ctxt)
546 {
547 	struct raid_bdev *raid_bdev = ctxt;
548 
549 	return spdk_get_io_channel(raid_bdev);
550 }
551 
552 /*
553  * brief:
554  * raid_bdev_dump_info_json is the function table pointer for raid bdev
555  * params:
556  * ctx - pointer to raid_bdev
557  * w - pointer to json context
558  * returns:
559  * 0 - success
560  * non zero - failure
561  */
562 static int
563 raid_bdev_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
564 {
565 	struct raid_bdev *raid_bdev = ctx;
566 
567 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_dump_config_json\n");
568 	assert(raid_bdev != NULL);
569 
570 	/* Dump the raid bdev configuration related information */
571 	spdk_json_write_named_object_begin(w, "raid");
572 	spdk_json_write_named_uint32(w, "strip_size", raid_bdev->strip_size);
573 	spdk_json_write_named_uint32(w, "strip_size_kb", raid_bdev->strip_size_kb);
574 	spdk_json_write_named_uint32(w, "state", raid_bdev->state);
575 	spdk_json_write_named_string(w, "raid_level", raid_bdev_level_to_str(raid_bdev->level));
576 	spdk_json_write_named_uint32(w, "destruct_called", raid_bdev->destruct_called);
577 	spdk_json_write_named_uint32(w, "num_base_bdevs", raid_bdev->num_base_bdevs);
578 	spdk_json_write_named_uint32(w, "num_base_bdevs_discovered", raid_bdev->num_base_bdevs_discovered);
579 	spdk_json_write_name(w, "base_bdevs_list");
580 	spdk_json_write_array_begin(w);
581 	for (uint8_t i = 0; i < raid_bdev->num_base_bdevs; i++) {
582 		if (raid_bdev->base_bdev_info[i].bdev) {
583 			spdk_json_write_string(w, raid_bdev->base_bdev_info[i].bdev->name);
584 		} else {
585 			spdk_json_write_null(w);
586 		}
587 	}
588 	spdk_json_write_array_end(w);
589 	spdk_json_write_object_end(w);
590 
591 	return 0;
592 }
593 
594 /*
595  * brief:
596  * raid_bdev_write_config_json is the function table pointer for raid bdev
597  * params:
598  * bdev - pointer to spdk_bdev
599  * w - pointer to json context
600  * returns:
601  * none
602  */
603 static void
604 raid_bdev_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
605 {
606 	struct raid_bdev *raid_bdev = bdev->ctxt;
607 	struct spdk_bdev *base;
608 	uint8_t i;
609 
610 	spdk_json_write_object_begin(w);
611 
612 	spdk_json_write_named_string(w, "method", "bdev_raid_create");
613 
614 	spdk_json_write_named_object_begin(w, "params");
615 	spdk_json_write_named_string(w, "name", bdev->name);
616 	spdk_json_write_named_uint32(w, "strip_size", raid_bdev->strip_size_kb);
617 	spdk_json_write_named_string(w, "raid_level", raid_bdev_level_to_str(raid_bdev->level));
618 
619 	spdk_json_write_named_array_begin(w, "base_bdevs");
620 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
621 		base = raid_bdev->base_bdev_info[i].bdev;
622 		if (base) {
623 			spdk_json_write_string(w, base->name);
624 		}
625 	}
626 	spdk_json_write_array_end(w);
627 	spdk_json_write_object_end(w);
628 
629 	spdk_json_write_object_end(w);
630 }
631 
632 /* g_raid_bdev_fn_table is the function table for raid bdev */
633 static const struct spdk_bdev_fn_table g_raid_bdev_fn_table = {
634 	.destruct		= raid_bdev_destruct,
635 	.submit_request		= raid_bdev_submit_request,
636 	.io_type_supported	= raid_bdev_io_type_supported,
637 	.get_io_channel		= raid_bdev_get_io_channel,
638 	.dump_info_json		= raid_bdev_dump_info_json,
639 	.write_config_json	= raid_bdev_write_config_json,
640 };
641 
642 /*
643  * brief:
644  * raid_bdev_config_cleanup function is used to free memory for one raid_bdev in configuration
645  * params:
646  * raid_cfg - pointer to raid_bdev_config structure
647  * returns:
648  * none
649  */
650 void
651 raid_bdev_config_cleanup(struct raid_bdev_config *raid_cfg)
652 {
653 	uint8_t i;
654 
655 	TAILQ_REMOVE(&g_raid_config.raid_bdev_config_head, raid_cfg, link);
656 	g_raid_config.total_raid_bdev--;
657 
658 	if (raid_cfg->base_bdev) {
659 		for (i = 0; i < raid_cfg->num_base_bdevs; i++) {
660 			free(raid_cfg->base_bdev[i].name);
661 		}
662 		free(raid_cfg->base_bdev);
663 	}
664 	free(raid_cfg->name);
665 	free(raid_cfg);
666 }
667 
668 /*
669  * brief:
670  * raid_bdev_free is the raid bdev function table function pointer. This is
671  * called on bdev free path
672  * params:
673  * none
674  * returns:
675  * none
676  */
677 static void
678 raid_bdev_free(void)
679 {
680 	struct raid_bdev_config *raid_cfg, *tmp;
681 
682 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_free\n");
683 	TAILQ_FOREACH_SAFE(raid_cfg, &g_raid_config.raid_bdev_config_head, link, tmp) {
684 		raid_bdev_config_cleanup(raid_cfg);
685 	}
686 }
687 
688 /* brief
689  * raid_bdev_config_find_by_name is a helper function to find raid bdev config
690  * by name as key.
691  *
692  * params:
693  * raid_name - name for raid bdev.
694  */
695 struct raid_bdev_config *
696 raid_bdev_config_find_by_name(const char *raid_name)
697 {
698 	struct raid_bdev_config *raid_cfg;
699 
700 	TAILQ_FOREACH(raid_cfg, &g_raid_config.raid_bdev_config_head, link) {
701 		if (!strcmp(raid_cfg->name, raid_name)) {
702 			return raid_cfg;
703 		}
704 	}
705 
706 	return raid_cfg;
707 }
708 
709 /*
710  * brief
711  * raid_bdev_config_add function adds config for newly created raid bdev.
712  *
713  * params:
714  * raid_name - name for raid bdev.
715  * strip_size - strip size in KB
716  * num_base_bdevs - number of base bdevs.
717  * level - raid level.
718  * _raid_cfg - Pointer to newly added configuration
719  */
720 int
721 raid_bdev_config_add(const char *raid_name, uint32_t strip_size, uint8_t num_base_bdevs,
722 		     enum raid_level level, struct raid_bdev_config **_raid_cfg)
723 {
724 	struct raid_bdev_config *raid_cfg;
725 
726 	raid_cfg = raid_bdev_config_find_by_name(raid_name);
727 	if (raid_cfg != NULL) {
728 		SPDK_ERRLOG("Duplicate raid bdev name found in config file %s\n",
729 			    raid_name);
730 		return -EEXIST;
731 	}
732 
733 	if (spdk_u32_is_pow2(strip_size) == false) {
734 		SPDK_ERRLOG("Invalid strip size %" PRIu32 "\n", strip_size);
735 		return -EINVAL;
736 	}
737 
738 	if (num_base_bdevs == 0) {
739 		SPDK_ERRLOG("Invalid base device count %u\n", num_base_bdevs);
740 		return -EINVAL;
741 	}
742 
743 	raid_cfg = calloc(1, sizeof(*raid_cfg));
744 	if (raid_cfg == NULL) {
745 		SPDK_ERRLOG("unable to allocate memory\n");
746 		return -ENOMEM;
747 	}
748 
749 	raid_cfg->name = strdup(raid_name);
750 	if (!raid_cfg->name) {
751 		free(raid_cfg);
752 		SPDK_ERRLOG("unable to allocate memory\n");
753 		return -ENOMEM;
754 	}
755 	raid_cfg->strip_size = strip_size;
756 	raid_cfg->num_base_bdevs = num_base_bdevs;
757 	raid_cfg->level = level;
758 
759 	raid_cfg->base_bdev = calloc(num_base_bdevs, sizeof(*raid_cfg->base_bdev));
760 	if (raid_cfg->base_bdev == NULL) {
761 		free(raid_cfg->name);
762 		free(raid_cfg);
763 		SPDK_ERRLOG("unable to allocate memory\n");
764 		return -ENOMEM;
765 	}
766 
767 	TAILQ_INSERT_TAIL(&g_raid_config.raid_bdev_config_head, raid_cfg, link);
768 	g_raid_config.total_raid_bdev++;
769 
770 	*_raid_cfg = raid_cfg;
771 	return 0;
772 }
773 
774 /*
775  * brief:
776  * raid_bdev_config_add_base_bdev function add base bdev to raid bdev config.
777  *
778  * params:
779  * raid_cfg - pointer to raid bdev configuration
780  * base_bdev_name - name of base bdev
781  * slot - Position to add base bdev
782  */
783 int
784 raid_bdev_config_add_base_bdev(struct raid_bdev_config *raid_cfg, const char *base_bdev_name,
785 			       uint8_t slot)
786 {
787 	uint8_t i;
788 	struct raid_bdev_config *tmp;
789 
790 	if (slot >= raid_cfg->num_base_bdevs) {
791 		return -EINVAL;
792 	}
793 
794 	TAILQ_FOREACH(tmp, &g_raid_config.raid_bdev_config_head, link) {
795 		for (i = 0; i < tmp->num_base_bdevs; i++) {
796 			if (tmp->base_bdev[i].name != NULL) {
797 				if (!strcmp(tmp->base_bdev[i].name, base_bdev_name)) {
798 					SPDK_ERRLOG("duplicate base bdev name %s mentioned\n",
799 						    base_bdev_name);
800 					return -EEXIST;
801 				}
802 			}
803 		}
804 	}
805 
806 	raid_cfg->base_bdev[slot].name = strdup(base_bdev_name);
807 	if (raid_cfg->base_bdev[slot].name == NULL) {
808 		SPDK_ERRLOG("unable to allocate memory\n");
809 		return -ENOMEM;
810 	}
811 
812 	return 0;
813 }
814 
815 static struct {
816 	const char *name;
817 	enum raid_level value;
818 } g_raid_level_names[] = {
819 	{ "raid0", RAID0 },
820 	{ "0", RAID0 },
821 	{ }
822 };
823 
824 enum raid_level raid_bdev_parse_raid_level(const char *str)
825 {
826 	unsigned int i;
827 
828 	for (i = 0; g_raid_level_names[i].name != NULL; i++) {
829 		if (strcasecmp(g_raid_level_names[i].name, str) == 0) {
830 			return g_raid_level_names[i].value;
831 		}
832 	}
833 
834 	return INVALID_RAID_LEVEL;
835 }
836 
837 const char *
838 raid_bdev_level_to_str(enum raid_level level)
839 {
840 	unsigned int i;
841 
842 	for (i = 0; g_raid_level_names[i].name != NULL; i++) {
843 		if (g_raid_level_names[i].value == level) {
844 			return g_raid_level_names[i].name;
845 		}
846 	}
847 
848 	return "";
849 }
850 
851 /*
852  * brief:
853  * raid_bdev_parse_raid is used to parse the raid bdev from config file based on
854  * pre-defined raid bdev format in config file.
855  * Format of config file:
856  *   [RAID1]
857  *   Name raid1
858  *   StripSize 64
859  *   NumDevices 2
860  *   RaidLevel 0
861  *   Devices Nvme0n1 Nvme1n1
862  *
863  *   [RAID2]
864  *   Name raid2
865  *   StripSize 64
866  *   NumDevices 3
867  *   RaidLevel 0
868  *   Devices Nvme2n1 Nvme3n1 Nvme4n1
869  *
870  * params:
871  * conf_section - pointer to config section
872  * returns:
873  * 0 - success
874  * non zero - failure
875  */
876 static int
877 raid_bdev_parse_raid(struct spdk_conf_section *conf_section)
878 {
879 	const char *raid_name;
880 	uint32_t strip_size;
881 	uint8_t num_base_bdevs;
882 	const char *raid_level_str;
883 	enum raid_level level;
884 	const char *base_bdev_name;
885 	struct raid_bdev_config *raid_cfg;
886 	int rc, i, val;
887 
888 	raid_name = spdk_conf_section_get_val(conf_section, "Name");
889 	if (raid_name == NULL) {
890 		SPDK_ERRLOG("raid_name is null\n");
891 		return -EINVAL;
892 	}
893 
894 	val = spdk_conf_section_get_intval(conf_section, "StripSize");
895 	if (val < 0) {
896 		return -EINVAL;
897 	}
898 	strip_size = val;
899 
900 	val = spdk_conf_section_get_intval(conf_section, "NumDevices");
901 	if (val < 0) {
902 		return -EINVAL;
903 	}
904 	num_base_bdevs = val;
905 
906 	raid_level_str = spdk_conf_section_get_val(conf_section, "RaidLevel");
907 	if (raid_level_str == NULL) {
908 		SPDK_ERRLOG("Missing RaidLevel\n");
909 		return -EINVAL;
910 	}
911 	level = raid_bdev_parse_raid_level(raid_level_str);
912 	if (level == INVALID_RAID_LEVEL) {
913 		SPDK_ERRLOG("Invalid RaidLevel\n");
914 		return -EINVAL;
915 	}
916 
917 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "%s %" PRIu32 " %u %u\n",
918 		      raid_name, strip_size, num_base_bdevs, level);
919 
920 	rc = raid_bdev_config_add(raid_name, strip_size, num_base_bdevs, level,
921 				  &raid_cfg);
922 	if (rc != 0) {
923 		SPDK_ERRLOG("Failed to add raid bdev config\n");
924 		return rc;
925 	}
926 
927 	for (i = 0; true; i++) {
928 		base_bdev_name = spdk_conf_section_get_nmval(conf_section, "Devices", 0, i);
929 		if (base_bdev_name == NULL) {
930 			break;
931 		}
932 		if (i >= num_base_bdevs) {
933 			raid_bdev_config_cleanup(raid_cfg);
934 			SPDK_ERRLOG("Number of devices mentioned is more than count\n");
935 			return -EINVAL;
936 		}
937 
938 		rc = raid_bdev_config_add_base_bdev(raid_cfg, base_bdev_name, i);
939 		if (rc != 0) {
940 			raid_bdev_config_cleanup(raid_cfg);
941 			SPDK_ERRLOG("Failed to add base bdev to raid bdev config\n");
942 			return rc;
943 		}
944 	}
945 
946 	if (i != raid_cfg->num_base_bdevs) {
947 		raid_bdev_config_cleanup(raid_cfg);
948 		SPDK_ERRLOG("Number of devices mentioned is less than count\n");
949 		return -EINVAL;
950 	}
951 
952 	rc = raid_bdev_create(raid_cfg);
953 	if (rc != 0) {
954 		raid_bdev_config_cleanup(raid_cfg);
955 		SPDK_ERRLOG("Failed to create raid bdev\n");
956 		return rc;
957 	}
958 
959 	rc = raid_bdev_add_base_devices(raid_cfg);
960 	if (rc != 0) {
961 		SPDK_ERRLOG("Failed to add any base bdev to raid bdev\n");
962 		/* Config is not removed in this case. */
963 	}
964 
965 	return 0;
966 }
967 
968 /*
969  * brief:
970  * raid_bdev_parse_config is used to find the raid bdev config section and parse it
971  * Format of config file:
972  * params:
973  * none
974  * returns:
975  * 0 - success
976  * non zero - failure
977  */
978 static int
979 raid_bdev_parse_config(void)
980 {
981 	int                      ret;
982 	struct spdk_conf_section *conf_section;
983 
984 	conf_section = spdk_conf_first_section(NULL);
985 	while (conf_section != NULL) {
986 		if (spdk_conf_section_match_prefix(conf_section, "RAID")) {
987 			ret = raid_bdev_parse_raid(conf_section);
988 			if (ret < 0) {
989 				SPDK_ERRLOG("Unable to parse raid bdev section\n");
990 				return ret;
991 			}
992 		}
993 		conf_section = spdk_conf_next_section(conf_section);
994 	}
995 
996 	return 0;
997 }
998 
999 /*
1000  * brief:
1001  * raid_bdev_fini_start is called when bdev layer is starting the
1002  * shutdown process
1003  * params:
1004  * none
1005  * returns:
1006  * none
1007  */
1008 static void
1009 raid_bdev_fini_start(void)
1010 {
1011 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_fini_start\n");
1012 	g_shutdown_started = true;
1013 }
1014 
1015 /*
1016  * brief:
1017  * raid_bdev_exit is called on raid bdev module exit time by bdev layer
1018  * params:
1019  * none
1020  * returns:
1021  * none
1022  */
1023 static void
1024 raid_bdev_exit(void)
1025 {
1026 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_exit\n");
1027 	raid_bdev_free();
1028 }
1029 
1030 /*
1031  * brief:
1032  * raid_bdev_get_ctx_size is used to return the context size of bdev_io for raid
1033  * module
1034  * params:
1035  * none
1036  * returns:
1037  * size of spdk_bdev_io context for raid
1038  */
1039 static int
1040 raid_bdev_get_ctx_size(void)
1041 {
1042 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_get_ctx_size\n");
1043 	return sizeof(struct raid_bdev_io);
1044 }
1045 
1046 /*
1047  * brief:
1048  * raid_bdev_get_running_config is used to get the configuration options.
1049  *
1050  * params:
1051  * fp - The pointer to a file that will be written to the configuration options.
1052  * returns:
1053  * none
1054  */
1055 static void
1056 raid_bdev_get_running_config(FILE *fp)
1057 {
1058 	struct raid_bdev *raid_bdev;
1059 	struct spdk_bdev *base;
1060 	int index = 1;
1061 	uint8_t i;
1062 
1063 	TAILQ_FOREACH(raid_bdev, &g_raid_bdev_configured_list, state_link) {
1064 		fprintf(fp,
1065 			"\n"
1066 			"[RAID%d]\n"
1067 			"  Name %s\n"
1068 			"  StripSize %" PRIu32 "\n"
1069 			"  NumDevices %u\n"
1070 			"  RaidLevel %s\n",
1071 			index, raid_bdev->bdev.name, raid_bdev->strip_size_kb,
1072 			raid_bdev->num_base_bdevs,
1073 			raid_bdev_level_to_str(raid_bdev->level));
1074 		fprintf(fp,
1075 			"  Devices ");
1076 		for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
1077 			base = raid_bdev->base_bdev_info[i].bdev;
1078 			if (base) {
1079 				fprintf(fp,
1080 					"%s ",
1081 					base->name);
1082 			}
1083 		}
1084 		fprintf(fp,
1085 			"\n");
1086 		index++;
1087 	}
1088 }
1089 
1090 /*
1091  * brief:
1092  * raid_bdev_can_claim_bdev is the function to check if this base_bdev can be
1093  * claimed by raid bdev or not.
1094  * params:
1095  * bdev_name - represents base bdev name
1096  * _raid_cfg - pointer to raid bdev config parsed from config file
1097  * base_bdev_slot - if bdev can be claimed, it represents the base_bdev correct
1098  * slot. This field is only valid if return value of this function is true
1099  * returns:
1100  * true - if bdev can be claimed
1101  * false - if bdev can't be claimed
1102  */
1103 static bool
1104 raid_bdev_can_claim_bdev(const char *bdev_name, struct raid_bdev_config **_raid_cfg,
1105 			 uint8_t *base_bdev_slot)
1106 {
1107 	struct raid_bdev_config *raid_cfg;
1108 	uint8_t i;
1109 
1110 	TAILQ_FOREACH(raid_cfg, &g_raid_config.raid_bdev_config_head, link) {
1111 		for (i = 0; i < raid_cfg->num_base_bdevs; i++) {
1112 			/*
1113 			 * Check if the base bdev name is part of raid bdev configuration.
1114 			 * If match is found then return true and the slot information where
1115 			 * this base bdev should be inserted in raid bdev
1116 			 */
1117 			if (!strcmp(bdev_name, raid_cfg->base_bdev[i].name)) {
1118 				*_raid_cfg = raid_cfg;
1119 				*base_bdev_slot = i;
1120 				return true;
1121 			}
1122 		}
1123 	}
1124 
1125 	return false;
1126 }
1127 
1128 
1129 static struct spdk_bdev_module g_raid_if = {
1130 	.name = "raid",
1131 	.module_init = raid_bdev_init,
1132 	.fini_start = raid_bdev_fini_start,
1133 	.module_fini = raid_bdev_exit,
1134 	.get_ctx_size = raid_bdev_get_ctx_size,
1135 	.examine_config = raid_bdev_examine,
1136 	.config_text = raid_bdev_get_running_config,
1137 	.async_init = false,
1138 	.async_fini = false,
1139 };
1140 SPDK_BDEV_MODULE_REGISTER(raid, &g_raid_if)
1141 
1142 /*
1143  * brief:
1144  * raid_bdev_init is the initialization function for raid bdev module
1145  * params:
1146  * none
1147  * returns:
1148  * 0 - success
1149  * non zero - failure
1150  */
1151 static int
1152 raid_bdev_init(void)
1153 {
1154 	int ret;
1155 
1156 	/* Parse config file for raids */
1157 	ret = raid_bdev_parse_config();
1158 	if (ret < 0) {
1159 		SPDK_ERRLOG("raid bdev init failed parsing\n");
1160 		raid_bdev_free();
1161 		return ret;
1162 	}
1163 
1164 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_init completed successfully\n");
1165 
1166 	return 0;
1167 }
1168 
1169 /*
1170  * brief:
1171  * raid_bdev_create allocates raid bdev based on passed configuration
1172  * params:
1173  * raid_cfg - configuration of raid bdev
1174  * returns:
1175  * 0 - success
1176  * non zero - failure
1177  */
1178 int
1179 raid_bdev_create(struct raid_bdev_config *raid_cfg)
1180 {
1181 	struct raid_bdev *raid_bdev;
1182 	struct spdk_bdev *raid_bdev_gen;
1183 
1184 	raid_bdev = calloc(1, sizeof(*raid_bdev));
1185 	if (!raid_bdev) {
1186 		SPDK_ERRLOG("Unable to allocate memory for raid bdev\n");
1187 		return -ENOMEM;
1188 	}
1189 
1190 	assert(raid_cfg->num_base_bdevs != 0);
1191 	raid_bdev->num_base_bdevs = raid_cfg->num_base_bdevs;
1192 	raid_bdev->base_bdev_info = calloc(raid_bdev->num_base_bdevs,
1193 					   sizeof(struct raid_base_bdev_info));
1194 	if (!raid_bdev->base_bdev_info) {
1195 		SPDK_ERRLOG("Unable able to allocate base bdev info\n");
1196 		free(raid_bdev);
1197 		return -ENOMEM;
1198 	}
1199 
1200 	/* strip_size_kb is from the rpc param.  strip_size is in blocks and used
1201 	 * intnerally and set later.
1202 	 */
1203 	raid_bdev->strip_size = 0;
1204 	raid_bdev->strip_size_kb = raid_cfg->strip_size;
1205 	raid_bdev->state = RAID_BDEV_STATE_CONFIGURING;
1206 	raid_bdev->config = raid_cfg;
1207 	raid_bdev->level = raid_cfg->level;
1208 
1209 	raid_bdev->module = raid_bdev_module_find(raid_bdev->level);
1210 	if (raid_bdev->module == NULL) {
1211 		SPDK_ERRLOG("Unsupported raid level '%d'\n", raid_bdev->level);
1212 		free(raid_bdev->base_bdev_info);
1213 		free(raid_bdev);
1214 		return -EINVAL;
1215 	}
1216 
1217 	raid_bdev_gen = &raid_bdev->bdev;
1218 
1219 	raid_bdev_gen->name = strdup(raid_cfg->name);
1220 	if (!raid_bdev_gen->name) {
1221 		SPDK_ERRLOG("Unable to allocate name for raid\n");
1222 		free(raid_bdev->base_bdev_info);
1223 		free(raid_bdev);
1224 		return -ENOMEM;
1225 	}
1226 
1227 	raid_bdev_gen->product_name = "Raid Volume";
1228 	raid_bdev_gen->ctxt = raid_bdev;
1229 	raid_bdev_gen->fn_table = &g_raid_bdev_fn_table;
1230 	raid_bdev_gen->module = &g_raid_if;
1231 	raid_bdev_gen->write_cache = 0;
1232 
1233 	TAILQ_INSERT_TAIL(&g_raid_bdev_configuring_list, raid_bdev, state_link);
1234 	TAILQ_INSERT_TAIL(&g_raid_bdev_list, raid_bdev, global_link);
1235 
1236 	raid_cfg->raid_bdev = raid_bdev;
1237 
1238 	return 0;
1239 }
1240 
1241 /*
1242  * brief
1243  * raid_bdev_alloc_base_bdev_resource allocates resource of base bdev.
1244  * params:
1245  * raid_bdev - pointer to raid bdev
1246  * bdev - pointer to base bdev
1247  * base_bdev_slot - position to add base bdev
1248  * returns:
1249  * 0 - success
1250  * non zero - failure
1251  */
1252 static int
1253 raid_bdev_alloc_base_bdev_resource(struct raid_bdev *raid_bdev, struct spdk_bdev *bdev,
1254 				   uint8_t base_bdev_slot)
1255 {
1256 	struct spdk_bdev_desc *desc;
1257 	int rc;
1258 
1259 	rc = spdk_bdev_open(bdev, true, raid_bdev_remove_base_bdev, bdev, &desc);
1260 	if (rc != 0) {
1261 		SPDK_ERRLOG("Unable to create desc on bdev '%s'\n", bdev->name);
1262 		return rc;
1263 	}
1264 
1265 	rc = spdk_bdev_module_claim_bdev(bdev, NULL, &g_raid_if);
1266 	if (rc != 0) {
1267 		SPDK_ERRLOG("Unable to claim this bdev as it is already claimed\n");
1268 		spdk_bdev_close(desc);
1269 		return rc;
1270 	}
1271 
1272 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "bdev %s is claimed\n", bdev->name);
1273 
1274 	assert(raid_bdev->state != RAID_BDEV_STATE_ONLINE);
1275 	assert(base_bdev_slot < raid_bdev->num_base_bdevs);
1276 
1277 	raid_bdev->base_bdev_info[base_bdev_slot].bdev = bdev;
1278 	raid_bdev->base_bdev_info[base_bdev_slot].desc = desc;
1279 	raid_bdev->num_base_bdevs_discovered++;
1280 	assert(raid_bdev->num_base_bdevs_discovered <= raid_bdev->num_base_bdevs);
1281 
1282 	return 0;
1283 }
1284 
1285 /*
1286  * brief:
1287  * If raid bdev config is complete, then only register the raid bdev to
1288  * bdev layer and remove this raid bdev from configuring list and
1289  * insert the raid bdev to configured list
1290  * params:
1291  * raid_bdev - pointer to raid bdev
1292  * returns:
1293  * 0 - success
1294  * non zero - failure
1295  */
1296 static int
1297 raid_bdev_configure(struct raid_bdev *raid_bdev)
1298 {
1299 	uint32_t		blocklen;
1300 	uint64_t		min_blockcnt;
1301 	struct spdk_bdev	*raid_bdev_gen;
1302 	int rc = 0;
1303 
1304 	blocklen = raid_bdev->base_bdev_info[0].bdev->blocklen;
1305 	min_blockcnt = raid_bdev->base_bdev_info[0].bdev->blockcnt;
1306 	for (uint8_t i = 1; i < raid_bdev->num_base_bdevs; i++) {
1307 		/* Calculate minimum block count from all base bdevs */
1308 		if (raid_bdev->base_bdev_info[i].bdev->blockcnt < min_blockcnt) {
1309 			min_blockcnt = raid_bdev->base_bdev_info[i].bdev->blockcnt;
1310 		}
1311 
1312 		/* Check blocklen for all base bdevs that it should be same */
1313 		if (blocklen != raid_bdev->base_bdev_info[i].bdev->blocklen) {
1314 			/*
1315 			 * Assumption is that all the base bdevs for any raid bdev should
1316 			 * have same blocklen
1317 			 */
1318 			SPDK_ERRLOG("Blocklen of various bdevs not matching\n");
1319 			return -EINVAL;
1320 		}
1321 	}
1322 
1323 	/* The strip_size_kb is read in from user in KB. Convert to blocks here for
1324 	 * internal use.
1325 	 */
1326 	raid_bdev->strip_size = (raid_bdev->strip_size_kb * 1024) / blocklen;
1327 	raid_bdev->strip_size_shift = spdk_u32log2(raid_bdev->strip_size);
1328 	raid_bdev->blocklen_shift = spdk_u32log2(blocklen);
1329 
1330 	raid_bdev_gen = &raid_bdev->bdev;
1331 	raid_bdev_gen->blocklen = blocklen;
1332 	if (raid_bdev->num_base_bdevs > 1) {
1333 		raid_bdev_gen->optimal_io_boundary = raid_bdev->strip_size;
1334 		raid_bdev_gen->split_on_optimal_io_boundary = true;
1335 	} else {
1336 		/* Do not need to split reads/writes on single bdev RAID modules. */
1337 		raid_bdev_gen->optimal_io_boundary = 0;
1338 		raid_bdev_gen->split_on_optimal_io_boundary = false;
1339 	}
1340 
1341 	/*
1342 	 * RAID bdev logic is for striping so take the minimum block count based
1343 	 * approach where total block count of raid bdev is the number of base
1344 	 * bdev times the minimum block count of any base bdev
1345 	 */
1346 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "min blockcount %lu,  numbasedev %u, strip size shift %u\n",
1347 		      min_blockcnt,
1348 		      raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift);
1349 	raid_bdev_gen->blockcnt = ((min_blockcnt >> raid_bdev->strip_size_shift) <<
1350 				   raid_bdev->strip_size_shift)  * raid_bdev->num_base_bdevs;
1351 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "io device register %p\n", raid_bdev);
1352 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "blockcnt %lu, blocklen %u\n", raid_bdev_gen->blockcnt,
1353 		      raid_bdev_gen->blocklen);
1354 	if (raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) {
1355 		raid_bdev->state = RAID_BDEV_STATE_ONLINE;
1356 		spdk_io_device_register(raid_bdev, raid_bdev_create_cb, raid_bdev_destroy_cb,
1357 					sizeof(struct raid_bdev_io_channel),
1358 					raid_bdev->bdev.name);
1359 		rc = spdk_bdev_register(raid_bdev_gen);
1360 		if (rc != 0) {
1361 			SPDK_ERRLOG("Unable to register raid bdev and stay at configuring state\n");
1362 			spdk_io_device_unregister(raid_bdev, NULL);
1363 			raid_bdev->state = RAID_BDEV_STATE_CONFIGURING;
1364 			return rc;
1365 		}
1366 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev generic %p\n", raid_bdev_gen);
1367 		TAILQ_REMOVE(&g_raid_bdev_configuring_list, raid_bdev, state_link);
1368 		TAILQ_INSERT_TAIL(&g_raid_bdev_configured_list, raid_bdev, state_link);
1369 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev is created with name %s, raid_bdev %p\n",
1370 			      raid_bdev_gen->name, raid_bdev);
1371 	}
1372 
1373 	return 0;
1374 }
1375 
1376 /*
1377  * brief:
1378  * If raid bdev is online and registered, change the bdev state to
1379  * configuring and unregister this raid device. Queue this raid device
1380  * in configuring list
1381  * params:
1382  * raid_bdev - pointer to raid bdev
1383  * cb_fn - callback function
1384  * cb_arg - argument to callback function
1385  * returns:
1386  * none
1387  */
1388 static void
1389 raid_bdev_deconfigure(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn,
1390 		      void *cb_arg)
1391 {
1392 	if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) {
1393 		if (cb_fn) {
1394 			cb_fn(cb_arg, 0);
1395 		}
1396 		return;
1397 	}
1398 
1399 	assert(raid_bdev->num_base_bdevs == raid_bdev->num_base_bdevs_discovered);
1400 	TAILQ_REMOVE(&g_raid_bdev_configured_list, raid_bdev, state_link);
1401 	raid_bdev->state = RAID_BDEV_STATE_OFFLINE;
1402 	assert(raid_bdev->num_base_bdevs_discovered);
1403 	TAILQ_INSERT_TAIL(&g_raid_bdev_offline_list, raid_bdev, state_link);
1404 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev state chaning from online to offline\n");
1405 
1406 	spdk_bdev_unregister(&raid_bdev->bdev, cb_fn, cb_arg);
1407 }
1408 
1409 /*
1410  * brief:
1411  * raid_bdev_find_by_base_bdev function finds the raid bdev which has
1412  *  claimed the base bdev.
1413  * params:
1414  * base_bdev - pointer to base bdev pointer
1415  * _raid_bdev - Referenct to pointer to raid bdev
1416  * _base_bdev_slot - Reference to the slot of the base bdev.
1417  * returns:
1418  * true - if the raid bdev is found.
1419  * false - if the raid bdev is not found.
1420  */
1421 static bool
1422 raid_bdev_find_by_base_bdev(struct spdk_bdev *base_bdev, struct raid_bdev **_raid_bdev,
1423 			    uint8_t *_base_bdev_slot)
1424 {
1425 	struct raid_bdev	*raid_bdev;
1426 	uint8_t			i;
1427 
1428 	TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) {
1429 		for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
1430 			if (raid_bdev->base_bdev_info[i].bdev == base_bdev) {
1431 				*_raid_bdev = raid_bdev;
1432 				*_base_bdev_slot = i;
1433 				return true;
1434 			}
1435 		}
1436 	}
1437 
1438 	return false;
1439 }
1440 
1441 /*
1442  * brief:
1443  * raid_bdev_remove_base_bdev function is called by below layers when base_bdev
1444  * is removed. This function checks if this base bdev is part of any raid bdev
1445  * or not. If yes, it takes necessary action on that particular raid bdev.
1446  * params:
1447  * ctx - pointer to base bdev pointer which got removed
1448  * returns:
1449  * none
1450  */
1451 static void
1452 raid_bdev_remove_base_bdev(void *ctx)
1453 {
1454 	struct spdk_bdev	*base_bdev = ctx;
1455 	struct raid_bdev	*raid_bdev = NULL;
1456 	uint8_t			base_bdev_slot = 0;
1457 
1458 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_remove_base_bdev\n");
1459 
1460 	/* Find the raid_bdev which has claimed this base_bdev */
1461 	if (!raid_bdev_find_by_base_bdev(base_bdev, &raid_bdev, &base_bdev_slot)) {
1462 		SPDK_ERRLOG("bdev to remove '%s' not found\n", base_bdev->name);
1463 		return;
1464 	}
1465 
1466 	assert(raid_bdev->base_bdev_info[base_bdev_slot].desc);
1467 	raid_bdev->base_bdev_info[base_bdev_slot].remove_scheduled = true;
1468 
1469 	if (raid_bdev->destruct_called == true ||
1470 	    raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) {
1471 		/*
1472 		 * As raid bdev is not registered yet or already unregistered,
1473 		 * so cleanup should be done here itself.
1474 		 */
1475 		raid_bdev_free_base_bdev_resource(raid_bdev, base_bdev_slot);
1476 		if (raid_bdev->num_base_bdevs_discovered == 0) {
1477 			/* There is no base bdev for this raid, so free the raid device. */
1478 			raid_bdev_cleanup(raid_bdev);
1479 			return;
1480 		}
1481 	}
1482 
1483 	raid_bdev_deconfigure(raid_bdev, NULL, NULL);
1484 }
1485 
1486 /*
1487  * brief:
1488  * Remove base bdevs from the raid bdev one by one.  Skip any base bdev which
1489  *  doesn't exist.
1490  * params:
1491  * raid_cfg - pointer to raid bdev config.
1492  * cb_fn - callback function
1493  * cb_ctx - argument to callback function
1494  */
1495 void
1496 raid_bdev_remove_base_devices(struct raid_bdev_config *raid_cfg,
1497 			      raid_bdev_destruct_cb cb_fn, void *cb_arg)
1498 {
1499 	struct raid_bdev		*raid_bdev;
1500 	struct raid_base_bdev_info	*info;
1501 	uint8_t				i;
1502 
1503 	SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_remove_base_devices\n");
1504 
1505 	raid_bdev = raid_cfg->raid_bdev;
1506 	if (raid_bdev == NULL) {
1507 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev %s doesn't exist now\n", raid_cfg->name);
1508 		if (cb_fn) {
1509 			cb_fn(cb_arg, 0);
1510 		}
1511 		return;
1512 	}
1513 
1514 	if (raid_bdev->destroy_started) {
1515 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "destroying raid bdev %s is already started\n",
1516 			      raid_cfg->name);
1517 		if (cb_fn) {
1518 			cb_fn(cb_arg, -EALREADY);
1519 		}
1520 		return;
1521 	}
1522 
1523 	raid_bdev->destroy_started = true;
1524 
1525 	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
1526 		info = &raid_bdev->base_bdev_info[i];
1527 
1528 		if (info->bdev == NULL) {
1529 			continue;
1530 		}
1531 
1532 		assert(info->desc);
1533 		info->remove_scheduled = true;
1534 
1535 		if (raid_bdev->destruct_called == true ||
1536 		    raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) {
1537 			/*
1538 			 * As raid bdev is not registered yet or already unregistered,
1539 			 * so cleanup should be done here itself.
1540 			 */
1541 			raid_bdev_free_base_bdev_resource(raid_bdev, i);
1542 			if (raid_bdev->num_base_bdevs_discovered == 0) {
1543 				/* There is no base bdev for this raid, so free the raid device. */
1544 				raid_bdev_cleanup(raid_bdev);
1545 				if (cb_fn) {
1546 					cb_fn(cb_arg, 0);
1547 				}
1548 				return;
1549 			}
1550 		}
1551 	}
1552 
1553 	raid_bdev_deconfigure(raid_bdev, cb_fn, cb_arg);
1554 }
1555 
1556 /*
1557  * brief:
1558  * raid_bdev_add_base_device function is the actual function which either adds
1559  * the nvme base device to existing raid bdev or create a new raid bdev. It also claims
1560  * the base device and keep the open descriptor.
1561  * params:
1562  * raid_cfg - pointer to raid bdev config
1563  * bdev - pointer to base bdev
1564  * base_bdev_slot - position to add base bdev
1565  * returns:
1566  * 0 - success
1567  * non zero - failure
1568  */
1569 static int
1570 raid_bdev_add_base_device(struct raid_bdev_config *raid_cfg, struct spdk_bdev *bdev,
1571 			  uint8_t base_bdev_slot)
1572 {
1573 	struct raid_bdev	*raid_bdev;
1574 	int			rc;
1575 
1576 	raid_bdev = raid_cfg->raid_bdev;
1577 	if (!raid_bdev) {
1578 		SPDK_ERRLOG("Raid bdev '%s' is not created yet\n", raid_cfg->name);
1579 		return -ENODEV;
1580 	}
1581 
1582 	rc = raid_bdev_alloc_base_bdev_resource(raid_bdev, bdev, base_bdev_slot);
1583 	if (rc != 0) {
1584 		SPDK_ERRLOG("Failed to allocate resource for bdev '%s'\n", bdev->name);
1585 		return rc;
1586 	}
1587 
1588 	assert(raid_bdev->num_base_bdevs_discovered <= raid_bdev->num_base_bdevs);
1589 
1590 	if (raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs) {
1591 		rc = raid_bdev_configure(raid_bdev);
1592 		if (rc != 0) {
1593 			SPDK_ERRLOG("Failed to configure raid bdev\n");
1594 			return rc;
1595 		}
1596 	}
1597 
1598 	return 0;
1599 }
1600 
1601 /*
1602  * brief:
1603  * Add base bdevs to the raid bdev one by one.  Skip any base bdev which doesn't
1604  *  exist or fails to add. If all base bdevs are successfully added, the raid bdev
1605  *  moves to the configured state and becomes available. Otherwise, the raid bdev
1606  *  stays at the configuring state with added base bdevs.
1607  * params:
1608  * raid_cfg - pointer to raid bdev config
1609  * returns:
1610  * 0 - The raid bdev moves to the configured state or stays at the configuring
1611  *     state with added base bdevs due to any nonexistent base bdev.
1612  * non zero - Failed to add any base bdev and stays at the configuring state with
1613  *            added base bdevs.
1614  */
1615 int
1616 raid_bdev_add_base_devices(struct raid_bdev_config *raid_cfg)
1617 {
1618 	struct spdk_bdev	*base_bdev;
1619 	uint8_t			i;
1620 	int			rc = 0, _rc;
1621 
1622 	for (i = 0; i < raid_cfg->num_base_bdevs; i++) {
1623 		base_bdev = spdk_bdev_get_by_name(raid_cfg->base_bdev[i].name);
1624 		if (base_bdev == NULL) {
1625 			SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "base bdev %s doesn't exist now\n",
1626 				      raid_cfg->base_bdev[i].name);
1627 			continue;
1628 		}
1629 
1630 		_rc = raid_bdev_add_base_device(raid_cfg, base_bdev, i);
1631 		if (_rc != 0) {
1632 			SPDK_ERRLOG("Failed to add base bdev %s to RAID bdev %s: %s\n",
1633 				    raid_cfg->base_bdev[i].name, raid_cfg->name,
1634 				    spdk_strerror(-_rc));
1635 			if (rc == 0) {
1636 				rc = _rc;
1637 			}
1638 		}
1639 	}
1640 
1641 	return rc;
1642 }
1643 
1644 /*
1645  * brief:
1646  * raid_bdev_examine function is the examine function call by the below layers
1647  * like bdev_nvme layer. This function will check if this base bdev can be
1648  * claimed by this raid bdev or not.
1649  * params:
1650  * bdev - pointer to base bdev
1651  * returns:
1652  * none
1653  */
1654 static void
1655 raid_bdev_examine(struct spdk_bdev *bdev)
1656 {
1657 	struct raid_bdev_config	*raid_cfg;
1658 	uint8_t			base_bdev_slot;
1659 
1660 	if (raid_bdev_can_claim_bdev(bdev->name, &raid_cfg, &base_bdev_slot)) {
1661 		raid_bdev_add_base_device(raid_cfg, bdev, base_bdev_slot);
1662 	} else {
1663 		SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "bdev %s can't be claimed\n",
1664 			      bdev->name);
1665 	}
1666 
1667 	spdk_bdev_module_examine_done(&g_raid_if);
1668 }
1669 
1670 /* Log component for bdev raid bdev module */
1671 SPDK_LOG_REGISTER_COMPONENT("bdev_raid", SPDK_LOG_BDEV_RAID)
1672