xref: /spdk/module/bdev/raid/bdev_raid.h (revision 5558f3f5022a22f65bfbb6e3a9fc67602f6d0ca8)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2018 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #ifndef SPDK_BDEV_RAID_INTERNAL_H
7 #define SPDK_BDEV_RAID_INTERNAL_H
8 
9 #include "spdk/bdev_module.h"
10 #include "spdk/uuid.h"
11 
12 #define RAID_BDEV_MIN_DATA_OFFSET_SIZE	(1024*1024) /* 1 MiB */
13 
14 enum raid_level {
15 	INVALID_RAID_LEVEL	= -1,
16 	RAID0			= 0,
17 	RAID1			= 1,
18 	RAID5F			= 95, /* 0x5f */
19 	CONCAT			= 99,
20 };
21 
22 /*
23  * Raid state describes the state of the raid. This raid bdev can be either in
24  * configured list or configuring list
25  */
26 enum raid_bdev_state {
27 	/* raid bdev is ready and is seen by upper layers */
28 	RAID_BDEV_STATE_ONLINE,
29 
30 	/*
31 	 * raid bdev is configuring, not all underlying bdevs are present.
32 	 * And can't be seen by upper layers.
33 	 */
34 	RAID_BDEV_STATE_CONFIGURING,
35 
36 	/*
37 	 * In offline state, raid bdev layer will complete all incoming commands without
38 	 * submitting to underlying base nvme bdevs
39 	 */
40 	RAID_BDEV_STATE_OFFLINE,
41 
42 	/* raid bdev state max, new states should be added before this */
43 	RAID_BDEV_STATE_MAX
44 };
45 
46 enum raid_process_type {
47 	RAID_PROCESS_NONE,
48 	RAID_PROCESS_REBUILD,
49 	RAID_PROCESS_MAX
50 };
51 
52 typedef void (*raid_base_bdev_cb)(void *ctx, int status);
53 
54 /*
55  * raid_base_bdev_info contains information for the base bdevs which are part of some
56  * raid. This structure contains the per base bdev information. Whatever is
57  * required per base device for raid bdev will be kept here
58  */
59 struct raid_base_bdev_info {
60 	/* The raid bdev that this base bdev belongs to */
61 	struct raid_bdev	*raid_bdev;
62 
63 	/* name of the bdev */
64 	char			*name;
65 
66 	/* uuid of the bdev */
67 	struct spdk_uuid	uuid;
68 
69 	/*
70 	 * Pointer to base bdev descriptor opened by raid bdev. This is NULL when the bdev for
71 	 * this slot is missing.
72 	 */
73 	struct spdk_bdev_desc	*desc;
74 
75 	/* offset in blocks from the start of the base bdev to the start of the data region */
76 	uint64_t		data_offset;
77 
78 	/* size in blocks of the base bdev's data region */
79 	uint64_t		data_size;
80 
81 	/*
82 	 * When underlying base device calls the hot plug function on drive removal,
83 	 * this flag will be set and later after doing some processing, base device
84 	 * descriptor will be closed
85 	 */
86 	bool			remove_scheduled;
87 
88 	/* callback for base bdev removal */
89 	raid_base_bdev_cb	remove_cb;
90 
91 	/* context of the callback */
92 	void			*remove_cb_ctx;
93 
94 	/* Hold the number of blocks to know how large the base bdev is resized. */
95 	uint64_t		blockcnt;
96 
97 	/* io channel for the app thread */
98 	struct spdk_io_channel	*app_thread_ch;
99 
100 	/* Set to true when base bdev has completed the configuration process */
101 	bool			is_configured;
102 
103 	/* Set to true if this base bdev is the target of a background process */
104 	bool			is_process_target;
105 
106 	/* Set to true to indicate that the base bdev is being removed because of a failure */
107 	bool			is_failed;
108 
109 	/* callback for base bdev configuration */
110 	raid_base_bdev_cb	configure_cb;
111 
112 	/* context of the callback */
113 	void			*configure_cb_ctx;
114 };
115 
116 struct raid_bdev_io;
117 typedef void (*raid_bdev_io_completion_cb)(struct raid_bdev_io *raid_io,
118 		enum spdk_bdev_io_status status);
119 
120 /*
121  * raid_bdev_io is the context part of bdev_io. It contains the information
122  * related to bdev_io for a raid bdev
123  */
124 struct raid_bdev_io {
125 	/* The raid bdev associated with this IO */
126 	struct raid_bdev *raid_bdev;
127 
128 	uint64_t offset_blocks;
129 	uint64_t num_blocks;
130 	struct iovec *iovs;
131 	int iovcnt;
132 	enum spdk_bdev_io_type type;
133 	struct spdk_memory_domain *memory_domain;
134 	void *memory_domain_ctx;
135 	void *md_buf;
136 
137 	/* WaitQ entry, used only in waitq logic */
138 	struct spdk_bdev_io_wait_entry	waitq_entry;
139 
140 	/* Context of the original channel for this IO */
141 	struct raid_bdev_io_channel	*raid_ch;
142 
143 	/* Used for tracking progress on io requests sent to member disks. */
144 	uint64_t			base_bdev_io_remaining;
145 	uint8_t				base_bdev_io_submitted;
146 	enum spdk_bdev_io_status	base_bdev_io_status;
147 	/* This will be the raid_io completion status unless any base io's status is different. */
148 	enum spdk_bdev_io_status	base_bdev_io_status_default;
149 
150 	/* Private data for the raid module */
151 	void				*module_private;
152 
153 	/* Custom completion callback. Overrides bdev_io completion if set. */
154 	raid_bdev_io_completion_cb	completion_cb;
155 
156 	struct {
157 		uint64_t		offset;
158 		struct iovec		*iov;
159 		struct iovec		iov_copy;
160 	} split;
161 };
162 
163 struct raid_bdev_process_request {
164 	struct raid_bdev_process *process;
165 	struct raid_base_bdev_info *target;
166 	struct spdk_io_channel *target_ch;
167 	uint64_t offset_blocks;
168 	uint32_t num_blocks;
169 	struct iovec iov;
170 	void *md_buf;
171 	/* bdev_io is raid_io's driver_ctx - don't reorder them!
172 	 * These are needed for re-using raid module I/O functions for process I/O. */
173 	struct spdk_bdev_io bdev_io;
174 	struct raid_bdev_io raid_io;
175 	TAILQ_ENTRY(raid_bdev_process_request) link;
176 };
177 
178 typedef void (*raid_bdev_configure_cb)(void *cb_ctx, int rc);
179 
180 /*
181  * raid_bdev is the single entity structure which contains SPDK block device
182  * and the information related to any raid bdev either configured or
183  * in configuring list. io device is created on this.
184  */
185 struct raid_bdev {
186 	/* raid bdev device, this will get registered in bdev layer */
187 	struct spdk_bdev		bdev;
188 
189 	/* the raid bdev descriptor, opened for internal use */
190 	struct spdk_bdev_desc		*self_desc;
191 
192 	/* link of raid bdev to link it to global raid bdev list */
193 	TAILQ_ENTRY(raid_bdev)		global_link;
194 
195 	/* array of base bdev info */
196 	struct raid_base_bdev_info	*base_bdev_info;
197 
198 	/* strip size of raid bdev in blocks */
199 	uint32_t			strip_size;
200 
201 	/* strip size of raid bdev in KB */
202 	uint32_t			strip_size_kb;
203 
204 	/* strip size bit shift for optimized calculation */
205 	uint32_t			strip_size_shift;
206 
207 	/* state of raid bdev */
208 	enum raid_bdev_state		state;
209 
210 	/* number of base bdevs comprising raid bdev  */
211 	uint8_t				num_base_bdevs;
212 
213 	/* number of base bdevs discovered */
214 	uint8_t				num_base_bdevs_discovered;
215 
216 	/*
217 	 * Number of operational base bdevs, i.e. how many we know/expect to be working. This
218 	 * will be less than num_base_bdevs when starting a degraded array.
219 	 */
220 	uint8_t				num_base_bdevs_operational;
221 
222 	/* minimum number of viable base bdevs that are required by array to operate */
223 	uint8_t				min_base_bdevs_operational;
224 
225 	/* Raid Level of this raid bdev */
226 	enum raid_level			level;
227 
228 	/* Set to true if destroy of this raid bdev is started. */
229 	bool				destroy_started;
230 
231 	/* Module for RAID-level specific operations */
232 	struct raid_bdev_module		*module;
233 
234 	/* Private data for the raid module */
235 	void				*module_private;
236 
237 	/* Superblock */
238 	bool				superblock_enabled;
239 	struct raid_bdev_superblock	*sb;
240 
241 	/* Superblock buffer used for I/O */
242 	void				*sb_io_buf;
243 	uint32_t			sb_io_buf_size;
244 
245 	/* Raid bdev background process, e.g. rebuild */
246 	struct raid_bdev_process	*process;
247 
248 	/* Callback and context for raid_bdev configuration */
249 	raid_bdev_configure_cb		configure_cb;
250 	void				*configure_cb_ctx;
251 };
252 
253 #define RAID_FOR_EACH_BASE_BDEV(r, i) \
254 	for (i = r->base_bdev_info; i < r->base_bdev_info + r->num_base_bdevs; i++)
255 
256 struct raid_bdev_io_channel;
257 
258 /* TAIL head for raid bdev list */
259 TAILQ_HEAD(raid_all_tailq, raid_bdev);
260 
261 extern struct raid_all_tailq		g_raid_bdev_list;
262 
263 typedef void (*raid_bdev_destruct_cb)(void *cb_ctx, int rc);
264 
265 int raid_bdev_create(const char *name, uint32_t strip_size, uint8_t num_base_bdevs,
266 		     enum raid_level level, bool superblock, const struct spdk_uuid *uuid,
267 		     struct raid_bdev **raid_bdev_out);
268 void raid_bdev_delete(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn, void *cb_ctx);
269 int raid_bdev_add_base_bdev(struct raid_bdev *raid_bdev, const char *name,
270 			    raid_base_bdev_cb cb_fn, void *cb_ctx);
271 struct raid_bdev *raid_bdev_find_by_name(const char *name);
272 enum raid_level raid_bdev_str_to_level(const char *str);
273 const char *raid_bdev_level_to_str(enum raid_level level);
274 enum raid_bdev_state raid_bdev_str_to_state(const char *str);
275 const char *raid_bdev_state_to_str(enum raid_bdev_state state);
276 const char *raid_bdev_process_to_str(enum raid_process_type value);
277 void raid_bdev_write_info_json(struct raid_bdev *raid_bdev, struct spdk_json_write_ctx *w);
278 int raid_bdev_remove_base_bdev(struct spdk_bdev *base_bdev, raid_base_bdev_cb cb_fn, void *cb_ctx);
279 
280 /*
281  * RAID module descriptor
282  */
283 struct raid_bdev_module {
284 	/* RAID level implemented by this module */
285 	enum raid_level level;
286 
287 	/* Minimum required number of base bdevs. Must be > 0. */
288 	uint8_t base_bdevs_min;
289 
290 	/*
291 	 * RAID constraint. Determines number of base bdevs that can be removed
292 	 * without failing the array.
293 	 */
294 	struct {
295 		enum {
296 			CONSTRAINT_UNSET = 0,
297 			CONSTRAINT_MAX_BASE_BDEVS_REMOVED,
298 			CONSTRAINT_MIN_BASE_BDEVS_OPERATIONAL,
299 		} type;
300 		uint8_t value;
301 	} base_bdevs_constraint;
302 
303 	/* Set to true if this module supports memory domains. */
304 	bool memory_domains_supported;
305 
306 	/* Set to true if this module supports DIF/DIX */
307 	bool dif_supported;
308 
309 	/*
310 	 * Called when the raid is starting, right before changing the state to
311 	 * online and registering the bdev. Parameters of the bdev like blockcnt
312 	 * should be set here.
313 	 *
314 	 * Non-zero return value will abort the startup process.
315 	 */
316 	int (*start)(struct raid_bdev *raid_bdev);
317 
318 	/*
319 	 * Called when the raid is stopping, right before changing the state to
320 	 * offline and unregistering the bdev. Optional.
321 	 *
322 	 * The function should return false if it is asynchronous. Then, after
323 	 * the async operation has completed and the module is fully stopped
324 	 * raid_bdev_module_stop_done() must be called.
325 	 */
326 	bool (*stop)(struct raid_bdev *raid_bdev);
327 
328 	/* Handler for R/W requests */
329 	void (*submit_rw_request)(struct raid_bdev_io *raid_io);
330 
331 	/* Handler for requests without payload (flush, unmap). Optional. */
332 	void (*submit_null_payload_request)(struct raid_bdev_io *raid_io);
333 
334 	/*
335 	 * Called when the bdev's IO channel is created to get the module's private IO channel.
336 	 * Optional.
337 	 */
338 	struct spdk_io_channel *(*get_io_channel)(struct raid_bdev *raid_bdev);
339 
340 	/*
341 	 * Called when a base_bdev is resized to resize the raid if the condition
342 	 * is satisfied. Optional.
343 	 *
344 	 * Returns true if the resize was performed.
345 	 */
346 	bool (*resize)(struct raid_bdev *raid_bdev);
347 
348 	/* Handler for raid process requests. Required for raid modules with redundancy. */
349 	int (*submit_process_request)(struct raid_bdev_process_request *process_req,
350 				      struct raid_bdev_io_channel *raid_ch);
351 
352 	TAILQ_ENTRY(raid_bdev_module) link;
353 };
354 
355 void raid_bdev_module_list_add(struct raid_bdev_module *raid_module);
356 
357 #define __RAID_MODULE_REGISTER(line) __RAID_MODULE_REGISTER_(line)
358 #define __RAID_MODULE_REGISTER_(line) raid_module_register_##line
359 
360 #define RAID_MODULE_REGISTER(_module)					\
361 __attribute__((constructor)) static void				\
362 __RAID_MODULE_REGISTER(__LINE__)(void)					\
363 {									\
364     raid_bdev_module_list_add(_module);					\
365 }
366 
367 bool raid_bdev_io_complete_part(struct raid_bdev_io *raid_io, uint64_t completed,
368 				enum spdk_bdev_io_status status);
369 void raid_bdev_queue_io_wait(struct raid_bdev_io *raid_io, struct spdk_bdev *bdev,
370 			     struct spdk_io_channel *ch, spdk_bdev_io_wait_cb cb_fn);
371 void raid_bdev_io_complete(struct raid_bdev_io *raid_io, enum spdk_bdev_io_status status);
372 void raid_bdev_module_stop_done(struct raid_bdev *raid_bdev);
373 struct spdk_io_channel *raid_bdev_channel_get_base_channel(struct raid_bdev_io_channel *raid_ch,
374 		uint8_t idx);
375 void *raid_bdev_channel_get_module_ctx(struct raid_bdev_io_channel *raid_ch);
376 struct raid_base_bdev_info *raid_bdev_channel_get_base_info(struct raid_bdev_io_channel *raid_ch,
377 		struct spdk_bdev *base_bdev);
378 void raid_bdev_process_request_complete(struct raid_bdev_process_request *process_req, int status);
379 void raid_bdev_io_init(struct raid_bdev_io *raid_io, struct raid_bdev_io_channel *raid_ch,
380 		       enum spdk_bdev_io_type type, uint64_t offset_blocks,
381 		       uint64_t num_blocks, struct iovec *iovs, int iovcnt, void *md_buf,
382 		       struct spdk_memory_domain *memory_domain, void *memory_domain_ctx);
383 void raid_bdev_fail_base_bdev(struct raid_base_bdev_info *base_info);
384 
385 static inline uint8_t
386 raid_bdev_base_bdev_slot(struct raid_base_bdev_info *base_info)
387 {
388 	return base_info - base_info->raid_bdev->base_bdev_info;
389 }
390 
391 static inline void
392 raid_bdev_io_set_default_status(struct raid_bdev_io *raid_io, enum spdk_bdev_io_status status)
393 {
394 	assert(raid_io->base_bdev_io_submitted == 0);
395 	raid_io->base_bdev_io_status = status;
396 	raid_io->base_bdev_io_status_default = status;
397 }
398 
399 int raid_bdev_remap_dix_reftag(void *md_buf, uint64_t num_blocks,
400 			       struct spdk_bdev *bdev, uint32_t remapped_offset);
401 int raid_bdev_verify_dix_reftag(struct iovec *iovs, int iovcnt, void *md_buf,
402 				uint64_t num_blocks, struct spdk_bdev *bdev, uint32_t offset_blocks);
403 
404 /**
405  * Raid bdev I/O read/write wrapper for spdk_bdev_readv_blocks_ext function.
406  */
407 static inline int
408 raid_bdev_readv_blocks_ext(struct raid_base_bdev_info *base_info, struct spdk_io_channel *ch,
409 			   struct iovec *iov, int iovcnt, uint64_t offset_blocks,
410 			   uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg,
411 			   struct spdk_bdev_ext_io_opts *opts)
412 {
413 	return spdk_bdev_readv_blocks_ext(base_info->desc, ch, iov, iovcnt,
414 					  base_info->data_offset + offset_blocks, num_blocks, cb, cb_arg, opts);
415 }
416 
417 /**
418  * Raid bdev I/O read/write wrapper for spdk_bdev_writev_blocks_ext function.
419  */
420 static inline int
421 raid_bdev_writev_blocks_ext(struct raid_base_bdev_info *base_info, struct spdk_io_channel *ch,
422 			    struct iovec *iov, int iovcnt, uint64_t offset_blocks,
423 			    uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg,
424 			    struct spdk_bdev_ext_io_opts *opts)
425 {
426 	int rc;
427 	uint64_t remapped_offset_blocks = base_info->data_offset + offset_blocks;
428 
429 	if (spdk_unlikely(spdk_bdev_get_dif_type(&base_info->raid_bdev->bdev) != SPDK_DIF_DISABLE &&
430 			  (base_info->raid_bdev->bdev.dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK))) {
431 		rc = raid_bdev_remap_dix_reftag(opts->metadata, num_blocks, &base_info->raid_bdev->bdev,
432 						remapped_offset_blocks);
433 		if (rc != 0) {
434 			return rc;
435 		}
436 	}
437 
438 	return spdk_bdev_writev_blocks_ext(base_info->desc, ch, iov, iovcnt,
439 					   remapped_offset_blocks, num_blocks, cb, cb_arg, opts);
440 }
441 
442 /**
443  * Raid bdev I/O read/write wrapper for spdk_bdev_unmap_blocks function.
444  */
445 static inline int
446 raid_bdev_unmap_blocks(struct raid_base_bdev_info *base_info, struct spdk_io_channel *ch,
447 		       uint64_t offset_blocks, uint64_t num_blocks,
448 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
449 {
450 	return spdk_bdev_unmap_blocks(base_info->desc, ch, base_info->data_offset + offset_blocks,
451 				      num_blocks, cb, cb_arg);
452 }
453 
454 /**
455  * Raid bdev I/O read/write wrapper for spdk_bdev_flush_blocks function.
456  */
457 static inline int
458 raid_bdev_flush_blocks(struct raid_base_bdev_info *base_info, struct spdk_io_channel *ch,
459 		       uint64_t offset_blocks, uint64_t num_blocks,
460 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
461 {
462 	return spdk_bdev_flush_blocks(base_info->desc, ch, base_info->data_offset + offset_blocks,
463 				      num_blocks, cb, cb_arg);
464 }
465 
466 /*
467  * Definitions related to raid bdev superblock
468  */
469 
470 #define RAID_BDEV_SB_VERSION_MAJOR	1
471 #define RAID_BDEV_SB_VERSION_MINOR	0
472 
473 #define RAID_BDEV_SB_NAME_SIZE		64
474 
475 enum raid_bdev_sb_base_bdev_state {
476 	RAID_SB_BASE_BDEV_MISSING	= 0,
477 	RAID_SB_BASE_BDEV_CONFIGURED	= 1,
478 	RAID_SB_BASE_BDEV_FAILED	= 2,
479 	RAID_SB_BASE_BDEV_SPARE		= 3,
480 };
481 
482 struct raid_bdev_sb_base_bdev {
483 	/* uuid of the base bdev */
484 	struct spdk_uuid	uuid;
485 	/* offset in blocks from base device start to the start of raid data area */
486 	uint64_t		data_offset;
487 	/* size in blocks of the base device raid data area */
488 	uint64_t		data_size;
489 	/* state of the base bdev */
490 	uint32_t		state;
491 	/* feature/status flags */
492 	uint32_t		flags;
493 	/* slot number of this base bdev in the raid */
494 	uint8_t			slot;
495 
496 	uint8_t			reserved[23];
497 };
498 SPDK_STATIC_ASSERT(sizeof(struct raid_bdev_sb_base_bdev) == 64, "incorrect size");
499 
500 struct raid_bdev_superblock {
501 #define RAID_BDEV_SB_SIG "SPDKRAID"
502 	uint8_t			signature[8];
503 	struct {
504 		/* incremented when a breaking change in the superblock structure is made */
505 		uint16_t	major;
506 		/* incremented for changes in the superblock that are backward compatible */
507 		uint16_t	minor;
508 	} version;
509 	/* length in bytes of the entire superblock */
510 	uint32_t		length;
511 	/* crc32c checksum of the entire superblock */
512 	uint32_t		crc;
513 	/* feature/status flags */
514 	uint32_t		flags;
515 	/* unique id of the raid bdev */
516 	struct spdk_uuid	uuid;
517 	/* name of the raid bdev */
518 	uint8_t			name[RAID_BDEV_SB_NAME_SIZE];
519 	/* size of the raid bdev in blocks */
520 	uint64_t		raid_size;
521 	/* the raid bdev block size - must be the same for all base bdevs */
522 	uint32_t		block_size;
523 	/* the raid level */
524 	uint32_t		level;
525 	/* strip (chunk) size in blocks */
526 	uint32_t		strip_size;
527 	/* state of the raid */
528 	uint32_t		state;
529 	/* sequence number, incremented on every superblock update */
530 	uint64_t		seq_number;
531 	/* number of raid base devices */
532 	uint8_t			num_base_bdevs;
533 
534 	uint8_t			reserved[118];
535 
536 	/* size of the base bdevs array */
537 	uint8_t			base_bdevs_size;
538 	/* array of base bdev descriptors */
539 	struct raid_bdev_sb_base_bdev base_bdevs[];
540 };
541 SPDK_STATIC_ASSERT(sizeof(struct raid_bdev_superblock) == 256, "incorrect size");
542 
543 #define RAID_BDEV_SB_MAX_LENGTH (sizeof(struct raid_bdev_superblock) + UINT8_MAX * sizeof(struct raid_bdev_sb_base_bdev))
544 
545 SPDK_STATIC_ASSERT(RAID_BDEV_SB_MAX_LENGTH < RAID_BDEV_MIN_DATA_OFFSET_SIZE,
546 		   "Incorrect min data offset");
547 
548 typedef void (*raid_bdev_write_sb_cb)(int status, struct raid_bdev *raid_bdev, void *ctx);
549 typedef void (*raid_bdev_load_sb_cb)(const struct raid_bdev_superblock *sb, int status, void *ctx);
550 
551 int raid_bdev_alloc_superblock(struct raid_bdev *raid_bdev, uint32_t block_size);
552 void raid_bdev_free_superblock(struct raid_bdev *raid_bdev);
553 void raid_bdev_init_superblock(struct raid_bdev *raid_bdev);
554 void raid_bdev_write_superblock(struct raid_bdev *raid_bdev, raid_bdev_write_sb_cb cb,
555 				void *cb_ctx);
556 int raid_bdev_load_base_bdev_superblock(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
557 					raid_bdev_load_sb_cb cb, void *cb_ctx);
558 
559 struct spdk_raid_bdev_opts {
560 	/* Size of the background process window in KiB */
561 	uint32_t process_window_size_kb;
562 	/* Maximum bandwidth in MiB to process per second */
563 	uint32_t process_max_bandwidth_mb_sec;
564 };
565 
566 void raid_bdev_get_opts(struct spdk_raid_bdev_opts *opts);
567 int raid_bdev_set_opts(const struct spdk_raid_bdev_opts *opts);
568 
569 #endif /* SPDK_BDEV_RAID_INTERNAL_H */
570