xref: /spdk/module/bdev/raid/bdev_raid.h (revision ad5fc351dd221a287cce269ad0e50b11253cc48b)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2018 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #ifndef SPDK_BDEV_RAID_INTERNAL_H
7 #define SPDK_BDEV_RAID_INTERNAL_H
8 
9 #include "spdk/bdev_module.h"
10 #include "spdk/uuid.h"
11 
12 #define RAID_BDEV_MIN_DATA_OFFSET_SIZE	(1024*1024) /* 1 MiB */
13 
14 enum raid_level {
15 	INVALID_RAID_LEVEL	= -1,
16 	RAID0			= 0,
17 	RAID1			= 1,
18 	RAID5F			= 95, /* 0x5f */
19 	CONCAT			= 99,
20 };
21 
22 /*
23  * Raid state describes the state of the raid. This raid bdev can be either in
24  * configured list or configuring list
25  */
26 enum raid_bdev_state {
27 	/* raid bdev is ready and is seen by upper layers */
28 	RAID_BDEV_STATE_ONLINE,
29 
30 	/*
31 	 * raid bdev is configuring, not all underlying bdevs are present.
32 	 * And can't be seen by upper layers.
33 	 */
34 	RAID_BDEV_STATE_CONFIGURING,
35 
36 	/*
37 	 * In offline state, raid bdev layer will complete all incoming commands without
38 	 * submitting to underlying base nvme bdevs
39 	 */
40 	RAID_BDEV_STATE_OFFLINE,
41 
42 	/* raid bdev state max, new states should be added before this */
43 	RAID_BDEV_STATE_MAX
44 };
45 
46 typedef void (*raid_bdev_remove_base_bdev_cb)(void *ctx, int status);
47 
48 /*
49  * raid_base_bdev_info contains information for the base bdevs which are part of some
50  * raid. This structure contains the per base bdev information. Whatever is
51  * required per base device for raid bdev will be kept here
52  */
53 struct raid_base_bdev_info {
54 	/* The raid bdev that this base bdev belongs to */
55 	struct raid_bdev	*raid_bdev;
56 
57 	/* name of the bdev */
58 	char			*name;
59 
60 	/* uuid of the bdev */
61 	struct spdk_uuid	uuid;
62 
63 	/*
64 	 * Pointer to base bdev descriptor opened by raid bdev. This is NULL when the bdev for
65 	 * this slot is missing.
66 	 */
67 	struct spdk_bdev_desc	*desc;
68 
69 	/* offset in blocks from the start of the base bdev to the start of the data region */
70 	uint64_t		data_offset;
71 
72 	/* size in blocks of the base bdev's data region */
73 	uint64_t		data_size;
74 
75 	/*
76 	 * When underlying base device calls the hot plug function on drive removal,
77 	 * this flag will be set and later after doing some processing, base device
78 	 * descriptor will be closed
79 	 */
80 	bool			remove_scheduled;
81 
82 	/* callback for base bdev removal */
83 	raid_bdev_remove_base_bdev_cb remove_cb;
84 
85 	/* context of the callback */
86 	void			*remove_cb_ctx;
87 
88 	/* Hold the number of blocks to know how large the base bdev is resized. */
89 	uint64_t		blockcnt;
90 
91 	/* io channel for the app thread */
92 	struct spdk_io_channel	*app_thread_ch;
93 
94 	/* Set to true when base bdev has completed the configuration process */
95 	bool			is_configured;
96 };
97 
98 struct raid_bdev_io;
99 typedef void (*raid_bdev_io_completion_cb)(struct raid_bdev_io *raid_io,
100 		enum spdk_bdev_io_status status);
101 
102 /*
103  * raid_bdev_io is the context part of bdev_io. It contains the information
104  * related to bdev_io for a raid bdev
105  */
106 struct raid_bdev_io {
107 	/* The raid bdev associated with this IO */
108 	struct raid_bdev *raid_bdev;
109 
110 	uint64_t offset_blocks;
111 	uint64_t num_blocks;
112 	struct iovec *iovs;
113 	int iovcnt;
114 	enum spdk_bdev_io_type type;
115 	struct spdk_memory_domain *memory_domain;
116 	void *memory_domain_ctx;
117 	void *md_buf;
118 
119 	/* WaitQ entry, used only in waitq logic */
120 	struct spdk_bdev_io_wait_entry	waitq_entry;
121 
122 	/* Context of the original channel for this IO */
123 	struct raid_bdev_io_channel	*raid_ch;
124 
125 	/* Used for tracking progress on io requests sent to member disks. */
126 	uint64_t			base_bdev_io_remaining;
127 	uint8_t				base_bdev_io_submitted;
128 	enum spdk_bdev_io_status	base_bdev_io_status;
129 
130 	/* Private data for the raid module */
131 	void				*module_private;
132 
133 	/* Custom completion callback. Overrides bdev_io completion if set. */
134 	raid_bdev_io_completion_cb	completion_cb;
135 };
136 
137 /*
138  * raid_bdev is the single entity structure which contains SPDK block device
139  * and the information related to any raid bdev either configured or
140  * in configuring list. io device is created on this.
141  */
142 struct raid_bdev {
143 	/* raid bdev device, this will get registered in bdev layer */
144 	struct spdk_bdev		bdev;
145 
146 	/* link of raid bdev to link it to global raid bdev list */
147 	TAILQ_ENTRY(raid_bdev)		global_link;
148 
149 	/* array of base bdev info */
150 	struct raid_base_bdev_info	*base_bdev_info;
151 
152 	/* lock to protect the base bdev array */
153 	struct spdk_spinlock		base_bdev_lock;
154 
155 	/* strip size of raid bdev in blocks */
156 	uint32_t			strip_size;
157 
158 	/* strip size of raid bdev in KB */
159 	uint32_t			strip_size_kb;
160 
161 	/* strip size bit shift for optimized calculation */
162 	uint32_t			strip_size_shift;
163 
164 	/* block length bit shift for optimized calculation */
165 	uint32_t			blocklen_shift;
166 
167 	/* state of raid bdev */
168 	enum raid_bdev_state		state;
169 
170 	/* number of base bdevs comprising raid bdev  */
171 	uint8_t				num_base_bdevs;
172 
173 	/* number of base bdevs discovered */
174 	uint8_t				num_base_bdevs_discovered;
175 
176 	/*
177 	 * Number of operational base bdevs, i.e. how many we know/expect to be working. This
178 	 * will be less than num_base_bdevs when starting a degraded array.
179 	 */
180 	uint8_t				num_base_bdevs_operational;
181 
182 	/* minimum number of viable base bdevs that are required by array to operate */
183 	uint8_t				min_base_bdevs_operational;
184 
185 	/* Raid Level of this raid bdev */
186 	enum raid_level			level;
187 
188 	/* Set to true if destroy of this raid bdev is started. */
189 	bool				destroy_started;
190 
191 	/* Module for RAID-level specific operations */
192 	struct raid_bdev_module		*module;
193 
194 	/* Private data for the raid module */
195 	void				*module_private;
196 
197 	/* Superblock */
198 	struct raid_bdev_superblock	*sb;
199 };
200 
201 #define RAID_FOR_EACH_BASE_BDEV(r, i) \
202 	for (i = r->base_bdev_info; i < r->base_bdev_info + r->num_base_bdevs; i++)
203 
204 struct raid_bdev_io_channel;
205 
206 /* TAIL head for raid bdev list */
207 TAILQ_HEAD(raid_all_tailq, raid_bdev);
208 
209 extern struct raid_all_tailq		g_raid_bdev_list;
210 
211 typedef void (*raid_bdev_destruct_cb)(void *cb_ctx, int rc);
212 
213 int raid_bdev_create(const char *name, uint32_t strip_size, uint8_t num_base_bdevs,
214 		     enum raid_level level, bool superblock, const struct spdk_uuid *uuid,
215 		     struct raid_bdev **raid_bdev_out);
216 void raid_bdev_delete(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn, void *cb_ctx);
217 int raid_bdev_add_base_device(struct raid_bdev *raid_bdev, const char *name, uint8_t slot);
218 struct raid_bdev *raid_bdev_find_by_name(const char *name);
219 enum raid_level raid_bdev_str_to_level(const char *str);
220 const char *raid_bdev_level_to_str(enum raid_level level);
221 enum raid_bdev_state raid_bdev_str_to_state(const char *str);
222 const char *raid_bdev_state_to_str(enum raid_bdev_state state);
223 void raid_bdev_write_info_json(struct raid_bdev *raid_bdev, struct spdk_json_write_ctx *w);
224 int raid_bdev_remove_base_bdev(struct spdk_bdev *base_bdev, raid_bdev_remove_base_bdev_cb cb_fn,
225 			       void *cb_ctx);
226 struct spdk_io_channel *raid_bdev_channel_get_base_channel(struct raid_bdev_io_channel *raid_ch,
227 		uint8_t idx);
228 void *raid_bdev_channel_get_module_ctx(struct raid_bdev_io_channel *raid_ch);
229 
230 /*
231  * RAID module descriptor
232  */
233 struct raid_bdev_module {
234 	/* RAID level implemented by this module */
235 	enum raid_level level;
236 
237 	/* Minimum required number of base bdevs. Must be > 0. */
238 	uint8_t base_bdevs_min;
239 
240 	/*
241 	 * RAID constraint. Determines number of base bdevs that can be removed
242 	 * without failing the array.
243 	 */
244 	struct {
245 		enum {
246 			CONSTRAINT_UNSET = 0,
247 			CONSTRAINT_MAX_BASE_BDEVS_REMOVED,
248 			CONSTRAINT_MIN_BASE_BDEVS_OPERATIONAL,
249 		} type;
250 		uint8_t value;
251 	} base_bdevs_constraint;
252 
253 	/* Set to true if this module supports memory domains. */
254 	bool memory_domains_supported;
255 
256 	/*
257 	 * Called when the raid is starting, right before changing the state to
258 	 * online and registering the bdev. Parameters of the bdev like blockcnt
259 	 * should be set here.
260 	 *
261 	 * Non-zero return value will abort the startup process.
262 	 */
263 	int (*start)(struct raid_bdev *raid_bdev);
264 
265 	/*
266 	 * Called when the raid is stopping, right before changing the state to
267 	 * offline and unregistering the bdev. Optional.
268 	 *
269 	 * The function should return false if it is asynchronous. Then, after
270 	 * the async operation has completed and the module is fully stopped
271 	 * raid_bdev_module_stop_done() must be called.
272 	 */
273 	bool (*stop)(struct raid_bdev *raid_bdev);
274 
275 	/* Handler for R/W requests */
276 	void (*submit_rw_request)(struct raid_bdev_io *raid_io);
277 
278 	/* Handler for requests without payload (flush, unmap). Optional. */
279 	void (*submit_null_payload_request)(struct raid_bdev_io *raid_io);
280 
281 	/*
282 	 * Called when the bdev's IO channel is created to get the module's private IO channel.
283 	 * Optional.
284 	 */
285 	struct spdk_io_channel *(*get_io_channel)(struct raid_bdev *raid_bdev);
286 
287 	/*
288 	 * Called when a base_bdev is resized to resize the raid if the condition
289 	 * is satisfied.
290 	 */
291 	void (*resize)(struct raid_bdev *raid_bdev);
292 
293 	TAILQ_ENTRY(raid_bdev_module) link;
294 };
295 
296 void raid_bdev_module_list_add(struct raid_bdev_module *raid_module);
297 
298 #define __RAID_MODULE_REGISTER(line) __RAID_MODULE_REGISTER_(line)
299 #define __RAID_MODULE_REGISTER_(line) raid_module_register_##line
300 
301 #define RAID_MODULE_REGISTER(_module)					\
302 __attribute__((constructor)) static void				\
303 __RAID_MODULE_REGISTER(__LINE__)(void)					\
304 {									\
305     raid_bdev_module_list_add(_module);					\
306 }
307 
308 bool raid_bdev_io_complete_part(struct raid_bdev_io *raid_io, uint64_t completed,
309 				enum spdk_bdev_io_status status);
310 void raid_bdev_queue_io_wait(struct raid_bdev_io *raid_io, struct spdk_bdev *bdev,
311 			     struct spdk_io_channel *ch, spdk_bdev_io_wait_cb cb_fn);
312 void raid_bdev_io_complete(struct raid_bdev_io *raid_io, enum spdk_bdev_io_status status);
313 void raid_bdev_module_stop_done(struct raid_bdev *raid_bdev);
314 void raid_bdev_io_init(struct raid_bdev_io *raid_io, struct raid_bdev_io_channel *raid_ch,
315 		       enum spdk_bdev_io_type type, uint64_t offset_blocks,
316 		       uint64_t num_blocks, struct iovec *iovs, int iovcnt, void *md_buf,
317 		       struct spdk_memory_domain *memory_domain, void *memory_domain_ctx);
318 
319 static inline uint8_t
320 raid_bdev_base_bdev_slot(struct raid_base_bdev_info *base_info)
321 {
322 	return base_info - base_info->raid_bdev->base_bdev_info;
323 }
324 
325 /**
326  * Raid bdev I/O read/write wrapper for spdk_bdev_readv_blocks_ext function.
327  */
328 static inline int
329 raid_bdev_readv_blocks_ext(struct raid_base_bdev_info *base_info, struct spdk_io_channel *ch,
330 			   struct iovec *iov, int iovcnt, uint64_t offset_blocks,
331 			   uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg,
332 			   struct spdk_bdev_ext_io_opts *opts)
333 {
334 	return spdk_bdev_readv_blocks_ext(base_info->desc, ch, iov, iovcnt,
335 					  base_info->data_offset + offset_blocks, num_blocks, cb, cb_arg, opts);
336 }
337 
338 /**
339  * Raid bdev I/O read/write wrapper for spdk_bdev_writev_blocks_ext function.
340  */
341 static inline int
342 raid_bdev_writev_blocks_ext(struct raid_base_bdev_info *base_info, struct spdk_io_channel *ch,
343 			    struct iovec *iov, int iovcnt, uint64_t offset_blocks,
344 			    uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg,
345 			    struct spdk_bdev_ext_io_opts *opts)
346 {
347 	return spdk_bdev_writev_blocks_ext(base_info->desc, ch, iov, iovcnt,
348 					   base_info->data_offset + offset_blocks, num_blocks, cb, cb_arg, opts);
349 }
350 
351 /**
352  * Raid bdev I/O read/write wrapper for spdk_bdev_unmap_blocks function.
353  */
354 static inline int
355 raid_bdev_unmap_blocks(struct raid_base_bdev_info *base_info, struct spdk_io_channel *ch,
356 		       uint64_t offset_blocks, uint64_t num_blocks,
357 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
358 {
359 	return spdk_bdev_unmap_blocks(base_info->desc, ch, base_info->data_offset + offset_blocks,
360 				      num_blocks, cb, cb_arg);
361 }
362 
363 /**
364  * Raid bdev I/O read/write wrapper for spdk_bdev_flush_blocks function.
365  */
366 static inline int
367 raid_bdev_flush_blocks(struct raid_base_bdev_info *base_info, struct spdk_io_channel *ch,
368 		       uint64_t offset_blocks, uint64_t num_blocks,
369 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
370 {
371 	return spdk_bdev_flush_blocks(base_info->desc, ch, base_info->data_offset + offset_blocks,
372 				      num_blocks, cb, cb_arg);
373 }
374 
375 /*
376  * Definitions related to raid bdev superblock
377  */
378 
379 #define RAID_BDEV_SB_VERSION_MAJOR	1
380 #define RAID_BDEV_SB_VERSION_MINOR	0
381 
382 #define RAID_BDEV_SB_NAME_SIZE		64
383 
384 enum raid_bdev_sb_base_bdev_state {
385 	RAID_SB_BASE_BDEV_MISSING	= 0,
386 	RAID_SB_BASE_BDEV_CONFIGURED	= 1,
387 	RAID_SB_BASE_BDEV_FAILED	= 2,
388 	RAID_SB_BASE_BDEV_SPARE		= 3,
389 };
390 
391 struct raid_bdev_sb_base_bdev {
392 	/* uuid of the base bdev */
393 	struct spdk_uuid	uuid;
394 	/* offset in blocks from base device start to the start of raid data area */
395 	uint64_t		data_offset;
396 	/* size in blocks of the base device raid data area */
397 	uint64_t		data_size;
398 	/* state of the base bdev */
399 	uint32_t		state;
400 	/* feature/status flags */
401 	uint32_t		flags;
402 	/* slot number of this base bdev in the raid */
403 	uint8_t			slot;
404 
405 	uint8_t			reserved[23];
406 };
407 SPDK_STATIC_ASSERT(sizeof(struct raid_bdev_sb_base_bdev) == 64, "incorrect size");
408 
409 struct raid_bdev_superblock {
410 #define RAID_BDEV_SB_SIG "SPDKRAID"
411 	uint8_t			signature[8];
412 	struct {
413 		/* incremented when a breaking change in the superblock structure is made */
414 		uint16_t	major;
415 		/* incremented for changes in the superblock that are backward compatible */
416 		uint16_t	minor;
417 	} version;
418 	/* length in bytes of the entire superblock */
419 	uint32_t		length;
420 	/* crc32c checksum of the entire superblock */
421 	uint32_t		crc;
422 	/* feature/status flags */
423 	uint32_t		flags;
424 	/* unique id of the raid bdev */
425 	struct spdk_uuid	uuid;
426 	/* name of the raid bdev */
427 	uint8_t			name[RAID_BDEV_SB_NAME_SIZE];
428 	/* size of the raid bdev in blocks */
429 	uint64_t		raid_size;
430 	/* the raid bdev block size - must be the same for all base bdevs */
431 	uint32_t		block_size;
432 	/* the raid level */
433 	uint32_t		level;
434 	/* strip (chunk) size in blocks */
435 	uint32_t		strip_size;
436 	/* state of the raid */
437 	uint32_t		state;
438 	/* sequence number, incremented on every superblock update */
439 	uint64_t		seq_number;
440 	/* number of raid base devices */
441 	uint8_t			num_base_bdevs;
442 
443 	uint8_t			reserved[118];
444 
445 	/* size of the base bdevs array */
446 	uint8_t			base_bdevs_size;
447 	/* array of base bdev descriptors */
448 	struct raid_bdev_sb_base_bdev base_bdevs[];
449 };
450 SPDK_STATIC_ASSERT(sizeof(struct raid_bdev_superblock) == 256, "incorrect size");
451 
452 #define RAID_BDEV_SB_MAX_LENGTH \
453 	SPDK_ALIGN_CEIL((sizeof(struct raid_bdev_superblock) + UINT8_MAX * sizeof(struct raid_bdev_sb_base_bdev)), 0x1000)
454 
455 SPDK_STATIC_ASSERT(RAID_BDEV_SB_MAX_LENGTH < RAID_BDEV_MIN_DATA_OFFSET_SIZE,
456 		   "Incorrect min data offset");
457 
458 typedef void (*raid_bdev_write_sb_cb)(int status, struct raid_bdev *raid_bdev, void *ctx);
459 typedef void (*raid_bdev_load_sb_cb)(const struct raid_bdev_superblock *sb, int status, void *ctx);
460 
461 void raid_bdev_init_superblock(struct raid_bdev *raid_bdev);
462 void raid_bdev_write_superblock(struct raid_bdev *raid_bdev, raid_bdev_write_sb_cb cb,
463 				void *cb_ctx);
464 int raid_bdev_load_base_bdev_superblock(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
465 					raid_bdev_load_sb_cb cb, void *cb_ctx);
466 
467 #endif /* SPDK_BDEV_RAID_INTERNAL_H */
468