xref: /spdk/module/bdev/raid/raid5f.c (revision 07d28d02f73bbcd7732a5421bcaebfb067b46ca0)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2022 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "bdev_raid.h"
7 
8 #include "spdk/env.h"
9 #include "spdk/thread.h"
10 #include "spdk/string.h"
11 #include "spdk/util.h"
12 #include "spdk/likely.h"
13 #include "spdk/log.h"
14 #include "spdk/accel.h"
15 
16 /* Maximum concurrent full stripe writes per io channel */
17 #define RAID5F_MAX_STRIPES 32
18 
19 struct chunk {
20 	/* Corresponds to base_bdev index */
21 	uint8_t index;
22 
23 	/* Array of iovecs */
24 	struct iovec *iovs;
25 
26 	/* Number of used iovecs */
27 	int iovcnt;
28 
29 	/* Total number of available iovecs in the array */
30 	int iovcnt_max;
31 
32 	/* Pointer to buffer with I/O metadata */
33 	void *md_buf;
34 
35 	/* Shallow copy of IO request parameters */
36 	struct spdk_bdev_ext_io_opts ext_opts;
37 };
38 
39 struct stripe_request {
40 	enum stripe_request_type {
41 		STRIPE_REQ_WRITE,
42 	} type;
43 
44 	struct raid5f_io_channel *r5ch;
45 
46 	/* The associated raid_bdev_io */
47 	struct raid_bdev_io *raid_io;
48 
49 	/* The stripe's index in the raid array. */
50 	uint64_t stripe_index;
51 
52 	/* The stripe's parity chunk */
53 	struct chunk *parity_chunk;
54 
55 	union {
56 		struct {
57 			/* Buffer for stripe parity */
58 			void *parity_buf;
59 
60 			/* Buffer for stripe io metadata parity */
61 			void *parity_md_buf;
62 		} write;
63 	};
64 
65 	/* Array of iovec iterators for each chunk */
66 	struct spdk_ioviter *chunk_iov_iters;
67 
68 	/* Array of source buffer pointers for parity calculation */
69 	void **chunk_xor_buffers;
70 
71 	/* Array of source buffer pointers for parity calculation of io metadata */
72 	void **chunk_xor_md_buffers;
73 
74 	struct {
75 		size_t len;
76 		size_t remaining;
77 		size_t remaining_md;
78 		int status;
79 	} xor;
80 
81 	TAILQ_ENTRY(stripe_request) link;
82 
83 	/* Array of chunks corresponding to base_bdevs */
84 	struct chunk chunks[0];
85 };
86 
87 struct raid5f_info {
88 	/* The parent raid bdev */
89 	struct raid_bdev *raid_bdev;
90 
91 	/* Number of data blocks in a stripe (without parity) */
92 	uint64_t stripe_blocks;
93 
94 	/* Number of stripes on this array */
95 	uint64_t total_stripes;
96 
97 	/* Alignment for buffer allocation */
98 	size_t buf_alignment;
99 };
100 
101 struct raid5f_io_channel {
102 	/* All available stripe requests on this channel */
103 	TAILQ_HEAD(, stripe_request) free_stripe_requests;
104 
105 	/* accel_fw channel */
106 	struct spdk_io_channel *accel_ch;
107 
108 	/* For retrying xor if accel_ch runs out of resources */
109 	TAILQ_HEAD(, stripe_request) xor_retry_queue;
110 
111 	/* For iterating over chunk iovecs during xor calculation */
112 	void **chunk_xor_buffers;
113 	struct iovec **chunk_xor_iovs;
114 	size_t *chunk_xor_iovcnt;
115 };
116 
117 #define __CHUNK_IN_RANGE(req, c) \
118 	c < req->chunks + raid5f_ch_to_r5f_info(req->r5ch)->raid_bdev->num_base_bdevs
119 
120 #define FOR_EACH_CHUNK_FROM(req, c, from) \
121 	for (c = from; __CHUNK_IN_RANGE(req, c); c++)
122 
123 #define FOR_EACH_CHUNK(req, c) \
124 	FOR_EACH_CHUNK_FROM(req, c, req->chunks)
125 
126 #define __NEXT_DATA_CHUNK(req, c) \
127 	c == req->parity_chunk ? c+1 : c
128 
129 #define FOR_EACH_DATA_CHUNK(req, c) \
130 	for (c = __NEXT_DATA_CHUNK(req, req->chunks); __CHUNK_IN_RANGE(req, c); \
131 	     c = __NEXT_DATA_CHUNK(req, c+1))
132 
133 static inline struct raid5f_info *
134 raid5f_ch_to_r5f_info(struct raid5f_io_channel *r5ch)
135 {
136 	return spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(r5ch));
137 }
138 
139 static inline struct stripe_request *
140 raid5f_chunk_stripe_req(struct chunk *chunk)
141 {
142 	return SPDK_CONTAINEROF((chunk - chunk->index), struct stripe_request, chunks);
143 }
144 
145 static inline uint8_t
146 raid5f_stripe_data_chunks_num(const struct raid_bdev *raid_bdev)
147 {
148 	return raid_bdev->min_base_bdevs_operational;
149 }
150 
151 static inline uint8_t
152 raid5f_stripe_parity_chunk_index(const struct raid_bdev *raid_bdev, uint64_t stripe_index)
153 {
154 	return raid5f_stripe_data_chunks_num(raid_bdev) - stripe_index % raid_bdev->num_base_bdevs;
155 }
156 
157 static inline void
158 raid5f_stripe_request_release(struct stripe_request *stripe_req)
159 {
160 	TAILQ_INSERT_HEAD(&stripe_req->r5ch->free_stripe_requests, stripe_req, link);
161 }
162 
163 static void raid5f_stripe_request_submit_chunks(struct stripe_request *stripe_req);
164 static void raid5f_xor_stripe_retry(struct stripe_request *stripe_req);
165 
166 static void
167 raid5f_xor_stripe_done(struct stripe_request *stripe_req)
168 {
169 	struct raid5f_io_channel *r5ch = stripe_req->r5ch;
170 
171 	if (stripe_req->xor.status != 0) {
172 		struct raid_bdev_io *raid_io = stripe_req->raid_io;
173 
174 		SPDK_ERRLOG("stripe xor failed: %s\n", spdk_strerror(-stripe_req->xor.status));
175 		raid5f_stripe_request_release(stripe_req);
176 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
177 	} else {
178 		raid5f_stripe_request_submit_chunks(stripe_req);
179 	}
180 
181 	if (!TAILQ_EMPTY(&r5ch->xor_retry_queue)) {
182 		stripe_req = TAILQ_FIRST(&r5ch->xor_retry_queue);
183 		TAILQ_REMOVE(&r5ch->xor_retry_queue, stripe_req, link);
184 		raid5f_xor_stripe_retry(stripe_req);
185 	}
186 }
187 
188 static void raid5f_xor_stripe_continue(struct stripe_request *stripe_req);
189 
190 static void
191 _raid5f_xor_stripe_cb(struct stripe_request *stripe_req, int status)
192 {
193 	if (status != 0) {
194 		stripe_req->xor.status = status;
195 	}
196 
197 	if (stripe_req->xor.remaining + stripe_req->xor.remaining_md == 0) {
198 		raid5f_xor_stripe_done(stripe_req);
199 	}
200 }
201 
202 static void
203 raid5f_xor_stripe_cb(void *_stripe_req, int status)
204 {
205 	struct stripe_request *stripe_req = _stripe_req;
206 
207 	stripe_req->xor.remaining -= stripe_req->xor.len;
208 
209 	if (stripe_req->xor.remaining > 0) {
210 		stripe_req->xor.len = spdk_ioviter_nextv(stripe_req->chunk_iov_iters,
211 				      stripe_req->r5ch->chunk_xor_buffers);
212 		raid5f_xor_stripe_continue(stripe_req);
213 	}
214 
215 	_raid5f_xor_stripe_cb(stripe_req, status);
216 }
217 
218 static void
219 raid5f_xor_stripe_md_cb(void *_stripe_req, int status)
220 {
221 	struct stripe_request *stripe_req = _stripe_req;
222 
223 	stripe_req->xor.remaining_md = 0;
224 
225 	_raid5f_xor_stripe_cb(stripe_req, status);
226 }
227 
228 static void
229 raid5f_xor_stripe_continue(struct stripe_request *stripe_req)
230 {
231 	struct raid5f_io_channel *r5ch = stripe_req->r5ch;
232 	struct raid_bdev_io *raid_io = stripe_req->raid_io;
233 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
234 	uint8_t n_src = raid5f_stripe_data_chunks_num(raid_bdev);
235 	uint8_t i;
236 	int ret;
237 
238 	assert(stripe_req->xor.len > 0);
239 
240 	for (i = 0; i < n_src; i++) {
241 		stripe_req->chunk_xor_buffers[i] = r5ch->chunk_xor_buffers[i];
242 	}
243 
244 	ret = spdk_accel_submit_xor(r5ch->accel_ch, r5ch->chunk_xor_buffers[n_src],
245 				    stripe_req->chunk_xor_buffers, n_src, stripe_req->xor.len,
246 				    raid5f_xor_stripe_cb, stripe_req);
247 	if (spdk_unlikely(ret)) {
248 		if (ret == -ENOMEM) {
249 			TAILQ_INSERT_HEAD(&r5ch->xor_retry_queue, stripe_req, link);
250 		} else {
251 			stripe_req->xor.status = ret;
252 			raid5f_xor_stripe_done(stripe_req);
253 		}
254 	}
255 }
256 
257 static void
258 raid5f_xor_stripe(struct stripe_request *stripe_req)
259 {
260 	struct raid5f_io_channel *r5ch = stripe_req->r5ch;
261 	struct raid_bdev_io *raid_io = stripe_req->raid_io;
262 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
263 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
264 	void *raid_md = spdk_bdev_io_get_md_buf(bdev_io);
265 	uint32_t raid_md_size = spdk_bdev_get_md_size(&raid_bdev->bdev);
266 	struct chunk *chunk;
267 	uint8_t c;
268 
269 	assert(stripe_req->type == STRIPE_REQ_WRITE);
270 
271 	c = 0;
272 	FOR_EACH_DATA_CHUNK(stripe_req, chunk) {
273 		r5ch->chunk_xor_iovs[c] = chunk->iovs;
274 		r5ch->chunk_xor_iovcnt[c] = chunk->iovcnt;
275 		c++;
276 	}
277 	r5ch->chunk_xor_iovs[c] = stripe_req->parity_chunk->iovs;
278 	r5ch->chunk_xor_iovcnt[c] = stripe_req->parity_chunk->iovcnt;
279 
280 	stripe_req->xor.len = spdk_ioviter_firstv(stripe_req->chunk_iov_iters,
281 			      raid_bdev->num_base_bdevs,
282 			      r5ch->chunk_xor_iovs,
283 			      r5ch->chunk_xor_iovcnt,
284 			      r5ch->chunk_xor_buffers);
285 	stripe_req->xor.remaining = raid_bdev->strip_size << raid_bdev->blocklen_shift;
286 	stripe_req->xor.status = 0;
287 
288 	if (raid_md != NULL) {
289 		uint8_t n_src = raid5f_stripe_data_chunks_num(raid_bdev);
290 		uint64_t len = raid_bdev->strip_size * raid_md_size;
291 		int ret;
292 
293 		stripe_req->xor.remaining_md = len;
294 
295 		c = 0;
296 		FOR_EACH_DATA_CHUNK(stripe_req, chunk) {
297 			stripe_req->chunk_xor_md_buffers[c] = chunk->md_buf;
298 			c++;
299 		}
300 
301 		ret = spdk_accel_submit_xor(stripe_req->r5ch->accel_ch, stripe_req->write.parity_md_buf,
302 					    stripe_req->chunk_xor_md_buffers, n_src, len,
303 					    raid5f_xor_stripe_md_cb, stripe_req);
304 		if (spdk_unlikely(ret)) {
305 			if (ret == -ENOMEM) {
306 				TAILQ_INSERT_HEAD(&stripe_req->r5ch->xor_retry_queue, stripe_req, link);
307 			} else {
308 				stripe_req->xor.status = ret;
309 				raid5f_xor_stripe_done(stripe_req);
310 			}
311 			return;
312 		}
313 	}
314 
315 	raid5f_xor_stripe_continue(stripe_req);
316 }
317 
318 static void
319 raid5f_xor_stripe_retry(struct stripe_request *stripe_req)
320 {
321 	if (stripe_req->xor.remaining_md) {
322 		raid5f_xor_stripe(stripe_req);
323 	} else {
324 		raid5f_xor_stripe_continue(stripe_req);
325 	}
326 }
327 
328 static void
329 raid5f_stripe_request_chunk_write_complete(struct stripe_request *stripe_req,
330 		enum spdk_bdev_io_status status)
331 {
332 	if (raid_bdev_io_complete_part(stripe_req->raid_io, 1, status)) {
333 		raid5f_stripe_request_release(stripe_req);
334 	}
335 }
336 
337 static void
338 raid5f_chunk_complete_bdev_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
339 {
340 	struct chunk *chunk = cb_arg;
341 	struct stripe_request *stripe_req = raid5f_chunk_stripe_req(chunk);
342 	enum spdk_bdev_io_status status = success ? SPDK_BDEV_IO_STATUS_SUCCESS :
343 					  SPDK_BDEV_IO_STATUS_FAILED;
344 
345 	spdk_bdev_free_io(bdev_io);
346 
347 	if (spdk_likely(stripe_req->type == STRIPE_REQ_WRITE)) {
348 		raid5f_stripe_request_chunk_write_complete(stripe_req, status);
349 	} else {
350 		assert(false);
351 	}
352 }
353 
354 static void
355 raid5f_chunk_submit_retry(void *_raid_io)
356 {
357 	struct raid_bdev_io *raid_io = _raid_io;
358 	struct stripe_request *stripe_req = raid_io->module_private;
359 
360 	raid5f_stripe_request_submit_chunks(stripe_req);
361 }
362 
363 static inline void
364 raid5f_init_ext_io_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts)
365 {
366 	memset(opts, 0, sizeof(*opts));
367 	opts->size = sizeof(*opts);
368 	opts->memory_domain = bdev_io->u.bdev.memory_domain;
369 	opts->memory_domain_ctx = bdev_io->u.bdev.memory_domain_ctx;
370 	opts->metadata = bdev_io->u.bdev.md_buf;
371 }
372 
373 static int
374 raid5f_chunk_submit(struct chunk *chunk)
375 {
376 	struct stripe_request *stripe_req = raid5f_chunk_stripe_req(chunk);
377 	struct raid_bdev_io *raid_io = stripe_req->raid_io;
378 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
379 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
380 	struct raid_base_bdev_info *base_info = &raid_bdev->base_bdev_info[chunk->index];
381 	struct spdk_io_channel *base_ch = raid_io->raid_ch->base_channel[chunk->index];
382 	uint64_t base_offset_blocks = (stripe_req->stripe_index << raid_bdev->strip_size_shift);
383 	int ret;
384 
385 	raid5f_init_ext_io_opts(bdev_io, &chunk->ext_opts);
386 	chunk->ext_opts.metadata = chunk->md_buf;
387 
388 	switch (stripe_req->type) {
389 	case STRIPE_REQ_WRITE:
390 		ret = spdk_bdev_writev_blocks_ext(base_info->desc, base_ch, chunk->iovs, chunk->iovcnt,
391 						  base_offset_blocks, raid_bdev->strip_size,
392 						  raid5f_chunk_complete_bdev_io, chunk,
393 						  &chunk->ext_opts);
394 		break;
395 	default:
396 		assert(false);
397 		ret = -EINVAL;
398 		break;
399 	}
400 
401 	if (spdk_unlikely(ret)) {
402 		if (ret == -ENOMEM) {
403 			raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
404 						base_ch, raid5f_chunk_submit_retry);
405 		} else {
406 			/*
407 			 * Implicitly complete any I/Os not yet submitted as FAILED. If completing
408 			 * these means there are no more to complete for the stripe request, we can
409 			 * release the stripe request as well.
410 			 */
411 			uint64_t base_bdev_io_not_submitted = raid_bdev->num_base_bdevs -
412 							      raid_io->base_bdev_io_submitted;
413 
414 			if (raid_bdev_io_complete_part(stripe_req->raid_io, base_bdev_io_not_submitted,
415 						       SPDK_BDEV_IO_STATUS_FAILED)) {
416 				raid5f_stripe_request_release(stripe_req);
417 			}
418 		}
419 	}
420 
421 	return ret;
422 }
423 
424 static int
425 raid5f_chunk_set_iovcnt(struct chunk *chunk, int iovcnt)
426 {
427 	if (iovcnt > chunk->iovcnt_max) {
428 		struct iovec *iovs = chunk->iovs;
429 
430 		iovs = realloc(iovs, iovcnt * sizeof(*iovs));
431 		if (!iovs) {
432 			return -ENOMEM;
433 		}
434 		chunk->iovs = iovs;
435 		chunk->iovcnt_max = iovcnt;
436 	}
437 	chunk->iovcnt = iovcnt;
438 
439 	return 0;
440 }
441 
442 static int
443 raid5f_stripe_request_map_iovecs(struct stripe_request *stripe_req)
444 {
445 	struct raid_bdev *raid_bdev = stripe_req->raid_io->raid_bdev;
446 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(stripe_req->raid_io);
447 	const struct iovec *raid_io_iovs = bdev_io->u.bdev.iovs;
448 	int raid_io_iovcnt = bdev_io->u.bdev.iovcnt;
449 	void *raid_io_md = spdk_bdev_io_get_md_buf(bdev_io);
450 	uint32_t raid_io_md_size = spdk_bdev_get_md_size(&raid_bdev->bdev);
451 	struct chunk *chunk;
452 	int raid_io_iov_idx = 0;
453 	size_t raid_io_offset = 0;
454 	size_t raid_io_iov_offset = 0;
455 	int i;
456 
457 	FOR_EACH_DATA_CHUNK(stripe_req, chunk) {
458 		int chunk_iovcnt = 0;
459 		uint64_t len = raid_bdev->strip_size << raid_bdev->blocklen_shift;
460 		size_t off = raid_io_iov_offset;
461 		int ret;
462 
463 		for (i = raid_io_iov_idx; i < raid_io_iovcnt; i++) {
464 			chunk_iovcnt++;
465 			off += raid_io_iovs[i].iov_len;
466 			if (off >= raid_io_offset + len) {
467 				break;
468 			}
469 		}
470 
471 		assert(raid_io_iov_idx + chunk_iovcnt <= raid_io_iovcnt);
472 
473 		ret = raid5f_chunk_set_iovcnt(chunk, chunk_iovcnt);
474 		if (ret) {
475 			return ret;
476 		}
477 
478 		if (raid_io_md) {
479 			chunk->md_buf = raid_io_md +
480 					(raid_io_offset >> raid_bdev->blocklen_shift) * raid_io_md_size;
481 		}
482 
483 		for (i = 0; i < chunk_iovcnt; i++) {
484 			struct iovec *chunk_iov = &chunk->iovs[i];
485 			const struct iovec *raid_io_iov = &raid_io_iovs[raid_io_iov_idx];
486 			size_t chunk_iov_offset = raid_io_offset - raid_io_iov_offset;
487 
488 			chunk_iov->iov_base = raid_io_iov->iov_base + chunk_iov_offset;
489 			chunk_iov->iov_len = spdk_min(len, raid_io_iov->iov_len - chunk_iov_offset);
490 			raid_io_offset += chunk_iov->iov_len;
491 			len -= chunk_iov->iov_len;
492 
493 			if (raid_io_offset >= raid_io_iov_offset + raid_io_iov->iov_len) {
494 				raid_io_iov_idx++;
495 				raid_io_iov_offset += raid_io_iov->iov_len;
496 			}
497 		}
498 
499 		if (spdk_unlikely(len > 0)) {
500 			return -EINVAL;
501 		}
502 	}
503 
504 	stripe_req->parity_chunk->iovs[0].iov_base = stripe_req->write.parity_buf;
505 	stripe_req->parity_chunk->iovs[0].iov_len = raid_bdev->strip_size << raid_bdev->blocklen_shift;
506 	stripe_req->parity_chunk->iovcnt = 1;
507 	stripe_req->parity_chunk->md_buf = stripe_req->write.parity_md_buf;
508 
509 	return 0;
510 }
511 
512 static void
513 raid5f_stripe_request_submit_chunks(struct stripe_request *stripe_req)
514 {
515 	struct raid_bdev_io *raid_io = stripe_req->raid_io;
516 	struct chunk *start = &stripe_req->chunks[raid_io->base_bdev_io_submitted];
517 	struct chunk *chunk;
518 
519 	FOR_EACH_CHUNK_FROM(stripe_req, chunk, start) {
520 		if (spdk_unlikely(raid5f_chunk_submit(chunk) != 0)) {
521 			break;
522 		}
523 		raid_io->base_bdev_io_submitted++;
524 	}
525 }
526 
527 static int
528 raid5f_submit_write_request(struct raid_bdev_io *raid_io, uint64_t stripe_index)
529 {
530 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
531 	struct raid5f_io_channel *r5ch = spdk_io_channel_get_ctx(raid_io->raid_ch->module_channel);
532 	struct stripe_request *stripe_req;
533 	int ret;
534 
535 	stripe_req = TAILQ_FIRST(&r5ch->free_stripe_requests);
536 	if (!stripe_req) {
537 		return -ENOMEM;
538 	}
539 
540 	stripe_req->stripe_index = stripe_index;
541 	stripe_req->parity_chunk = stripe_req->chunks + raid5f_stripe_parity_chunk_index(raid_bdev,
542 				   stripe_req->stripe_index);
543 	stripe_req->raid_io = raid_io;
544 
545 	ret = raid5f_stripe_request_map_iovecs(stripe_req);
546 	if (spdk_unlikely(ret)) {
547 		return ret;
548 	}
549 
550 	TAILQ_REMOVE(&r5ch->free_stripe_requests, stripe_req, link);
551 
552 	raid_io->module_private = stripe_req;
553 	raid_io->base_bdev_io_remaining = raid_bdev->num_base_bdevs;
554 
555 	raid5f_xor_stripe(stripe_req);
556 
557 	return 0;
558 }
559 
560 static void
561 raid5f_chunk_read_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
562 {
563 	struct raid_bdev_io *raid_io = cb_arg;
564 
565 	spdk_bdev_free_io(bdev_io);
566 
567 	raid_bdev_io_complete(raid_io, success ? SPDK_BDEV_IO_STATUS_SUCCESS :
568 			      SPDK_BDEV_IO_STATUS_FAILED);
569 }
570 
571 static void raid5f_submit_rw_request(struct raid_bdev_io *raid_io);
572 
573 static void
574 _raid5f_submit_rw_request(void *_raid_io)
575 {
576 	struct raid_bdev_io *raid_io = _raid_io;
577 
578 	raid5f_submit_rw_request(raid_io);
579 }
580 
581 static int
582 raid5f_submit_read_request(struct raid_bdev_io *raid_io, uint64_t stripe_index,
583 			   uint64_t stripe_offset)
584 {
585 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
586 	uint8_t chunk_data_idx = stripe_offset >> raid_bdev->strip_size_shift;
587 	uint8_t p_idx = raid5f_stripe_parity_chunk_index(raid_bdev, stripe_index);
588 	uint8_t chunk_idx = chunk_data_idx < p_idx ? chunk_data_idx : chunk_data_idx + 1;
589 	struct raid_base_bdev_info *base_info = &raid_bdev->base_bdev_info[chunk_idx];
590 	struct spdk_io_channel *base_ch = raid_io->raid_ch->base_channel[chunk_idx];
591 	uint64_t chunk_offset = stripe_offset - (chunk_data_idx << raid_bdev->strip_size_shift);
592 	uint64_t base_offset_blocks = (stripe_index << raid_bdev->strip_size_shift) + chunk_offset;
593 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
594 	struct spdk_bdev_ext_io_opts io_opts;
595 	int ret;
596 
597 	raid5f_init_ext_io_opts(bdev_io, &io_opts);
598 	ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch, bdev_io->u.bdev.iovs,
599 					 bdev_io->u.bdev.iovcnt,
600 					 base_offset_blocks, bdev_io->u.bdev.num_blocks, raid5f_chunk_read_complete, raid_io,
601 					 &io_opts);
602 
603 	if (spdk_unlikely(ret == -ENOMEM)) {
604 		raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
605 					base_ch, _raid5f_submit_rw_request);
606 		return 0;
607 	}
608 
609 	return ret;
610 }
611 
612 static void
613 raid5f_submit_rw_request(struct raid_bdev_io *raid_io)
614 {
615 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
616 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
617 	struct raid5f_info *r5f_info = raid_bdev->module_private;
618 	uint64_t offset_blocks = bdev_io->u.bdev.offset_blocks;
619 	uint64_t stripe_index = offset_blocks / r5f_info->stripe_blocks;
620 	uint64_t stripe_offset = offset_blocks % r5f_info->stripe_blocks;
621 	int ret;
622 
623 	switch (bdev_io->type) {
624 	case SPDK_BDEV_IO_TYPE_READ:
625 		assert(bdev_io->u.bdev.num_blocks <= raid_bdev->strip_size);
626 		ret = raid5f_submit_read_request(raid_io, stripe_index, stripe_offset);
627 		break;
628 	case SPDK_BDEV_IO_TYPE_WRITE:
629 		assert(stripe_offset == 0);
630 		assert(bdev_io->u.bdev.num_blocks == r5f_info->stripe_blocks);
631 		ret = raid5f_submit_write_request(raid_io, stripe_index);
632 		break;
633 	default:
634 		ret = -EINVAL;
635 		break;
636 	}
637 
638 	if (spdk_unlikely(ret)) {
639 		raid_bdev_io_complete(raid_io, ret == -ENOMEM ? SPDK_BDEV_IO_STATUS_NOMEM :
640 				      SPDK_BDEV_IO_STATUS_FAILED);
641 	}
642 }
643 
644 static void
645 raid5f_stripe_request_free(struct stripe_request *stripe_req)
646 {
647 	struct chunk *chunk;
648 
649 	FOR_EACH_CHUNK(stripe_req, chunk) {
650 		free(chunk->iovs);
651 	}
652 
653 	if (stripe_req->type == STRIPE_REQ_WRITE) {
654 		spdk_dma_free(stripe_req->write.parity_buf);
655 		spdk_dma_free(stripe_req->write.parity_md_buf);
656 	} else {
657 		assert(false);
658 	}
659 
660 	free(stripe_req->chunk_xor_buffers);
661 	free(stripe_req->chunk_xor_md_buffers);
662 	free(stripe_req->chunk_iov_iters);
663 
664 	free(stripe_req);
665 }
666 
667 static struct stripe_request *
668 raid5f_stripe_request_alloc(struct raid5f_io_channel *r5ch, enum stripe_request_type type)
669 {
670 	struct raid5f_info *r5f_info = raid5f_ch_to_r5f_info(r5ch);
671 	struct raid_bdev *raid_bdev = r5f_info->raid_bdev;
672 	uint32_t raid_io_md_size = spdk_bdev_get_md_size(&raid_bdev->bdev);
673 	struct stripe_request *stripe_req;
674 	struct chunk *chunk;
675 
676 	stripe_req = calloc(1, sizeof(*stripe_req) + sizeof(*chunk) * raid_bdev->num_base_bdevs);
677 	if (!stripe_req) {
678 		return NULL;
679 	}
680 
681 	stripe_req->r5ch = r5ch;
682 	stripe_req->type = type;
683 
684 	FOR_EACH_CHUNK(stripe_req, chunk) {
685 		chunk->index = chunk - stripe_req->chunks;
686 		chunk->iovcnt_max = 4;
687 		chunk->iovs = calloc(chunk->iovcnt_max, sizeof(chunk->iovs[0]));
688 		if (!chunk->iovs) {
689 			goto err;
690 		}
691 	}
692 
693 	if (type == STRIPE_REQ_WRITE) {
694 		stripe_req->write.parity_buf = spdk_dma_malloc(raid_bdev->strip_size << raid_bdev->blocklen_shift,
695 					       r5f_info->buf_alignment, NULL);
696 		if (!stripe_req->write.parity_buf) {
697 			goto err;
698 		}
699 
700 		if (raid_io_md_size != 0) {
701 			stripe_req->write.parity_md_buf = spdk_dma_malloc(raid_bdev->strip_size * raid_io_md_size,
702 							  r5f_info->buf_alignment, NULL);
703 			if (!stripe_req->write.parity_md_buf) {
704 				goto err;
705 			}
706 		}
707 	} else {
708 		assert(false);
709 		return NULL;
710 	}
711 
712 	stripe_req->chunk_iov_iters = malloc(SPDK_IOVITER_SIZE(raid_bdev->num_base_bdevs));
713 	if (!stripe_req->chunk_iov_iters) {
714 		goto err;
715 	}
716 
717 	stripe_req->chunk_xor_buffers = calloc(raid5f_stripe_data_chunks_num(raid_bdev),
718 					       sizeof(stripe_req->chunk_xor_buffers[0]));
719 	if (!stripe_req->chunk_xor_buffers) {
720 		goto err;
721 	}
722 
723 	stripe_req->chunk_xor_md_buffers = calloc(raid5f_stripe_data_chunks_num(raid_bdev),
724 					   sizeof(stripe_req->chunk_xor_md_buffers[0]));
725 	if (!stripe_req->chunk_xor_md_buffers) {
726 		goto err;
727 	}
728 
729 	return stripe_req;
730 err:
731 	raid5f_stripe_request_free(stripe_req);
732 	return NULL;
733 }
734 
735 static void
736 raid5f_ioch_destroy(void *io_device, void *ctx_buf)
737 {
738 	struct raid5f_io_channel *r5ch = ctx_buf;
739 	struct stripe_request *stripe_req;
740 
741 	assert(TAILQ_EMPTY(&r5ch->xor_retry_queue));
742 
743 	while ((stripe_req = TAILQ_FIRST(&r5ch->free_stripe_requests))) {
744 		TAILQ_REMOVE(&r5ch->free_stripe_requests, stripe_req, link);
745 		raid5f_stripe_request_free(stripe_req);
746 	}
747 
748 	if (r5ch->accel_ch) {
749 		spdk_put_io_channel(r5ch->accel_ch);
750 	}
751 
752 	free(r5ch->chunk_xor_buffers);
753 	free(r5ch->chunk_xor_iovs);
754 	free(r5ch->chunk_xor_iovcnt);
755 }
756 
757 static int
758 raid5f_ioch_create(void *io_device, void *ctx_buf)
759 {
760 	struct raid5f_io_channel *r5ch = ctx_buf;
761 	struct raid5f_info *r5f_info = io_device;
762 	struct raid_bdev *raid_bdev = r5f_info->raid_bdev;
763 	int i;
764 
765 	TAILQ_INIT(&r5ch->free_stripe_requests);
766 	TAILQ_INIT(&r5ch->xor_retry_queue);
767 
768 	for (i = 0; i < RAID5F_MAX_STRIPES; i++) {
769 		struct stripe_request *stripe_req;
770 
771 		stripe_req = raid5f_stripe_request_alloc(r5ch, STRIPE_REQ_WRITE);
772 		if (!stripe_req) {
773 			goto err;
774 		}
775 
776 		TAILQ_INSERT_HEAD(&r5ch->free_stripe_requests, stripe_req, link);
777 	}
778 
779 	r5ch->accel_ch = spdk_accel_get_io_channel();
780 	if (!r5ch->accel_ch) {
781 		SPDK_ERRLOG("Failed to get accel framework's IO channel\n");
782 		goto err;
783 	}
784 
785 	r5ch->chunk_xor_buffers = calloc(raid_bdev->num_base_bdevs, sizeof(*r5ch->chunk_xor_buffers));
786 	if (!r5ch->chunk_xor_buffers) {
787 		goto err;
788 	}
789 
790 	r5ch->chunk_xor_iovs = calloc(raid_bdev->num_base_bdevs, sizeof(*r5ch->chunk_xor_iovs));
791 	if (!r5ch->chunk_xor_iovs) {
792 		goto err;
793 	}
794 
795 	r5ch->chunk_xor_iovcnt = calloc(raid_bdev->num_base_bdevs, sizeof(*r5ch->chunk_xor_iovcnt));
796 	if (!r5ch->chunk_xor_iovcnt) {
797 		goto err;
798 	}
799 
800 	return 0;
801 err:
802 	SPDK_ERRLOG("Failed to initialize io channel\n");
803 	raid5f_ioch_destroy(r5f_info, r5ch);
804 	return -ENOMEM;
805 }
806 
807 static int
808 raid5f_start(struct raid_bdev *raid_bdev)
809 {
810 	uint64_t min_blockcnt = UINT64_MAX;
811 	struct raid_base_bdev_info *base_info;
812 	struct raid5f_info *r5f_info;
813 	size_t alignment = 0;
814 
815 	r5f_info = calloc(1, sizeof(*r5f_info));
816 	if (!r5f_info) {
817 		SPDK_ERRLOG("Failed to allocate r5f_info\n");
818 		return -ENOMEM;
819 	}
820 	r5f_info->raid_bdev = raid_bdev;
821 
822 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
823 		struct spdk_bdev *base_bdev;
824 
825 		base_bdev = spdk_bdev_desc_get_bdev(base_info->desc);
826 		min_blockcnt = spdk_min(min_blockcnt, base_bdev->blockcnt);
827 		alignment = spdk_max(alignment, spdk_bdev_get_buf_align(base_bdev));
828 	}
829 
830 	r5f_info->total_stripes = min_blockcnt / raid_bdev->strip_size;
831 	r5f_info->stripe_blocks = raid_bdev->strip_size * raid5f_stripe_data_chunks_num(raid_bdev);
832 	r5f_info->buf_alignment = alignment;
833 
834 	raid_bdev->bdev.blockcnt = r5f_info->stripe_blocks * r5f_info->total_stripes;
835 	raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size;
836 	raid_bdev->bdev.split_on_optimal_io_boundary = true;
837 	raid_bdev->bdev.write_unit_size = r5f_info->stripe_blocks;
838 	raid_bdev->bdev.split_on_write_unit = true;
839 
840 	raid_bdev->module_private = r5f_info;
841 
842 	spdk_io_device_register(r5f_info, raid5f_ioch_create, raid5f_ioch_destroy,
843 				sizeof(struct raid5f_io_channel), NULL);
844 
845 	return 0;
846 }
847 
848 static void
849 raid5f_io_device_unregister_done(void *io_device)
850 {
851 	struct raid5f_info *r5f_info = io_device;
852 
853 	raid_bdev_module_stop_done(r5f_info->raid_bdev);
854 
855 	free(r5f_info);
856 }
857 
858 static bool
859 raid5f_stop(struct raid_bdev *raid_bdev)
860 {
861 	struct raid5f_info *r5f_info = raid_bdev->module_private;
862 
863 	spdk_io_device_unregister(r5f_info, raid5f_io_device_unregister_done);
864 
865 	return false;
866 }
867 
868 static struct spdk_io_channel *
869 raid5f_get_io_channel(struct raid_bdev *raid_bdev)
870 {
871 	struct raid5f_info *r5f_info = raid_bdev->module_private;
872 
873 	return spdk_get_io_channel(r5f_info);
874 }
875 
876 static struct raid_bdev_module g_raid5f_module = {
877 	.level = RAID5F,
878 	.base_bdevs_min = 3,
879 	.base_bdevs_constraint = {CONSTRAINT_MAX_BASE_BDEVS_REMOVED, 1},
880 	.start = raid5f_start,
881 	.stop = raid5f_stop,
882 	.submit_rw_request = raid5f_submit_rw_request,
883 	.get_io_channel = raid5f_get_io_channel,
884 };
885 RAID_MODULE_REGISTER(&g_raid5f_module)
886 
887 SPDK_LOG_REGISTER_COMPONENT(bdev_raid5f)
888