xref: /spdk/module/bdev/raid/raid5f.c (revision 95a367d64eadbe63e59259f0a9f30e525c345140)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2022 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "bdev_raid.h"
7 
8 #include "spdk/env.h"
9 #include "spdk/thread.h"
10 #include "spdk/string.h"
11 #include "spdk/util.h"
12 #include "spdk/likely.h"
13 #include "spdk/log.h"
14 #include "spdk/accel.h"
15 
16 /* Maximum concurrent full stripe writes per io channel */
17 #define RAID5F_MAX_STRIPES 32
18 
19 struct chunk {
20 	/* Corresponds to base_bdev index */
21 	uint8_t index;
22 
23 	/* Array of iovecs */
24 	struct iovec *iovs;
25 
26 	/* Number of used iovecs */
27 	int iovcnt;
28 
29 	/* Total number of available iovecs in the array */
30 	int iovcnt_max;
31 
32 	/* Pointer to buffer with I/O metadata */
33 	void *md_buf;
34 
35 	/* Shallow copy of IO request parameters */
36 	struct spdk_bdev_ext_io_opts ext_opts;
37 };
38 
39 struct stripe_request {
40 	struct raid5f_io_channel *r5ch;
41 
42 	/* The associated raid_bdev_io */
43 	struct raid_bdev_io *raid_io;
44 
45 	/* The stripe's index in the raid array. */
46 	uint64_t stripe_index;
47 
48 	/* The stripe's parity chunk */
49 	struct chunk *parity_chunk;
50 
51 	/* Buffer for stripe parity */
52 	void *parity_buf;
53 
54 	/* Buffer for stripe io metadata parity */
55 	void *parity_md_buf;
56 
57 	/* Array of iovec iterators for each data chunk */
58 	struct iov_iter {
59 		struct iovec *iovs;
60 		int iovcnt;
61 		int index;
62 		size_t offset;
63 	} *chunk_iov_iters;
64 
65 	/* Array of source buffer pointers for parity calculation */
66 	void **chunk_xor_buffers;
67 
68 	/* Array of source buffer pointers for parity calculation of io metadata */
69 	void **chunk_xor_md_buffers;
70 
71 	struct {
72 		void *dest;
73 		size_t len;
74 		size_t remaining;
75 		size_t remaining_md;
76 		int status;
77 	} xor;
78 
79 	TAILQ_ENTRY(stripe_request) link;
80 
81 	/* Array of chunks corresponding to base_bdevs */
82 	struct chunk chunks[0];
83 };
84 
85 struct raid5f_info {
86 	/* The parent raid bdev */
87 	struct raid_bdev *raid_bdev;
88 
89 	/* Number of data blocks in a stripe (without parity) */
90 	uint64_t stripe_blocks;
91 
92 	/* Number of stripes on this array */
93 	uint64_t total_stripes;
94 
95 	/* Alignment for buffer allocation */
96 	size_t buf_alignment;
97 };
98 
99 struct raid5f_io_channel {
100 	/* All available stripe requests on this channel */
101 	TAILQ_HEAD(, stripe_request) free_stripe_requests;
102 
103 	/* accel_fw channel */
104 	struct spdk_io_channel *accel_ch;
105 
106 	/* For retrying xor if accel_ch runs out of resources */
107 	TAILQ_HEAD(, stripe_request) xor_retry_queue;
108 };
109 
110 #define __CHUNK_IN_RANGE(req, c) \
111 	c < req->chunks + raid5f_ch_to_r5f_info(req->r5ch)->raid_bdev->num_base_bdevs
112 
113 #define FOR_EACH_CHUNK_FROM(req, c, from) \
114 	for (c = from; __CHUNK_IN_RANGE(req, c); c++)
115 
116 #define FOR_EACH_CHUNK(req, c) \
117 	FOR_EACH_CHUNK_FROM(req, c, req->chunks)
118 
119 #define __NEXT_DATA_CHUNK(req, c) \
120 	c == req->parity_chunk ? c+1 : c
121 
122 #define FOR_EACH_DATA_CHUNK(req, c) \
123 	for (c = __NEXT_DATA_CHUNK(req, req->chunks); __CHUNK_IN_RANGE(req, c); \
124 	     c = __NEXT_DATA_CHUNK(req, c+1))
125 
126 static inline struct raid5f_info *
127 raid5f_ch_to_r5f_info(struct raid5f_io_channel *r5ch)
128 {
129 	return spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(r5ch));
130 }
131 
132 static inline struct stripe_request *
133 raid5f_chunk_stripe_req(struct chunk *chunk)
134 {
135 	return SPDK_CONTAINEROF((chunk - chunk->index), struct stripe_request, chunks);
136 }
137 
138 static inline uint8_t
139 raid5f_stripe_data_chunks_num(const struct raid_bdev *raid_bdev)
140 {
141 	return raid_bdev->min_base_bdevs_operational;
142 }
143 
144 static inline uint8_t
145 raid5f_stripe_parity_chunk_index(const struct raid_bdev *raid_bdev, uint64_t stripe_index)
146 {
147 	return raid5f_stripe_data_chunks_num(raid_bdev) - stripe_index % raid_bdev->num_base_bdevs;
148 }
149 
150 static inline void
151 raid5f_stripe_request_release(struct stripe_request *stripe_req)
152 {
153 	TAILQ_INSERT_HEAD(&stripe_req->r5ch->free_stripe_requests, stripe_req, link);
154 }
155 
156 static void raid5f_stripe_request_submit_chunks(struct stripe_request *stripe_req);
157 static void raid5f_xor_stripe_retry(struct stripe_request *stripe_req);
158 
159 static void
160 raid5f_xor_stripe_done(struct stripe_request *stripe_req)
161 {
162 	if (stripe_req->xor.status != 0) {
163 		SPDK_ERRLOG("stripe xor failed: %s\n", spdk_strerror(-stripe_req->xor.status));
164 		raid_bdev_io_complete(stripe_req->raid_io, SPDK_BDEV_IO_STATUS_FAILED);
165 	} else {
166 		raid5f_stripe_request_submit_chunks(stripe_req);
167 	}
168 
169 	if (!TAILQ_EMPTY(&stripe_req->r5ch->xor_retry_queue)) {
170 		stripe_req = TAILQ_FIRST(&stripe_req->r5ch->xor_retry_queue);
171 		TAILQ_REMOVE(&stripe_req->r5ch->xor_retry_queue, stripe_req, link);
172 		raid5f_xor_stripe_retry(stripe_req);
173 	}
174 }
175 
176 static void raid5f_xor_stripe_continue(struct stripe_request *stripe_req);
177 
178 static void
179 _raid5f_xor_stripe_cb(struct stripe_request *stripe_req, int status)
180 {
181 	if (status != 0) {
182 		stripe_req->xor.status = status;
183 	}
184 
185 	if (stripe_req->xor.remaining + stripe_req->xor.remaining_md == 0) {
186 		raid5f_xor_stripe_done(stripe_req);
187 	}
188 }
189 
190 static void
191 raid5f_xor_stripe_cb(void *_stripe_req, int status)
192 {
193 	struct stripe_request *stripe_req = _stripe_req;
194 	size_t len = stripe_req->xor.len;
195 
196 	stripe_req->xor.remaining -= len;
197 
198 	if (stripe_req->xor.remaining > 0) {
199 		struct raid_bdev_io *raid_io = stripe_req->raid_io;
200 		struct raid_bdev *raid_bdev = raid_io->raid_bdev;
201 		uint8_t n_src = raid5f_stripe_data_chunks_num(raid_bdev);
202 		uint8_t i;
203 
204 		for (i = 0; i < n_src; i++) {
205 			struct iov_iter *iov_iter = &stripe_req->chunk_iov_iters[i];
206 			struct iovec *iov = &iov_iter->iovs[iov_iter->index];
207 
208 			iov_iter->offset += len;
209 			if (iov_iter->offset == iov->iov_len) {
210 				iov_iter->offset = 0;
211 				iov_iter->index++;
212 			}
213 		}
214 
215 		stripe_req->xor.dest += len;
216 
217 		raid5f_xor_stripe_continue(stripe_req);
218 	}
219 
220 	_raid5f_xor_stripe_cb(stripe_req, status);
221 }
222 
223 static void
224 raid5f_xor_stripe_md_cb(void *_stripe_req, int status)
225 {
226 	struct stripe_request *stripe_req = _stripe_req;
227 
228 	stripe_req->xor.remaining_md = 0;
229 
230 	_raid5f_xor_stripe_cb(stripe_req, status);
231 }
232 
233 static void
234 raid5f_xor_stripe_continue(struct stripe_request *stripe_req)
235 {
236 	struct raid_bdev_io *raid_io = stripe_req->raid_io;
237 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
238 	uint8_t n_src = raid5f_stripe_data_chunks_num(raid_bdev);
239 	size_t len = stripe_req->xor.remaining;
240 	uint8_t i;
241 	int ret;
242 
243 	assert(stripe_req->xor.remaining > 0);
244 
245 	for (i = 0; i < n_src; i++) {
246 		struct iov_iter *iov_iter = &stripe_req->chunk_iov_iters[i];
247 		struct iovec *iov = &iov_iter->iovs[iov_iter->index];
248 
249 		len = spdk_min(len, iov->iov_len - iov_iter->offset);
250 		stripe_req->chunk_xor_buffers[i] = iov->iov_base + iov_iter->offset;
251 	}
252 
253 	assert(len > 0);
254 	stripe_req->xor.len = len;
255 
256 	ret = spdk_accel_submit_xor(stripe_req->r5ch->accel_ch, stripe_req->xor.dest,
257 				    stripe_req->chunk_xor_buffers, n_src, len,
258 				    raid5f_xor_stripe_cb, stripe_req);
259 	if (spdk_unlikely(ret)) {
260 		if (ret == -ENOMEM) {
261 			TAILQ_INSERT_HEAD(&stripe_req->r5ch->xor_retry_queue, stripe_req, link);
262 		} else {
263 			stripe_req->xor.status = ret;
264 			raid5f_xor_stripe_done(stripe_req);
265 		}
266 		return;
267 	}
268 }
269 
270 static void
271 raid5f_xor_stripe(struct stripe_request *stripe_req)
272 {
273 	struct raid_bdev_io *raid_io = stripe_req->raid_io;
274 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
275 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
276 	void *raid_md = spdk_bdev_io_get_md_buf(bdev_io);
277 	uint32_t raid_md_size = spdk_bdev_get_md_size(&raid_bdev->bdev);
278 	struct chunk *chunk;
279 	uint8_t c;
280 
281 	c = 0;
282 	FOR_EACH_DATA_CHUNK(stripe_req, chunk) {
283 		struct iov_iter *iov_iter = &stripe_req->chunk_iov_iters[c++];
284 
285 		iov_iter->iovs = chunk->iovs;
286 		iov_iter->iovcnt = chunk->iovcnt;
287 		iov_iter->index = 0;
288 		iov_iter->offset = 0;
289 	}
290 
291 	stripe_req->xor.dest = stripe_req->parity_buf;
292 	stripe_req->xor.remaining = raid_bdev->strip_size << raid_bdev->blocklen_shift;
293 	stripe_req->xor.status = 0;
294 
295 	if (raid_md != NULL) {
296 		uint8_t n_src = raid5f_stripe_data_chunks_num(raid_bdev);
297 		uint64_t len = raid_bdev->strip_size * raid_md_size;
298 		int ret;
299 
300 		stripe_req->xor.remaining_md = len;
301 
302 		c = 0;
303 		FOR_EACH_DATA_CHUNK(stripe_req, chunk) {
304 			stripe_req->chunk_xor_md_buffers[c] = chunk->md_buf;
305 			c++;
306 		}
307 
308 		ret = spdk_accel_submit_xor(stripe_req->r5ch->accel_ch, stripe_req->parity_md_buf,
309 					    stripe_req->chunk_xor_md_buffers, n_src, len,
310 					    raid5f_xor_stripe_md_cb, stripe_req);
311 		if (spdk_unlikely(ret)) {
312 			if (ret == -ENOMEM) {
313 				TAILQ_INSERT_HEAD(&stripe_req->r5ch->xor_retry_queue, stripe_req, link);
314 			} else {
315 				stripe_req->xor.status = ret;
316 				raid5f_xor_stripe_done(stripe_req);
317 			}
318 			return;
319 		}
320 	}
321 
322 	raid5f_xor_stripe_continue(stripe_req);
323 }
324 
325 static void
326 raid5f_xor_stripe_retry(struct stripe_request *stripe_req)
327 {
328 	if (stripe_req->xor.remaining_md) {
329 		raid5f_xor_stripe(stripe_req);
330 	} else {
331 		raid5f_xor_stripe_continue(stripe_req);
332 	}
333 }
334 
335 static void
336 raid5f_chunk_write_complete(struct chunk *chunk, enum spdk_bdev_io_status status)
337 {
338 	struct stripe_request *stripe_req = raid5f_chunk_stripe_req(chunk);
339 
340 	if (raid_bdev_io_complete_part(stripe_req->raid_io, 1, status)) {
341 		raid5f_stripe_request_release(stripe_req);
342 	}
343 }
344 
345 static void
346 raid5f_chunk_write_complete_bdev_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
347 {
348 	struct chunk *chunk = cb_arg;
349 
350 	spdk_bdev_free_io(bdev_io);
351 
352 	raid5f_chunk_write_complete(chunk, success ? SPDK_BDEV_IO_STATUS_SUCCESS :
353 				    SPDK_BDEV_IO_STATUS_FAILED);
354 }
355 
356 static void
357 raid5f_chunk_write_retry(void *_raid_io)
358 {
359 	struct raid_bdev_io *raid_io = _raid_io;
360 	struct stripe_request *stripe_req = raid_io->module_private;
361 
362 	raid5f_stripe_request_submit_chunks(stripe_req);
363 }
364 
365 static inline void
366 raid5f_init_ext_io_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts)
367 {
368 	memset(opts, 0, sizeof(*opts));
369 	opts->size = sizeof(*opts);
370 	opts->memory_domain = bdev_io->u.bdev.memory_domain;
371 	opts->memory_domain_ctx = bdev_io->u.bdev.memory_domain_ctx;
372 	opts->metadata = bdev_io->u.bdev.md_buf;
373 }
374 
375 static int
376 raid5f_chunk_write(struct chunk *chunk)
377 {
378 	struct stripe_request *stripe_req = raid5f_chunk_stripe_req(chunk);
379 	struct raid_bdev_io *raid_io = stripe_req->raid_io;
380 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
381 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
382 	struct raid_base_bdev_info *base_info = &raid_bdev->base_bdev_info[chunk->index];
383 	struct spdk_io_channel *base_ch = raid_io->raid_ch->base_channel[chunk->index];
384 	uint64_t base_offset_blocks = (stripe_req->stripe_index << raid_bdev->strip_size_shift);
385 	int ret;
386 
387 	raid5f_init_ext_io_opts(bdev_io, &chunk->ext_opts);
388 	chunk->ext_opts.metadata = chunk->md_buf;
389 
390 	ret = spdk_bdev_writev_blocks_ext(base_info->desc, base_ch, chunk->iovs, chunk->iovcnt,
391 					  base_offset_blocks, raid_bdev->strip_size, raid5f_chunk_write_complete_bdev_io,
392 					  chunk, &chunk->ext_opts);
393 
394 	if (spdk_unlikely(ret)) {
395 		if (ret == -ENOMEM) {
396 			raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch,
397 						raid5f_chunk_write_retry);
398 		} else {
399 			/*
400 			 * Implicitly complete any I/Os not yet submitted as FAILED. If completing
401 			 * these means there are no more to complete for the stripe request, we can
402 			 * release the stripe request as well.
403 			 */
404 			uint64_t base_bdev_io_not_submitted = raid_bdev->num_base_bdevs -
405 							      raid_io->base_bdev_io_submitted;
406 
407 			if (raid_bdev_io_complete_part(stripe_req->raid_io, base_bdev_io_not_submitted,
408 						       SPDK_BDEV_IO_STATUS_FAILED)) {
409 				raid5f_stripe_request_release(stripe_req);
410 			}
411 		}
412 	}
413 
414 	return ret;
415 }
416 
417 static int
418 raid5f_stripe_request_map_iovecs(struct stripe_request *stripe_req)
419 {
420 	struct raid_bdev *raid_bdev = stripe_req->raid_io->raid_bdev;
421 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(stripe_req->raid_io);
422 	const struct iovec *raid_io_iovs = bdev_io->u.bdev.iovs;
423 	int raid_io_iovcnt = bdev_io->u.bdev.iovcnt;
424 	void *raid_io_md = spdk_bdev_io_get_md_buf(bdev_io);
425 	uint32_t raid_io_md_size = spdk_bdev_get_md_size(&raid_bdev->bdev);
426 	struct chunk *chunk;
427 	int raid_io_iov_idx = 0;
428 	size_t raid_io_offset = 0;
429 	size_t raid_io_iov_offset = 0;
430 	int i;
431 
432 	FOR_EACH_DATA_CHUNK(stripe_req, chunk) {
433 		int chunk_iovcnt = 0;
434 		uint64_t len = raid_bdev->strip_size << raid_bdev->blocklen_shift;
435 		size_t off = raid_io_iov_offset;
436 
437 		for (i = raid_io_iov_idx; i < raid_io_iovcnt; i++) {
438 			chunk_iovcnt++;
439 			off += raid_io_iovs[i].iov_len;
440 			if (off >= raid_io_offset + len) {
441 				break;
442 			}
443 		}
444 
445 		assert(raid_io_iov_idx + chunk_iovcnt <= raid_io_iovcnt);
446 
447 		if (chunk_iovcnt > chunk->iovcnt_max) {
448 			struct iovec *iovs = chunk->iovs;
449 
450 			iovs = realloc(iovs, chunk_iovcnt * sizeof(*iovs));
451 			if (!iovs) {
452 				return -ENOMEM;
453 			}
454 			chunk->iovs = iovs;
455 			chunk->iovcnt_max = chunk_iovcnt;
456 		}
457 		chunk->iovcnt = chunk_iovcnt;
458 
459 		if (raid_io_md) {
460 			chunk->md_buf = raid_io_md +
461 					(raid_io_offset >> raid_bdev->blocklen_shift) * raid_io_md_size;
462 		}
463 
464 		for (i = 0; i < chunk_iovcnt; i++) {
465 			struct iovec *chunk_iov = &chunk->iovs[i];
466 			const struct iovec *raid_io_iov = &raid_io_iovs[raid_io_iov_idx];
467 			size_t chunk_iov_offset = raid_io_offset - raid_io_iov_offset;
468 
469 			chunk_iov->iov_base = raid_io_iov->iov_base + chunk_iov_offset;
470 			chunk_iov->iov_len = spdk_min(len, raid_io_iov->iov_len - chunk_iov_offset);
471 			raid_io_offset += chunk_iov->iov_len;
472 			len -= chunk_iov->iov_len;
473 
474 			if (raid_io_offset >= raid_io_iov_offset + raid_io_iov->iov_len) {
475 				raid_io_iov_idx++;
476 				raid_io_iov_offset += raid_io_iov->iov_len;
477 			}
478 		}
479 
480 		if (spdk_unlikely(len > 0)) {
481 			return -EINVAL;
482 		}
483 	}
484 
485 	stripe_req->parity_chunk->iovs[0].iov_base = stripe_req->parity_buf;
486 	stripe_req->parity_chunk->iovs[0].iov_len = raid_bdev->strip_size <<
487 			raid_bdev->blocklen_shift;
488 	stripe_req->parity_chunk->md_buf = stripe_req->parity_md_buf;
489 	stripe_req->parity_chunk->iovcnt = 1;
490 
491 	return 0;
492 }
493 
494 static void
495 raid5f_stripe_request_submit_chunks(struct stripe_request *stripe_req)
496 {
497 	struct raid_bdev_io *raid_io = stripe_req->raid_io;
498 	struct chunk *start = &stripe_req->chunks[raid_io->base_bdev_io_submitted];
499 	struct chunk *chunk;
500 
501 	FOR_EACH_CHUNK_FROM(stripe_req, chunk, start) {
502 		if (spdk_unlikely(raid5f_chunk_write(chunk) != 0)) {
503 			break;
504 		}
505 		raid_io->base_bdev_io_submitted++;
506 	}
507 }
508 
509 static int
510 raid5f_submit_write_request(struct raid_bdev_io *raid_io, uint64_t stripe_index)
511 {
512 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
513 	struct raid5f_io_channel *r5ch = spdk_io_channel_get_ctx(raid_io->raid_ch->module_channel);
514 	struct stripe_request *stripe_req;
515 	int ret;
516 
517 	stripe_req = TAILQ_FIRST(&r5ch->free_stripe_requests);
518 	if (!stripe_req) {
519 		return -ENOMEM;
520 	}
521 
522 	stripe_req->stripe_index = stripe_index;
523 	stripe_req->parity_chunk = stripe_req->chunks + raid5f_stripe_parity_chunk_index(raid_bdev,
524 				   stripe_req->stripe_index);
525 	stripe_req->raid_io = raid_io;
526 
527 	ret = raid5f_stripe_request_map_iovecs(stripe_req);
528 	if (spdk_unlikely(ret)) {
529 		return ret;
530 	}
531 
532 	TAILQ_REMOVE(&r5ch->free_stripe_requests, stripe_req, link);
533 
534 	raid_io->module_private = stripe_req;
535 	raid_io->base_bdev_io_remaining = raid_bdev->num_base_bdevs;
536 
537 	raid5f_xor_stripe(stripe_req);
538 
539 	return 0;
540 }
541 
542 static void
543 raid5f_chunk_read_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
544 {
545 	struct raid_bdev_io *raid_io = cb_arg;
546 
547 	spdk_bdev_free_io(bdev_io);
548 
549 	raid_bdev_io_complete(raid_io, success ? SPDK_BDEV_IO_STATUS_SUCCESS :
550 			      SPDK_BDEV_IO_STATUS_FAILED);
551 }
552 
553 static void raid5f_submit_rw_request(struct raid_bdev_io *raid_io);
554 
555 static void
556 _raid5f_submit_rw_request(void *_raid_io)
557 {
558 	struct raid_bdev_io *raid_io = _raid_io;
559 
560 	raid5f_submit_rw_request(raid_io);
561 }
562 
563 static int
564 raid5f_submit_read_request(struct raid_bdev_io *raid_io, uint64_t stripe_index,
565 			   uint64_t stripe_offset)
566 {
567 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
568 	uint8_t chunk_data_idx = stripe_offset >> raid_bdev->strip_size_shift;
569 	uint8_t p_idx = raid5f_stripe_parity_chunk_index(raid_bdev, stripe_index);
570 	uint8_t chunk_idx = chunk_data_idx < p_idx ? chunk_data_idx : chunk_data_idx + 1;
571 	struct raid_base_bdev_info *base_info = &raid_bdev->base_bdev_info[chunk_idx];
572 	struct spdk_io_channel *base_ch = raid_io->raid_ch->base_channel[chunk_idx];
573 	uint64_t chunk_offset = stripe_offset - (chunk_data_idx << raid_bdev->strip_size_shift);
574 	uint64_t base_offset_blocks = (stripe_index << raid_bdev->strip_size_shift) + chunk_offset;
575 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
576 	struct spdk_bdev_ext_io_opts io_opts;
577 	int ret;
578 
579 	raid5f_init_ext_io_opts(bdev_io, &io_opts);
580 	ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch, bdev_io->u.bdev.iovs,
581 					 bdev_io->u.bdev.iovcnt,
582 					 base_offset_blocks, bdev_io->u.bdev.num_blocks, raid5f_chunk_read_complete, raid_io,
583 					 &io_opts);
584 
585 	if (spdk_unlikely(ret == -ENOMEM)) {
586 		raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch,
587 					_raid5f_submit_rw_request);
588 		return 0;
589 	}
590 
591 	return ret;
592 }
593 
594 static void
595 raid5f_submit_rw_request(struct raid_bdev_io *raid_io)
596 {
597 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
598 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
599 	struct raid5f_info *r5f_info = raid_bdev->module_private;
600 	uint64_t offset_blocks = bdev_io->u.bdev.offset_blocks;
601 	uint64_t stripe_index = offset_blocks / r5f_info->stripe_blocks;
602 	uint64_t stripe_offset = offset_blocks % r5f_info->stripe_blocks;
603 	int ret;
604 
605 	switch (bdev_io->type) {
606 	case SPDK_BDEV_IO_TYPE_READ:
607 		assert(bdev_io->u.bdev.num_blocks <= raid_bdev->strip_size);
608 		ret = raid5f_submit_read_request(raid_io, stripe_index, stripe_offset);
609 		break;
610 	case SPDK_BDEV_IO_TYPE_WRITE:
611 		assert(stripe_offset == 0);
612 		assert(bdev_io->u.bdev.num_blocks == r5f_info->stripe_blocks);
613 		ret = raid5f_submit_write_request(raid_io, stripe_index);
614 		break;
615 	default:
616 		ret = -EINVAL;
617 		break;
618 	}
619 
620 	if (spdk_unlikely(ret)) {
621 		raid_bdev_io_complete(raid_io, ret == -ENOMEM ? SPDK_BDEV_IO_STATUS_NOMEM :
622 				      SPDK_BDEV_IO_STATUS_FAILED);
623 	}
624 }
625 
626 static void
627 raid5f_stripe_request_free(struct stripe_request *stripe_req)
628 {
629 	struct chunk *chunk;
630 
631 	FOR_EACH_CHUNK(stripe_req, chunk) {
632 		free(chunk->iovs);
633 	}
634 
635 	spdk_dma_free(stripe_req->parity_buf);
636 	spdk_dma_free(stripe_req->parity_md_buf);
637 
638 	free(stripe_req->chunk_xor_buffers);
639 	free(stripe_req->chunk_xor_md_buffers);
640 	free(stripe_req->chunk_iov_iters);
641 
642 	free(stripe_req);
643 }
644 
645 static struct stripe_request *
646 raid5f_stripe_request_alloc(struct raid5f_io_channel *r5ch)
647 {
648 	struct raid5f_info *r5f_info = raid5f_ch_to_r5f_info(r5ch);
649 	struct raid_bdev *raid_bdev = r5f_info->raid_bdev;
650 	uint32_t raid_io_md_size = spdk_bdev_get_md_size(&raid_bdev->bdev);
651 	struct stripe_request *stripe_req;
652 	struct chunk *chunk;
653 
654 	stripe_req = calloc(1, sizeof(*stripe_req) +
655 			    sizeof(struct chunk) * raid_bdev->num_base_bdevs);
656 	if (!stripe_req) {
657 		return NULL;
658 	}
659 
660 	stripe_req->r5ch = r5ch;
661 
662 	FOR_EACH_CHUNK(stripe_req, chunk) {
663 		chunk->index = chunk - stripe_req->chunks;
664 		chunk->iovcnt_max = 4;
665 		chunk->iovs = calloc(chunk->iovcnt_max, sizeof(chunk->iovs[0]));
666 		if (!chunk->iovs) {
667 			goto err;
668 		}
669 	}
670 
671 	stripe_req->parity_buf = spdk_dma_malloc(raid_bdev->strip_size << raid_bdev->blocklen_shift,
672 				 r5f_info->buf_alignment, NULL);
673 	if (!stripe_req->parity_buf) {
674 		goto err;
675 	}
676 
677 	if (raid_io_md_size != 0) {
678 		stripe_req->parity_md_buf = spdk_dma_malloc(raid_bdev->strip_size * raid_io_md_size,
679 					    r5f_info->buf_alignment, NULL);
680 		if (!stripe_req->parity_md_buf) {
681 			goto err;
682 		}
683 	}
684 
685 	stripe_req->chunk_iov_iters = calloc(raid5f_stripe_data_chunks_num(raid_bdev),
686 					     sizeof(stripe_req->chunk_iov_iters[0]));
687 	if (!stripe_req->chunk_iov_iters) {
688 		goto err;
689 	}
690 
691 	stripe_req->chunk_xor_buffers = calloc(raid5f_stripe_data_chunks_num(raid_bdev),
692 					       sizeof(stripe_req->chunk_xor_buffers[0]));
693 	if (!stripe_req->chunk_xor_buffers) {
694 		goto err;
695 	}
696 
697 	stripe_req->chunk_xor_md_buffers = calloc(raid5f_stripe_data_chunks_num(raid_bdev),
698 					   sizeof(stripe_req->chunk_xor_md_buffers[0]));
699 	if (!stripe_req->chunk_xor_md_buffers) {
700 		goto err;
701 	}
702 
703 	return stripe_req;
704 err:
705 	raid5f_stripe_request_free(stripe_req);
706 	return NULL;
707 }
708 
709 static void
710 raid5f_ioch_destroy(void *io_device, void *ctx_buf)
711 {
712 	struct raid5f_io_channel *r5ch = ctx_buf;
713 	struct stripe_request *stripe_req;
714 
715 	assert(TAILQ_EMPTY(&r5ch->xor_retry_queue));
716 
717 	while ((stripe_req = TAILQ_FIRST(&r5ch->free_stripe_requests))) {
718 		TAILQ_REMOVE(&r5ch->free_stripe_requests, stripe_req, link);
719 		raid5f_stripe_request_free(stripe_req);
720 	}
721 
722 	if (r5ch->accel_ch) {
723 		spdk_put_io_channel(r5ch->accel_ch);
724 	}
725 }
726 
727 static int
728 raid5f_ioch_create(void *io_device, void *ctx_buf)
729 {
730 	struct raid5f_io_channel *r5ch = ctx_buf;
731 	struct raid5f_info *r5f_info = io_device;
732 	int status = 0;
733 	int i;
734 
735 	TAILQ_INIT(&r5ch->free_stripe_requests);
736 
737 	for (i = 0; i < RAID5F_MAX_STRIPES; i++) {
738 		struct stripe_request *stripe_req;
739 
740 		stripe_req = raid5f_stripe_request_alloc(r5ch);
741 		if (!stripe_req) {
742 			status = -ENOMEM;
743 			goto out;
744 		}
745 
746 		TAILQ_INSERT_HEAD(&r5ch->free_stripe_requests, stripe_req, link);
747 	}
748 
749 	r5ch->accel_ch = spdk_accel_get_io_channel();
750 	if (!r5ch->accel_ch) {
751 		SPDK_ERRLOG("Failed to get accel framework's IO channel\n");
752 		goto out;
753 	}
754 
755 	TAILQ_INIT(&r5ch->xor_retry_queue);
756 out:
757 	if (status) {
758 		SPDK_ERRLOG("Failed to initialize io channel\n");
759 		raid5f_ioch_destroy(r5f_info, r5ch);
760 	}
761 	return status;
762 }
763 
764 static int
765 raid5f_start(struct raid_bdev *raid_bdev)
766 {
767 	uint64_t min_blockcnt = UINT64_MAX;
768 	struct raid_base_bdev_info *base_info;
769 	struct raid5f_info *r5f_info;
770 	size_t alignment = 0;
771 
772 	r5f_info = calloc(1, sizeof(*r5f_info));
773 	if (!r5f_info) {
774 		SPDK_ERRLOG("Failed to allocate r5f_info\n");
775 		return -ENOMEM;
776 	}
777 	r5f_info->raid_bdev = raid_bdev;
778 
779 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
780 		min_blockcnt = spdk_min(min_blockcnt, base_info->bdev->blockcnt);
781 		alignment = spdk_max(alignment, spdk_bdev_get_buf_align(base_info->bdev));
782 	}
783 
784 	r5f_info->total_stripes = min_blockcnt / raid_bdev->strip_size;
785 	r5f_info->stripe_blocks = raid_bdev->strip_size * raid5f_stripe_data_chunks_num(raid_bdev);
786 	r5f_info->buf_alignment = alignment;
787 
788 	raid_bdev->bdev.blockcnt = r5f_info->stripe_blocks * r5f_info->total_stripes;
789 	raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size;
790 	raid_bdev->bdev.split_on_optimal_io_boundary = true;
791 	raid_bdev->bdev.write_unit_size = r5f_info->stripe_blocks;
792 	raid_bdev->bdev.split_on_write_unit = true;
793 
794 	raid_bdev->module_private = r5f_info;
795 
796 	spdk_io_device_register(r5f_info, raid5f_ioch_create, raid5f_ioch_destroy,
797 				sizeof(struct raid5f_io_channel), NULL);
798 
799 	return 0;
800 }
801 
802 static void
803 raid5f_io_device_unregister_done(void *io_device)
804 {
805 	struct raid5f_info *r5f_info = io_device;
806 
807 	raid_bdev_module_stop_done(r5f_info->raid_bdev);
808 
809 	free(r5f_info);
810 }
811 
812 static bool
813 raid5f_stop(struct raid_bdev *raid_bdev)
814 {
815 	struct raid5f_info *r5f_info = raid_bdev->module_private;
816 
817 	spdk_io_device_unregister(r5f_info, raid5f_io_device_unregister_done);
818 
819 	return false;
820 }
821 
822 static struct spdk_io_channel *
823 raid5f_get_io_channel(struct raid_bdev *raid_bdev)
824 {
825 	struct raid5f_info *r5f_info = raid_bdev->module_private;
826 
827 	return spdk_get_io_channel(r5f_info);
828 }
829 
830 static struct raid_bdev_module g_raid5f_module = {
831 	.level = RAID5F,
832 	.base_bdevs_min = 3,
833 	.base_bdevs_constraint = {CONSTRAINT_MAX_BASE_BDEVS_REMOVED, 1},
834 	.start = raid5f_start,
835 	.stop = raid5f_stop,
836 	.submit_rw_request = raid5f_submit_rw_request,
837 	.get_io_channel = raid5f_get_io_channel,
838 };
839 RAID_MODULE_REGISTER(&g_raid5f_module)
840 
841 SPDK_LOG_REGISTER_COMPONENT(bdev_raid5f)
842