xref: /spdk/module/bdev/raid/raid5f.c (revision f6866117acb32c78d5ea7bd76ba330284655af35)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2022 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "bdev_raid.h"
7 
8 #include "spdk/env.h"
9 #include "spdk/thread.h"
10 #include "spdk/string.h"
11 #include "spdk/util.h"
12 #include "spdk/likely.h"
13 #include "spdk/log.h"
14 #include "spdk/xor.h"
15 
16 /* Maximum concurrent full stripe writes per io channel */
17 #define RAID5F_MAX_STRIPES 32
18 
19 struct chunk {
20 	/* Corresponds to base_bdev index */
21 	uint8_t index;
22 
23 	/* Array of iovecs */
24 	struct iovec *iovs;
25 
26 	/* Number of used iovecs */
27 	int iovcnt;
28 
29 	/* Total number of available iovecs in the array */
30 	int iovcnt_max;
31 
32 	/* Pointer to buffer with I/O metadata */
33 	void *md_buf;
34 
35 	/* Shallow copy of IO request parameters */
36 	struct spdk_bdev_ext_io_opts ext_opts;
37 };
38 
39 struct stripe_request {
40 	struct raid5f_io_channel *r5ch;
41 
42 	/* The associated raid_bdev_io */
43 	struct raid_bdev_io *raid_io;
44 
45 	/* The stripe's index in the raid array. */
46 	uint64_t stripe_index;
47 
48 	/* The stripe's parity chunk */
49 	struct chunk *parity_chunk;
50 
51 	/* Buffer for stripe parity */
52 	void *parity_buf;
53 
54 	/* Buffer for stripe io metadata parity */
55 	void *parity_md_buf;
56 
57 	TAILQ_ENTRY(stripe_request) link;
58 
59 	/* Array of chunks corresponding to base_bdevs */
60 	struct chunk chunks[0];
61 };
62 
63 struct raid5f_info {
64 	/* The parent raid bdev */
65 	struct raid_bdev *raid_bdev;
66 
67 	/* Number of data blocks in a stripe (without parity) */
68 	uint64_t stripe_blocks;
69 
70 	/* Number of stripes on this array */
71 	uint64_t total_stripes;
72 
73 	/* Alignment for buffer allocation */
74 	size_t buf_alignment;
75 };
76 
77 struct raid5f_io_channel {
78 	/* All available stripe requests on this channel */
79 	TAILQ_HEAD(, stripe_request) free_stripe_requests;
80 
81 	/* Array of iovec iterators for each data chunk */
82 	struct iov_iter {
83 		struct iovec *iovs;
84 		int iovcnt;
85 		int index;
86 		size_t offset;
87 	} *chunk_iov_iters;
88 
89 	/* Array of source buffer pointers for parity calculation */
90 	void **chunk_xor_buffers;
91 
92 	/* Array of source buffer pointers for parity calculation of io metadata */
93 	void **chunk_xor_md_buffers;
94 
95 	/* Bounce buffers for parity calculation in case of unaligned source buffers */
96 	struct iovec *chunk_xor_bounce_buffers;
97 };
98 
99 #define __CHUNK_IN_RANGE(req, c) \
100 	c < req->chunks + raid5f_ch_to_r5f_info(req->r5ch)->raid_bdev->num_base_bdevs
101 
102 #define FOR_EACH_CHUNK_FROM(req, c, from) \
103 	for (c = from; __CHUNK_IN_RANGE(req, c); c++)
104 
105 #define FOR_EACH_CHUNK(req, c) \
106 	FOR_EACH_CHUNK_FROM(req, c, req->chunks)
107 
108 #define __NEXT_DATA_CHUNK(req, c) \
109 	c == req->parity_chunk ? c+1 : c
110 
111 #define FOR_EACH_DATA_CHUNK(req, c) \
112 	for (c = __NEXT_DATA_CHUNK(req, req->chunks); __CHUNK_IN_RANGE(req, c); \
113 	     c = __NEXT_DATA_CHUNK(req, c+1))
114 
115 static inline struct raid5f_info *
116 raid5f_ch_to_r5f_info(struct raid5f_io_channel *r5ch)
117 {
118 	return spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(r5ch));
119 }
120 
121 static inline struct stripe_request *
122 raid5f_chunk_stripe_req(struct chunk *chunk)
123 {
124 	return SPDK_CONTAINEROF((chunk - chunk->index), struct stripe_request, chunks);
125 }
126 
127 static inline uint8_t
128 raid5f_stripe_data_chunks_num(const struct raid_bdev *raid_bdev)
129 {
130 	return raid_bdev->min_base_bdevs_operational;
131 }
132 
133 static inline uint8_t
134 raid5f_stripe_parity_chunk_index(const struct raid_bdev *raid_bdev, uint64_t stripe_index)
135 {
136 	return raid5f_stripe_data_chunks_num(raid_bdev) - stripe_index % raid_bdev->num_base_bdevs;
137 }
138 
139 static inline void
140 raid5f_stripe_request_release(struct stripe_request *stripe_req)
141 {
142 	TAILQ_INSERT_HEAD(&stripe_req->r5ch->free_stripe_requests, stripe_req, link);
143 }
144 
145 static int
146 raid5f_xor_stripe(struct stripe_request *stripe_req)
147 {
148 	struct raid_bdev_io *raid_io = stripe_req->raid_io;
149 	struct raid5f_io_channel *r5ch = stripe_req->r5ch;
150 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
151 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
152 	size_t remaining = raid_bdev->strip_size << raid_bdev->blocklen_shift;
153 	uint8_t n_src = raid5f_stripe_data_chunks_num(raid_bdev);
154 	void *dest = stripe_req->parity_buf;
155 	size_t alignment_mask = spdk_xor_get_optimal_alignment() - 1;
156 	void *raid_md = spdk_bdev_io_get_md_buf(bdev_io);
157 	uint32_t raid_md_size = spdk_bdev_get_md_size(&raid_bdev->bdev);
158 	struct chunk *chunk;
159 	int ret;
160 	uint8_t c;
161 
162 	c = 0;
163 	FOR_EACH_DATA_CHUNK(stripe_req, chunk) {
164 		struct iov_iter *iov_iter = &r5ch->chunk_iov_iters[c];
165 		bool aligned = true;
166 		int i;
167 
168 		for (i = 0; i < chunk->iovcnt; i++) {
169 			if (((uintptr_t)chunk->iovs[i].iov_base & alignment_mask) ||
170 			    (chunk->iovs[i].iov_len & alignment_mask)) {
171 				aligned = false;
172 				break;
173 			}
174 		}
175 
176 		if (aligned) {
177 			iov_iter->iovs = chunk->iovs;
178 			iov_iter->iovcnt = chunk->iovcnt;
179 		} else {
180 			iov_iter->iovs = &r5ch->chunk_xor_bounce_buffers[c];
181 			iov_iter->iovcnt = 1;
182 			spdk_iovcpy(chunk->iovs, chunk->iovcnt, iov_iter->iovs, iov_iter->iovcnt);
183 		}
184 
185 		iov_iter->index = 0;
186 		iov_iter->offset = 0;
187 
188 		c++;
189 	}
190 
191 	while (remaining > 0) {
192 		size_t len = remaining;
193 		uint8_t i;
194 
195 		for (i = 0; i < n_src; i++) {
196 			struct iov_iter *iov_iter = &r5ch->chunk_iov_iters[i];
197 			struct iovec *iov = &iov_iter->iovs[iov_iter->index];
198 
199 			len = spdk_min(len, iov->iov_len - iov_iter->offset);
200 			r5ch->chunk_xor_buffers[i] = iov->iov_base + iov_iter->offset;
201 		}
202 
203 		assert(len > 0);
204 
205 		ret = spdk_xor_gen(dest, r5ch->chunk_xor_buffers, n_src, len);
206 		if (spdk_unlikely(ret)) {
207 			SPDK_ERRLOG("stripe xor failed\n");
208 			return ret;
209 		}
210 
211 		for (i = 0; i < n_src; i++) {
212 			struct iov_iter *iov_iter = &r5ch->chunk_iov_iters[i];
213 			struct iovec *iov = &iov_iter->iovs[iov_iter->index];
214 
215 			iov_iter->offset += len;
216 			if (iov_iter->offset == iov->iov_len) {
217 				iov_iter->offset = 0;
218 				iov_iter->index++;
219 			}
220 		}
221 		dest += len;
222 
223 		remaining -= len;
224 	}
225 
226 	if (raid_md != NULL) {
227 		uint64_t len = raid_bdev->strip_size * raid_md_size;
228 		c = 0;
229 		FOR_EACH_DATA_CHUNK(stripe_req, chunk) {
230 			r5ch->chunk_xor_md_buffers[c] = chunk->md_buf;
231 			c++;
232 		}
233 		ret = spdk_xor_gen(stripe_req->parity_md_buf, r5ch->chunk_xor_md_buffers, n_src, len);
234 		if (spdk_unlikely(ret)) {
235 			SPDK_ERRLOG("stripe io metadata xor failed\n");
236 			return ret;
237 		}
238 	}
239 
240 	return 0;
241 }
242 
243 static void
244 raid5f_chunk_write_complete(struct chunk *chunk, enum spdk_bdev_io_status status)
245 {
246 	struct stripe_request *stripe_req = raid5f_chunk_stripe_req(chunk);
247 
248 	if (raid_bdev_io_complete_part(stripe_req->raid_io, 1, status)) {
249 		raid5f_stripe_request_release(stripe_req);
250 	}
251 }
252 
253 static void
254 raid5f_chunk_write_complete_bdev_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
255 {
256 	struct chunk *chunk = cb_arg;
257 
258 	spdk_bdev_free_io(bdev_io);
259 
260 	raid5f_chunk_write_complete(chunk, success ? SPDK_BDEV_IO_STATUS_SUCCESS :
261 				    SPDK_BDEV_IO_STATUS_FAILED);
262 }
263 
264 static void raid5f_stripe_request_submit_chunks(struct stripe_request *stripe_req);
265 
266 static void
267 raid5f_chunk_write_retry(void *_raid_io)
268 {
269 	struct raid_bdev_io *raid_io = _raid_io;
270 	struct stripe_request *stripe_req = raid_io->module_private;
271 
272 	raid5f_stripe_request_submit_chunks(stripe_req);
273 }
274 
275 static inline void
276 raid5f_init_ext_io_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts)
277 {
278 	memset(opts, 0, sizeof(*opts));
279 	opts->size = sizeof(*opts);
280 	opts->memory_domain = bdev_io->u.bdev.memory_domain;
281 	opts->memory_domain_ctx = bdev_io->u.bdev.memory_domain_ctx;
282 	opts->metadata = bdev_io->u.bdev.md_buf;
283 }
284 
285 static int
286 raid5f_chunk_write(struct chunk *chunk)
287 {
288 	struct stripe_request *stripe_req = raid5f_chunk_stripe_req(chunk);
289 	struct raid_bdev_io *raid_io = stripe_req->raid_io;
290 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
291 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
292 	struct raid_base_bdev_info *base_info = &raid_bdev->base_bdev_info[chunk->index];
293 	struct spdk_io_channel *base_ch = raid_io->raid_ch->base_channel[chunk->index];
294 	uint64_t base_offset_blocks = (stripe_req->stripe_index << raid_bdev->strip_size_shift);
295 	int ret;
296 
297 	raid5f_init_ext_io_opts(bdev_io, &chunk->ext_opts);
298 	chunk->ext_opts.metadata = chunk->md_buf;
299 
300 	ret = spdk_bdev_writev_blocks_ext(base_info->desc, base_ch, chunk->iovs, chunk->iovcnt,
301 					  base_offset_blocks, raid_bdev->strip_size, raid5f_chunk_write_complete_bdev_io,
302 					  chunk, &chunk->ext_opts);
303 
304 	if (spdk_unlikely(ret)) {
305 		if (ret == -ENOMEM) {
306 			raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch,
307 						raid5f_chunk_write_retry);
308 		} else {
309 			/*
310 			 * Implicitly complete any I/Os not yet submitted as FAILED. If completing
311 			 * these means there are no more to complete for the stripe request, we can
312 			 * release the stripe request as well.
313 			 */
314 			uint64_t base_bdev_io_not_submitted = raid_bdev->num_base_bdevs -
315 							      raid_io->base_bdev_io_submitted;
316 
317 			if (raid_bdev_io_complete_part(stripe_req->raid_io, base_bdev_io_not_submitted,
318 						       SPDK_BDEV_IO_STATUS_FAILED)) {
319 				raid5f_stripe_request_release(stripe_req);
320 			}
321 		}
322 	}
323 
324 	return ret;
325 }
326 
327 static int
328 raid5f_stripe_request_map_iovecs(struct stripe_request *stripe_req)
329 {
330 	struct raid_bdev *raid_bdev = stripe_req->raid_io->raid_bdev;
331 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(stripe_req->raid_io);
332 	const struct iovec *raid_io_iovs = bdev_io->u.bdev.iovs;
333 	int raid_io_iovcnt = bdev_io->u.bdev.iovcnt;
334 	void *raid_io_md = spdk_bdev_io_get_md_buf(bdev_io);
335 	uint32_t raid_io_md_size = spdk_bdev_get_md_size(&raid_bdev->bdev);
336 	struct chunk *chunk;
337 	int raid_io_iov_idx = 0;
338 	size_t raid_io_offset = 0;
339 	size_t raid_io_iov_offset = 0;
340 	int i;
341 
342 	FOR_EACH_DATA_CHUNK(stripe_req, chunk) {
343 		int chunk_iovcnt = 0;
344 		uint64_t len = raid_bdev->strip_size << raid_bdev->blocklen_shift;
345 		size_t off = raid_io_iov_offset;
346 
347 		for (i = raid_io_iov_idx; i < raid_io_iovcnt; i++) {
348 			chunk_iovcnt++;
349 			off += raid_io_iovs[i].iov_len;
350 			if (off >= raid_io_offset + len) {
351 				break;
352 			}
353 		}
354 
355 		assert(raid_io_iov_idx + chunk_iovcnt <= raid_io_iovcnt);
356 
357 		if (chunk_iovcnt > chunk->iovcnt_max) {
358 			struct iovec *iovs = chunk->iovs;
359 
360 			iovs = realloc(iovs, chunk_iovcnt * sizeof(*iovs));
361 			if (!iovs) {
362 				return -ENOMEM;
363 			}
364 			chunk->iovs = iovs;
365 			chunk->iovcnt_max = chunk_iovcnt;
366 		}
367 		chunk->iovcnt = chunk_iovcnt;
368 
369 		if (raid_io_md) {
370 			chunk->md_buf = raid_io_md +
371 					(raid_io_offset >> raid_bdev->blocklen_shift) * raid_io_md_size;
372 		}
373 
374 		for (i = 0; i < chunk_iovcnt; i++) {
375 			struct iovec *chunk_iov = &chunk->iovs[i];
376 			const struct iovec *raid_io_iov = &raid_io_iovs[raid_io_iov_idx];
377 			size_t chunk_iov_offset = raid_io_offset - raid_io_iov_offset;
378 
379 			chunk_iov->iov_base = raid_io_iov->iov_base + chunk_iov_offset;
380 			chunk_iov->iov_len = spdk_min(len, raid_io_iov->iov_len - chunk_iov_offset);
381 			raid_io_offset += chunk_iov->iov_len;
382 			len -= chunk_iov->iov_len;
383 
384 			if (raid_io_offset >= raid_io_iov_offset + raid_io_iov->iov_len) {
385 				raid_io_iov_idx++;
386 				raid_io_iov_offset += raid_io_iov->iov_len;
387 			}
388 		}
389 
390 		if (spdk_unlikely(len > 0)) {
391 			return -EINVAL;
392 		}
393 	}
394 
395 	stripe_req->parity_chunk->iovs[0].iov_base = stripe_req->parity_buf;
396 	stripe_req->parity_chunk->iovs[0].iov_len = raid_bdev->strip_size <<
397 			raid_bdev->blocklen_shift;
398 	stripe_req->parity_chunk->md_buf = stripe_req->parity_md_buf;
399 	stripe_req->parity_chunk->iovcnt = 1;
400 
401 	return 0;
402 }
403 
404 static void
405 raid5f_stripe_request_submit_chunks(struct stripe_request *stripe_req)
406 {
407 	struct raid_bdev_io *raid_io = stripe_req->raid_io;
408 	struct chunk *start = &stripe_req->chunks[raid_io->base_bdev_io_submitted];
409 	struct chunk *chunk;
410 
411 	FOR_EACH_CHUNK_FROM(stripe_req, chunk, start) {
412 		if (spdk_unlikely(raid5f_chunk_write(chunk) != 0)) {
413 			break;
414 		}
415 		raid_io->base_bdev_io_submitted++;
416 	}
417 }
418 
419 static void
420 raid5f_submit_stripe_request(struct stripe_request *stripe_req)
421 {
422 	if (spdk_unlikely(raid5f_xor_stripe(stripe_req) != 0)) {
423 		raid_bdev_io_complete(stripe_req->raid_io, SPDK_BDEV_IO_STATUS_FAILED);
424 		return;
425 	}
426 
427 	raid5f_stripe_request_submit_chunks(stripe_req);
428 }
429 
430 static int
431 raid5f_submit_write_request(struct raid_bdev_io *raid_io, uint64_t stripe_index)
432 {
433 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
434 	struct raid5f_io_channel *r5ch = spdk_io_channel_get_ctx(raid_io->raid_ch->module_channel);
435 	struct stripe_request *stripe_req;
436 	int ret;
437 
438 	stripe_req = TAILQ_FIRST(&r5ch->free_stripe_requests);
439 	if (!stripe_req) {
440 		return -ENOMEM;
441 	}
442 
443 	stripe_req->stripe_index = stripe_index;
444 	stripe_req->parity_chunk = stripe_req->chunks + raid5f_stripe_parity_chunk_index(raid_bdev,
445 				   stripe_req->stripe_index);
446 	stripe_req->raid_io = raid_io;
447 
448 	ret = raid5f_stripe_request_map_iovecs(stripe_req);
449 	if (spdk_unlikely(ret)) {
450 		return ret;
451 	}
452 
453 	TAILQ_REMOVE(&r5ch->free_stripe_requests, stripe_req, link);
454 
455 	raid_io->module_private = stripe_req;
456 	raid_io->base_bdev_io_remaining = raid_bdev->num_base_bdevs;
457 
458 	raid5f_submit_stripe_request(stripe_req);
459 
460 	return 0;
461 }
462 
463 static void
464 raid5f_chunk_read_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
465 {
466 	struct raid_bdev_io *raid_io = cb_arg;
467 
468 	spdk_bdev_free_io(bdev_io);
469 
470 	raid_bdev_io_complete(raid_io, success ? SPDK_BDEV_IO_STATUS_SUCCESS :
471 			      SPDK_BDEV_IO_STATUS_FAILED);
472 }
473 
474 static void raid5f_submit_rw_request(struct raid_bdev_io *raid_io);
475 
476 static void
477 _raid5f_submit_rw_request(void *_raid_io)
478 {
479 	struct raid_bdev_io *raid_io = _raid_io;
480 
481 	raid5f_submit_rw_request(raid_io);
482 }
483 
484 static int
485 raid5f_submit_read_request(struct raid_bdev_io *raid_io, uint64_t stripe_index,
486 			   uint64_t stripe_offset)
487 {
488 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
489 	uint8_t chunk_data_idx = stripe_offset >> raid_bdev->strip_size_shift;
490 	uint8_t p_idx = raid5f_stripe_parity_chunk_index(raid_bdev, stripe_index);
491 	uint8_t chunk_idx = chunk_data_idx < p_idx ? chunk_data_idx : chunk_data_idx + 1;
492 	struct raid_base_bdev_info *base_info = &raid_bdev->base_bdev_info[chunk_idx];
493 	struct spdk_io_channel *base_ch = raid_io->raid_ch->base_channel[chunk_idx];
494 	uint64_t chunk_offset = stripe_offset - (chunk_data_idx << raid_bdev->strip_size_shift);
495 	uint64_t base_offset_blocks = (stripe_index << raid_bdev->strip_size_shift) + chunk_offset;
496 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
497 	struct spdk_bdev_ext_io_opts io_opts;
498 	int ret;
499 
500 	raid5f_init_ext_io_opts(bdev_io, &io_opts);
501 	ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch, bdev_io->u.bdev.iovs,
502 					 bdev_io->u.bdev.iovcnt,
503 					 base_offset_blocks, bdev_io->u.bdev.num_blocks, raid5f_chunk_read_complete, raid_io,
504 					 &io_opts);
505 
506 	if (spdk_unlikely(ret == -ENOMEM)) {
507 		raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch,
508 					_raid5f_submit_rw_request);
509 		return 0;
510 	}
511 
512 	return ret;
513 }
514 
515 static void
516 raid5f_submit_rw_request(struct raid_bdev_io *raid_io)
517 {
518 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
519 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
520 	struct raid5f_info *r5f_info = raid_bdev->module_private;
521 	uint64_t offset_blocks = bdev_io->u.bdev.offset_blocks;
522 	uint64_t stripe_index = offset_blocks / r5f_info->stripe_blocks;
523 	uint64_t stripe_offset = offset_blocks % r5f_info->stripe_blocks;
524 	int ret;
525 
526 	switch (bdev_io->type) {
527 	case SPDK_BDEV_IO_TYPE_READ:
528 		assert(bdev_io->u.bdev.num_blocks <= raid_bdev->strip_size);
529 		ret = raid5f_submit_read_request(raid_io, stripe_index, stripe_offset);
530 		break;
531 	case SPDK_BDEV_IO_TYPE_WRITE:
532 		assert(stripe_offset == 0);
533 		assert(bdev_io->u.bdev.num_blocks == r5f_info->stripe_blocks);
534 		ret = raid5f_submit_write_request(raid_io, stripe_index);
535 		break;
536 	default:
537 		ret = -EINVAL;
538 		break;
539 	}
540 
541 	if (spdk_unlikely(ret)) {
542 		raid_bdev_io_complete(raid_io, ret == -ENOMEM ? SPDK_BDEV_IO_STATUS_NOMEM :
543 				      SPDK_BDEV_IO_STATUS_FAILED);
544 	}
545 }
546 
547 static void
548 raid5f_stripe_request_free(struct stripe_request *stripe_req)
549 {
550 	struct chunk *chunk;
551 
552 	FOR_EACH_CHUNK(stripe_req, chunk) {
553 		free(chunk->iovs);
554 	}
555 
556 	spdk_dma_free(stripe_req->parity_buf);
557 	spdk_dma_free(stripe_req->parity_md_buf);
558 
559 	free(stripe_req);
560 }
561 
562 static struct stripe_request *
563 raid5f_stripe_request_alloc(struct raid5f_io_channel *r5ch)
564 {
565 	struct raid5f_info *r5f_info = raid5f_ch_to_r5f_info(r5ch);
566 	struct raid_bdev *raid_bdev = r5f_info->raid_bdev;
567 	uint32_t raid_io_md_size = spdk_bdev_get_md_size(&raid_bdev->bdev);
568 	struct stripe_request *stripe_req;
569 	struct chunk *chunk;
570 
571 	stripe_req = calloc(1, sizeof(*stripe_req) +
572 			    sizeof(struct chunk) * raid_bdev->num_base_bdevs);
573 	if (!stripe_req) {
574 		return NULL;
575 	}
576 
577 	stripe_req->r5ch = r5ch;
578 
579 	FOR_EACH_CHUNK(stripe_req, chunk) {
580 		chunk->index = chunk - stripe_req->chunks;
581 		chunk->iovcnt_max = 4;
582 		chunk->iovs = calloc(chunk->iovcnt_max, sizeof(chunk->iovs[0]));
583 		if (!chunk->iovs) {
584 			goto err;
585 		}
586 	}
587 
588 	stripe_req->parity_buf = spdk_dma_malloc(raid_bdev->strip_size << raid_bdev->blocklen_shift,
589 				 r5f_info->buf_alignment, NULL);
590 	if (!stripe_req->parity_buf) {
591 		goto err;
592 	}
593 
594 	if (raid_io_md_size != 0) {
595 		stripe_req->parity_md_buf = spdk_dma_malloc(raid_bdev->strip_size * raid_io_md_size,
596 					    r5f_info->buf_alignment, NULL);
597 		if (!stripe_req->parity_md_buf) {
598 			goto err;
599 		}
600 	}
601 
602 	return stripe_req;
603 err:
604 	raid5f_stripe_request_free(stripe_req);
605 	return NULL;
606 }
607 
608 static void
609 raid5f_ioch_destroy(void *io_device, void *ctx_buf)
610 {
611 	struct raid5f_io_channel *r5ch = ctx_buf;
612 	struct raid5f_info *r5f_info = io_device;
613 	struct raid_bdev *raid_bdev = r5f_info->raid_bdev;
614 	struct stripe_request *stripe_req;
615 	int i;
616 
617 	while ((stripe_req = TAILQ_FIRST(&r5ch->free_stripe_requests))) {
618 		TAILQ_REMOVE(&r5ch->free_stripe_requests, stripe_req, link);
619 		raid5f_stripe_request_free(stripe_req);
620 	}
621 
622 	if (r5ch->chunk_xor_bounce_buffers) {
623 		for (i = 0; i < raid5f_stripe_data_chunks_num(raid_bdev); i++) {
624 			free(r5ch->chunk_xor_bounce_buffers[i].iov_base);
625 		}
626 		free(r5ch->chunk_xor_bounce_buffers);
627 	}
628 
629 	free(r5ch->chunk_xor_buffers);
630 	free(r5ch->chunk_xor_md_buffers);
631 	free(r5ch->chunk_iov_iters);
632 }
633 
634 static int
635 raid5f_ioch_create(void *io_device, void *ctx_buf)
636 {
637 	struct raid5f_io_channel *r5ch = ctx_buf;
638 	struct raid5f_info *r5f_info = io_device;
639 	struct raid_bdev *raid_bdev = r5f_info->raid_bdev;
640 	size_t chunk_len = raid_bdev->strip_size << raid_bdev->blocklen_shift;
641 	int status = 0;
642 	int i;
643 
644 	TAILQ_INIT(&r5ch->free_stripe_requests);
645 
646 	for (i = 0; i < RAID5F_MAX_STRIPES; i++) {
647 		struct stripe_request *stripe_req;
648 
649 		stripe_req = raid5f_stripe_request_alloc(r5ch);
650 		if (!stripe_req) {
651 			status = -ENOMEM;
652 			goto out;
653 		}
654 
655 		TAILQ_INSERT_HEAD(&r5ch->free_stripe_requests, stripe_req, link);
656 	}
657 
658 	r5ch->chunk_iov_iters = calloc(raid5f_stripe_data_chunks_num(raid_bdev),
659 				       sizeof(r5ch->chunk_iov_iters[0]));
660 	if (!r5ch->chunk_iov_iters) {
661 		status = -ENOMEM;
662 		goto out;
663 	}
664 
665 	r5ch->chunk_xor_buffers = calloc(raid5f_stripe_data_chunks_num(raid_bdev),
666 					 sizeof(r5ch->chunk_xor_buffers[0]));
667 	if (!r5ch->chunk_xor_buffers) {
668 		status = -ENOMEM;
669 		goto out;
670 	}
671 
672 	r5ch->chunk_xor_md_buffers = calloc(raid5f_stripe_data_chunks_num(raid_bdev),
673 					    sizeof(r5ch->chunk_xor_md_buffers[0]));
674 	if (!r5ch->chunk_xor_md_buffers) {
675 		status = -ENOMEM;
676 		goto out;
677 	}
678 
679 	r5ch->chunk_xor_bounce_buffers = calloc(raid5f_stripe_data_chunks_num(raid_bdev),
680 						sizeof(r5ch->chunk_xor_bounce_buffers[0]));
681 	if (!r5ch->chunk_xor_bounce_buffers) {
682 		status = -ENOMEM;
683 		goto out;
684 	}
685 
686 	for (i = 0; i < raid5f_stripe_data_chunks_num(raid_bdev); i++) {
687 		status = posix_memalign(&r5ch->chunk_xor_bounce_buffers[i].iov_base,
688 					spdk_xor_get_optimal_alignment(), chunk_len);
689 		if (status) {
690 			goto out;
691 		}
692 		r5ch->chunk_xor_bounce_buffers[i].iov_len = chunk_len;
693 	}
694 out:
695 	if (status) {
696 		SPDK_ERRLOG("Failed to initialize io channel\n");
697 		raid5f_ioch_destroy(r5f_info, r5ch);
698 	}
699 	return status;
700 }
701 
702 static int
703 raid5f_start(struct raid_bdev *raid_bdev)
704 {
705 	uint64_t min_blockcnt = UINT64_MAX;
706 	struct raid_base_bdev_info *base_info;
707 	struct raid5f_info *r5f_info;
708 	size_t alignment;
709 
710 	r5f_info = calloc(1, sizeof(*r5f_info));
711 	if (!r5f_info) {
712 		SPDK_ERRLOG("Failed to allocate r5f_info\n");
713 		return -ENOMEM;
714 	}
715 	r5f_info->raid_bdev = raid_bdev;
716 
717 	alignment = spdk_xor_get_optimal_alignment();
718 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
719 		min_blockcnt = spdk_min(min_blockcnt, base_info->bdev->blockcnt);
720 		alignment = spdk_max(alignment, spdk_bdev_get_buf_align(base_info->bdev));
721 	}
722 
723 	r5f_info->total_stripes = min_blockcnt / raid_bdev->strip_size;
724 	r5f_info->stripe_blocks = raid_bdev->strip_size * raid5f_stripe_data_chunks_num(raid_bdev);
725 	r5f_info->buf_alignment = alignment;
726 
727 	raid_bdev->bdev.blockcnt = r5f_info->stripe_blocks * r5f_info->total_stripes;
728 	raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size;
729 	raid_bdev->bdev.split_on_optimal_io_boundary = true;
730 	raid_bdev->bdev.write_unit_size = r5f_info->stripe_blocks;
731 	raid_bdev->bdev.split_on_write_unit = true;
732 
733 	raid_bdev->module_private = r5f_info;
734 
735 	spdk_io_device_register(r5f_info, raid5f_ioch_create, raid5f_ioch_destroy,
736 				sizeof(struct raid5f_io_channel), NULL);
737 
738 	return 0;
739 }
740 
741 static void
742 raid5f_io_device_unregister_done(void *io_device)
743 {
744 	struct raid5f_info *r5f_info = io_device;
745 
746 	raid_bdev_module_stop_done(r5f_info->raid_bdev);
747 
748 	free(r5f_info);
749 }
750 
751 static bool
752 raid5f_stop(struct raid_bdev *raid_bdev)
753 {
754 	struct raid5f_info *r5f_info = raid_bdev->module_private;
755 
756 	spdk_io_device_unregister(r5f_info, raid5f_io_device_unregister_done);
757 
758 	return false;
759 }
760 
761 static struct spdk_io_channel *
762 raid5f_get_io_channel(struct raid_bdev *raid_bdev)
763 {
764 	struct raid5f_info *r5f_info = raid_bdev->module_private;
765 
766 	return spdk_get_io_channel(r5f_info);
767 }
768 
769 static struct raid_bdev_module g_raid5f_module = {
770 	.level = RAID5F,
771 	.base_bdevs_min = 3,
772 	.base_bdevs_constraint = {CONSTRAINT_MAX_BASE_BDEVS_REMOVED, 1},
773 	.start = raid5f_start,
774 	.stop = raid5f_stop,
775 	.submit_rw_request = raid5f_submit_rw_request,
776 	.get_io_channel = raid5f_get_io_channel,
777 };
778 RAID_MODULE_REGISTER(&g_raid5f_module)
779 
780 SPDK_LOG_REGISTER_COMPONENT(bdev_raid5f)
781