xref: /spdk/module/bdev/raid/raid5f.c (revision b3bec07939ebe2ea2e0c43931705d32aa9e06719)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2022 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "bdev_raid.h"
7 
8 #include "spdk/env.h"
9 #include "spdk/thread.h"
10 #include "spdk/string.h"
11 #include "spdk/util.h"
12 #include "spdk/likely.h"
13 #include "spdk/log.h"
14 #include "spdk/accel.h"
15 
16 /* Maximum concurrent full stripe writes per io channel */
17 #define RAID5F_MAX_STRIPES 32
18 
19 struct chunk {
20 	/* Corresponds to base_bdev index */
21 	uint8_t index;
22 
23 	/* Array of iovecs */
24 	struct iovec *iovs;
25 
26 	/* Number of used iovecs */
27 	int iovcnt;
28 
29 	/* Total number of available iovecs in the array */
30 	int iovcnt_max;
31 
32 	/* Pointer to buffer with I/O metadata */
33 	void *md_buf;
34 
35 	/* Shallow copy of IO request parameters */
36 	struct spdk_bdev_ext_io_opts ext_opts;
37 };
38 
39 struct stripe_request;
40 typedef void (*stripe_req_xor_cb)(struct stripe_request *stripe_req, int status);
41 
42 struct stripe_request {
43 	enum stripe_request_type {
44 		STRIPE_REQ_WRITE,
45 	} type;
46 
47 	struct raid5f_io_channel *r5ch;
48 
49 	/* The associated raid_bdev_io */
50 	struct raid_bdev_io *raid_io;
51 
52 	/* The stripe's index in the raid array. */
53 	uint64_t stripe_index;
54 
55 	/* The stripe's parity chunk */
56 	struct chunk *parity_chunk;
57 
58 	union {
59 		struct {
60 			/* Buffer for stripe parity */
61 			void *parity_buf;
62 
63 			/* Buffer for stripe io metadata parity */
64 			void *parity_md_buf;
65 		} write;
66 	};
67 
68 	/* Array of iovec iterators for each chunk */
69 	struct spdk_ioviter *chunk_iov_iters;
70 
71 	/* Array of source buffer pointers for parity calculation */
72 	void **chunk_xor_buffers;
73 
74 	/* Array of source buffer pointers for parity calculation of io metadata */
75 	void **chunk_xor_md_buffers;
76 
77 	struct {
78 		size_t len;
79 		size_t remaining;
80 		size_t remaining_md;
81 		int status;
82 		stripe_req_xor_cb cb;
83 	} xor;
84 
85 	TAILQ_ENTRY(stripe_request) link;
86 
87 	/* Array of chunks corresponding to base_bdevs */
88 	struct chunk chunks[0];
89 };
90 
91 struct raid5f_info {
92 	/* The parent raid bdev */
93 	struct raid_bdev *raid_bdev;
94 
95 	/* Number of data blocks in a stripe (without parity) */
96 	uint64_t stripe_blocks;
97 
98 	/* Number of stripes on this array */
99 	uint64_t total_stripes;
100 
101 	/* Alignment for buffer allocation */
102 	size_t buf_alignment;
103 };
104 
105 struct raid5f_io_channel {
106 	/* All available stripe requests on this channel */
107 	struct {
108 		TAILQ_HEAD(, stripe_request) write;
109 	} free_stripe_requests;
110 
111 	/* accel_fw channel */
112 	struct spdk_io_channel *accel_ch;
113 
114 	/* For retrying xor if accel_ch runs out of resources */
115 	TAILQ_HEAD(, stripe_request) xor_retry_queue;
116 
117 	/* For iterating over chunk iovecs during xor calculation */
118 	void **chunk_xor_buffers;
119 	struct iovec **chunk_xor_iovs;
120 	size_t *chunk_xor_iovcnt;
121 };
122 
123 #define __CHUNK_IN_RANGE(req, c) \
124 	c < req->chunks + raid5f_ch_to_r5f_info(req->r5ch)->raid_bdev->num_base_bdevs
125 
126 #define FOR_EACH_CHUNK_FROM(req, c, from) \
127 	for (c = from; __CHUNK_IN_RANGE(req, c); c++)
128 
129 #define FOR_EACH_CHUNK(req, c) \
130 	FOR_EACH_CHUNK_FROM(req, c, req->chunks)
131 
132 #define __NEXT_DATA_CHUNK(req, c) \
133 	c == req->parity_chunk ? c+1 : c
134 
135 #define FOR_EACH_DATA_CHUNK(req, c) \
136 	for (c = __NEXT_DATA_CHUNK(req, req->chunks); __CHUNK_IN_RANGE(req, c); \
137 	     c = __NEXT_DATA_CHUNK(req, c+1))
138 
139 static inline struct raid5f_info *
140 raid5f_ch_to_r5f_info(struct raid5f_io_channel *r5ch)
141 {
142 	return spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(r5ch));
143 }
144 
145 static inline struct stripe_request *
146 raid5f_chunk_stripe_req(struct chunk *chunk)
147 {
148 	return SPDK_CONTAINEROF((chunk - chunk->index), struct stripe_request, chunks);
149 }
150 
151 static inline uint8_t
152 raid5f_stripe_data_chunks_num(const struct raid_bdev *raid_bdev)
153 {
154 	return raid_bdev->min_base_bdevs_operational;
155 }
156 
157 static inline uint8_t
158 raid5f_stripe_parity_chunk_index(const struct raid_bdev *raid_bdev, uint64_t stripe_index)
159 {
160 	return raid5f_stripe_data_chunks_num(raid_bdev) - stripe_index % raid_bdev->num_base_bdevs;
161 }
162 
163 static inline void
164 raid5f_stripe_request_release(struct stripe_request *stripe_req)
165 {
166 	if (spdk_likely(stripe_req->type == STRIPE_REQ_WRITE)) {
167 		TAILQ_INSERT_HEAD(&stripe_req->r5ch->free_stripe_requests.write, stripe_req, link);
168 	} else {
169 		assert(false);
170 	}
171 }
172 
173 static void raid5f_xor_stripe_retry(struct stripe_request *stripe_req);
174 
175 static void
176 raid5f_xor_stripe_done(struct stripe_request *stripe_req)
177 {
178 	struct raid5f_io_channel *r5ch = stripe_req->r5ch;
179 
180 	if (stripe_req->xor.status != 0) {
181 		SPDK_ERRLOG("stripe xor failed: %s\n", spdk_strerror(-stripe_req->xor.status));
182 	}
183 
184 	stripe_req->xor.cb(stripe_req, stripe_req->xor.status);
185 
186 	if (!TAILQ_EMPTY(&r5ch->xor_retry_queue)) {
187 		stripe_req = TAILQ_FIRST(&r5ch->xor_retry_queue);
188 		TAILQ_REMOVE(&r5ch->xor_retry_queue, stripe_req, link);
189 		raid5f_xor_stripe_retry(stripe_req);
190 	}
191 }
192 
193 static void raid5f_xor_stripe_continue(struct stripe_request *stripe_req);
194 
195 static void
196 _raid5f_xor_stripe_cb(struct stripe_request *stripe_req, int status)
197 {
198 	if (status != 0) {
199 		stripe_req->xor.status = status;
200 	}
201 
202 	if (stripe_req->xor.remaining + stripe_req->xor.remaining_md == 0) {
203 		raid5f_xor_stripe_done(stripe_req);
204 	}
205 }
206 
207 static void
208 raid5f_xor_stripe_cb(void *_stripe_req, int status)
209 {
210 	struct stripe_request *stripe_req = _stripe_req;
211 
212 	stripe_req->xor.remaining -= stripe_req->xor.len;
213 
214 	if (stripe_req->xor.remaining > 0) {
215 		stripe_req->xor.len = spdk_ioviter_nextv(stripe_req->chunk_iov_iters,
216 				      stripe_req->r5ch->chunk_xor_buffers);
217 		raid5f_xor_stripe_continue(stripe_req);
218 	}
219 
220 	_raid5f_xor_stripe_cb(stripe_req, status);
221 }
222 
223 static void
224 raid5f_xor_stripe_md_cb(void *_stripe_req, int status)
225 {
226 	struct stripe_request *stripe_req = _stripe_req;
227 
228 	stripe_req->xor.remaining_md = 0;
229 
230 	_raid5f_xor_stripe_cb(stripe_req, status);
231 }
232 
233 static void
234 raid5f_xor_stripe_continue(struct stripe_request *stripe_req)
235 {
236 	struct raid5f_io_channel *r5ch = stripe_req->r5ch;
237 	struct raid_bdev_io *raid_io = stripe_req->raid_io;
238 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
239 	uint8_t n_src = raid5f_stripe_data_chunks_num(raid_bdev);
240 	uint8_t i;
241 	int ret;
242 
243 	assert(stripe_req->xor.len > 0);
244 
245 	for (i = 0; i < n_src; i++) {
246 		stripe_req->chunk_xor_buffers[i] = r5ch->chunk_xor_buffers[i];
247 	}
248 
249 	ret = spdk_accel_submit_xor(r5ch->accel_ch, r5ch->chunk_xor_buffers[n_src],
250 				    stripe_req->chunk_xor_buffers, n_src, stripe_req->xor.len,
251 				    raid5f_xor_stripe_cb, stripe_req);
252 	if (spdk_unlikely(ret)) {
253 		if (ret == -ENOMEM) {
254 			TAILQ_INSERT_HEAD(&r5ch->xor_retry_queue, stripe_req, link);
255 		} else {
256 			stripe_req->xor.status = ret;
257 			raid5f_xor_stripe_done(stripe_req);
258 		}
259 	}
260 }
261 
262 static void
263 raid5f_xor_stripe(struct stripe_request *stripe_req, stripe_req_xor_cb cb)
264 {
265 	struct raid5f_io_channel *r5ch = stripe_req->r5ch;
266 	struct raid_bdev_io *raid_io = stripe_req->raid_io;
267 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
268 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
269 	void *raid_md = spdk_bdev_io_get_md_buf(bdev_io);
270 	uint32_t raid_md_size = spdk_bdev_get_md_size(&raid_bdev->bdev);
271 	struct chunk *chunk;
272 	uint8_t c;
273 
274 	assert(cb != NULL);
275 	assert(stripe_req->type == STRIPE_REQ_WRITE);
276 
277 	c = 0;
278 	FOR_EACH_DATA_CHUNK(stripe_req, chunk) {
279 		r5ch->chunk_xor_iovs[c] = chunk->iovs;
280 		r5ch->chunk_xor_iovcnt[c] = chunk->iovcnt;
281 		c++;
282 	}
283 	r5ch->chunk_xor_iovs[c] = stripe_req->parity_chunk->iovs;
284 	r5ch->chunk_xor_iovcnt[c] = stripe_req->parity_chunk->iovcnt;
285 
286 	stripe_req->xor.len = spdk_ioviter_firstv(stripe_req->chunk_iov_iters,
287 			      raid_bdev->num_base_bdevs,
288 			      r5ch->chunk_xor_iovs,
289 			      r5ch->chunk_xor_iovcnt,
290 			      r5ch->chunk_xor_buffers);
291 	stripe_req->xor.remaining = raid_bdev->strip_size << raid_bdev->blocklen_shift;
292 	stripe_req->xor.status = 0;
293 	stripe_req->xor.cb = cb;
294 
295 	if (raid_md != NULL) {
296 		uint8_t n_src = raid5f_stripe_data_chunks_num(raid_bdev);
297 		uint64_t len = raid_bdev->strip_size * raid_md_size;
298 		int ret;
299 
300 		stripe_req->xor.remaining_md = len;
301 
302 		c = 0;
303 		FOR_EACH_DATA_CHUNK(stripe_req, chunk) {
304 			stripe_req->chunk_xor_md_buffers[c] = chunk->md_buf;
305 			c++;
306 		}
307 
308 		ret = spdk_accel_submit_xor(stripe_req->r5ch->accel_ch, stripe_req->write.parity_md_buf,
309 					    stripe_req->chunk_xor_md_buffers, n_src, len,
310 					    raid5f_xor_stripe_md_cb, stripe_req);
311 		if (spdk_unlikely(ret)) {
312 			if (ret == -ENOMEM) {
313 				TAILQ_INSERT_HEAD(&stripe_req->r5ch->xor_retry_queue, stripe_req, link);
314 			} else {
315 				stripe_req->xor.status = ret;
316 				raid5f_xor_stripe_done(stripe_req);
317 			}
318 			return;
319 		}
320 	}
321 
322 	raid5f_xor_stripe_continue(stripe_req);
323 }
324 
325 static void
326 raid5f_xor_stripe_retry(struct stripe_request *stripe_req)
327 {
328 	if (stripe_req->xor.remaining_md) {
329 		raid5f_xor_stripe(stripe_req, stripe_req->xor.cb);
330 	} else {
331 		raid5f_xor_stripe_continue(stripe_req);
332 	}
333 }
334 
335 static void
336 raid5f_stripe_request_chunk_write_complete(struct stripe_request *stripe_req,
337 		enum spdk_bdev_io_status status)
338 {
339 	if (raid_bdev_io_complete_part(stripe_req->raid_io, 1, status)) {
340 		raid5f_stripe_request_release(stripe_req);
341 	}
342 }
343 
344 static void
345 raid5f_chunk_complete_bdev_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
346 {
347 	struct chunk *chunk = cb_arg;
348 	struct stripe_request *stripe_req = raid5f_chunk_stripe_req(chunk);
349 	enum spdk_bdev_io_status status = success ? SPDK_BDEV_IO_STATUS_SUCCESS :
350 					  SPDK_BDEV_IO_STATUS_FAILED;
351 
352 	spdk_bdev_free_io(bdev_io);
353 
354 	if (spdk_likely(stripe_req->type == STRIPE_REQ_WRITE)) {
355 		raid5f_stripe_request_chunk_write_complete(stripe_req, status);
356 	} else {
357 		assert(false);
358 	}
359 }
360 
361 static void raid5f_stripe_request_submit_chunks(struct stripe_request *stripe_req);
362 
363 static void
364 raid5f_chunk_submit_retry(void *_raid_io)
365 {
366 	struct raid_bdev_io *raid_io = _raid_io;
367 	struct stripe_request *stripe_req = raid_io->module_private;
368 
369 	raid5f_stripe_request_submit_chunks(stripe_req);
370 }
371 
372 static inline void
373 raid5f_init_ext_io_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts)
374 {
375 	memset(opts, 0, sizeof(*opts));
376 	opts->size = sizeof(*opts);
377 	opts->memory_domain = bdev_io->u.bdev.memory_domain;
378 	opts->memory_domain_ctx = bdev_io->u.bdev.memory_domain_ctx;
379 	opts->metadata = bdev_io->u.bdev.md_buf;
380 }
381 
382 static int
383 raid5f_chunk_submit(struct chunk *chunk)
384 {
385 	struct stripe_request *stripe_req = raid5f_chunk_stripe_req(chunk);
386 	struct raid_bdev_io *raid_io = stripe_req->raid_io;
387 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
388 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
389 	struct raid_base_bdev_info *base_info = &raid_bdev->base_bdev_info[chunk->index];
390 	struct spdk_io_channel *base_ch = raid_io->raid_ch->base_channel[chunk->index];
391 	uint64_t base_offset_blocks = (stripe_req->stripe_index << raid_bdev->strip_size_shift);
392 	int ret;
393 
394 	raid5f_init_ext_io_opts(bdev_io, &chunk->ext_opts);
395 	chunk->ext_opts.metadata = chunk->md_buf;
396 
397 	switch (stripe_req->type) {
398 	case STRIPE_REQ_WRITE:
399 		ret = spdk_bdev_writev_blocks_ext(base_info->desc, base_ch, chunk->iovs, chunk->iovcnt,
400 						  base_offset_blocks, raid_bdev->strip_size,
401 						  raid5f_chunk_complete_bdev_io, chunk,
402 						  &chunk->ext_opts);
403 		break;
404 	default:
405 		assert(false);
406 		ret = -EINVAL;
407 		break;
408 	}
409 
410 	if (spdk_unlikely(ret)) {
411 		if (ret == -ENOMEM) {
412 			raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
413 						base_ch, raid5f_chunk_submit_retry);
414 		} else {
415 			/*
416 			 * Implicitly complete any I/Os not yet submitted as FAILED. If completing
417 			 * these means there are no more to complete for the stripe request, we can
418 			 * release the stripe request as well.
419 			 */
420 			uint64_t base_bdev_io_not_submitted = raid_bdev->num_base_bdevs -
421 							      raid_io->base_bdev_io_submitted;
422 
423 			if (raid_bdev_io_complete_part(stripe_req->raid_io, base_bdev_io_not_submitted,
424 						       SPDK_BDEV_IO_STATUS_FAILED)) {
425 				raid5f_stripe_request_release(stripe_req);
426 			}
427 		}
428 	}
429 
430 	return ret;
431 }
432 
433 static int
434 raid5f_chunk_set_iovcnt(struct chunk *chunk, int iovcnt)
435 {
436 	if (iovcnt > chunk->iovcnt_max) {
437 		struct iovec *iovs = chunk->iovs;
438 
439 		iovs = realloc(iovs, iovcnt * sizeof(*iovs));
440 		if (!iovs) {
441 			return -ENOMEM;
442 		}
443 		chunk->iovs = iovs;
444 		chunk->iovcnt_max = iovcnt;
445 	}
446 	chunk->iovcnt = iovcnt;
447 
448 	return 0;
449 }
450 
451 static int
452 raid5f_stripe_request_map_iovecs(struct stripe_request *stripe_req)
453 {
454 	struct raid_bdev *raid_bdev = stripe_req->raid_io->raid_bdev;
455 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(stripe_req->raid_io);
456 	const struct iovec *raid_io_iovs = bdev_io->u.bdev.iovs;
457 	int raid_io_iovcnt = bdev_io->u.bdev.iovcnt;
458 	void *raid_io_md = spdk_bdev_io_get_md_buf(bdev_io);
459 	uint32_t raid_io_md_size = spdk_bdev_get_md_size(&raid_bdev->bdev);
460 	struct chunk *chunk;
461 	int raid_io_iov_idx = 0;
462 	size_t raid_io_offset = 0;
463 	size_t raid_io_iov_offset = 0;
464 	int i;
465 
466 	FOR_EACH_DATA_CHUNK(stripe_req, chunk) {
467 		int chunk_iovcnt = 0;
468 		uint64_t len = raid_bdev->strip_size << raid_bdev->blocklen_shift;
469 		size_t off = raid_io_iov_offset;
470 		int ret;
471 
472 		for (i = raid_io_iov_idx; i < raid_io_iovcnt; i++) {
473 			chunk_iovcnt++;
474 			off += raid_io_iovs[i].iov_len;
475 			if (off >= raid_io_offset + len) {
476 				break;
477 			}
478 		}
479 
480 		assert(raid_io_iov_idx + chunk_iovcnt <= raid_io_iovcnt);
481 
482 		ret = raid5f_chunk_set_iovcnt(chunk, chunk_iovcnt);
483 		if (ret) {
484 			return ret;
485 		}
486 
487 		if (raid_io_md) {
488 			chunk->md_buf = raid_io_md +
489 					(raid_io_offset >> raid_bdev->blocklen_shift) * raid_io_md_size;
490 		}
491 
492 		for (i = 0; i < chunk_iovcnt; i++) {
493 			struct iovec *chunk_iov = &chunk->iovs[i];
494 			const struct iovec *raid_io_iov = &raid_io_iovs[raid_io_iov_idx];
495 			size_t chunk_iov_offset = raid_io_offset - raid_io_iov_offset;
496 
497 			chunk_iov->iov_base = raid_io_iov->iov_base + chunk_iov_offset;
498 			chunk_iov->iov_len = spdk_min(len, raid_io_iov->iov_len - chunk_iov_offset);
499 			raid_io_offset += chunk_iov->iov_len;
500 			len -= chunk_iov->iov_len;
501 
502 			if (raid_io_offset >= raid_io_iov_offset + raid_io_iov->iov_len) {
503 				raid_io_iov_idx++;
504 				raid_io_iov_offset += raid_io_iov->iov_len;
505 			}
506 		}
507 
508 		if (spdk_unlikely(len > 0)) {
509 			return -EINVAL;
510 		}
511 	}
512 
513 	stripe_req->parity_chunk->iovs[0].iov_base = stripe_req->write.parity_buf;
514 	stripe_req->parity_chunk->iovs[0].iov_len = raid_bdev->strip_size << raid_bdev->blocklen_shift;
515 	stripe_req->parity_chunk->iovcnt = 1;
516 	stripe_req->parity_chunk->md_buf = stripe_req->write.parity_md_buf;
517 
518 	return 0;
519 }
520 
521 static void
522 raid5f_stripe_request_submit_chunks(struct stripe_request *stripe_req)
523 {
524 	struct raid_bdev_io *raid_io = stripe_req->raid_io;
525 	struct chunk *start = &stripe_req->chunks[raid_io->base_bdev_io_submitted];
526 	struct chunk *chunk;
527 
528 	FOR_EACH_CHUNK_FROM(stripe_req, chunk, start) {
529 		if (spdk_unlikely(raid5f_chunk_submit(chunk) != 0)) {
530 			break;
531 		}
532 		raid_io->base_bdev_io_submitted++;
533 	}
534 }
535 
536 static void
537 raid5f_stripe_write_request_xor_done(struct stripe_request *stripe_req, int status)
538 {
539 	struct raid_bdev_io *raid_io = stripe_req->raid_io;
540 
541 	if (status != 0) {
542 		raid5f_stripe_request_release(stripe_req);
543 		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
544 	} else {
545 		raid5f_stripe_request_submit_chunks(stripe_req);
546 	}
547 }
548 
549 static int
550 raid5f_submit_write_request(struct raid_bdev_io *raid_io, uint64_t stripe_index)
551 {
552 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
553 	struct raid5f_io_channel *r5ch = spdk_io_channel_get_ctx(raid_io->raid_ch->module_channel);
554 	struct stripe_request *stripe_req;
555 	int ret;
556 
557 	stripe_req = TAILQ_FIRST(&r5ch->free_stripe_requests.write);
558 	if (!stripe_req) {
559 		return -ENOMEM;
560 	}
561 
562 	stripe_req->stripe_index = stripe_index;
563 	stripe_req->parity_chunk = stripe_req->chunks + raid5f_stripe_parity_chunk_index(raid_bdev,
564 				   stripe_req->stripe_index);
565 	stripe_req->raid_io = raid_io;
566 
567 	ret = raid5f_stripe_request_map_iovecs(stripe_req);
568 	if (spdk_unlikely(ret)) {
569 		return ret;
570 	}
571 
572 	TAILQ_REMOVE(&r5ch->free_stripe_requests.write, stripe_req, link);
573 
574 	raid_io->module_private = stripe_req;
575 	raid_io->base_bdev_io_remaining = raid_bdev->num_base_bdevs;
576 
577 	raid5f_xor_stripe(stripe_req, raid5f_stripe_write_request_xor_done);
578 
579 	return 0;
580 }
581 
582 static void
583 raid5f_chunk_read_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
584 {
585 	struct raid_bdev_io *raid_io = cb_arg;
586 
587 	spdk_bdev_free_io(bdev_io);
588 
589 	raid_bdev_io_complete(raid_io, success ? SPDK_BDEV_IO_STATUS_SUCCESS :
590 			      SPDK_BDEV_IO_STATUS_FAILED);
591 }
592 
593 static void raid5f_submit_rw_request(struct raid_bdev_io *raid_io);
594 
595 static void
596 _raid5f_submit_rw_request(void *_raid_io)
597 {
598 	struct raid_bdev_io *raid_io = _raid_io;
599 
600 	raid5f_submit_rw_request(raid_io);
601 }
602 
603 static int
604 raid5f_submit_read_request(struct raid_bdev_io *raid_io, uint64_t stripe_index,
605 			   uint64_t stripe_offset)
606 {
607 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
608 	uint8_t chunk_data_idx = stripe_offset >> raid_bdev->strip_size_shift;
609 	uint8_t p_idx = raid5f_stripe_parity_chunk_index(raid_bdev, stripe_index);
610 	uint8_t chunk_idx = chunk_data_idx < p_idx ? chunk_data_idx : chunk_data_idx + 1;
611 	struct raid_base_bdev_info *base_info = &raid_bdev->base_bdev_info[chunk_idx];
612 	struct spdk_io_channel *base_ch = raid_io->raid_ch->base_channel[chunk_idx];
613 	uint64_t chunk_offset = stripe_offset - (chunk_data_idx << raid_bdev->strip_size_shift);
614 	uint64_t base_offset_blocks = (stripe_index << raid_bdev->strip_size_shift) + chunk_offset;
615 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
616 	struct spdk_bdev_ext_io_opts io_opts;
617 	int ret;
618 
619 	raid5f_init_ext_io_opts(bdev_io, &io_opts);
620 	ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch, bdev_io->u.bdev.iovs,
621 					 bdev_io->u.bdev.iovcnt,
622 					 base_offset_blocks, bdev_io->u.bdev.num_blocks, raid5f_chunk_read_complete, raid_io,
623 					 &io_opts);
624 
625 	if (spdk_unlikely(ret == -ENOMEM)) {
626 		raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
627 					base_ch, _raid5f_submit_rw_request);
628 		return 0;
629 	}
630 
631 	return ret;
632 }
633 
634 static void
635 raid5f_submit_rw_request(struct raid_bdev_io *raid_io)
636 {
637 	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
638 	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
639 	struct raid5f_info *r5f_info = raid_bdev->module_private;
640 	uint64_t offset_blocks = bdev_io->u.bdev.offset_blocks;
641 	uint64_t stripe_index = offset_blocks / r5f_info->stripe_blocks;
642 	uint64_t stripe_offset = offset_blocks % r5f_info->stripe_blocks;
643 	int ret;
644 
645 	switch (bdev_io->type) {
646 	case SPDK_BDEV_IO_TYPE_READ:
647 		assert(bdev_io->u.bdev.num_blocks <= raid_bdev->strip_size);
648 		ret = raid5f_submit_read_request(raid_io, stripe_index, stripe_offset);
649 		break;
650 	case SPDK_BDEV_IO_TYPE_WRITE:
651 		assert(stripe_offset == 0);
652 		assert(bdev_io->u.bdev.num_blocks == r5f_info->stripe_blocks);
653 		ret = raid5f_submit_write_request(raid_io, stripe_index);
654 		break;
655 	default:
656 		ret = -EINVAL;
657 		break;
658 	}
659 
660 	if (spdk_unlikely(ret)) {
661 		raid_bdev_io_complete(raid_io, ret == -ENOMEM ? SPDK_BDEV_IO_STATUS_NOMEM :
662 				      SPDK_BDEV_IO_STATUS_FAILED);
663 	}
664 }
665 
666 static void
667 raid5f_stripe_request_free(struct stripe_request *stripe_req)
668 {
669 	struct chunk *chunk;
670 
671 	FOR_EACH_CHUNK(stripe_req, chunk) {
672 		free(chunk->iovs);
673 	}
674 
675 	if (stripe_req->type == STRIPE_REQ_WRITE) {
676 		spdk_dma_free(stripe_req->write.parity_buf);
677 		spdk_dma_free(stripe_req->write.parity_md_buf);
678 	} else {
679 		assert(false);
680 	}
681 
682 	free(stripe_req->chunk_xor_buffers);
683 	free(stripe_req->chunk_xor_md_buffers);
684 	free(stripe_req->chunk_iov_iters);
685 
686 	free(stripe_req);
687 }
688 
689 static struct stripe_request *
690 raid5f_stripe_request_alloc(struct raid5f_io_channel *r5ch, enum stripe_request_type type)
691 {
692 	struct raid5f_info *r5f_info = raid5f_ch_to_r5f_info(r5ch);
693 	struct raid_bdev *raid_bdev = r5f_info->raid_bdev;
694 	uint32_t raid_io_md_size = spdk_bdev_get_md_size(&raid_bdev->bdev);
695 	struct stripe_request *stripe_req;
696 	struct chunk *chunk;
697 
698 	stripe_req = calloc(1, sizeof(*stripe_req) + sizeof(*chunk) * raid_bdev->num_base_bdevs);
699 	if (!stripe_req) {
700 		return NULL;
701 	}
702 
703 	stripe_req->r5ch = r5ch;
704 	stripe_req->type = type;
705 
706 	FOR_EACH_CHUNK(stripe_req, chunk) {
707 		chunk->index = chunk - stripe_req->chunks;
708 		chunk->iovcnt_max = 4;
709 		chunk->iovs = calloc(chunk->iovcnt_max, sizeof(chunk->iovs[0]));
710 		if (!chunk->iovs) {
711 			goto err;
712 		}
713 	}
714 
715 	if (type == STRIPE_REQ_WRITE) {
716 		stripe_req->write.parity_buf = spdk_dma_malloc(raid_bdev->strip_size << raid_bdev->blocklen_shift,
717 					       r5f_info->buf_alignment, NULL);
718 		if (!stripe_req->write.parity_buf) {
719 			goto err;
720 		}
721 
722 		if (raid_io_md_size != 0) {
723 			stripe_req->write.parity_md_buf = spdk_dma_malloc(raid_bdev->strip_size * raid_io_md_size,
724 							  r5f_info->buf_alignment, NULL);
725 			if (!stripe_req->write.parity_md_buf) {
726 				goto err;
727 			}
728 		}
729 	} else {
730 		assert(false);
731 		return NULL;
732 	}
733 
734 	stripe_req->chunk_iov_iters = malloc(SPDK_IOVITER_SIZE(raid_bdev->num_base_bdevs));
735 	if (!stripe_req->chunk_iov_iters) {
736 		goto err;
737 	}
738 
739 	stripe_req->chunk_xor_buffers = calloc(raid5f_stripe_data_chunks_num(raid_bdev),
740 					       sizeof(stripe_req->chunk_xor_buffers[0]));
741 	if (!stripe_req->chunk_xor_buffers) {
742 		goto err;
743 	}
744 
745 	stripe_req->chunk_xor_md_buffers = calloc(raid5f_stripe_data_chunks_num(raid_bdev),
746 					   sizeof(stripe_req->chunk_xor_md_buffers[0]));
747 	if (!stripe_req->chunk_xor_md_buffers) {
748 		goto err;
749 	}
750 
751 	return stripe_req;
752 err:
753 	raid5f_stripe_request_free(stripe_req);
754 	return NULL;
755 }
756 
757 static void
758 raid5f_ioch_destroy(void *io_device, void *ctx_buf)
759 {
760 	struct raid5f_io_channel *r5ch = ctx_buf;
761 	struct stripe_request *stripe_req;
762 
763 	assert(TAILQ_EMPTY(&r5ch->xor_retry_queue));
764 
765 	while ((stripe_req = TAILQ_FIRST(&r5ch->free_stripe_requests.write))) {
766 		TAILQ_REMOVE(&r5ch->free_stripe_requests.write, stripe_req, link);
767 		raid5f_stripe_request_free(stripe_req);
768 	}
769 
770 	if (r5ch->accel_ch) {
771 		spdk_put_io_channel(r5ch->accel_ch);
772 	}
773 
774 	free(r5ch->chunk_xor_buffers);
775 	free(r5ch->chunk_xor_iovs);
776 	free(r5ch->chunk_xor_iovcnt);
777 }
778 
779 static int
780 raid5f_ioch_create(void *io_device, void *ctx_buf)
781 {
782 	struct raid5f_io_channel *r5ch = ctx_buf;
783 	struct raid5f_info *r5f_info = io_device;
784 	struct raid_bdev *raid_bdev = r5f_info->raid_bdev;
785 	int i;
786 
787 	TAILQ_INIT(&r5ch->free_stripe_requests.write);
788 	TAILQ_INIT(&r5ch->xor_retry_queue);
789 
790 	for (i = 0; i < RAID5F_MAX_STRIPES; i++) {
791 		struct stripe_request *stripe_req;
792 
793 		stripe_req = raid5f_stripe_request_alloc(r5ch, STRIPE_REQ_WRITE);
794 		if (!stripe_req) {
795 			goto err;
796 		}
797 
798 		TAILQ_INSERT_HEAD(&r5ch->free_stripe_requests.write, stripe_req, link);
799 	}
800 
801 	r5ch->accel_ch = spdk_accel_get_io_channel();
802 	if (!r5ch->accel_ch) {
803 		SPDK_ERRLOG("Failed to get accel framework's IO channel\n");
804 		goto err;
805 	}
806 
807 	r5ch->chunk_xor_buffers = calloc(raid_bdev->num_base_bdevs, sizeof(*r5ch->chunk_xor_buffers));
808 	if (!r5ch->chunk_xor_buffers) {
809 		goto err;
810 	}
811 
812 	r5ch->chunk_xor_iovs = calloc(raid_bdev->num_base_bdevs, sizeof(*r5ch->chunk_xor_iovs));
813 	if (!r5ch->chunk_xor_iovs) {
814 		goto err;
815 	}
816 
817 	r5ch->chunk_xor_iovcnt = calloc(raid_bdev->num_base_bdevs, sizeof(*r5ch->chunk_xor_iovcnt));
818 	if (!r5ch->chunk_xor_iovcnt) {
819 		goto err;
820 	}
821 
822 	return 0;
823 err:
824 	SPDK_ERRLOG("Failed to initialize io channel\n");
825 	raid5f_ioch_destroy(r5f_info, r5ch);
826 	return -ENOMEM;
827 }
828 
829 static int
830 raid5f_start(struct raid_bdev *raid_bdev)
831 {
832 	uint64_t min_blockcnt = UINT64_MAX;
833 	struct raid_base_bdev_info *base_info;
834 	struct raid5f_info *r5f_info;
835 	size_t alignment = 0;
836 
837 	r5f_info = calloc(1, sizeof(*r5f_info));
838 	if (!r5f_info) {
839 		SPDK_ERRLOG("Failed to allocate r5f_info\n");
840 		return -ENOMEM;
841 	}
842 	r5f_info->raid_bdev = raid_bdev;
843 
844 	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
845 		struct spdk_bdev *base_bdev;
846 
847 		base_bdev = spdk_bdev_desc_get_bdev(base_info->desc);
848 		min_blockcnt = spdk_min(min_blockcnt, base_bdev->blockcnt);
849 		alignment = spdk_max(alignment, spdk_bdev_get_buf_align(base_bdev));
850 	}
851 
852 	r5f_info->total_stripes = min_blockcnt / raid_bdev->strip_size;
853 	r5f_info->stripe_blocks = raid_bdev->strip_size * raid5f_stripe_data_chunks_num(raid_bdev);
854 	r5f_info->buf_alignment = alignment;
855 
856 	raid_bdev->bdev.blockcnt = r5f_info->stripe_blocks * r5f_info->total_stripes;
857 	raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size;
858 	raid_bdev->bdev.split_on_optimal_io_boundary = true;
859 	raid_bdev->bdev.write_unit_size = r5f_info->stripe_blocks;
860 	raid_bdev->bdev.split_on_write_unit = true;
861 
862 	raid_bdev->module_private = r5f_info;
863 
864 	spdk_io_device_register(r5f_info, raid5f_ioch_create, raid5f_ioch_destroy,
865 				sizeof(struct raid5f_io_channel), NULL);
866 
867 	return 0;
868 }
869 
870 static void
871 raid5f_io_device_unregister_done(void *io_device)
872 {
873 	struct raid5f_info *r5f_info = io_device;
874 
875 	raid_bdev_module_stop_done(r5f_info->raid_bdev);
876 
877 	free(r5f_info);
878 }
879 
880 static bool
881 raid5f_stop(struct raid_bdev *raid_bdev)
882 {
883 	struct raid5f_info *r5f_info = raid_bdev->module_private;
884 
885 	spdk_io_device_unregister(r5f_info, raid5f_io_device_unregister_done);
886 
887 	return false;
888 }
889 
890 static struct spdk_io_channel *
891 raid5f_get_io_channel(struct raid_bdev *raid_bdev)
892 {
893 	struct raid5f_info *r5f_info = raid_bdev->module_private;
894 
895 	return spdk_get_io_channel(r5f_info);
896 }
897 
898 static struct raid_bdev_module g_raid5f_module = {
899 	.level = RAID5F,
900 	.base_bdevs_min = 3,
901 	.base_bdevs_constraint = {CONSTRAINT_MAX_BASE_BDEVS_REMOVED, 1},
902 	.start = raid5f_start,
903 	.stop = raid5f_stop,
904 	.submit_rw_request = raid5f_submit_rw_request,
905 	.get_io_channel = raid5f_get_io_channel,
906 };
907 RAID_MODULE_REGISTER(&g_raid5f_module)
908 
909 SPDK_LOG_REGISTER_COMPONENT(bdev_raid5f)
910