1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright (C) 2019 Intel Corporation.
3 * All rights reserved.
4 * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5 */
6
7 #include "bdev_raid.h"
8
9 #include "spdk/env.h"
10 #include "spdk/thread.h"
11 #include "spdk/string.h"
12 #include "spdk/util.h"
13
14 #include "spdk/log.h"
15
16 /*
17 * brief:
18 * raid0_bdev_io_completion function is called by lower layers to notify raid
19 * module that particular bdev_io is completed.
20 * params:
21 * bdev_io - pointer to bdev io submitted to lower layers, like child io
22 * success - bdev_io status
23 * cb_arg - function callback context (parent raid_bdev_io)
24 * returns:
25 * none
26 */
27 static void
raid0_bdev_io_completion(struct spdk_bdev_io * bdev_io,bool success,void * cb_arg)28 raid0_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
29 {
30 struct raid_bdev_io *raid_io = cb_arg;
31 int rc;
32
33 if (success) {
34 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_READ &&
35 spdk_bdev_get_dif_type(bdev_io->bdev) != SPDK_DIF_DISABLE &&
36 bdev_io->bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK)) {
37
38 rc = raid_bdev_verify_dix_reftag(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
39 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.num_blocks, bdev_io->bdev,
40 bdev_io->u.bdev.offset_blocks);
41 if (rc != 0) {
42 SPDK_ERRLOG("Reftag verify failed.\n");
43 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
44 return;
45 }
46 }
47
48 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS);
49 } else {
50 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
51 }
52
53 spdk_bdev_free_io(bdev_io);
54 }
55
56 static void raid0_submit_rw_request(struct raid_bdev_io *raid_io);
57
58 static void
_raid0_submit_rw_request(void * _raid_io)59 _raid0_submit_rw_request(void *_raid_io)
60 {
61 struct raid_bdev_io *raid_io = _raid_io;
62
63 raid0_submit_rw_request(raid_io);
64 }
65
66 /*
67 * brief:
68 * raid0_submit_rw_request function is used to submit I/O to the correct
69 * member disk for raid0 bdevs.
70 * params:
71 * raid_io
72 * returns:
73 * none
74 */
75 static void
raid0_submit_rw_request(struct raid_bdev_io * raid_io)76 raid0_submit_rw_request(struct raid_bdev_io *raid_io)
77 {
78 struct spdk_bdev_ext_io_opts io_opts = {};
79 struct raid_bdev_io_channel *raid_ch = raid_io->raid_ch;
80 struct raid_bdev *raid_bdev = raid_io->raid_bdev;
81 uint64_t pd_strip;
82 uint32_t offset_in_strip;
83 uint64_t pd_lba;
84 uint64_t pd_blocks;
85 uint8_t pd_idx;
86 int ret = 0;
87 uint64_t start_strip;
88 uint64_t end_strip;
89 struct raid_base_bdev_info *base_info;
90 struct spdk_io_channel *base_ch;
91
92 start_strip = raid_io->offset_blocks >> raid_bdev->strip_size_shift;
93 end_strip = (raid_io->offset_blocks + raid_io->num_blocks - 1) >>
94 raid_bdev->strip_size_shift;
95 if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) {
96 assert(false);
97 SPDK_ERRLOG("I/O spans strip boundary!\n");
98 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
99 return;
100 }
101
102 pd_strip = start_strip / raid_bdev->num_base_bdevs;
103 pd_idx = start_strip % raid_bdev->num_base_bdevs;
104 offset_in_strip = raid_io->offset_blocks & (raid_bdev->strip_size - 1);
105 pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip;
106 pd_blocks = raid_io->num_blocks;
107 base_info = &raid_bdev->base_bdev_info[pd_idx];
108 if (base_info->desc == NULL) {
109 SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
110 assert(0);
111 }
112
113 /*
114 * Submit child io to bdev layer with using base bdev descriptors, base
115 * bdev lba, base bdev child io length in blocks, buffer, completion
116 * function and function callback context
117 */
118 assert(raid_ch != NULL);
119 base_ch = raid_bdev_channel_get_base_channel(raid_ch, pd_idx);
120
121 io_opts.size = sizeof(io_opts);
122 io_opts.memory_domain = raid_io->memory_domain;
123 io_opts.memory_domain_ctx = raid_io->memory_domain_ctx;
124 io_opts.metadata = raid_io->md_buf;
125
126 if (raid_io->type == SPDK_BDEV_IO_TYPE_READ) {
127 ret = raid_bdev_readv_blocks_ext(base_info, base_ch,
128 raid_io->iovs, raid_io->iovcnt,
129 pd_lba, pd_blocks, raid0_bdev_io_completion,
130 raid_io, &io_opts);
131 } else if (raid_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
132 struct spdk_bdev *bdev = &base_info->raid_bdev->bdev;
133
134 if (spdk_unlikely(spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE &&
135 bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK)) {
136 ret = raid_bdev_verify_dix_reftag(raid_io->iovs, raid_io->iovcnt, io_opts.metadata,
137 pd_blocks, bdev, raid_io->offset_blocks);
138 if (ret != 0) {
139 SPDK_ERRLOG("bdev io submit error due to DIX verify failure\n");
140 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
141 return;
142 }
143 }
144
145 ret = raid_bdev_writev_blocks_ext(base_info, base_ch,
146 raid_io->iovs, raid_io->iovcnt,
147 pd_lba, pd_blocks, raid0_bdev_io_completion,
148 raid_io, &io_opts);
149 } else {
150 SPDK_ERRLOG("Recvd not supported io type %u\n", raid_io->type);
151 assert(0);
152 }
153
154 if (ret == -ENOMEM) {
155 raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
156 base_ch, _raid0_submit_rw_request);
157 } else if (ret != 0) {
158 SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
159 assert(false);
160 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
161 }
162 }
163
164 /* raid0 IO range */
165 struct raid_bdev_io_range {
166 uint64_t strip_size;
167 uint64_t start_strip_in_disk;
168 uint64_t end_strip_in_disk;
169 uint64_t start_offset_in_strip;
170 uint64_t end_offset_in_strip;
171 uint8_t start_disk;
172 uint8_t end_disk;
173 uint8_t n_disks_involved;
174 };
175
176 static inline void
_raid0_get_io_range(struct raid_bdev_io_range * io_range,uint8_t num_base_bdevs,uint64_t strip_size,uint64_t strip_size_shift,uint64_t offset_blocks,uint64_t num_blocks)177 _raid0_get_io_range(struct raid_bdev_io_range *io_range,
178 uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift,
179 uint64_t offset_blocks, uint64_t num_blocks)
180 {
181 uint64_t start_strip;
182 uint64_t end_strip;
183 uint64_t total_blocks;
184
185 io_range->strip_size = strip_size;
186 total_blocks = offset_blocks + num_blocks - (num_blocks > 0);
187
188 /* The start and end strip index in raid0 bdev scope */
189 start_strip = offset_blocks >> strip_size_shift;
190 end_strip = total_blocks >> strip_size_shift;
191 io_range->start_strip_in_disk = start_strip / num_base_bdevs;
192 io_range->end_strip_in_disk = end_strip / num_base_bdevs;
193
194 /* The first strip may have unaligned start LBA offset.
195 * The end strip may have unaligned end LBA offset.
196 * Strips between them certainly have aligned offset and length to boundaries.
197 */
198 io_range->start_offset_in_strip = offset_blocks % strip_size;
199 io_range->end_offset_in_strip = total_blocks % strip_size;
200
201 /* The base bdev indexes in which start and end strips are located */
202 io_range->start_disk = start_strip % num_base_bdevs;
203 io_range->end_disk = end_strip % num_base_bdevs;
204
205 /* Calculate how many base_bdevs are involved in io operation.
206 * Number of base bdevs involved is between 1 and num_base_bdevs.
207 * It will be 1 if the first strip and last strip are the same one.
208 */
209 io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs);
210 }
211
212 static inline void
_raid0_split_io_range(struct raid_bdev_io_range * io_range,uint8_t disk_idx,uint64_t * _offset_in_disk,uint64_t * _nblocks_in_disk)213 _raid0_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx,
214 uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk)
215 {
216 uint64_t n_strips_in_disk;
217 uint64_t start_offset_in_disk;
218 uint64_t end_offset_in_disk;
219 uint64_t offset_in_disk;
220 uint64_t nblocks_in_disk;
221 uint64_t start_strip_in_disk;
222 uint64_t end_strip_in_disk;
223
224 start_strip_in_disk = io_range->start_strip_in_disk;
225 if (disk_idx < io_range->start_disk) {
226 start_strip_in_disk += 1;
227 }
228
229 end_strip_in_disk = io_range->end_strip_in_disk;
230 if (disk_idx > io_range->end_disk) {
231 end_strip_in_disk -= 1;
232 }
233
234 assert(end_strip_in_disk >= start_strip_in_disk);
235 n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1;
236
237 if (disk_idx == io_range->start_disk) {
238 start_offset_in_disk = io_range->start_offset_in_strip;
239 } else {
240 start_offset_in_disk = 0;
241 }
242
243 if (disk_idx == io_range->end_disk) {
244 end_offset_in_disk = io_range->end_offset_in_strip;
245 } else {
246 end_offset_in_disk = io_range->strip_size - 1;
247 }
248
249 offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size;
250 nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size
251 + end_offset_in_disk - start_offset_in_disk + 1;
252
253 SPDK_DEBUGLOG(bdev_raid0,
254 "raid_bdev (strip_size 0x%" PRIx64 ") splits IO to base_bdev (%u) at (0x%" PRIx64 ", 0x%" PRIx64
255 ").\n",
256 io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk);
257
258 *_offset_in_disk = offset_in_disk;
259 *_nblocks_in_disk = nblocks_in_disk;
260 }
261
262 static void raid0_submit_null_payload_request(struct raid_bdev_io *raid_io);
263
264 static void
_raid0_submit_null_payload_request(void * _raid_io)265 _raid0_submit_null_payload_request(void *_raid_io)
266 {
267 struct raid_bdev_io *raid_io = _raid_io;
268
269 raid0_submit_null_payload_request(raid_io);
270 }
271
272 static void
raid0_base_io_complete(struct spdk_bdev_io * bdev_io,bool success,void * cb_arg)273 raid0_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
274 {
275 struct raid_bdev_io *raid_io = cb_arg;
276
277 raid_bdev_io_complete_part(raid_io, 1, success ?
278 SPDK_BDEV_IO_STATUS_SUCCESS :
279 SPDK_BDEV_IO_STATUS_FAILED);
280
281 spdk_bdev_free_io(bdev_io);
282 }
283
284 /*
285 * brief:
286 * raid0_submit_null_payload_request function submits the next batch of
287 * io requests with range but without payload, like FLUSH and UNMAP, to member disks;
288 * it will submit as many as possible unless one base io request fails with -ENOMEM,
289 * in which case it will queue itself for later submission.
290 * params:
291 * bdev_io - pointer to parent bdev_io on raid bdev device
292 * returns:
293 * none
294 */
295 static void
raid0_submit_null_payload_request(struct raid_bdev_io * raid_io)296 raid0_submit_null_payload_request(struct raid_bdev_io *raid_io)
297 {
298 struct raid_bdev *raid_bdev;
299 struct raid_bdev_io_range io_range;
300 int ret;
301 struct raid_base_bdev_info *base_info;
302 struct spdk_io_channel *base_ch;
303
304 raid_bdev = raid_io->raid_bdev;
305
306 _raid0_get_io_range(&io_range, raid_bdev->num_base_bdevs,
307 raid_bdev->strip_size, raid_bdev->strip_size_shift,
308 raid_io->offset_blocks, raid_io->num_blocks);
309
310 if (raid_io->base_bdev_io_remaining == 0) {
311 raid_io->base_bdev_io_remaining = io_range.n_disks_involved;
312 }
313
314 while (raid_io->base_bdev_io_submitted < io_range.n_disks_involved) {
315 uint8_t disk_idx;
316 uint64_t offset_in_disk;
317 uint64_t nblocks_in_disk;
318
319 /* base_bdev is started from start_disk to end_disk.
320 * It is possible that index of start_disk is larger than end_disk's.
321 */
322 disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs;
323 base_info = &raid_bdev->base_bdev_info[disk_idx];
324 base_ch = raid_bdev_channel_get_base_channel(raid_io->raid_ch, disk_idx);
325
326 _raid0_split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk);
327
328 switch (raid_io->type) {
329 case SPDK_BDEV_IO_TYPE_UNMAP:
330 ret = raid_bdev_unmap_blocks(base_info, base_ch,
331 offset_in_disk, nblocks_in_disk,
332 raid0_base_io_complete, raid_io);
333 break;
334
335 case SPDK_BDEV_IO_TYPE_FLUSH:
336 ret = raid_bdev_flush_blocks(base_info, base_ch,
337 offset_in_disk, nblocks_in_disk,
338 raid0_base_io_complete, raid_io);
339 break;
340
341 default:
342 SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", raid_io->type);
343 assert(false);
344 ret = -EIO;
345 }
346
347 if (ret == 0) {
348 raid_io->base_bdev_io_submitted++;
349 } else if (ret == -ENOMEM) {
350 raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
351 base_ch, _raid0_submit_null_payload_request);
352 return;
353 } else {
354 SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
355 assert(false);
356 raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
357 return;
358 }
359 }
360 }
361
362 static int
raid0_start(struct raid_bdev * raid_bdev)363 raid0_start(struct raid_bdev *raid_bdev)
364 {
365 uint64_t min_blockcnt = UINT64_MAX;
366 uint64_t base_bdev_data_size;
367 struct raid_base_bdev_info *base_info;
368
369 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
370 /* Calculate minimum block count from all base bdevs */
371 min_blockcnt = spdk_min(min_blockcnt, base_info->data_size);
372 }
373
374 base_bdev_data_size = (min_blockcnt >> raid_bdev->strip_size_shift) << raid_bdev->strip_size_shift;
375
376 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
377 base_info->data_size = base_bdev_data_size;
378 }
379
380 /*
381 * Take the minimum block count based approach where total block count
382 * of raid bdev is the number of base bdev times the minimum block count
383 * of any base bdev.
384 */
385 SPDK_DEBUGLOG(bdev_raid0, "min blockcount %" PRIu64 ", numbasedev %u, strip size shift %u\n",
386 min_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift);
387
388 raid_bdev->bdev.blockcnt = base_bdev_data_size * raid_bdev->num_base_bdevs;
389
390 if (raid_bdev->num_base_bdevs > 1) {
391 raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size;
392 raid_bdev->bdev.split_on_optimal_io_boundary = true;
393 } else {
394 /* Do not need to split reads/writes on single bdev RAID modules. */
395 raid_bdev->bdev.optimal_io_boundary = 0;
396 raid_bdev->bdev.split_on_optimal_io_boundary = false;
397 }
398
399 return 0;
400 }
401
402 static bool
raid0_resize(struct raid_bdev * raid_bdev)403 raid0_resize(struct raid_bdev *raid_bdev)
404 {
405 uint64_t blockcnt;
406 int rc;
407 uint64_t min_blockcnt = UINT64_MAX;
408 struct raid_base_bdev_info *base_info;
409 uint64_t base_bdev_data_size;
410
411 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
412 struct spdk_bdev *base_bdev = spdk_bdev_desc_get_bdev(base_info->desc);
413
414 min_blockcnt = spdk_min(min_blockcnt, base_bdev->blockcnt - base_info->data_offset);
415 }
416
417 base_bdev_data_size = (min_blockcnt >> raid_bdev->strip_size_shift) << raid_bdev->strip_size_shift;
418 blockcnt = base_bdev_data_size * raid_bdev->num_base_bdevs;
419
420 if (blockcnt == raid_bdev->bdev.blockcnt) {
421 return false;
422 }
423
424 rc = spdk_bdev_notify_blockcnt_change(&raid_bdev->bdev, blockcnt);
425 if (rc != 0) {
426 SPDK_ERRLOG("Failed to notify blockcount change\n");
427 return false;
428 }
429
430 RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
431 base_info->data_size = base_bdev_data_size;
432 }
433
434 return true;
435 }
436
437 static struct raid_bdev_module g_raid0_module = {
438 .level = RAID0,
439 .base_bdevs_min = 1,
440 .memory_domains_supported = true,
441 .dif_supported = true,
442 .start = raid0_start,
443 .submit_rw_request = raid0_submit_rw_request,
444 .submit_null_payload_request = raid0_submit_null_payload_request,
445 .resize = raid0_resize,
446 };
447 RAID_MODULE_REGISTER(&g_raid0_module)
448
449 SPDK_LOG_REGISTER_COMPONENT(bdev_raid0)
450