xref: /spdk/module/bdev/raid/raid0.c (revision 7506a7aa53d239f533af3bc768f0d2af55e735fe)
1  /*-
2   *   BSD LICENSE
3   *
4   *   Copyright (c) Intel Corporation.
5   *   All rights reserved.
6   *   Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
7   *
8   *   Redistribution and use in source and binary forms, with or without
9   *   modification, are permitted provided that the following conditions
10   *   are met:
11   *
12   *     * Redistributions of source code must retain the above copyright
13   *       notice, this list of conditions and the following disclaimer.
14   *     * Redistributions in binary form must reproduce the above copyright
15   *       notice, this list of conditions and the following disclaimer in
16   *       the documentation and/or other materials provided with the
17   *       distribution.
18   *     * Neither the name of Intel Corporation nor the names of its
19   *       contributors may be used to endorse or promote products derived
20   *       from this software without specific prior written permission.
21   *
22   *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23   *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24   *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25   *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26   *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27   *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28   *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29   *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30   *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31   *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32   *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33   */
34  
35  #include "bdev_raid.h"
36  
37  #include "spdk/env.h"
38  #include "spdk/thread.h"
39  #include "spdk/string.h"
40  #include "spdk/util.h"
41  
42  #include "spdk/log.h"
43  
44  /*
45   * brief:
46   * raid0_bdev_io_completion function is called by lower layers to notify raid
47   * module that particular bdev_io is completed.
48   * params:
49   * bdev_io - pointer to bdev io submitted to lower layers, like child io
50   * success - bdev_io status
51   * cb_arg - function callback context (parent raid_bdev_io)
52   * returns:
53   * none
54   */
55  static void
56  raid0_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
57  {
58  	struct raid_bdev_io *raid_io = cb_arg;
59  
60  	spdk_bdev_free_io(bdev_io);
61  
62  	if (success) {
63  		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS);
64  	} else {
65  		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
66  	}
67  }
68  
69  static void
70  raid0_submit_rw_request(struct raid_bdev_io *raid_io);
71  
72  static void
73  _raid0_submit_rw_request(void *_raid_io)
74  {
75  	struct raid_bdev_io *raid_io = _raid_io;
76  
77  	raid0_submit_rw_request(raid_io);
78  }
79  
80  /*
81   * brief:
82   * raid0_submit_rw_request function is used to submit I/O to the correct
83   * member disk for raid0 bdevs.
84   * params:
85   * raid_io
86   * returns:
87   * none
88   */
89  static void
90  raid0_submit_rw_request(struct raid_bdev_io *raid_io)
91  {
92  	struct spdk_bdev_io		*bdev_io = spdk_bdev_io_from_ctx(raid_io);
93  	struct raid_bdev_io_channel	*raid_ch = raid_io->raid_ch;
94  	struct raid_bdev		*raid_bdev = raid_io->raid_bdev;
95  	uint64_t			pd_strip;
96  	uint32_t			offset_in_strip;
97  	uint64_t			pd_lba;
98  	uint64_t			pd_blocks;
99  	uint8_t				pd_idx;
100  	int				ret = 0;
101  	uint64_t			start_strip;
102  	uint64_t			end_strip;
103  	struct raid_base_bdev_info	*base_info;
104  	struct spdk_io_channel		*base_ch;
105  
106  	start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift;
107  	end_strip = (bdev_io->u.bdev.offset_blocks + bdev_io->u.bdev.num_blocks - 1) >>
108  		    raid_bdev->strip_size_shift;
109  	if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) {
110  		assert(false);
111  		SPDK_ERRLOG("I/O spans strip boundary!\n");
112  		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
113  		return;
114  	}
115  
116  	pd_strip = start_strip / raid_bdev->num_base_bdevs;
117  	pd_idx = start_strip % raid_bdev->num_base_bdevs;
118  	offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1);
119  	pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip;
120  	pd_blocks = bdev_io->u.bdev.num_blocks;
121  	base_info = &raid_bdev->base_bdev_info[pd_idx];
122  	if (base_info->desc == NULL) {
123  		SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
124  		assert(0);
125  	}
126  
127  	/*
128  	 * Submit child io to bdev layer with using base bdev descriptors, base
129  	 * bdev lba, base bdev child io length in blocks, buffer, completion
130  	 * function and function callback context
131  	 */
132  	assert(raid_ch != NULL);
133  	assert(raid_ch->base_channel);
134  	base_ch = raid_ch->base_channel[pd_idx];
135  	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
136  		ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch,
137  						 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
138  						 pd_lba, pd_blocks, raid0_bdev_io_completion,
139  						 raid_io, bdev_io->u.bdev.ext_opts);
140  	} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
141  		ret = spdk_bdev_writev_blocks_ext(base_info->desc, base_ch,
142  						  bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
143  						  pd_lba, pd_blocks, raid0_bdev_io_completion,
144  						  raid_io, bdev_io->u.bdev.ext_opts);
145  	} else {
146  		SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type);
147  		assert(0);
148  	}
149  
150  	if (ret == -ENOMEM) {
151  		raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch,
152  					_raid0_submit_rw_request);
153  	} else if (ret != 0) {
154  		SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
155  		assert(false);
156  		raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
157  	}
158  }
159  
160  /* raid0 IO range */
161  struct raid_bdev_io_range {
162  	uint64_t	strip_size;
163  	uint64_t	start_strip_in_disk;
164  	uint64_t	end_strip_in_disk;
165  	uint64_t	start_offset_in_strip;
166  	uint64_t	end_offset_in_strip;
167  	uint8_t		start_disk;
168  	uint8_t		end_disk;
169  	uint8_t		n_disks_involved;
170  };
171  
172  static inline void
173  _raid0_get_io_range(struct raid_bdev_io_range *io_range,
174  		    uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift,
175  		    uint64_t offset_blocks, uint64_t num_blocks)
176  {
177  	uint64_t	start_strip;
178  	uint64_t	end_strip;
179  
180  	io_range->strip_size = strip_size;
181  
182  	/* The start and end strip index in raid0 bdev scope */
183  	start_strip = offset_blocks >> strip_size_shift;
184  	end_strip = (offset_blocks + num_blocks - 1) >> strip_size_shift;
185  	io_range->start_strip_in_disk = start_strip / num_base_bdevs;
186  	io_range->end_strip_in_disk = end_strip / num_base_bdevs;
187  
188  	/* The first strip may have unaligned start LBA offset.
189  	 * The end strip may have unaligned end LBA offset.
190  	 * Strips between them certainly have aligned offset and length to boundaries.
191  	 */
192  	io_range->start_offset_in_strip = offset_blocks % strip_size;
193  	io_range->end_offset_in_strip = (offset_blocks + num_blocks - 1) % strip_size;
194  
195  	/* The base bdev indexes in which start and end strips are located */
196  	io_range->start_disk = start_strip % num_base_bdevs;
197  	io_range->end_disk = end_strip % num_base_bdevs;
198  
199  	/* Calculate how many base_bdevs are involved in io operation.
200  	 * Number of base bdevs involved is between 1 and num_base_bdevs.
201  	 * It will be 1 if the first strip and last strip are the same one.
202  	 */
203  	io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs);
204  }
205  
206  static inline void
207  _raid0_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx,
208  		      uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk)
209  {
210  	uint64_t n_strips_in_disk;
211  	uint64_t start_offset_in_disk;
212  	uint64_t end_offset_in_disk;
213  	uint64_t offset_in_disk;
214  	uint64_t nblocks_in_disk;
215  	uint64_t start_strip_in_disk;
216  	uint64_t end_strip_in_disk;
217  
218  	start_strip_in_disk = io_range->start_strip_in_disk;
219  	if (disk_idx < io_range->start_disk) {
220  		start_strip_in_disk += 1;
221  	}
222  
223  	end_strip_in_disk = io_range->end_strip_in_disk;
224  	if (disk_idx > io_range->end_disk) {
225  		end_strip_in_disk -= 1;
226  	}
227  
228  	assert(end_strip_in_disk >= start_strip_in_disk);
229  	n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1;
230  
231  	if (disk_idx == io_range->start_disk) {
232  		start_offset_in_disk = io_range->start_offset_in_strip;
233  	} else {
234  		start_offset_in_disk = 0;
235  	}
236  
237  	if (disk_idx == io_range->end_disk) {
238  		end_offset_in_disk = io_range->end_offset_in_strip;
239  	} else {
240  		end_offset_in_disk = io_range->strip_size - 1;
241  	}
242  
243  	offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size;
244  	nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size
245  			  + end_offset_in_disk - start_offset_in_disk + 1;
246  
247  	SPDK_DEBUGLOG(bdev_raid0,
248  		      "raid_bdev (strip_size 0x%" PRIx64 ") splits IO to base_bdev (%u) at (0x%" PRIx64 ", 0x%" PRIx64
249  		      ").\n",
250  		      io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk);
251  
252  	*_offset_in_disk = offset_in_disk;
253  	*_nblocks_in_disk = nblocks_in_disk;
254  }
255  
256  static void
257  raid0_submit_null_payload_request(struct raid_bdev_io *raid_io);
258  
259  static void
260  _raid0_submit_null_payload_request(void *_raid_io)
261  {
262  	struct raid_bdev_io *raid_io = _raid_io;
263  
264  	raid0_submit_null_payload_request(raid_io);
265  }
266  
267  static void
268  raid0_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
269  {
270  	struct raid_bdev_io *raid_io = cb_arg;
271  
272  	raid_bdev_io_complete_part(raid_io, 1, success ?
273  				   SPDK_BDEV_IO_STATUS_SUCCESS :
274  				   SPDK_BDEV_IO_STATUS_FAILED);
275  
276  	spdk_bdev_free_io(bdev_io);
277  }
278  
279  /*
280   * brief:
281   * raid0_submit_null_payload_request function submits the next batch of
282   * io requests with range but without payload, like FLUSH and UNMAP, to member disks;
283   * it will submit as many as possible unless one base io request fails with -ENOMEM,
284   * in which case it will queue itself for later submission.
285   * params:
286   * bdev_io - pointer to parent bdev_io on raid bdev device
287   * returns:
288   * none
289   */
290  static void
291  raid0_submit_null_payload_request(struct raid_bdev_io *raid_io)
292  {
293  	struct spdk_bdev_io		*bdev_io;
294  	struct raid_bdev		*raid_bdev;
295  	struct raid_bdev_io_range	io_range;
296  	int				ret;
297  	struct raid_base_bdev_info	*base_info;
298  	struct spdk_io_channel		*base_ch;
299  
300  	bdev_io = spdk_bdev_io_from_ctx(raid_io);
301  	raid_bdev = raid_io->raid_bdev;
302  
303  	_raid0_get_io_range(&io_range, raid_bdev->num_base_bdevs,
304  			    raid_bdev->strip_size, raid_bdev->strip_size_shift,
305  			    bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks);
306  
307  	if (raid_io->base_bdev_io_remaining == 0) {
308  		raid_io->base_bdev_io_remaining = io_range.n_disks_involved;
309  	}
310  
311  	while (raid_io->base_bdev_io_submitted < io_range.n_disks_involved) {
312  		uint8_t disk_idx;
313  		uint64_t offset_in_disk;
314  		uint64_t nblocks_in_disk;
315  
316  		/* base_bdev is started from start_disk to end_disk.
317  		 * It is possible that index of start_disk is larger than end_disk's.
318  		 */
319  		disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs;
320  		base_info = &raid_bdev->base_bdev_info[disk_idx];
321  		base_ch = raid_io->raid_ch->base_channel[disk_idx];
322  
323  		_raid0_split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk);
324  
325  		switch (bdev_io->type) {
326  		case SPDK_BDEV_IO_TYPE_UNMAP:
327  			ret = spdk_bdev_unmap_blocks(base_info->desc, base_ch,
328  						     offset_in_disk, nblocks_in_disk,
329  						     raid0_base_io_complete, raid_io);
330  			break;
331  
332  		case SPDK_BDEV_IO_TYPE_FLUSH:
333  			ret = spdk_bdev_flush_blocks(base_info->desc, base_ch,
334  						     offset_in_disk, nblocks_in_disk,
335  						     raid0_base_io_complete, raid_io);
336  			break;
337  
338  		default:
339  			SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type);
340  			assert(false);
341  			ret = -EIO;
342  		}
343  
344  		if (ret == 0) {
345  			raid_io->base_bdev_io_submitted++;
346  		} else if (ret == -ENOMEM) {
347  			raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch,
348  						_raid0_submit_null_payload_request);
349  			return;
350  		} else {
351  			SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
352  			assert(false);
353  			raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
354  			return;
355  		}
356  	}
357  }
358  
359  static int raid0_start(struct raid_bdev *raid_bdev)
360  {
361  	uint64_t min_blockcnt = UINT64_MAX;
362  	struct raid_base_bdev_info *base_info;
363  
364  	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
365  		/* Calculate minimum block count from all base bdevs */
366  		min_blockcnt = spdk_min(min_blockcnt, base_info->bdev->blockcnt);
367  	}
368  
369  	/*
370  	 * Take the minimum block count based approach where total block count
371  	 * of raid bdev is the number of base bdev times the minimum block count
372  	 * of any base bdev.
373  	 */
374  	SPDK_DEBUGLOG(bdev_raid0, "min blockcount %" PRIu64 ",  numbasedev %u, strip size shift %u\n",
375  		      min_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift);
376  	raid_bdev->bdev.blockcnt = ((min_blockcnt >> raid_bdev->strip_size_shift) <<
377  				    raid_bdev->strip_size_shift)  * raid_bdev->num_base_bdevs;
378  
379  	if (raid_bdev->num_base_bdevs > 1) {
380  		raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size;
381  		raid_bdev->bdev.split_on_optimal_io_boundary = true;
382  	} else {
383  		/* Do not need to split reads/writes on single bdev RAID modules. */
384  		raid_bdev->bdev.optimal_io_boundary = 0;
385  		raid_bdev->bdev.split_on_optimal_io_boundary = false;
386  	}
387  
388  	return 0;
389  }
390  
391  static struct raid_bdev_module g_raid0_module = {
392  	.level = RAID0,
393  	.base_bdevs_min = 1,
394  	.start = raid0_start,
395  	.submit_rw_request = raid0_submit_rw_request,
396  	.submit_null_payload_request = raid0_submit_null_payload_request,
397  };
398  RAID_MODULE_REGISTER(&g_raid0_module)
399  
400  SPDK_LOG_REGISTER_COMPONENT(bdev_raid0)
401