xref: /spdk/module/bdev/uring/bdev_uring.c (revision 9889ab2dc80e40dae92dcef361d53dcba722043d)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "bdev_uring.h"
35 
36 #include "spdk/stdinc.h"
37 
38 #include "spdk/barrier.h"
39 #include "spdk/bdev.h"
40 #include "spdk/conf.h"
41 #include "spdk/env.h"
42 #include "spdk/fd.h"
43 #include "spdk/likely.h"
44 #include "spdk/thread.h"
45 #include "spdk/json.h"
46 #include "spdk/util.h"
47 #include "spdk/string.h"
48 
49 #include "spdk_internal/log.h"
50 
51 #include <liburing.h>
52 
53 struct bdev_uring_io_channel {
54 	struct bdev_uring_group_channel		*group_ch;
55 };
56 
57 struct bdev_uring_group_channel {
58 	uint64_t				io_inflight;
59 	uint64_t				io_pending;
60 	struct spdk_poller			*poller;
61 	struct io_uring				uring;
62 };
63 
64 struct bdev_uring_task {
65 	uint64_t			len;
66 	struct bdev_uring_io_channel	*ch;
67 	TAILQ_ENTRY(bdev_uring_task)	link;
68 };
69 
70 struct bdev_uring {
71 	struct spdk_bdev	bdev;
72 	char			*filename;
73 	int			fd;
74 	TAILQ_ENTRY(bdev_uring)  link;
75 };
76 
77 static int bdev_uring_init(void);
78 static void bdev_uring_fini(void);
79 static void uring_free_bdev(struct bdev_uring *uring);
80 static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head;
81 
82 #define SPDK_URING_QUEUE_DEPTH 512
83 #define MAX_EVENTS_PER_POLL 32
84 
85 static int
86 bdev_uring_get_ctx_size(void)
87 {
88 	return sizeof(struct bdev_uring_task);
89 }
90 
91 static struct spdk_bdev_module uring_if = {
92 	.name		= "uring",
93 	.module_init	= bdev_uring_init,
94 	.module_fini	= bdev_uring_fini,
95 	.config_text	= NULL,
96 	.get_ctx_size	= bdev_uring_get_ctx_size,
97 };
98 
99 SPDK_BDEV_MODULE_REGISTER(uring, &uring_if)
100 
101 static int
102 bdev_uring_open(struct bdev_uring *bdev)
103 {
104 	int fd;
105 
106 	fd = open(bdev->filename, O_NOATIME | O_DIRECT);
107 	if (fd < 0) {
108 		SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n",
109 			    bdev->filename, errno, spdk_strerror(errno));
110 		bdev->fd = -1;
111 		return -1;
112 	}
113 
114 	bdev->fd = fd;
115 
116 	return 0;
117 }
118 
119 static int
120 bdev_uring_close(struct bdev_uring *bdev)
121 {
122 	int rc;
123 
124 	if (bdev->fd == -1) {
125 		return 0;
126 	}
127 
128 	rc = close(bdev->fd);
129 	if (rc < 0) {
130 		SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n",
131 			    bdev->fd, errno, spdk_strerror(errno));
132 		return -1;
133 	}
134 
135 	bdev->fd = -1;
136 
137 	return 0;
138 }
139 
140 static int64_t
141 bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch,
142 		 struct bdev_uring_task *uring_task,
143 		 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
144 {
145 	struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch);
146 	struct bdev_uring_group_channel *group_ch = uring_ch->group_ch;
147 	struct io_uring_sqe *sqe;
148 
149 	sqe = io_uring_get_sqe(&group_ch->uring);
150 	io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset);
151 	io_uring_sqe_set_data(sqe, uring_task);
152 	uring_task->len = nbytes;
153 	uring_task->ch = uring_ch;
154 
155 	SPDK_DEBUGLOG(SPDK_LOG_URING, "read %d iovs size %lu to off: %#lx\n",
156 		      iovcnt, nbytes, offset);
157 
158 	group_ch->io_pending++;
159 	return nbytes;
160 }
161 
162 static int64_t
163 bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch,
164 		  struct bdev_uring_task *uring_task,
165 		  struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset)
166 {
167 	struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch);
168 	struct bdev_uring_group_channel *group_ch = uring_ch->group_ch;
169 	struct io_uring_sqe *sqe;
170 
171 	sqe = io_uring_get_sqe(&group_ch->uring);
172 	io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset);
173 	io_uring_sqe_set_data(sqe, uring_task);
174 	uring_task->len = nbytes;
175 	uring_task->ch = uring_ch;
176 
177 	SPDK_DEBUGLOG(SPDK_LOG_URING, "write %d iovs size %lu from off: %#lx\n",
178 		      iovcnt, nbytes, offset);
179 
180 	group_ch->io_pending++;
181 	return nbytes;
182 }
183 
184 static int
185 bdev_uring_destruct(void *ctx)
186 {
187 	struct bdev_uring *uring = ctx;
188 	int rc = 0;
189 
190 	TAILQ_REMOVE(&g_uring_bdev_head, uring, link);
191 	rc = bdev_uring_close(uring);
192 	if (rc < 0) {
193 		SPDK_ERRLOG("bdev_uring_close() failed\n");
194 	}
195 	spdk_io_device_unregister(uring, NULL);
196 	uring_free_bdev(uring);
197 	return rc;
198 }
199 
200 static int
201 bdev_uring_reap(struct io_uring *ring, int max)
202 {
203 	int i, count, ret;
204 	struct io_uring_cqe *cqe;
205 	struct bdev_uring_task *uring_task;
206 	enum spdk_bdev_io_status status;
207 
208 	count = 0;
209 	for (i = 0; i < max; i++) {
210 		ret = io_uring_peek_cqe(ring, &cqe);
211 		if (ret != 0) {
212 			return ret;
213 		}
214 
215 		if (cqe == NULL) {
216 			return count;
217 		}
218 
219 		uring_task = (struct bdev_uring_task *)cqe->user_data;
220 		if (cqe->res != (signed)uring_task->len) {
221 			status = SPDK_BDEV_IO_STATUS_FAILED;
222 		} else {
223 			status = SPDK_BDEV_IO_STATUS_SUCCESS;
224 		}
225 
226 		uring_task->ch->group_ch->io_inflight--;
227 		io_uring_cqe_seen(ring, cqe);
228 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status);
229 		count++;
230 	}
231 
232 	return count;
233 }
234 
235 static int
236 bdev_uring_group_poll(void *arg)
237 {
238 	struct bdev_uring_group_channel *group_ch = arg;
239 	int to_complete, to_submit;
240 	int count, ret;
241 
242 	to_submit = group_ch->io_pending;
243 	to_complete = group_ch->io_inflight;
244 
245 	ret = 0;
246 	if (to_submit > 0) {
247 		/* If there are I/O to submit, use io_uring_submit here.
248 		 * It will automatically call io_uring_enter appropriately. */
249 		ret = io_uring_submit(&group_ch->uring);
250 		group_ch->io_pending = 0;
251 		group_ch->io_inflight += to_submit;
252 	} else if (to_complete > 0) {
253 		/* If there are I/O in flight but none to submit, we need to
254 		 * call io_uring_enter ourselves. */
255 		ret = io_uring_enter(group_ch->uring.ring_fd, 0, 0,
256 				     IORING_ENTER_GETEVENTS, NULL);
257 	}
258 
259 	if (ret < 0) {
260 		return 1;
261 	}
262 
263 	count = 0;
264 	if (to_complete > 0) {
265 		count = bdev_uring_reap(&group_ch->uring, to_complete);
266 	}
267 
268 	return (count + to_submit);
269 }
270 
271 static void bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
272 				  bool success)
273 {
274 	if (!success) {
275 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
276 		return;
277 	}
278 
279 	switch (bdev_io->type) {
280 	case SPDK_BDEV_IO_TYPE_READ:
281 		bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt,
282 				 ch,
283 				 (struct bdev_uring_task *)bdev_io->driver_ctx,
284 				 bdev_io->u.bdev.iovs,
285 				 bdev_io->u.bdev.iovcnt,
286 				 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
287 				 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
288 		break;
289 	case SPDK_BDEV_IO_TYPE_WRITE:
290 		bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt,
291 				  ch,
292 				  (struct bdev_uring_task *)bdev_io->driver_ctx,
293 				  bdev_io->u.bdev.iovs,
294 				  bdev_io->u.bdev.iovcnt,
295 				  bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
296 				  bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
297 		break;
298 	default:
299 		SPDK_ERRLOG("Wrong io type\n");
300 		break;
301 	}
302 }
303 
304 static int _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
305 {
306 	switch (bdev_io->type) {
307 	/* Read and write operations must be performed on buffers aligned to
308 	 * bdev->required_alignment. If user specified unaligned buffers,
309 	 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
310 	case SPDK_BDEV_IO_TYPE_READ:
311 	case SPDK_BDEV_IO_TYPE_WRITE:
312 		spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb,
313 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
314 		return 0;
315 	default:
316 		return -1;
317 	}
318 }
319 
320 static void bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
321 {
322 	if (_bdev_uring_submit_request(ch, bdev_io) < 0) {
323 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
324 	}
325 }
326 
327 static bool
328 bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
329 {
330 	switch (io_type) {
331 	case SPDK_BDEV_IO_TYPE_READ:
332 	case SPDK_BDEV_IO_TYPE_WRITE:
333 		return true;
334 	default:
335 		return false;
336 	}
337 }
338 
339 static int
340 bdev_uring_create_cb(void *io_device, void *ctx_buf)
341 {
342 	struct bdev_uring_io_channel *ch = ctx_buf;
343 
344 	ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if));
345 
346 	return 0;
347 }
348 
349 static void
350 bdev_uring_destroy_cb(void *io_device, void *ctx_buf)
351 {
352 	struct bdev_uring_io_channel *ch = ctx_buf;
353 
354 	spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch));
355 }
356 
357 static struct spdk_io_channel *
358 bdev_uring_get_io_channel(void *ctx)
359 {
360 	struct bdev_uring *uring = ctx;
361 
362 	return spdk_get_io_channel(uring);
363 }
364 
365 
366 static const struct spdk_bdev_fn_table uring_fn_table = {
367 	.destruct		= bdev_uring_destruct,
368 	.submit_request		= bdev_uring_submit_request,
369 	.io_type_supported	= bdev_uring_io_type_supported,
370 	.get_io_channel		= bdev_uring_get_io_channel,
371 };
372 
373 static void uring_free_bdev(struct bdev_uring *uring)
374 {
375 	if (uring == NULL) {
376 		return;
377 	}
378 	free(uring->filename);
379 	free(uring->bdev.name);
380 	free(uring);
381 }
382 
383 static int
384 bdev_uring_group_create_cb(void *io_device, void *ctx_buf)
385 {
386 	struct bdev_uring_group_channel *ch = ctx_buf;
387 
388 	if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, IORING_SETUP_IOPOLL) < 0) {
389 		SPDK_ERRLOG("uring I/O context setup failure\n");
390 		return -1;
391 	}
392 
393 	ch->poller = spdk_poller_register(bdev_uring_group_poll, ch, 0);
394 	return 0;
395 }
396 
397 static void
398 bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf)
399 {
400 	struct bdev_uring_group_channel *ch = ctx_buf;
401 
402 	close(ch->uring.ring_fd);
403 	io_uring_queue_exit(&ch->uring);
404 
405 	spdk_poller_unregister(&ch->poller);
406 }
407 
408 struct spdk_bdev *
409 create_uring_bdev(const char *name, const char *filename)
410 {
411 	struct bdev_uring *uring;
412 	uint32_t block_size;
413 	uint64_t bdev_size;
414 	int rc;
415 
416 	uring = calloc(1, sizeof(*uring));
417 	if (!uring) {
418 		SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n");
419 		return NULL;
420 	}
421 
422 	uring->filename = strdup(filename);
423 	if (!uring->filename) {
424 		goto error_return;
425 	}
426 
427 	if (bdev_uring_open(uring)) {
428 		SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, uring->fd, errno);
429 		goto error_return;
430 	}
431 
432 	bdev_size = spdk_fd_get_size(uring->fd);
433 
434 	uring->bdev.name = strdup(name);
435 	if (!uring->bdev.name) {
436 		goto error_return;
437 	}
438 	uring->bdev.product_name = "URING bdev";
439 	uring->bdev.module = &uring_if;
440 
441 	uring->bdev.write_cache = 1;
442 
443 	block_size = spdk_fd_get_blocklen(uring->fd);
444 	if (block_size == 0) {
445 		SPDK_ERRLOG("Block size could not be auto-detected\n");
446 		goto error_return;
447 	}
448 
449 	if (block_size < 512) {
450 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size);
451 		goto error_return;
452 	}
453 
454 	if (!spdk_u32_is_pow2(block_size)) {
455 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size);
456 		goto error_return;
457 	}
458 
459 	uring->bdev.blocklen = block_size;
460 	uring->bdev.required_alignment = spdk_u32log2(block_size);
461 
462 	if (bdev_size % uring->bdev.blocklen != 0) {
463 		SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
464 			    bdev_size, uring->bdev.blocklen);
465 		goto error_return;
466 	}
467 
468 	uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen;
469 	uring->bdev.ctxt = uring;
470 
471 	uring->bdev.fn_table = &uring_fn_table;
472 
473 	spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb,
474 				sizeof(struct bdev_uring_io_channel),
475 				uring->bdev.name);
476 	rc = spdk_bdev_register(&uring->bdev);
477 	if (rc) {
478 		spdk_io_device_unregister(uring, NULL);
479 		goto error_return;
480 	}
481 
482 	TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link);
483 	return &uring->bdev;
484 
485 error_return:
486 	bdev_uring_close(uring);
487 	uring_free_bdev(uring);
488 	return NULL;
489 }
490 
491 struct delete_uring_bdev_ctx {
492 	spdk_delete_uring_complete cb_fn;
493 	void *cb_arg;
494 };
495 
496 static void
497 uring_bdev_unregister_cb(void *arg, int bdeverrno)
498 {
499 	struct delete_uring_bdev_ctx *ctx = arg;
500 
501 	ctx->cb_fn(ctx->cb_arg, bdeverrno);
502 	free(ctx);
503 }
504 
505 void
506 delete_uring_bdev(struct spdk_bdev *bdev, spdk_delete_uring_complete cb_fn, void *cb_arg)
507 {
508 	struct delete_uring_bdev_ctx *ctx;
509 
510 	if (!bdev || bdev->module != &uring_if) {
511 		cb_fn(cb_arg, -ENODEV);
512 		return;
513 	}
514 
515 	ctx = calloc(1, sizeof(*ctx));
516 	if (ctx == NULL) {
517 		cb_fn(cb_arg, -ENOMEM);
518 		return;
519 	}
520 
521 	ctx->cb_fn = cb_fn;
522 	ctx->cb_arg = cb_arg;
523 	spdk_bdev_unregister(bdev, uring_bdev_unregister_cb, ctx);
524 }
525 
526 static int
527 bdev_uring_init(void)
528 {
529 	size_t i;
530 	struct spdk_conf_section *sp;
531 	struct spdk_bdev *bdev;
532 
533 	TAILQ_INIT(&g_uring_bdev_head);
534 	spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb,
535 				sizeof(struct bdev_uring_group_channel),
536 				"uring_module");
537 
538 	sp = spdk_conf_find_section(NULL, "URING");
539 	if (!sp) {
540 		return 0;
541 	}
542 
543 	i = 0;
544 	while (true) {
545 		const char *file;
546 		const char *name;
547 
548 		file = spdk_conf_section_get_nmval(sp, "URING", i, 0);
549 		if (!file) {
550 			break;
551 		}
552 
553 		name = spdk_conf_section_get_nmval(sp, "URING", i, 1);
554 		if (!name) {
555 			SPDK_ERRLOG("No name provided for URING bdev with file %s\n", file);
556 			i++;
557 			continue;
558 		}
559 
560 		bdev = create_uring_bdev(name, file);
561 		if (!bdev) {
562 			SPDK_ERRLOG("Unable to create URING bdev from file %s\n", file);
563 			i++;
564 			continue;
565 		}
566 
567 		i++;
568 	}
569 
570 	return 0;
571 }
572 
573 static void
574 bdev_uring_fini(void)
575 {
576 	spdk_io_device_unregister(&uring_if, NULL);
577 }
578 
579 SPDK_LOG_REGISTER_COMPONENT("uring", SPDK_LOG_URING)
580