xref: /spdk/module/bdev/uring/bdev_uring.c (revision 7506a7aa53d239f533af3bc768f0d2af55e735fe)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "bdev_uring.h"
35 
36 #include "spdk/stdinc.h"
37 
38 #include "spdk/barrier.h"
39 #include "spdk/bdev.h"
40 #include "spdk/env.h"
41 #include "spdk/fd.h"
42 #include "spdk/likely.h"
43 #include "spdk/thread.h"
44 #include "spdk/json.h"
45 #include "spdk/util.h"
46 #include "spdk/string.h"
47 
48 #include "spdk/log.h"
49 #include "spdk_internal/uring.h"
50 
51 struct bdev_uring_io_channel {
52 	struct bdev_uring_group_channel		*group_ch;
53 };
54 
55 struct bdev_uring_group_channel {
56 	uint64_t				io_inflight;
57 	uint64_t				io_pending;
58 	struct spdk_poller			*poller;
59 	struct io_uring				uring;
60 };
61 
62 struct bdev_uring_task {
63 	uint64_t			len;
64 	struct bdev_uring_io_channel	*ch;
65 	TAILQ_ENTRY(bdev_uring_task)	link;
66 };
67 
68 struct bdev_uring {
69 	struct spdk_bdev	bdev;
70 	char			*filename;
71 	int			fd;
72 	TAILQ_ENTRY(bdev_uring)  link;
73 };
74 
75 static int bdev_uring_init(void);
76 static void bdev_uring_fini(void);
77 static void uring_free_bdev(struct bdev_uring *uring);
78 static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head = TAILQ_HEAD_INITIALIZER(g_uring_bdev_head);
79 
80 #define SPDK_URING_QUEUE_DEPTH 512
81 #define MAX_EVENTS_PER_POLL 32
82 
83 static int
84 bdev_uring_get_ctx_size(void)
85 {
86 	return sizeof(struct bdev_uring_task);
87 }
88 
89 static struct spdk_bdev_module uring_if = {
90 	.name		= "uring",
91 	.module_init	= bdev_uring_init,
92 	.module_fini	= bdev_uring_fini,
93 	.get_ctx_size	= bdev_uring_get_ctx_size,
94 };
95 
96 SPDK_BDEV_MODULE_REGISTER(uring, &uring_if)
97 
98 static int
99 bdev_uring_open(struct bdev_uring *bdev)
100 {
101 	int fd;
102 
103 	fd = open(bdev->filename, O_RDWR | O_DIRECT | O_NOATIME);
104 	if (fd < 0) {
105 		/* Try without O_DIRECT for non-disk files */
106 		fd = open(bdev->filename, O_RDWR | O_NOATIME);
107 		if (fd < 0) {
108 			SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n",
109 				    bdev->filename, errno, spdk_strerror(errno));
110 			bdev->fd = -1;
111 			return -1;
112 		}
113 	}
114 
115 	bdev->fd = fd;
116 
117 	return 0;
118 }
119 
120 static int
121 bdev_uring_close(struct bdev_uring *bdev)
122 {
123 	int rc;
124 
125 	if (bdev->fd == -1) {
126 		return 0;
127 	}
128 
129 	rc = close(bdev->fd);
130 	if (rc < 0) {
131 		SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n",
132 			    bdev->fd, errno, spdk_strerror(errno));
133 		return -1;
134 	}
135 
136 	bdev->fd = -1;
137 
138 	return 0;
139 }
140 
141 static int64_t
142 bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch,
143 		 struct bdev_uring_task *uring_task,
144 		 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
145 {
146 	struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch);
147 	struct bdev_uring_group_channel *group_ch = uring_ch->group_ch;
148 	struct io_uring_sqe *sqe;
149 
150 	sqe = io_uring_get_sqe(&group_ch->uring);
151 	io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset);
152 	io_uring_sqe_set_data(sqe, uring_task);
153 	uring_task->len = nbytes;
154 	uring_task->ch = uring_ch;
155 
156 	SPDK_DEBUGLOG(uring, "read %d iovs size %lu to off: %#lx\n",
157 		      iovcnt, nbytes, offset);
158 
159 	group_ch->io_pending++;
160 	return nbytes;
161 }
162 
163 static int64_t
164 bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch,
165 		  struct bdev_uring_task *uring_task,
166 		  struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset)
167 {
168 	struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch);
169 	struct bdev_uring_group_channel *group_ch = uring_ch->group_ch;
170 	struct io_uring_sqe *sqe;
171 
172 	sqe = io_uring_get_sqe(&group_ch->uring);
173 	io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset);
174 	io_uring_sqe_set_data(sqe, uring_task);
175 	uring_task->len = nbytes;
176 	uring_task->ch = uring_ch;
177 
178 	SPDK_DEBUGLOG(uring, "write %d iovs size %lu from off: %#lx\n",
179 		      iovcnt, nbytes, offset);
180 
181 	group_ch->io_pending++;
182 	return nbytes;
183 }
184 
185 static int
186 bdev_uring_destruct(void *ctx)
187 {
188 	struct bdev_uring *uring = ctx;
189 	int rc = 0;
190 
191 	TAILQ_REMOVE(&g_uring_bdev_head, uring, link);
192 	rc = bdev_uring_close(uring);
193 	if (rc < 0) {
194 		SPDK_ERRLOG("bdev_uring_close() failed\n");
195 	}
196 	spdk_io_device_unregister(uring, NULL);
197 	uring_free_bdev(uring);
198 	return rc;
199 }
200 
201 static int
202 bdev_uring_reap(struct io_uring *ring, int max)
203 {
204 	int i, count, ret;
205 	struct io_uring_cqe *cqe;
206 	struct bdev_uring_task *uring_task;
207 	enum spdk_bdev_io_status status;
208 
209 	count = 0;
210 	for (i = 0; i < max; i++) {
211 		ret = io_uring_peek_cqe(ring, &cqe);
212 		if (ret != 0) {
213 			return ret;
214 		}
215 
216 		if (cqe == NULL) {
217 			return count;
218 		}
219 
220 		uring_task = (struct bdev_uring_task *)cqe->user_data;
221 		if (cqe->res != (signed)uring_task->len) {
222 			status = SPDK_BDEV_IO_STATUS_FAILED;
223 		} else {
224 			status = SPDK_BDEV_IO_STATUS_SUCCESS;
225 		}
226 
227 		uring_task->ch->group_ch->io_inflight--;
228 		io_uring_cqe_seen(ring, cqe);
229 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status);
230 		count++;
231 	}
232 
233 	return count;
234 }
235 
236 static int
237 bdev_uring_group_poll(void *arg)
238 {
239 	struct bdev_uring_group_channel *group_ch = arg;
240 	int to_complete, to_submit;
241 	int count, ret;
242 
243 	to_submit = group_ch->io_pending;
244 
245 	if (to_submit > 0) {
246 		/* If there are I/O to submit, use io_uring_submit here.
247 		 * It will automatically call spdk_io_uring_enter appropriately. */
248 		ret = io_uring_submit(&group_ch->uring);
249 		if (ret < 0) {
250 			return SPDK_POLLER_BUSY;
251 		}
252 
253 		group_ch->io_pending = 0;
254 		group_ch->io_inflight += to_submit;
255 	}
256 
257 	to_complete = group_ch->io_inflight;
258 	count = 0;
259 	if (to_complete > 0) {
260 		count = bdev_uring_reap(&group_ch->uring, to_complete);
261 	}
262 
263 	if (count + to_submit > 0) {
264 		return SPDK_POLLER_BUSY;
265 	} else {
266 		return SPDK_POLLER_IDLE;
267 	}
268 }
269 
270 static void bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
271 				  bool success)
272 {
273 	if (!success) {
274 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
275 		return;
276 	}
277 
278 	switch (bdev_io->type) {
279 	case SPDK_BDEV_IO_TYPE_READ:
280 		bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt,
281 				 ch,
282 				 (struct bdev_uring_task *)bdev_io->driver_ctx,
283 				 bdev_io->u.bdev.iovs,
284 				 bdev_io->u.bdev.iovcnt,
285 				 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
286 				 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
287 		break;
288 	case SPDK_BDEV_IO_TYPE_WRITE:
289 		bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt,
290 				  ch,
291 				  (struct bdev_uring_task *)bdev_io->driver_ctx,
292 				  bdev_io->u.bdev.iovs,
293 				  bdev_io->u.bdev.iovcnt,
294 				  bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
295 				  bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
296 		break;
297 	default:
298 		SPDK_ERRLOG("Wrong io type\n");
299 		break;
300 	}
301 }
302 
303 static int _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
304 {
305 	switch (bdev_io->type) {
306 	/* Read and write operations must be performed on buffers aligned to
307 	 * bdev->required_alignment. If user specified unaligned buffers,
308 	 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
309 	case SPDK_BDEV_IO_TYPE_READ:
310 	case SPDK_BDEV_IO_TYPE_WRITE:
311 		spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb,
312 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
313 		return 0;
314 	default:
315 		return -1;
316 	}
317 }
318 
319 static void bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
320 {
321 	if (_bdev_uring_submit_request(ch, bdev_io) < 0) {
322 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
323 	}
324 }
325 
326 static bool
327 bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
328 {
329 	switch (io_type) {
330 	case SPDK_BDEV_IO_TYPE_READ:
331 	case SPDK_BDEV_IO_TYPE_WRITE:
332 		return true;
333 	default:
334 		return false;
335 	}
336 }
337 
338 static int
339 bdev_uring_create_cb(void *io_device, void *ctx_buf)
340 {
341 	struct bdev_uring_io_channel *ch = ctx_buf;
342 
343 	ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if));
344 
345 	return 0;
346 }
347 
348 static void
349 bdev_uring_destroy_cb(void *io_device, void *ctx_buf)
350 {
351 	struct bdev_uring_io_channel *ch = ctx_buf;
352 
353 	spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch));
354 }
355 
356 static struct spdk_io_channel *
357 bdev_uring_get_io_channel(void *ctx)
358 {
359 	struct bdev_uring *uring = ctx;
360 
361 	return spdk_get_io_channel(uring);
362 }
363 
364 static int
365 bdev_uring_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
366 {
367 	struct bdev_uring *uring = ctx;
368 
369 	spdk_json_write_named_object_begin(w, "uring");
370 
371 	spdk_json_write_named_string(w, "filename", uring->filename);
372 
373 	spdk_json_write_object_end(w);
374 
375 	return 0;
376 }
377 
378 static void
379 bdev_uring_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
380 {
381 	struct bdev_uring *uring = bdev->ctxt;
382 
383 	spdk_json_write_object_begin(w);
384 
385 	spdk_json_write_named_string(w, "method", "bdev_uring_create");
386 
387 	spdk_json_write_named_object_begin(w, "params");
388 	spdk_json_write_named_string(w, "name", bdev->name);
389 	spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
390 	spdk_json_write_named_string(w, "filename", uring->filename);
391 	spdk_json_write_object_end(w);
392 
393 	spdk_json_write_object_end(w);
394 }
395 
396 static const struct spdk_bdev_fn_table uring_fn_table = {
397 	.destruct		= bdev_uring_destruct,
398 	.submit_request		= bdev_uring_submit_request,
399 	.io_type_supported	= bdev_uring_io_type_supported,
400 	.get_io_channel		= bdev_uring_get_io_channel,
401 	.dump_info_json		= bdev_uring_dump_info_json,
402 	.write_config_json	= bdev_uring_write_json_config,
403 };
404 
405 static void uring_free_bdev(struct bdev_uring *uring)
406 {
407 	if (uring == NULL) {
408 		return;
409 	}
410 	free(uring->filename);
411 	free(uring->bdev.name);
412 	free(uring);
413 }
414 
415 static int
416 bdev_uring_group_create_cb(void *io_device, void *ctx_buf)
417 {
418 	struct bdev_uring_group_channel *ch = ctx_buf;
419 
420 	/* Do not use IORING_SETUP_IOPOLL until the Linux kernel can support not only
421 	 * local devices but also devices attached from remote target */
422 	if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, 0) < 0) {
423 		SPDK_ERRLOG("uring I/O context setup failure\n");
424 		return -1;
425 	}
426 
427 	ch->poller = SPDK_POLLER_REGISTER(bdev_uring_group_poll, ch, 0);
428 	return 0;
429 }
430 
431 static void
432 bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf)
433 {
434 	struct bdev_uring_group_channel *ch = ctx_buf;
435 
436 	io_uring_queue_exit(&ch->uring);
437 
438 	spdk_poller_unregister(&ch->poller);
439 }
440 
441 struct spdk_bdev *
442 create_uring_bdev(const char *name, const char *filename, uint32_t block_size)
443 {
444 	struct bdev_uring *uring;
445 	uint32_t detected_block_size;
446 	uint64_t bdev_size;
447 	int rc;
448 
449 	uring = calloc(1, sizeof(*uring));
450 	if (!uring) {
451 		SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n");
452 		return NULL;
453 	}
454 
455 	uring->filename = strdup(filename);
456 	if (!uring->filename) {
457 		goto error_return;
458 	}
459 
460 	if (bdev_uring_open(uring)) {
461 		SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, uring->fd, errno);
462 		goto error_return;
463 	}
464 
465 	bdev_size = spdk_fd_get_size(uring->fd);
466 
467 	uring->bdev.name = strdup(name);
468 	if (!uring->bdev.name) {
469 		goto error_return;
470 	}
471 	uring->bdev.product_name = "URING bdev";
472 	uring->bdev.module = &uring_if;
473 
474 	uring->bdev.write_cache = 1;
475 
476 	detected_block_size = spdk_fd_get_blocklen(uring->fd);
477 	if (block_size == 0) {
478 		/* User did not specify block size - use autodetected block size. */
479 		if (detected_block_size == 0) {
480 			SPDK_ERRLOG("Block size could not be auto-detected\n");
481 			goto error_return;
482 		}
483 		block_size = detected_block_size;
484 	} else {
485 		if (block_size < detected_block_size) {
486 			SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than "
487 				    "auto-detected block size %" PRIu32 "\n",
488 				    block_size, detected_block_size);
489 			goto error_return;
490 		} else if (detected_block_size != 0 && block_size != detected_block_size) {
491 			SPDK_WARNLOG("Specified block size %" PRIu32 " does not match "
492 				     "auto-detected block size %" PRIu32 "\n",
493 				     block_size, detected_block_size);
494 		}
495 	}
496 
497 	if (block_size < 512) {
498 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size);
499 		goto error_return;
500 	}
501 
502 	if (!spdk_u32_is_pow2(block_size)) {
503 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size);
504 		goto error_return;
505 	}
506 
507 	uring->bdev.blocklen = block_size;
508 	uring->bdev.required_alignment = spdk_u32log2(block_size);
509 
510 	if (bdev_size % uring->bdev.blocklen != 0) {
511 		SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
512 			    bdev_size, uring->bdev.blocklen);
513 		goto error_return;
514 	}
515 
516 	uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen;
517 	uring->bdev.ctxt = uring;
518 
519 	uring->bdev.fn_table = &uring_fn_table;
520 
521 	spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb,
522 				sizeof(struct bdev_uring_io_channel),
523 				uring->bdev.name);
524 	rc = spdk_bdev_register(&uring->bdev);
525 	if (rc) {
526 		spdk_io_device_unregister(uring, NULL);
527 		goto error_return;
528 	}
529 
530 	TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link);
531 	return &uring->bdev;
532 
533 error_return:
534 	bdev_uring_close(uring);
535 	uring_free_bdev(uring);
536 	return NULL;
537 }
538 
539 struct delete_uring_bdev_ctx {
540 	spdk_delete_uring_complete cb_fn;
541 	void *cb_arg;
542 };
543 
544 static void
545 uring_bdev_unregister_cb(void *arg, int bdeverrno)
546 {
547 	struct delete_uring_bdev_ctx *ctx = arg;
548 
549 	ctx->cb_fn(ctx->cb_arg, bdeverrno);
550 	free(ctx);
551 }
552 
553 void
554 delete_uring_bdev(const char *name, spdk_delete_uring_complete cb_fn, void *cb_arg)
555 {
556 	struct delete_uring_bdev_ctx *ctx;
557 	int rc;
558 
559 	ctx = calloc(1, sizeof(*ctx));
560 	if (ctx == NULL) {
561 		cb_fn(cb_arg, -ENOMEM);
562 		return;
563 	}
564 
565 	ctx->cb_fn = cb_fn;
566 	ctx->cb_arg = cb_arg;
567 	rc = spdk_bdev_unregister_by_name(name, &uring_if, uring_bdev_unregister_cb, ctx);
568 	if (rc != 0) {
569 		uring_bdev_unregister_cb(ctx, rc);
570 	}
571 }
572 
573 static int
574 bdev_uring_init(void)
575 {
576 	spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb,
577 				sizeof(struct bdev_uring_group_channel), "uring_module");
578 
579 	return 0;
580 }
581 
582 static void
583 bdev_uring_fini(void)
584 {
585 	spdk_io_device_unregister(&uring_if, NULL);
586 }
587 
588 SPDK_LOG_REGISTER_COMPONENT(uring)
589