xref: /spdk/module/bdev/uring/bdev_uring.c (revision 0098e636761237b77c12c30c2408263a5d2260cc)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (c) Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "bdev_uring.h"
7 
8 #include "spdk/stdinc.h"
9 
10 #include "spdk/barrier.h"
11 #include "spdk/bdev.h"
12 #include "spdk/env.h"
13 #include "spdk/fd.h"
14 #include "spdk/likely.h"
15 #include "spdk/thread.h"
16 #include "spdk/json.h"
17 #include "spdk/util.h"
18 #include "spdk/string.h"
19 
20 #include "spdk/log.h"
21 #include "spdk_internal/uring.h"
22 
23 struct bdev_uring_io_channel {
24 	struct bdev_uring_group_channel		*group_ch;
25 };
26 
27 struct bdev_uring_group_channel {
28 	uint64_t				io_inflight;
29 	uint64_t				io_pending;
30 	struct spdk_poller			*poller;
31 	struct io_uring				uring;
32 };
33 
34 struct bdev_uring_task {
35 	uint64_t			len;
36 	struct bdev_uring_io_channel	*ch;
37 	TAILQ_ENTRY(bdev_uring_task)	link;
38 };
39 
40 struct bdev_uring {
41 	struct spdk_bdev	bdev;
42 	char			*filename;
43 	int			fd;
44 	TAILQ_ENTRY(bdev_uring)  link;
45 };
46 
47 static int bdev_uring_init(void);
48 static void bdev_uring_fini(void);
49 static void uring_free_bdev(struct bdev_uring *uring);
50 static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head = TAILQ_HEAD_INITIALIZER(g_uring_bdev_head);
51 
52 #define SPDK_URING_QUEUE_DEPTH 512
53 #define MAX_EVENTS_PER_POLL 32
54 
55 static int
56 bdev_uring_get_ctx_size(void)
57 {
58 	return sizeof(struct bdev_uring_task);
59 }
60 
61 static struct spdk_bdev_module uring_if = {
62 	.name		= "uring",
63 	.module_init	= bdev_uring_init,
64 	.module_fini	= bdev_uring_fini,
65 	.get_ctx_size	= bdev_uring_get_ctx_size,
66 };
67 
68 SPDK_BDEV_MODULE_REGISTER(uring, &uring_if)
69 
70 static int
71 bdev_uring_open(struct bdev_uring *bdev)
72 {
73 	int fd;
74 
75 	fd = open(bdev->filename, O_RDWR | O_DIRECT | O_NOATIME);
76 	if (fd < 0) {
77 		/* Try without O_DIRECT for non-disk files */
78 		fd = open(bdev->filename, O_RDWR | O_NOATIME);
79 		if (fd < 0) {
80 			SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n",
81 				    bdev->filename, errno, spdk_strerror(errno));
82 			bdev->fd = -1;
83 			return -1;
84 		}
85 	}
86 
87 	bdev->fd = fd;
88 
89 	return 0;
90 }
91 
92 static int
93 bdev_uring_close(struct bdev_uring *bdev)
94 {
95 	int rc;
96 
97 	if (bdev->fd == -1) {
98 		return 0;
99 	}
100 
101 	rc = close(bdev->fd);
102 	if (rc < 0) {
103 		SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n",
104 			    bdev->fd, errno, spdk_strerror(errno));
105 		return -1;
106 	}
107 
108 	bdev->fd = -1;
109 
110 	return 0;
111 }
112 
113 static int64_t
114 bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch,
115 		 struct bdev_uring_task *uring_task,
116 		 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
117 {
118 	struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch);
119 	struct bdev_uring_group_channel *group_ch = uring_ch->group_ch;
120 	struct io_uring_sqe *sqe;
121 
122 	sqe = io_uring_get_sqe(&group_ch->uring);
123 	io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset);
124 	io_uring_sqe_set_data(sqe, uring_task);
125 	uring_task->len = nbytes;
126 	uring_task->ch = uring_ch;
127 
128 	SPDK_DEBUGLOG(uring, "read %d iovs size %lu to off: %#lx\n",
129 		      iovcnt, nbytes, offset);
130 
131 	group_ch->io_pending++;
132 	return nbytes;
133 }
134 
135 static int64_t
136 bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch,
137 		  struct bdev_uring_task *uring_task,
138 		  struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset)
139 {
140 	struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch);
141 	struct bdev_uring_group_channel *group_ch = uring_ch->group_ch;
142 	struct io_uring_sqe *sqe;
143 
144 	sqe = io_uring_get_sqe(&group_ch->uring);
145 	io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset);
146 	io_uring_sqe_set_data(sqe, uring_task);
147 	uring_task->len = nbytes;
148 	uring_task->ch = uring_ch;
149 
150 	SPDK_DEBUGLOG(uring, "write %d iovs size %lu from off: %#lx\n",
151 		      iovcnt, nbytes, offset);
152 
153 	group_ch->io_pending++;
154 	return nbytes;
155 }
156 
157 static int
158 bdev_uring_destruct(void *ctx)
159 {
160 	struct bdev_uring *uring = ctx;
161 	int rc = 0;
162 
163 	TAILQ_REMOVE(&g_uring_bdev_head, uring, link);
164 	rc = bdev_uring_close(uring);
165 	if (rc < 0) {
166 		SPDK_ERRLOG("bdev_uring_close() failed\n");
167 	}
168 	spdk_io_device_unregister(uring, NULL);
169 	uring_free_bdev(uring);
170 	return rc;
171 }
172 
173 static int
174 bdev_uring_reap(struct io_uring *ring, int max)
175 {
176 	int i, count, ret;
177 	struct io_uring_cqe *cqe;
178 	struct bdev_uring_task *uring_task;
179 	enum spdk_bdev_io_status status;
180 
181 	count = 0;
182 	for (i = 0; i < max; i++) {
183 		ret = io_uring_peek_cqe(ring, &cqe);
184 		if (ret != 0) {
185 			return ret;
186 		}
187 
188 		if (cqe == NULL) {
189 			return count;
190 		}
191 
192 		uring_task = (struct bdev_uring_task *)cqe->user_data;
193 		if (cqe->res != (signed)uring_task->len) {
194 			status = SPDK_BDEV_IO_STATUS_FAILED;
195 		} else {
196 			status = SPDK_BDEV_IO_STATUS_SUCCESS;
197 		}
198 
199 		uring_task->ch->group_ch->io_inflight--;
200 		io_uring_cqe_seen(ring, cqe);
201 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status);
202 		count++;
203 	}
204 
205 	return count;
206 }
207 
208 static int
209 bdev_uring_group_poll(void *arg)
210 {
211 	struct bdev_uring_group_channel *group_ch = arg;
212 	int to_complete, to_submit;
213 	int count, ret;
214 
215 	to_submit = group_ch->io_pending;
216 
217 	if (to_submit > 0) {
218 		/* If there are I/O to submit, use io_uring_submit here.
219 		 * It will automatically call spdk_io_uring_enter appropriately. */
220 		ret = io_uring_submit(&group_ch->uring);
221 		if (ret < 0) {
222 			return SPDK_POLLER_BUSY;
223 		}
224 
225 		group_ch->io_pending = 0;
226 		group_ch->io_inflight += to_submit;
227 	}
228 
229 	to_complete = group_ch->io_inflight;
230 	count = 0;
231 	if (to_complete > 0) {
232 		count = bdev_uring_reap(&group_ch->uring, to_complete);
233 	}
234 
235 	if (count + to_submit > 0) {
236 		return SPDK_POLLER_BUSY;
237 	} else {
238 		return SPDK_POLLER_IDLE;
239 	}
240 }
241 
242 static void
243 bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
244 		      bool success)
245 {
246 	if (!success) {
247 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
248 		return;
249 	}
250 
251 	switch (bdev_io->type) {
252 	case SPDK_BDEV_IO_TYPE_READ:
253 		bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt,
254 				 ch,
255 				 (struct bdev_uring_task *)bdev_io->driver_ctx,
256 				 bdev_io->u.bdev.iovs,
257 				 bdev_io->u.bdev.iovcnt,
258 				 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
259 				 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
260 		break;
261 	case SPDK_BDEV_IO_TYPE_WRITE:
262 		bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt,
263 				  ch,
264 				  (struct bdev_uring_task *)bdev_io->driver_ctx,
265 				  bdev_io->u.bdev.iovs,
266 				  bdev_io->u.bdev.iovcnt,
267 				  bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
268 				  bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
269 		break;
270 	default:
271 		SPDK_ERRLOG("Wrong io type\n");
272 		break;
273 	}
274 }
275 
276 static int
277 _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
278 {
279 	switch (bdev_io->type) {
280 	/* Read and write operations must be performed on buffers aligned to
281 	 * bdev->required_alignment. If user specified unaligned buffers,
282 	 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
283 	case SPDK_BDEV_IO_TYPE_READ:
284 	case SPDK_BDEV_IO_TYPE_WRITE:
285 		spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb,
286 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
287 		return 0;
288 	default:
289 		return -1;
290 	}
291 }
292 
293 static void
294 bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
295 {
296 	if (_bdev_uring_submit_request(ch, bdev_io) < 0) {
297 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
298 	}
299 }
300 
301 static bool
302 bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
303 {
304 	switch (io_type) {
305 	case SPDK_BDEV_IO_TYPE_READ:
306 	case SPDK_BDEV_IO_TYPE_WRITE:
307 		return true;
308 	default:
309 		return false;
310 	}
311 }
312 
313 static int
314 bdev_uring_create_cb(void *io_device, void *ctx_buf)
315 {
316 	struct bdev_uring_io_channel *ch = ctx_buf;
317 
318 	ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if));
319 
320 	return 0;
321 }
322 
323 static void
324 bdev_uring_destroy_cb(void *io_device, void *ctx_buf)
325 {
326 	struct bdev_uring_io_channel *ch = ctx_buf;
327 
328 	spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch));
329 }
330 
331 static struct spdk_io_channel *
332 bdev_uring_get_io_channel(void *ctx)
333 {
334 	struct bdev_uring *uring = ctx;
335 
336 	return spdk_get_io_channel(uring);
337 }
338 
339 static int
340 bdev_uring_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
341 {
342 	struct bdev_uring *uring = ctx;
343 
344 	spdk_json_write_named_object_begin(w, "uring");
345 
346 	spdk_json_write_named_string(w, "filename", uring->filename);
347 
348 	spdk_json_write_object_end(w);
349 
350 	return 0;
351 }
352 
353 static void
354 bdev_uring_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
355 {
356 	struct bdev_uring *uring = bdev->ctxt;
357 
358 	spdk_json_write_object_begin(w);
359 
360 	spdk_json_write_named_string(w, "method", "bdev_uring_create");
361 
362 	spdk_json_write_named_object_begin(w, "params");
363 	spdk_json_write_named_string(w, "name", bdev->name);
364 	spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
365 	spdk_json_write_named_string(w, "filename", uring->filename);
366 	spdk_json_write_object_end(w);
367 
368 	spdk_json_write_object_end(w);
369 }
370 
371 static const struct spdk_bdev_fn_table uring_fn_table = {
372 	.destruct		= bdev_uring_destruct,
373 	.submit_request		= bdev_uring_submit_request,
374 	.io_type_supported	= bdev_uring_io_type_supported,
375 	.get_io_channel		= bdev_uring_get_io_channel,
376 	.dump_info_json		= bdev_uring_dump_info_json,
377 	.write_config_json	= bdev_uring_write_json_config,
378 };
379 
380 static void
381 uring_free_bdev(struct bdev_uring *uring)
382 {
383 	if (uring == NULL) {
384 		return;
385 	}
386 	free(uring->filename);
387 	free(uring->bdev.name);
388 	free(uring);
389 }
390 
391 static int
392 bdev_uring_group_create_cb(void *io_device, void *ctx_buf)
393 {
394 	struct bdev_uring_group_channel *ch = ctx_buf;
395 
396 	/* Do not use IORING_SETUP_IOPOLL until the Linux kernel can support not only
397 	 * local devices but also devices attached from remote target */
398 	if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, 0) < 0) {
399 		SPDK_ERRLOG("uring I/O context setup failure\n");
400 		return -1;
401 	}
402 
403 	ch->poller = SPDK_POLLER_REGISTER(bdev_uring_group_poll, ch, 0);
404 	return 0;
405 }
406 
407 static void
408 bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf)
409 {
410 	struct bdev_uring_group_channel *ch = ctx_buf;
411 
412 	io_uring_queue_exit(&ch->uring);
413 
414 	spdk_poller_unregister(&ch->poller);
415 }
416 
417 struct spdk_bdev *
418 create_uring_bdev(const char *name, const char *filename, uint32_t block_size)
419 {
420 	struct bdev_uring *uring;
421 	uint32_t detected_block_size;
422 	uint64_t bdev_size;
423 	int rc;
424 
425 	uring = calloc(1, sizeof(*uring));
426 	if (!uring) {
427 		SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n");
428 		return NULL;
429 	}
430 
431 	uring->filename = strdup(filename);
432 	if (!uring->filename) {
433 		goto error_return;
434 	}
435 
436 	if (bdev_uring_open(uring)) {
437 		SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, uring->fd, errno);
438 		goto error_return;
439 	}
440 
441 	bdev_size = spdk_fd_get_size(uring->fd);
442 
443 	uring->bdev.name = strdup(name);
444 	if (!uring->bdev.name) {
445 		goto error_return;
446 	}
447 	uring->bdev.product_name = "URING bdev";
448 	uring->bdev.module = &uring_if;
449 
450 	uring->bdev.write_cache = 1;
451 
452 	detected_block_size = spdk_fd_get_blocklen(uring->fd);
453 	if (block_size == 0) {
454 		/* User did not specify block size - use autodetected block size. */
455 		if (detected_block_size == 0) {
456 			SPDK_ERRLOG("Block size could not be auto-detected\n");
457 			goto error_return;
458 		}
459 		block_size = detected_block_size;
460 	} else {
461 		if (block_size < detected_block_size) {
462 			SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than "
463 				    "auto-detected block size %" PRIu32 "\n",
464 				    block_size, detected_block_size);
465 			goto error_return;
466 		} else if (detected_block_size != 0 && block_size != detected_block_size) {
467 			SPDK_WARNLOG("Specified block size %" PRIu32 " does not match "
468 				     "auto-detected block size %" PRIu32 "\n",
469 				     block_size, detected_block_size);
470 		}
471 	}
472 
473 	if (block_size < 512) {
474 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size);
475 		goto error_return;
476 	}
477 
478 	if (!spdk_u32_is_pow2(block_size)) {
479 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size);
480 		goto error_return;
481 	}
482 
483 	uring->bdev.blocklen = block_size;
484 	uring->bdev.required_alignment = spdk_u32log2(block_size);
485 
486 	if (bdev_size % uring->bdev.blocklen != 0) {
487 		SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
488 			    bdev_size, uring->bdev.blocklen);
489 		goto error_return;
490 	}
491 
492 	uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen;
493 	uring->bdev.ctxt = uring;
494 
495 	uring->bdev.fn_table = &uring_fn_table;
496 
497 	spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb,
498 				sizeof(struct bdev_uring_io_channel),
499 				uring->bdev.name);
500 	rc = spdk_bdev_register(&uring->bdev);
501 	if (rc) {
502 		spdk_io_device_unregister(uring, NULL);
503 		goto error_return;
504 	}
505 
506 	TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link);
507 	return &uring->bdev;
508 
509 error_return:
510 	bdev_uring_close(uring);
511 	uring_free_bdev(uring);
512 	return NULL;
513 }
514 
515 struct delete_uring_bdev_ctx {
516 	spdk_delete_uring_complete cb_fn;
517 	void *cb_arg;
518 };
519 
520 static void
521 uring_bdev_unregister_cb(void *arg, int bdeverrno)
522 {
523 	struct delete_uring_bdev_ctx *ctx = arg;
524 
525 	ctx->cb_fn(ctx->cb_arg, bdeverrno);
526 	free(ctx);
527 }
528 
529 void
530 delete_uring_bdev(const char *name, spdk_delete_uring_complete cb_fn, void *cb_arg)
531 {
532 	struct delete_uring_bdev_ctx *ctx;
533 	int rc;
534 
535 	ctx = calloc(1, sizeof(*ctx));
536 	if (ctx == NULL) {
537 		cb_fn(cb_arg, -ENOMEM);
538 		return;
539 	}
540 
541 	ctx->cb_fn = cb_fn;
542 	ctx->cb_arg = cb_arg;
543 	rc = spdk_bdev_unregister_by_name(name, &uring_if, uring_bdev_unregister_cb, ctx);
544 	if (rc != 0) {
545 		uring_bdev_unregister_cb(ctx, rc);
546 	}
547 }
548 
549 static int
550 bdev_uring_init(void)
551 {
552 	spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb,
553 				sizeof(struct bdev_uring_group_channel), "uring_module");
554 
555 	return 0;
556 }
557 
558 static void
559 bdev_uring_fini(void)
560 {
561 	spdk_io_device_unregister(&uring_if, NULL);
562 }
563 
564 SPDK_LOG_REGISTER_COMPONENT(uring)
565