xref: /spdk/module/bdev/uring/bdev_uring.c (revision a6dbe3721eb3b5990707fc3e378c95e505dd8ab5)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2019 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "bdev_uring.h"
7 
8 #include "spdk/stdinc.h"
9 #include "spdk/config.h"
10 #include "spdk/barrier.h"
11 #include "spdk/bdev.h"
12 #include "spdk/env.h"
13 #include "spdk/fd.h"
14 #include "spdk/likely.h"
15 #include "spdk/thread.h"
16 #include "spdk/json.h"
17 #include "spdk/util.h"
18 #include "spdk/string.h"
19 
20 #include "spdk/log.h"
21 #include "spdk_internal/uring.h"
22 
23 #ifdef SPDK_CONFIG_URING_ZNS
24 #include <linux/blkzoned.h>
25 #define SECTOR_SHIFT 9
26 #endif
27 
28 struct bdev_uring_zoned_dev {
29 	uint64_t		num_zones;
30 	uint32_t		zone_shift;
31 	uint32_t		lba_shift;
32 };
33 
34 struct bdev_uring_io_channel {
35 	struct bdev_uring_group_channel		*group_ch;
36 };
37 
38 struct bdev_uring_group_channel {
39 	uint64_t				io_inflight;
40 	uint64_t				io_pending;
41 	struct spdk_poller			*poller;
42 	struct io_uring				uring;
43 };
44 
45 struct bdev_uring_task {
46 	uint64_t			len;
47 	struct bdev_uring_io_channel	*ch;
48 	TAILQ_ENTRY(bdev_uring_task)	link;
49 };
50 
51 struct bdev_uring {
52 	struct spdk_bdev	bdev;
53 	struct bdev_uring_zoned_dev	zd;
54 	char			*filename;
55 	int			fd;
56 	TAILQ_ENTRY(bdev_uring)  link;
57 };
58 
59 static int bdev_uring_init(void);
60 static void bdev_uring_fini(void);
61 static void uring_free_bdev(struct bdev_uring *uring);
62 static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head = TAILQ_HEAD_INITIALIZER(g_uring_bdev_head);
63 
64 #define SPDK_URING_QUEUE_DEPTH 512
65 #define MAX_EVENTS_PER_POLL 32
66 
67 static int
68 bdev_uring_get_ctx_size(void)
69 {
70 	return sizeof(struct bdev_uring_task);
71 }
72 
73 static struct spdk_bdev_module uring_if = {
74 	.name		= "uring",
75 	.module_init	= bdev_uring_init,
76 	.module_fini	= bdev_uring_fini,
77 	.get_ctx_size	= bdev_uring_get_ctx_size,
78 };
79 
80 SPDK_BDEV_MODULE_REGISTER(uring, &uring_if)
81 
82 static int
83 bdev_uring_open(struct bdev_uring *bdev)
84 {
85 	int fd;
86 
87 	fd = open(bdev->filename, O_RDWR | O_DIRECT | O_NOATIME);
88 	if (fd < 0) {
89 		/* Try without O_DIRECT for non-disk files */
90 		fd = open(bdev->filename, O_RDWR | O_NOATIME);
91 		if (fd < 0) {
92 			SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n",
93 				    bdev->filename, errno, spdk_strerror(errno));
94 			bdev->fd = -1;
95 			return -1;
96 		}
97 	}
98 
99 	bdev->fd = fd;
100 
101 	return 0;
102 }
103 
104 static int
105 bdev_uring_close(struct bdev_uring *bdev)
106 {
107 	int rc;
108 
109 	if (bdev->fd == -1) {
110 		return 0;
111 	}
112 
113 	rc = close(bdev->fd);
114 	if (rc < 0) {
115 		SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n",
116 			    bdev->fd, errno, spdk_strerror(errno));
117 		return -1;
118 	}
119 
120 	bdev->fd = -1;
121 
122 	return 0;
123 }
124 
125 static int64_t
126 bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch,
127 		 struct bdev_uring_task *uring_task,
128 		 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
129 {
130 	struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch);
131 	struct bdev_uring_group_channel *group_ch = uring_ch->group_ch;
132 	struct io_uring_sqe *sqe;
133 
134 	sqe = io_uring_get_sqe(&group_ch->uring);
135 	io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset);
136 	io_uring_sqe_set_data(sqe, uring_task);
137 	uring_task->len = nbytes;
138 	uring_task->ch = uring_ch;
139 
140 	SPDK_DEBUGLOG(uring, "read %d iovs size %lu to off: %#lx\n",
141 		      iovcnt, nbytes, offset);
142 
143 	group_ch->io_pending++;
144 	return nbytes;
145 }
146 
147 static int64_t
148 bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch,
149 		  struct bdev_uring_task *uring_task,
150 		  struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset)
151 {
152 	struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch);
153 	struct bdev_uring_group_channel *group_ch = uring_ch->group_ch;
154 	struct io_uring_sqe *sqe;
155 
156 	sqe = io_uring_get_sqe(&group_ch->uring);
157 	io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset);
158 	io_uring_sqe_set_data(sqe, uring_task);
159 	uring_task->len = nbytes;
160 	uring_task->ch = uring_ch;
161 
162 	SPDK_DEBUGLOG(uring, "write %d iovs size %lu from off: %#lx\n",
163 		      iovcnt, nbytes, offset);
164 
165 	group_ch->io_pending++;
166 	return nbytes;
167 }
168 
169 static int
170 bdev_uring_destruct(void *ctx)
171 {
172 	struct bdev_uring *uring = ctx;
173 	int rc = 0;
174 
175 	TAILQ_REMOVE(&g_uring_bdev_head, uring, link);
176 	rc = bdev_uring_close(uring);
177 	if (rc < 0) {
178 		SPDK_ERRLOG("bdev_uring_close() failed\n");
179 	}
180 	spdk_io_device_unregister(uring, NULL);
181 	uring_free_bdev(uring);
182 	return rc;
183 }
184 
185 static int
186 bdev_uring_reap(struct io_uring *ring, int max)
187 {
188 	int i, count, ret;
189 	struct io_uring_cqe *cqe;
190 	struct bdev_uring_task *uring_task;
191 	enum spdk_bdev_io_status status;
192 
193 	count = 0;
194 	for (i = 0; i < max; i++) {
195 		ret = io_uring_peek_cqe(ring, &cqe);
196 		if (ret != 0) {
197 			return ret;
198 		}
199 
200 		if (cqe == NULL) {
201 			return count;
202 		}
203 
204 		uring_task = (struct bdev_uring_task *)cqe->user_data;
205 		if (cqe->res != (signed)uring_task->len) {
206 			status = SPDK_BDEV_IO_STATUS_FAILED;
207 		} else {
208 			status = SPDK_BDEV_IO_STATUS_SUCCESS;
209 		}
210 
211 		uring_task->ch->group_ch->io_inflight--;
212 		io_uring_cqe_seen(ring, cqe);
213 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status);
214 		count++;
215 	}
216 
217 	return count;
218 }
219 
220 static int
221 bdev_uring_group_poll(void *arg)
222 {
223 	struct bdev_uring_group_channel *group_ch = arg;
224 	int to_complete, to_submit;
225 	int count, ret;
226 
227 	to_submit = group_ch->io_pending;
228 
229 	if (to_submit > 0) {
230 		/* If there are I/O to submit, use io_uring_submit here.
231 		 * It will automatically call spdk_io_uring_enter appropriately. */
232 		ret = io_uring_submit(&group_ch->uring);
233 		if (ret < 0) {
234 			return SPDK_POLLER_BUSY;
235 		}
236 
237 		group_ch->io_pending = 0;
238 		group_ch->io_inflight += to_submit;
239 	}
240 
241 	to_complete = group_ch->io_inflight;
242 	count = 0;
243 	if (to_complete > 0) {
244 		count = bdev_uring_reap(&group_ch->uring, to_complete);
245 	}
246 
247 	if (count + to_submit > 0) {
248 		return SPDK_POLLER_BUSY;
249 	} else {
250 		return SPDK_POLLER_IDLE;
251 	}
252 }
253 
254 static void
255 bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
256 		      bool success)
257 {
258 	if (!success) {
259 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
260 		return;
261 	}
262 
263 	switch (bdev_io->type) {
264 	case SPDK_BDEV_IO_TYPE_READ:
265 		bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt,
266 				 ch,
267 				 (struct bdev_uring_task *)bdev_io->driver_ctx,
268 				 bdev_io->u.bdev.iovs,
269 				 bdev_io->u.bdev.iovcnt,
270 				 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
271 				 bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
272 		break;
273 	case SPDK_BDEV_IO_TYPE_WRITE:
274 		bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt,
275 				  ch,
276 				  (struct bdev_uring_task *)bdev_io->driver_ctx,
277 				  bdev_io->u.bdev.iovs,
278 				  bdev_io->u.bdev.iovcnt,
279 				  bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
280 				  bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
281 		break;
282 	default:
283 		SPDK_ERRLOG("Wrong io type\n");
284 		break;
285 	}
286 }
287 
288 #ifdef SPDK_CONFIG_URING_ZNS
289 static int
290 bdev_uring_read_sysfs_attr(const char *devname, const char *attr, char *str, int str_len)
291 {
292 	char *path = NULL;
293 	char *device = NULL;
294 	FILE *file;
295 	int ret = 0;
296 
297 	device = basename(devname);
298 	path = spdk_sprintf_alloc("/sys/block/%s/%s", device, attr);
299 	if (!path) {
300 		return -EINVAL;
301 	}
302 
303 	file = fopen(path, "r");
304 	if (!file) {
305 		free(path);
306 		return -ENOENT;
307 	}
308 
309 	if (!fgets(str, str_len, file)) {
310 		ret = -EINVAL;
311 		goto close;
312 	}
313 
314 	spdk_str_chomp(str);
315 
316 close:
317 	free(path);
318 	fclose(file);
319 	return ret;
320 }
321 
322 static int
323 bdev_uring_read_sysfs_attr_long(const char *devname, const char *attr, long *val)
324 {
325 	char str[128];
326 	int ret;
327 
328 	ret = bdev_uring_read_sysfs_attr(devname, attr, str, sizeof(str));
329 	if (ret) {
330 		return ret;
331 	}
332 
333 	*val = spdk_strtol(str, 10);
334 
335 	return 0;
336 }
337 
338 static int
339 bdev_uring_fill_zone_type(struct spdk_bdev_zone_info *zone_info, struct blk_zone *zones_rep)
340 {
341 	switch (zones_rep->type) {
342 	case BLK_ZONE_TYPE_CONVENTIONAL:
343 		zone_info->type = SPDK_BDEV_ZONE_TYPE_CNV;
344 		break;
345 	case BLK_ZONE_TYPE_SEQWRITE_REQ:
346 		zone_info->type = SPDK_BDEV_ZONE_TYPE_SEQWR;
347 		break;
348 	case BLK_ZONE_TYPE_SEQWRITE_PREF:
349 		zone_info->type = SPDK_BDEV_ZONE_TYPE_SEQWP;
350 		break;
351 	default:
352 		SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", zones_rep->type);
353 		return -EIO;
354 	}
355 	return 0;
356 }
357 
358 static int
359 bdev_uring_fill_zone_state(struct spdk_bdev_zone_info *zone_info, struct blk_zone *zones_rep)
360 {
361 	switch (zones_rep->cond) {
362 	case BLK_ZONE_COND_EMPTY:
363 		zone_info->state = SPDK_BDEV_ZONE_STATE_EMPTY;
364 		break;
365 	case BLK_ZONE_COND_IMP_OPEN:
366 		zone_info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN;
367 		break;
368 	case BLK_ZONE_COND_EXP_OPEN:
369 		zone_info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN;
370 		break;
371 	case BLK_ZONE_COND_CLOSED:
372 		zone_info->state = SPDK_BDEV_ZONE_STATE_CLOSED;
373 		break;
374 	case BLK_ZONE_COND_READONLY:
375 		zone_info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY;
376 		break;
377 	case BLK_ZONE_COND_FULL:
378 		zone_info->state = SPDK_BDEV_ZONE_STATE_FULL;
379 		break;
380 	case BLK_ZONE_COND_OFFLINE:
381 		zone_info->state = SPDK_BDEV_ZONE_STATE_OFFLINE;
382 		break;
383 	case BLK_ZONE_COND_NOT_WP:
384 		zone_info->state = SPDK_BDEV_ZONE_STATE_NOT_WP;
385 		break;
386 	default:
387 		SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", zones_rep->cond);
388 		return -EIO;
389 	}
390 	return 0;
391 }
392 
393 static int
394 bdev_uring_zone_management_op(struct spdk_bdev_io *bdev_io)
395 {
396 	struct bdev_uring *uring;
397 	struct blk_zone_range range;
398 	long unsigned zone_mgmt_op;
399 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
400 
401 	uring = (struct bdev_uring *)bdev_io->bdev->ctxt;
402 
403 	switch (bdev_io->u.zone_mgmt.zone_action) {
404 	case SPDK_BDEV_ZONE_RESET:
405 		zone_mgmt_op = BLKRESETZONE;
406 		break;
407 	case SPDK_BDEV_ZONE_OPEN:
408 		zone_mgmt_op = BLKOPENZONE;
409 		break;
410 	case SPDK_BDEV_ZONE_CLOSE:
411 		zone_mgmt_op = BLKCLOSEZONE;
412 		break;
413 	case SPDK_BDEV_ZONE_FINISH:
414 		zone_mgmt_op = BLKFINISHZONE;
415 		break;
416 	default:
417 		return -EINVAL;
418 	}
419 
420 	range.sector = (zone_id << uring->zd.lba_shift);
421 	range.nr_sectors = (uring->bdev.zone_size << uring->zd.lba_shift);
422 
423 	if (ioctl(uring->fd, zone_mgmt_op, &range)) {
424 		SPDK_ERRLOG("Ioctl BLKXXXZONE(%#x) failed errno: %d(%s)\n",
425 			    bdev_io->u.zone_mgmt.zone_action, errno, strerror(errno));
426 		return -EINVAL;
427 	}
428 
429 	spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
430 
431 	return 0;
432 }
433 
434 static int
435 bdev_uring_zone_get_info(struct spdk_bdev_io *bdev_io)
436 {
437 	struct bdev_uring *uring;
438 	struct blk_zone *zones;
439 	struct blk_zone_report *rep;
440 	struct spdk_bdev_zone_info *zone_info = bdev_io->u.zone_mgmt.buf;
441 	size_t repsize;
442 	uint32_t i, shift;
443 	uint32_t num_zones = bdev_io->u.zone_mgmt.num_zones;
444 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
445 
446 	uring = (struct bdev_uring *)bdev_io->bdev->ctxt;
447 	shift = uring->zd.lba_shift;
448 
449 	if ((num_zones > uring->zd.num_zones) || !num_zones) {
450 		return -EINVAL;
451 	}
452 
453 	repsize = sizeof(struct blk_zone_report) + (sizeof(struct blk_zone) * num_zones);
454 	rep = (struct blk_zone_report *)malloc(repsize);
455 	if (!rep) {
456 		return -ENOMEM;
457 	}
458 
459 	zones = (struct blk_zone *)(rep + 1);
460 
461 	while (num_zones && ((zone_id >> uring->zd.zone_shift) <= num_zones)) {
462 		memset(rep, 0, repsize);
463 		rep->sector = zone_id;
464 		rep->nr_zones = num_zones;
465 
466 		if (ioctl(uring->fd, BLKREPORTZONE, rep)) {
467 			SPDK_ERRLOG("Ioctl BLKREPORTZONE failed errno: %d(%s)\n",
468 				    errno, strerror(errno));
469 			free(rep);
470 			return -EINVAL;
471 		}
472 
473 		if (!rep->nr_zones) {
474 			break;
475 		}
476 
477 		for (i = 0; i < rep->nr_zones; i++) {
478 			zone_info->zone_id = ((zones + i)->start >> shift);
479 			zone_info->write_pointer = ((zones + i)->wp >> shift);
480 			zone_info->capacity = ((zones + i)->capacity >> shift);
481 
482 			bdev_uring_fill_zone_state(zone_info, zones + i);
483 			bdev_uring_fill_zone_type(zone_info, zones + i);
484 
485 			zone_id = ((zones + i)->start + (zones + i)->len) >> shift;
486 			zone_info++;
487 			num_zones--;
488 		}
489 	}
490 
491 	spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
492 	free(rep);
493 	return 0;
494 }
495 
496 static int
497 bdev_uring_check_zoned_support(struct bdev_uring *uring, const char *name, const char *filename)
498 {
499 	char str[128];
500 	long int val = 0;
501 	uint32_t zinfo;
502 	int retval = -1;
503 
504 	uring->bdev.zoned = false;
505 
506 	/* Check if this is a zoned block device */
507 	if (bdev_uring_read_sysfs_attr(filename, "queue/zoned", str, sizeof(str))) {
508 		SPDK_ERRLOG("Unable to open file %s/queue/zoned. errno: %d\n", filename, errno);
509 	} else if (strcmp(str, "host-aware") == 0 || strcmp(str, "host-managed") == 0) {
510 		/* Only host-aware & host-managed zns devices */
511 		uring->bdev.zoned = true;
512 
513 		if (ioctl(uring->fd, BLKGETNRZONES, &zinfo)) {
514 			SPDK_ERRLOG("ioctl BLKNRZONES failed %d (%s)\n", errno, strerror(errno));
515 			goto err_ret;
516 		}
517 		uring->zd.num_zones = zinfo;
518 
519 		if (ioctl(uring->fd, BLKGETZONESZ, &zinfo)) {
520 			SPDK_ERRLOG("ioctl BLKGETZONESZ failed %d (%s)\n", errno, strerror(errno));
521 			goto err_ret;
522 		}
523 
524 		uring->zd.lba_shift = uring->bdev.required_alignment - SECTOR_SHIFT;
525 		uring->bdev.zone_size = (zinfo >> uring->zd.lba_shift);
526 		uring->zd.zone_shift = spdk_u32log2(zinfo >> uring->zd.lba_shift);
527 
528 		if (bdev_uring_read_sysfs_attr_long(filename, "queue/max_open_zones", &val)) {
529 			SPDK_ERRLOG("Failed to get max open zones %d (%s)\n", errno, strerror(errno));
530 			goto err_ret;
531 		}
532 		uring->bdev.max_open_zones = uring->bdev.optimal_open_zones = (uint32_t)val;
533 
534 		if (bdev_uring_read_sysfs_attr_long(filename, "queue/max_active_zones", &val)) {
535 			SPDK_ERRLOG("Failed to get max active zones %d (%s)\n", errno, strerror(errno));
536 			goto err_ret;
537 		}
538 		uring->bdev.max_active_zones = (uint32_t)val;
539 		retval = 0;
540 	} else {
541 		retval = 0;        /* queue/zoned=none */
542 	}
543 
544 err_ret:
545 	return retval;
546 }
547 #else
548 /* No support for zoned devices */
549 static int
550 bdev_uring_zone_management_op(struct spdk_bdev_io *bdev_io)
551 {
552 	return -1;
553 }
554 
555 static int
556 bdev_uring_zone_get_info(struct spdk_bdev_io *bdev_io)
557 {
558 	return -1;
559 }
560 
561 static int
562 bdev_uring_check_zoned_support(struct bdev_uring *uring, const char *name, const char *filename)
563 {
564 	return 0;
565 }
566 #endif
567 
568 static int
569 _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
570 {
571 
572 	switch (bdev_io->type) {
573 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
574 		return bdev_uring_zone_get_info(bdev_io);
575 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
576 		return bdev_uring_zone_management_op(bdev_io);
577 	/* Read and write operations must be performed on buffers aligned to
578 	 * bdev->required_alignment. If user specified unaligned buffers,
579 	 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
580 	case SPDK_BDEV_IO_TYPE_READ:
581 	case SPDK_BDEV_IO_TYPE_WRITE:
582 		spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb,
583 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
584 		return 0;
585 	default:
586 		return -1;
587 	}
588 }
589 
590 static void
591 bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
592 {
593 	if (_bdev_uring_submit_request(ch, bdev_io) < 0) {
594 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
595 	}
596 }
597 
598 static bool
599 bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
600 {
601 	switch (io_type) {
602 #ifdef SPDK_CONFIG_URING_ZNS
603 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
604 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
605 #endif
606 	case SPDK_BDEV_IO_TYPE_READ:
607 	case SPDK_BDEV_IO_TYPE_WRITE:
608 		return true;
609 	default:
610 		return false;
611 	}
612 }
613 
614 static int
615 bdev_uring_create_cb(void *io_device, void *ctx_buf)
616 {
617 	struct bdev_uring_io_channel *ch = ctx_buf;
618 
619 	ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if));
620 
621 	return 0;
622 }
623 
624 static void
625 bdev_uring_destroy_cb(void *io_device, void *ctx_buf)
626 {
627 	struct bdev_uring_io_channel *ch = ctx_buf;
628 
629 	spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch));
630 }
631 
632 static struct spdk_io_channel *
633 bdev_uring_get_io_channel(void *ctx)
634 {
635 	struct bdev_uring *uring = ctx;
636 
637 	return spdk_get_io_channel(uring);
638 }
639 
640 static int
641 bdev_uring_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
642 {
643 	struct bdev_uring *uring = ctx;
644 
645 	spdk_json_write_named_object_begin(w, "uring");
646 
647 	spdk_json_write_named_string(w, "filename", uring->filename);
648 
649 	spdk_json_write_object_end(w);
650 
651 	return 0;
652 }
653 
654 static void
655 bdev_uring_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
656 {
657 	struct bdev_uring *uring = bdev->ctxt;
658 
659 	spdk_json_write_object_begin(w);
660 
661 	spdk_json_write_named_string(w, "method", "bdev_uring_create");
662 
663 	spdk_json_write_named_object_begin(w, "params");
664 	spdk_json_write_named_string(w, "name", bdev->name);
665 	spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
666 	spdk_json_write_named_string(w, "filename", uring->filename);
667 	spdk_json_write_object_end(w);
668 
669 	spdk_json_write_object_end(w);
670 }
671 
672 static const struct spdk_bdev_fn_table uring_fn_table = {
673 	.destruct		= bdev_uring_destruct,
674 	.submit_request		= bdev_uring_submit_request,
675 	.io_type_supported	= bdev_uring_io_type_supported,
676 	.get_io_channel		= bdev_uring_get_io_channel,
677 	.dump_info_json		= bdev_uring_dump_info_json,
678 	.write_config_json	= bdev_uring_write_json_config,
679 };
680 
681 static void
682 uring_free_bdev(struct bdev_uring *uring)
683 {
684 	if (uring == NULL) {
685 		return;
686 	}
687 	free(uring->filename);
688 	free(uring->bdev.name);
689 	free(uring);
690 }
691 
692 static int
693 bdev_uring_group_create_cb(void *io_device, void *ctx_buf)
694 {
695 	struct bdev_uring_group_channel *ch = ctx_buf;
696 
697 	/* Do not use IORING_SETUP_IOPOLL until the Linux kernel can support not only
698 	 * local devices but also devices attached from remote target */
699 	if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, 0) < 0) {
700 		SPDK_ERRLOG("uring I/O context setup failure\n");
701 		return -1;
702 	}
703 
704 	ch->poller = SPDK_POLLER_REGISTER(bdev_uring_group_poll, ch, 0);
705 	return 0;
706 }
707 
708 static void
709 bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf)
710 {
711 	struct bdev_uring_group_channel *ch = ctx_buf;
712 
713 	io_uring_queue_exit(&ch->uring);
714 
715 	spdk_poller_unregister(&ch->poller);
716 }
717 
718 struct spdk_bdev *
719 create_uring_bdev(const char *name, const char *filename, uint32_t block_size)
720 {
721 	struct bdev_uring *uring;
722 	uint32_t detected_block_size;
723 	uint64_t bdev_size;
724 	int rc;
725 
726 	uring = calloc(1, sizeof(*uring));
727 	if (!uring) {
728 		SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n");
729 		return NULL;
730 	}
731 
732 	uring->filename = strdup(filename);
733 	if (!uring->filename) {
734 		goto error_return;
735 	}
736 
737 	if (bdev_uring_open(uring)) {
738 		SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, uring->fd, errno);
739 		goto error_return;
740 	}
741 
742 	bdev_size = spdk_fd_get_size(uring->fd);
743 
744 	uring->bdev.name = strdup(name);
745 	if (!uring->bdev.name) {
746 		goto error_return;
747 	}
748 	uring->bdev.product_name = "URING bdev";
749 	uring->bdev.module = &uring_if;
750 
751 	uring->bdev.write_cache = 1;
752 
753 	detected_block_size = spdk_fd_get_blocklen(uring->fd);
754 	if (block_size == 0) {
755 		/* User did not specify block size - use autodetected block size. */
756 		if (detected_block_size == 0) {
757 			SPDK_ERRLOG("Block size could not be auto-detected\n");
758 			goto error_return;
759 		}
760 		block_size = detected_block_size;
761 	} else {
762 		if (block_size < detected_block_size) {
763 			SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than "
764 				    "auto-detected block size %" PRIu32 "\n",
765 				    block_size, detected_block_size);
766 			goto error_return;
767 		} else if (detected_block_size != 0 && block_size != detected_block_size) {
768 			SPDK_WARNLOG("Specified block size %" PRIu32 " does not match "
769 				     "auto-detected block size %" PRIu32 "\n",
770 				     block_size, detected_block_size);
771 		}
772 	}
773 
774 	if (block_size < 512) {
775 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size);
776 		goto error_return;
777 	}
778 
779 	if (!spdk_u32_is_pow2(block_size)) {
780 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size);
781 		goto error_return;
782 	}
783 
784 	uring->bdev.blocklen = block_size;
785 	uring->bdev.required_alignment = spdk_u32log2(block_size);
786 
787 	rc = bdev_uring_check_zoned_support(uring, name, filename);
788 	if (rc) {
789 		goto error_return;
790 	}
791 
792 	if (bdev_size % uring->bdev.blocklen != 0) {
793 		SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
794 			    bdev_size, uring->bdev.blocklen);
795 		goto error_return;
796 	}
797 
798 	uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen;
799 	uring->bdev.ctxt = uring;
800 
801 	uring->bdev.fn_table = &uring_fn_table;
802 
803 	spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb,
804 				sizeof(struct bdev_uring_io_channel),
805 				uring->bdev.name);
806 	rc = spdk_bdev_register(&uring->bdev);
807 	if (rc) {
808 		spdk_io_device_unregister(uring, NULL);
809 		goto error_return;
810 	}
811 
812 	TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link);
813 	return &uring->bdev;
814 
815 error_return:
816 	bdev_uring_close(uring);
817 	uring_free_bdev(uring);
818 	return NULL;
819 }
820 
821 struct delete_uring_bdev_ctx {
822 	spdk_delete_uring_complete cb_fn;
823 	void *cb_arg;
824 };
825 
826 static void
827 uring_bdev_unregister_cb(void *arg, int bdeverrno)
828 {
829 	struct delete_uring_bdev_ctx *ctx = arg;
830 
831 	ctx->cb_fn(ctx->cb_arg, bdeverrno);
832 	free(ctx);
833 }
834 
835 void
836 delete_uring_bdev(const char *name, spdk_delete_uring_complete cb_fn, void *cb_arg)
837 {
838 	struct delete_uring_bdev_ctx *ctx;
839 	int rc;
840 
841 	ctx = calloc(1, sizeof(*ctx));
842 	if (ctx == NULL) {
843 		cb_fn(cb_arg, -ENOMEM);
844 		return;
845 	}
846 
847 	ctx->cb_fn = cb_fn;
848 	ctx->cb_arg = cb_arg;
849 	rc = spdk_bdev_unregister_by_name(name, &uring_if, uring_bdev_unregister_cb, ctx);
850 	if (rc != 0) {
851 		uring_bdev_unregister_cb(ctx, rc);
852 	}
853 }
854 
855 static int
856 bdev_uring_init(void)
857 {
858 	spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb,
859 				sizeof(struct bdev_uring_group_channel), "uring_module");
860 
861 	return 0;
862 }
863 
864 static void
865 bdev_uring_fini(void)
866 {
867 	spdk_io_device_unregister(&uring_if, NULL);
868 }
869 
870 SPDK_LOG_REGISTER_COMPONENT(uring)
871