xref: /spdk/lib/vhost/vhost_blk.c (revision b37db06935181fd0e8f5592a96d860040abaa201)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2017 Intel Corporation. All rights reserved.
3  *   All rights reserved.
4  */
5 
6 #include <linux/virtio_blk.h>
7 
8 #include "spdk/env.h"
9 #include "spdk/bdev.h"
10 #include "spdk/bdev_module.h"
11 #include "spdk/thread.h"
12 #include "spdk/likely.h"
13 #include "spdk/string.h"
14 #include "spdk/util.h"
15 #include "spdk/vhost.h"
16 #include "spdk/json.h"
17 
18 #include "vhost_internal.h"
19 #include <rte_version.h>
20 
21 /* Minimal set of features supported by every SPDK VHOST-BLK device */
22 #define SPDK_VHOST_BLK_FEATURES_BASE (SPDK_VHOST_FEATURES | \
23 		(1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) | \
24 		(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \
25 		(1ULL << VIRTIO_BLK_F_TOPOLOGY) | (1ULL << VIRTIO_BLK_F_BARRIER)  | \
26 		(1ULL << VIRTIO_BLK_F_SCSI)     | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
27 		(1ULL << VIRTIO_BLK_F_MQ))
28 
29 /* Not supported features */
30 #define SPDK_VHOST_BLK_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \
31 		(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
32 		(1ULL << VIRTIO_BLK_F_BARRIER)  | (1ULL << VIRTIO_BLK_F_SCSI))
33 
34 /* Vhost-blk support protocol features */
35 #define SPDK_VHOST_BLK_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_CONFIG) | \
36 		(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
37 
38 #define VIRTIO_BLK_DEFAULT_TRANSPORT "vhost_user_blk"
39 
40 struct spdk_vhost_user_blk_task {
41 	struct spdk_vhost_blk_task blk_task;
42 	struct spdk_vhost_blk_session *bvsession;
43 	struct spdk_vhost_virtqueue *vq;
44 
45 	uint16_t req_idx;
46 	uint16_t num_descs;
47 	uint16_t buffer_id;
48 	uint16_t inflight_head;
49 
50 	/* If set, the task is currently used for I/O processing. */
51 	bool used;
52 };
53 
54 struct spdk_vhost_blk_dev {
55 	struct spdk_vhost_dev vdev;
56 	struct spdk_bdev *bdev;
57 	struct spdk_bdev_desc *bdev_desc;
58 	const struct spdk_virtio_blk_transport_ops *ops;
59 
60 	bool readonly;
61 };
62 
63 struct spdk_vhost_blk_session {
64 	/* The parent session must be the very first field in this struct */
65 	struct spdk_vhost_session vsession;
66 	struct spdk_vhost_blk_dev *bvdev;
67 	struct spdk_poller *requestq_poller;
68 	struct spdk_io_channel *io_channel;
69 	struct spdk_poller *stop_poller;
70 };
71 
72 /* forward declaration */
73 static const struct spdk_vhost_dev_backend vhost_blk_device_backend;
74 
75 static void vhost_user_blk_request_finish(uint8_t status, struct spdk_vhost_blk_task *task,
76 		void *cb_arg);
77 
78 static int
79 vhost_user_process_blk_request(struct spdk_vhost_user_blk_task *user_task)
80 {
81 	struct spdk_vhost_blk_session *bvsession = user_task->bvsession;
82 	struct spdk_vhost_dev *vdev = &bvsession->bvdev->vdev;
83 
84 	return virtio_blk_process_request(vdev, bvsession->io_channel, &user_task->blk_task,
85 					  vhost_user_blk_request_finish, NULL);
86 }
87 
88 static struct spdk_vhost_blk_dev *
89 to_blk_dev(struct spdk_vhost_dev *vdev)
90 {
91 	if (vdev == NULL) {
92 		return NULL;
93 	}
94 
95 	if (vdev->backend->type != VHOST_BACKEND_BLK) {
96 		SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name);
97 		return NULL;
98 	}
99 
100 	return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev);
101 }
102 
103 struct spdk_bdev *
104 vhost_blk_get_bdev(struct spdk_vhost_dev *vdev)
105 {
106 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
107 
108 	assert(bvdev != NULL);
109 
110 	return bvdev->bdev;
111 }
112 
113 static struct spdk_vhost_blk_session *
114 to_blk_session(struct spdk_vhost_session *vsession)
115 {
116 	assert(vsession->vdev->backend->type == VHOST_BACKEND_BLK);
117 	return (struct spdk_vhost_blk_session *)vsession;
118 }
119 
120 static inline void
121 blk_task_inc_task_cnt(struct spdk_vhost_user_blk_task *task)
122 {
123 	task->bvsession->vsession.task_cnt++;
124 }
125 
126 static inline void
127 blk_task_dec_task_cnt(struct spdk_vhost_user_blk_task *task)
128 {
129 	assert(task->bvsession->vsession.task_cnt > 0);
130 	task->bvsession->vsession.task_cnt--;
131 }
132 
133 static void
134 blk_task_finish(struct spdk_vhost_user_blk_task *task)
135 {
136 	blk_task_dec_task_cnt(task);
137 	task->used = false;
138 }
139 
140 static void
141 blk_task_init(struct spdk_vhost_user_blk_task *task)
142 {
143 	struct spdk_vhost_blk_task *blk_task = &task->blk_task;
144 
145 	task->used = true;
146 	blk_task->iovcnt = SPDK_COUNTOF(blk_task->iovs);
147 	blk_task->status = NULL;
148 	blk_task->used_len = 0;
149 	blk_task->payload_size = 0;
150 }
151 
152 static void
153 blk_task_enqueue(struct spdk_vhost_user_blk_task *task)
154 {
155 	if (task->vq->packed.packed_ring) {
156 		vhost_vq_packed_ring_enqueue(&task->bvsession->vsession, task->vq,
157 					     task->num_descs,
158 					     task->buffer_id, task->blk_task.used_len,
159 					     task->inflight_head);
160 	} else {
161 		vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq,
162 					   task->req_idx, task->blk_task.used_len);
163 	}
164 }
165 
166 static void
167 vhost_user_blk_request_finish(uint8_t status, struct spdk_vhost_blk_task *task, void *cb_arg)
168 {
169 	struct spdk_vhost_user_blk_task *user_task;
170 
171 	user_task = SPDK_CONTAINEROF(task, struct spdk_vhost_user_blk_task, blk_task);
172 
173 	blk_task_enqueue(user_task);
174 
175 	SPDK_DEBUGLOG(vhost_blk, "Finished task (%p) req_idx=%d\n status: %" PRIu8"\n",
176 		      user_task, user_task->req_idx, status);
177 	blk_task_finish(user_task);
178 }
179 
180 static void
181 blk_request_finish(uint8_t status, struct spdk_vhost_blk_task *task)
182 {
183 
184 	if (task->status) {
185 		*task->status = status;
186 	}
187 
188 	task->cb(status, task, task->cb_arg);
189 }
190 
191 /*
192  * Process task's descriptor chain and setup data related fields.
193  * Return
194  *   total size of supplied buffers
195  *
196  *   FIXME: Make this function return to rd_cnt and wr_cnt
197  */
198 static int
199 blk_iovs_split_queue_setup(struct spdk_vhost_blk_session *bvsession,
200 			   struct spdk_vhost_virtqueue *vq,
201 			   uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
202 {
203 	struct spdk_vhost_session *vsession = &bvsession->vsession;
204 	struct spdk_vhost_dev *vdev = vsession->vdev;
205 	struct vring_desc *desc, *desc_table;
206 	uint16_t out_cnt = 0, cnt = 0;
207 	uint32_t desc_table_size, len = 0;
208 	uint32_t desc_handled_cnt;
209 	int rc;
210 
211 	rc = vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size);
212 	if (rc != 0) {
213 		SPDK_ERRLOG("%s: invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
214 		return -1;
215 	}
216 
217 	desc_handled_cnt = 0;
218 	while (1) {
219 		/*
220 		 * Maximum cnt reached?
221 		 * Should not happen if request is well formatted, otherwise this is a BUG.
222 		 */
223 		if (spdk_unlikely(cnt == *iovs_cnt)) {
224 			SPDK_DEBUGLOG(vhost_blk, "%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
225 				      vsession->name, req_idx);
226 			return -1;
227 		}
228 
229 		if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) {
230 			SPDK_DEBUGLOG(vhost_blk, "%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
231 				      vsession->name, req_idx, cnt);
232 			return -1;
233 		}
234 
235 		len += desc->len;
236 
237 		out_cnt += vhost_vring_desc_is_wr(desc);
238 
239 		rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
240 		if (rc != 0) {
241 			SPDK_ERRLOG("%s: descriptor chain at index %"PRIu16" terminated unexpectedly.\n",
242 				    vsession->name, req_idx);
243 			return -1;
244 		} else if (desc == NULL) {
245 			break;
246 		}
247 
248 		desc_handled_cnt++;
249 		if (spdk_unlikely(desc_handled_cnt > desc_table_size)) {
250 			/* Break a cycle and report an error, if any. */
251 			SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n",
252 				    vsession->name, desc_table_size, desc_handled_cnt);
253 			return -1;
254 		}
255 	}
256 
257 	/*
258 	 * There must be least two descriptors.
259 	 * First contain request so it must be readable.
260 	 * Last descriptor contain buffer for response so it must be writable.
261 	 */
262 	if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
263 		return -1;
264 	}
265 
266 	*length = len;
267 	*iovs_cnt = cnt;
268 	return 0;
269 }
270 
271 static int
272 blk_iovs_packed_desc_setup(struct spdk_vhost_session *vsession,
273 			   struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
274 			   struct vring_packed_desc *desc_table, uint16_t desc_table_size,
275 			   struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
276 {
277 	struct vring_packed_desc *desc;
278 	uint16_t cnt = 0, out_cnt = 0;
279 	uint32_t len = 0;
280 
281 	if (desc_table == NULL) {
282 		desc = &vq->vring.desc_packed[req_idx];
283 	} else {
284 		req_idx = 0;
285 		desc = desc_table;
286 	}
287 
288 	while (1) {
289 		/*
290 		 * Maximum cnt reached?
291 		 * Should not happen if request is well formatted, otherwise this is a BUG.
292 		 */
293 		if (spdk_unlikely(cnt == *iovs_cnt)) {
294 			SPDK_ERRLOG("%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
295 				    vsession->name, req_idx);
296 			return -EINVAL;
297 		}
298 
299 		if (spdk_unlikely(vhost_vring_packed_desc_to_iov(vsession, iovs, &cnt, desc))) {
300 			SPDK_ERRLOG("%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
301 				    vsession->name, req_idx, cnt);
302 			return -EINVAL;
303 		}
304 
305 		len += desc->len;
306 		out_cnt += vhost_vring_packed_desc_is_wr(desc);
307 
308 		/* desc is NULL means we reach the last desc of this request */
309 		vhost_vring_packed_desc_get_next(&desc, &req_idx, vq, desc_table, desc_table_size);
310 		if (desc == NULL) {
311 			break;
312 		}
313 	}
314 
315 	/*
316 	 * There must be least two descriptors.
317 	 * First contain request so it must be readable.
318 	 * Last descriptor contain buffer for response so it must be writable.
319 	 */
320 	if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
321 		return -EINVAL;
322 	}
323 
324 	*length = len;
325 	*iovs_cnt = cnt;
326 
327 	return 0;
328 }
329 
330 static int
331 blk_iovs_packed_queue_setup(struct spdk_vhost_blk_session *bvsession,
332 			    struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
333 			    struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
334 {
335 	struct spdk_vhost_session *vsession = &bvsession->vsession;
336 	struct spdk_vhost_dev *vdev = vsession->vdev;
337 	struct vring_packed_desc *desc = NULL, *desc_table;
338 	uint32_t desc_table_size;
339 	int rc;
340 
341 	rc = vhost_vq_get_desc_packed(vsession, vq, req_idx, &desc,
342 				      &desc_table, &desc_table_size);
343 	if (spdk_unlikely(rc != 0)) {
344 		SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
345 		return rc;
346 	}
347 
348 	return blk_iovs_packed_desc_setup(vsession, vq, req_idx, desc_table, desc_table_size,
349 					  iovs, iovs_cnt, length);
350 }
351 
352 static int
353 blk_iovs_inflight_queue_setup(struct spdk_vhost_blk_session *bvsession,
354 			      struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
355 			      struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
356 {
357 	struct spdk_vhost_session *vsession = &bvsession->vsession;
358 	struct spdk_vhost_dev *vdev = vsession->vdev;
359 	spdk_vhost_inflight_desc *inflight_desc;
360 	struct vring_packed_desc *desc_table;
361 	uint16_t out_cnt = 0, cnt = 0;
362 	uint32_t desc_table_size, len = 0;
363 	int rc = 0;
364 
365 	rc = vhost_inflight_queue_get_desc(vsession, vq->vring_inflight.inflight_packed->desc,
366 					   req_idx, &inflight_desc, &desc_table, &desc_table_size);
367 	if (spdk_unlikely(rc != 0)) {
368 		SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
369 		return rc;
370 	}
371 
372 	if (desc_table != NULL) {
373 		return blk_iovs_packed_desc_setup(vsession, vq, req_idx, desc_table, desc_table_size,
374 						  iovs, iovs_cnt, length);
375 	}
376 
377 	while (1) {
378 		/*
379 		 * Maximum cnt reached?
380 		 * Should not happen if request is well formatted, otherwise this is a BUG.
381 		 */
382 		if (spdk_unlikely(cnt == *iovs_cnt)) {
383 			SPDK_ERRLOG("%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
384 				    vsession->name, req_idx);
385 			return -EINVAL;
386 		}
387 
388 		if (spdk_unlikely(vhost_vring_inflight_desc_to_iov(vsession, iovs, &cnt, inflight_desc))) {
389 			SPDK_ERRLOG("%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
390 				    vsession->name, req_idx, cnt);
391 			return -EINVAL;
392 		}
393 
394 		len += inflight_desc->len;
395 		out_cnt += vhost_vring_inflight_desc_is_wr(inflight_desc);
396 
397 		/* Without F_NEXT means it's the last desc */
398 		if ((inflight_desc->flags & VRING_DESC_F_NEXT) == 0) {
399 			break;
400 		}
401 
402 		inflight_desc = &vq->vring_inflight.inflight_packed->desc[inflight_desc->next];
403 	}
404 
405 	/*
406 	 * There must be least two descriptors.
407 	 * First contain request so it must be readable.
408 	 * Last descriptor contain buffer for response so it must be writable.
409 	 */
410 	if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
411 		return -EINVAL;
412 	}
413 
414 	*length = len;
415 	*iovs_cnt = cnt;
416 
417 	return 0;
418 }
419 
420 static void
421 blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
422 {
423 	struct spdk_vhost_blk_task *task = cb_arg;
424 
425 	spdk_bdev_free_io(bdev_io);
426 	blk_request_finish(success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR, task);
427 }
428 
429 static void
430 blk_request_resubmit(void *arg)
431 {
432 	struct spdk_vhost_blk_task *task = arg;
433 	int rc = 0;
434 
435 	rc = virtio_blk_process_request(task->bdev_io_wait_vdev, task->bdev_io_wait_ch, task,
436 					task->cb, task->cb_arg);
437 	if (rc == 0) {
438 		SPDK_DEBUGLOG(vhost_blk, "====== Task %p resubmitted ======\n", task);
439 	} else {
440 		SPDK_DEBUGLOG(vhost_blk, "====== Task %p failed ======\n", task);
441 	}
442 }
443 
444 static inline void
445 blk_request_queue_io(struct spdk_vhost_dev *vdev, struct spdk_io_channel *ch,
446 		     struct spdk_vhost_blk_task *task)
447 {
448 	int rc;
449 	struct spdk_bdev *bdev = vhost_blk_get_bdev(vdev);
450 
451 	task->bdev_io_wait.bdev = bdev;
452 	task->bdev_io_wait.cb_fn = blk_request_resubmit;
453 	task->bdev_io_wait.cb_arg = task;
454 	task->bdev_io_wait_ch = ch;
455 	task->bdev_io_wait_vdev = vdev;
456 
457 	rc = spdk_bdev_queue_io_wait(bdev, ch, &task->bdev_io_wait);
458 	if (rc != 0) {
459 		blk_request_finish(VIRTIO_BLK_S_IOERR, task);
460 	}
461 }
462 
463 int
464 virtio_blk_process_request(struct spdk_vhost_dev *vdev, struct spdk_io_channel *ch,
465 			   struct spdk_vhost_blk_task *task, virtio_blk_request_cb cb, void *cb_arg)
466 {
467 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
468 	struct virtio_blk_outhdr req;
469 	struct virtio_blk_discard_write_zeroes *desc;
470 	struct iovec *iov;
471 	uint32_t type;
472 	uint64_t flush_bytes;
473 	uint32_t payload_len;
474 	uint16_t iovcnt;
475 	int rc;
476 
477 	assert(bvdev != NULL);
478 
479 	task->cb = cb;
480 	task->cb_arg = cb_arg;
481 
482 	iov = &task->iovs[0];
483 	if (spdk_unlikely(iov->iov_len != sizeof(req))) {
484 		SPDK_DEBUGLOG(vhost_blk,
485 			      "First descriptor size is %zu but expected %zu (task = %p).\n",
486 			      iov->iov_len, sizeof(req), task);
487 		blk_request_finish(VIRTIO_BLK_S_UNSUPP, task);
488 		return -1;
489 	}
490 
491 	/* Some SeaBIOS versions don't align the virtio_blk_outhdr on an 8-byte boundary, which
492 	 * triggers ubsan errors.  So copy this small 16-byte structure to the stack to workaround
493 	 * this problem.
494 	 */
495 	memcpy(&req, iov->iov_base, sizeof(req));
496 
497 	iov = &task->iovs[task->iovcnt - 1];
498 	if (spdk_unlikely(iov->iov_len != 1)) {
499 		SPDK_DEBUGLOG(vhost_blk,
500 			      "Last descriptor size is %zu but expected %d (task = %p).\n",
501 			      iov->iov_len, 1, task);
502 		blk_request_finish(VIRTIO_BLK_S_UNSUPP, task);
503 		return -1;
504 	}
505 
506 	payload_len = task->payload_size;
507 	task->status = iov->iov_base;
508 	payload_len -= sizeof(req) + sizeof(*task->status);
509 	iovcnt = task->iovcnt - 2;
510 
511 	type = req.type;
512 #ifdef VIRTIO_BLK_T_BARRIER
513 	/* Don't care about barrier for now (as QEMU's virtio-blk do). */
514 	type &= ~VIRTIO_BLK_T_BARRIER;
515 #endif
516 
517 	switch (type) {
518 	case VIRTIO_BLK_T_IN:
519 	case VIRTIO_BLK_T_OUT:
520 		if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) {
521 			SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (task = %p).\n",
522 				    type ? "WRITE" : "READ", task);
523 			blk_request_finish(VIRTIO_BLK_S_UNSUPP, task);
524 			return -1;
525 		}
526 
527 		if (type == VIRTIO_BLK_T_IN) {
528 			task->used_len = payload_len + sizeof(*task->status);
529 			rc = spdk_bdev_readv(bvdev->bdev_desc, ch,
530 					     &task->iovs[1], iovcnt, req.sector * 512,
531 					     payload_len, blk_request_complete_cb, task);
532 		} else if (!bvdev->readonly) {
533 			task->used_len = sizeof(*task->status);
534 			rc = spdk_bdev_writev(bvdev->bdev_desc, ch,
535 					      &task->iovs[1], iovcnt, req.sector * 512,
536 					      payload_len, blk_request_complete_cb, task);
537 		} else {
538 			SPDK_DEBUGLOG(vhost_blk, "Device is in read-only mode!\n");
539 			rc = -1;
540 		}
541 
542 		if (rc) {
543 			if (rc == -ENOMEM) {
544 				SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
545 				blk_request_queue_io(vdev, ch, task);
546 			} else {
547 				blk_request_finish(VIRTIO_BLK_S_IOERR, task);
548 				return -1;
549 			}
550 		}
551 		break;
552 	case VIRTIO_BLK_T_DISCARD:
553 		desc = task->iovs[1].iov_base;
554 		if (payload_len != sizeof(*desc)) {
555 			SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len);
556 			blk_request_finish(VIRTIO_BLK_S_IOERR, task);
557 			return -1;
558 		}
559 
560 		if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
561 			SPDK_ERRLOG("UNMAP flag is only used for WRITE ZEROES command\n");
562 			blk_request_finish(VIRTIO_BLK_S_UNSUPP, task);
563 			return -1;
564 		}
565 
566 		rc = spdk_bdev_unmap(bvdev->bdev_desc, ch,
567 				     desc->sector * 512, desc->num_sectors * 512,
568 				     blk_request_complete_cb, task);
569 		if (rc) {
570 			if (rc == -ENOMEM) {
571 				SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
572 				blk_request_queue_io(vdev, ch, task);
573 			} else {
574 				blk_request_finish(VIRTIO_BLK_S_IOERR, task);
575 				return -1;
576 			}
577 		}
578 		break;
579 	case VIRTIO_BLK_T_WRITE_ZEROES:
580 		desc = task->iovs[1].iov_base;
581 		if (payload_len != sizeof(*desc)) {
582 			SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len);
583 			blk_request_finish(VIRTIO_BLK_S_IOERR, task);
584 			return -1;
585 		}
586 
587 		/* Unmap this range, SPDK doesn't support it, kernel will enable this flag by default
588 		 * without checking unmap feature is negotiated or not, the flag isn't mandatory, so
589 		 * just print a warning.
590 		 */
591 		if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
592 			SPDK_WARNLOG("Ignore the unmap flag for WRITE ZEROES from %"PRIx64", len %"PRIx64"\n",
593 				     (uint64_t)desc->sector * 512, (uint64_t)desc->num_sectors * 512);
594 		}
595 
596 		rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, ch,
597 					    desc->sector * 512, desc->num_sectors * 512,
598 					    blk_request_complete_cb, task);
599 		if (rc) {
600 			if (rc == -ENOMEM) {
601 				SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
602 				blk_request_queue_io(vdev, ch, task);
603 			} else {
604 				blk_request_finish(VIRTIO_BLK_S_IOERR, task);
605 				return -1;
606 			}
607 		}
608 		break;
609 	case VIRTIO_BLK_T_FLUSH:
610 		flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev);
611 		if (req.sector != 0) {
612 			SPDK_NOTICELOG("sector must be zero for flush command\n");
613 			blk_request_finish(VIRTIO_BLK_S_IOERR, task);
614 			return -1;
615 		}
616 		rc = spdk_bdev_flush(bvdev->bdev_desc, ch,
617 				     0, flush_bytes,
618 				     blk_request_complete_cb, task);
619 		if (rc) {
620 			if (rc == -ENOMEM) {
621 				SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
622 				blk_request_queue_io(vdev, ch, task);
623 			} else if (rc == -ENOTSUP) {
624 				blk_request_finish(VIRTIO_BLK_S_UNSUPP, task);
625 				return -1;
626 			} else {
627 				blk_request_finish(VIRTIO_BLK_S_IOERR, task);
628 				return -1;
629 			}
630 		}
631 		break;
632 	case VIRTIO_BLK_T_GET_ID:
633 		if (!iovcnt || !payload_len) {
634 			blk_request_finish(VIRTIO_BLK_S_UNSUPP, task);
635 			return -1;
636 		}
637 		task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len);
638 		spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_name(bvdev->bdev),
639 				task->used_len, ' ');
640 		blk_request_finish(VIRTIO_BLK_S_OK, task);
641 		break;
642 	default:
643 		SPDK_DEBUGLOG(vhost_blk, "Not supported request type '%"PRIu32"'.\n", type);
644 		blk_request_finish(VIRTIO_BLK_S_UNSUPP, task);
645 		return -1;
646 	}
647 
648 	return 0;
649 }
650 
651 static void
652 process_blk_task(struct spdk_vhost_virtqueue *vq, uint16_t req_idx)
653 {
654 	struct spdk_vhost_user_blk_task *task;
655 	struct spdk_vhost_blk_task *blk_task;
656 	int rc;
657 
658 	assert(vq->packed.packed_ring == false);
659 
660 	task = &((struct spdk_vhost_user_blk_task *)vq->tasks)[req_idx];
661 	blk_task = &task->blk_task;
662 	if (spdk_unlikely(task->used)) {
663 		SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
664 			    task->bvsession->vsession.name, req_idx);
665 		blk_task->used_len = 0;
666 		blk_task_enqueue(task);
667 		return;
668 	}
669 
670 	blk_task_inc_task_cnt(task);
671 
672 	blk_task_init(task);
673 
674 	rc = blk_iovs_split_queue_setup(task->bvsession, vq, task->req_idx,
675 					blk_task->iovs, &blk_task->iovcnt, &blk_task->payload_size);
676 
677 	if (rc) {
678 		SPDK_DEBUGLOG(vhost_blk, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
679 		/* Only READ and WRITE are supported for now. */
680 		vhost_user_blk_request_finish(VIRTIO_BLK_S_UNSUPP, blk_task, NULL);
681 		return;
682 	}
683 
684 	if (vhost_user_process_blk_request(task) == 0) {
685 		SPDK_DEBUGLOG(vhost_blk, "====== Task %p req_idx %d submitted ======\n", task,
686 			      req_idx);
687 	} else {
688 		SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, req_idx);
689 	}
690 }
691 
692 static void
693 process_packed_blk_task(struct spdk_vhost_virtqueue *vq, uint16_t req_idx)
694 {
695 	struct spdk_vhost_user_blk_task *task;
696 	struct spdk_vhost_blk_task *blk_task;
697 	uint16_t task_idx = req_idx, num_descs;
698 	int rc;
699 
700 	assert(vq->packed.packed_ring);
701 
702 	/* Packed ring used the buffer_id as the task_idx to get task struct.
703 	 * In kernel driver, it uses the vq->free_head to set the buffer_id so the value
704 	 * must be in the range of 0 ~ vring.size. The free_head value must be unique
705 	 * in the outstanding requests.
706 	 * We can't use the req_idx as the task_idx because the desc can be reused in
707 	 * the next phase even when it's not completed in the previous phase. For example,
708 	 * At phase 0, last_used_idx was 2 and desc0 was not completed.Then after moving
709 	 * phase 1, last_avail_idx is updated to 1. In this case, req_idx can not be used
710 	 * as task_idx because we will know task[0]->used is true at phase 1.
711 	 * The split queue is quite different, the desc would insert into the free list when
712 	 * device completes the request, the driver gets the desc from the free list which
713 	 * ensures the req_idx is unique in the outstanding requests.
714 	 */
715 	task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs);
716 
717 	task = &((struct spdk_vhost_user_blk_task *)vq->tasks)[task_idx];
718 	blk_task = &task->blk_task;
719 	if (spdk_unlikely(task->used)) {
720 		SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
721 			    task->bvsession->vsession.name, task_idx);
722 		blk_task->used_len = 0;
723 		blk_task_enqueue(task);
724 		return;
725 	}
726 
727 	task->req_idx = req_idx;
728 	task->num_descs = num_descs;
729 	task->buffer_id = task_idx;
730 
731 	rte_vhost_set_inflight_desc_packed(task->bvsession->vsession.vid, vq->vring_idx,
732 					   req_idx, (req_idx + num_descs - 1) % vq->vring.size,
733 					   &task->inflight_head);
734 
735 	blk_task_inc_task_cnt(task);
736 
737 	blk_task_init(task);
738 
739 	rc = blk_iovs_packed_queue_setup(task->bvsession, vq, task->req_idx, blk_task->iovs,
740 					 &blk_task->iovcnt,
741 					 &blk_task->payload_size);
742 	if (rc) {
743 		SPDK_DEBUGLOG(vhost_blk, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
744 		/* Only READ and WRITE are supported for now. */
745 		vhost_user_blk_request_finish(VIRTIO_BLK_S_UNSUPP, blk_task, NULL);
746 		return;
747 	}
748 
749 	if (vhost_user_process_blk_request(task) == 0) {
750 		SPDK_DEBUGLOG(vhost_blk, "====== Task %p req_idx %d submitted ======\n", task,
751 			      task_idx);
752 	} else {
753 		SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, task_idx);
754 	}
755 }
756 
757 static void
758 process_packed_inflight_blk_task(struct spdk_vhost_virtqueue *vq,
759 				 uint16_t req_idx)
760 {
761 	spdk_vhost_inflight_desc *desc_array = vq->vring_inflight.inflight_packed->desc;
762 	spdk_vhost_inflight_desc *desc = &desc_array[req_idx];
763 	struct spdk_vhost_user_blk_task *task;
764 	struct spdk_vhost_blk_task *blk_task;
765 	uint16_t task_idx, num_descs;
766 	int rc;
767 
768 	task_idx = desc_array[desc->last].id;
769 	num_descs = desc->num;
770 	/* In packed ring reconnection, we use the last_used_idx as the
771 	 * initial value. So when we process the inflight descs we still
772 	 * need to update the available ring index.
773 	 */
774 	vq->last_avail_idx += num_descs;
775 	if (vq->last_avail_idx >= vq->vring.size) {
776 		vq->last_avail_idx -= vq->vring.size;
777 		vq->packed.avail_phase = !vq->packed.avail_phase;
778 	}
779 
780 	task = &((struct spdk_vhost_user_blk_task *)vq->tasks)[task_idx];
781 	blk_task = &task->blk_task;
782 	if (spdk_unlikely(task->used)) {
783 		SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
784 			    task->bvsession->vsession.name, task_idx);
785 		blk_task->used_len = 0;
786 		blk_task_enqueue(task);
787 		return;
788 	}
789 
790 	task->req_idx = req_idx;
791 	task->num_descs = num_descs;
792 	task->buffer_id = task_idx;
793 	/* It's for cleaning inflight entries */
794 	task->inflight_head = req_idx;
795 
796 	blk_task_inc_task_cnt(task);
797 
798 	blk_task_init(task);
799 
800 	rc = blk_iovs_inflight_queue_setup(task->bvsession, vq, task->req_idx, blk_task->iovs,
801 					   &blk_task->iovcnt,
802 					   &blk_task->payload_size);
803 	if (rc) {
804 		SPDK_DEBUGLOG(vhost_blk, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
805 		/* Only READ and WRITE are supported for now. */
806 		vhost_user_blk_request_finish(VIRTIO_BLK_S_UNSUPP, blk_task, NULL);
807 		return;
808 	}
809 
810 	if (vhost_user_process_blk_request(task) == 0) {
811 		SPDK_DEBUGLOG(vhost_blk, "====== Task %p req_idx %d submitted ======\n", task,
812 			      task_idx);
813 	} else {
814 		SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, task_idx);
815 	}
816 }
817 
818 static int
819 submit_inflight_desc(struct spdk_vhost_blk_session *bvsession,
820 		     struct spdk_vhost_virtqueue *vq)
821 {
822 	struct spdk_vhost_session *vsession;
823 	spdk_vhost_resubmit_info *resubmit;
824 	spdk_vhost_resubmit_desc *resubmit_list;
825 	uint16_t req_idx;
826 	int i, resubmit_cnt;
827 
828 	resubmit = vq->vring_inflight.resubmit_inflight;
829 	if (spdk_likely(resubmit == NULL || resubmit->resubmit_list == NULL ||
830 			resubmit->resubmit_num == 0)) {
831 		return 0;
832 	}
833 
834 	resubmit_list = resubmit->resubmit_list;
835 	vsession = &bvsession->vsession;
836 
837 	for (i = resubmit->resubmit_num - 1; i >= 0; --i) {
838 		req_idx = resubmit_list[i].index;
839 		SPDK_DEBUGLOG(vhost_blk, "====== Start processing resubmit request idx %"PRIu16"======\n",
840 			      req_idx);
841 
842 		if (spdk_unlikely(req_idx >= vq->vring.size)) {
843 			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
844 				    vsession->name, req_idx, vq->vring.size);
845 			vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
846 			continue;
847 		}
848 
849 		if (vq->packed.packed_ring) {
850 			process_packed_inflight_blk_task(vq, req_idx);
851 		} else {
852 			process_blk_task(vq, req_idx);
853 		}
854 	}
855 	resubmit_cnt = resubmit->resubmit_num;
856 	resubmit->resubmit_num = 0;
857 	return resubmit_cnt;
858 }
859 
860 static int
861 process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
862 {
863 	struct spdk_vhost_session *vsession = &bvsession->vsession;
864 	uint16_t reqs[SPDK_VHOST_VQ_MAX_SUBMISSIONS];
865 	uint16_t reqs_cnt, i;
866 	int resubmit_cnt = 0;
867 
868 	resubmit_cnt = submit_inflight_desc(bvsession, vq);
869 
870 	reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
871 	if (!reqs_cnt) {
872 		return resubmit_cnt;
873 	}
874 
875 	for (i = 0; i < reqs_cnt; i++) {
876 		SPDK_DEBUGLOG(vhost_blk, "====== Starting processing request idx %"PRIu16"======\n",
877 			      reqs[i]);
878 
879 		if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
880 			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
881 				    vsession->name, reqs[i], vq->vring.size);
882 			vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
883 			continue;
884 		}
885 
886 		rte_vhost_set_inflight_desc_split(vsession->vid, vq->vring_idx, reqs[i]);
887 
888 		process_blk_task(vq, reqs[i]);
889 	}
890 
891 	return reqs_cnt;
892 }
893 
894 static int
895 process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
896 {
897 	uint16_t i = 0;
898 	uint16_t count = 0;
899 	int resubmit_cnt = 0;
900 
901 	resubmit_cnt = submit_inflight_desc(bvsession, vq);
902 
903 	while (i++ < SPDK_VHOST_VQ_MAX_SUBMISSIONS &&
904 	       vhost_vq_packed_ring_is_avail(vq)) {
905 		SPDK_DEBUGLOG(vhost_blk, "====== Starting processing request idx %"PRIu16"======\n",
906 			      vq->last_avail_idx);
907 		count++;
908 		process_packed_blk_task(vq, vq->last_avail_idx);
909 	}
910 
911 	return count > 0 ? count : resubmit_cnt;
912 }
913 
914 static int
915 _vdev_vq_worker(struct spdk_vhost_virtqueue *vq)
916 {
917 	struct spdk_vhost_session *vsession = vq->vsession;
918 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
919 	bool packed_ring;
920 	int rc = 0;
921 
922 	packed_ring = vq->packed.packed_ring;
923 	if (packed_ring) {
924 		rc = process_packed_vq(bvsession, vq);
925 	} else {
926 		rc = process_vq(bvsession, vq);
927 	}
928 
929 	vhost_session_vq_used_signal(vq);
930 
931 	return rc;
932 
933 }
934 
935 static int
936 vdev_vq_worker(void *arg)
937 {
938 	struct spdk_vhost_virtqueue *vq = arg;
939 
940 	return _vdev_vq_worker(vq);
941 }
942 
943 static int
944 vdev_worker(void *arg)
945 {
946 	struct spdk_vhost_blk_session *bvsession = arg;
947 	struct spdk_vhost_session *vsession = &bvsession->vsession;
948 	uint16_t q_idx;
949 	int rc = 0;
950 
951 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
952 		rc += _vdev_vq_worker(&vsession->virtqueue[q_idx]);
953 	}
954 
955 	return rc > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
956 }
957 
958 static void
959 no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
960 {
961 	struct spdk_vhost_session *vsession = &bvsession->vsession;
962 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
963 	uint32_t length;
964 	uint16_t iovcnt, req_idx;
965 
966 	if (vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) {
967 		return;
968 	}
969 
970 	iovcnt = SPDK_COUNTOF(iovs);
971 	if (blk_iovs_split_queue_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) {
972 		*(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR;
973 		SPDK_DEBUGLOG(vhost_blk_data, "Aborting request %" PRIu16"\n", req_idx);
974 	}
975 
976 	vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
977 }
978 
979 static void
980 no_bdev_process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
981 {
982 	struct spdk_vhost_session *vsession = &bvsession->vsession;
983 	struct spdk_vhost_user_blk_task *task;
984 	struct spdk_vhost_blk_task *blk_task;
985 	uint32_t length;
986 	uint16_t req_idx = vq->last_avail_idx;
987 	uint16_t task_idx, num_descs;
988 
989 	if (!vhost_vq_packed_ring_is_avail(vq)) {
990 		return;
991 	}
992 
993 	task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs);
994 	task = &((struct spdk_vhost_user_blk_task *)vq->tasks)[task_idx];
995 	blk_task = &task->blk_task;
996 	if (spdk_unlikely(task->used)) {
997 		SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
998 			    vsession->name, req_idx);
999 		vhost_vq_packed_ring_enqueue(vsession, vq, num_descs,
1000 					     task->buffer_id, blk_task->used_len,
1001 					     task->inflight_head);
1002 		return;
1003 	}
1004 
1005 	task->req_idx = req_idx;
1006 	task->num_descs = num_descs;
1007 	task->buffer_id = task_idx;
1008 	blk_task_init(task);
1009 
1010 	if (blk_iovs_packed_queue_setup(bvsession, vq, task->req_idx, blk_task->iovs, &blk_task->iovcnt,
1011 					&length)) {
1012 		*(volatile uint8_t *)(blk_task->iovs[blk_task->iovcnt - 1].iov_base) = VIRTIO_BLK_S_IOERR;
1013 		SPDK_DEBUGLOG(vhost_blk_data, "Aborting request %" PRIu16"\n", req_idx);
1014 	}
1015 
1016 	task->used = false;
1017 	vhost_vq_packed_ring_enqueue(vsession, vq, num_descs,
1018 				     task->buffer_id, blk_task->used_len,
1019 				     task->inflight_head);
1020 }
1021 
1022 static int
1023 _no_bdev_vdev_vq_worker(struct spdk_vhost_virtqueue *vq)
1024 {
1025 	struct spdk_vhost_session *vsession = vq->vsession;
1026 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
1027 	bool packed_ring;
1028 
1029 	packed_ring = vq->packed.packed_ring;
1030 	if (packed_ring) {
1031 		no_bdev_process_packed_vq(bvsession, vq);
1032 	} else {
1033 		no_bdev_process_vq(bvsession, vq);
1034 	}
1035 
1036 	vhost_session_vq_used_signal(vq);
1037 
1038 	if (vsession->task_cnt == 0 && bvsession->io_channel) {
1039 		vhost_blk_put_io_channel(bvsession->io_channel);
1040 		bvsession->io_channel = NULL;
1041 	}
1042 
1043 	return SPDK_POLLER_BUSY;
1044 }
1045 
1046 static int
1047 no_bdev_vdev_vq_worker(void *arg)
1048 {
1049 	struct spdk_vhost_virtqueue *vq = arg;
1050 
1051 	return _no_bdev_vdev_vq_worker(vq);
1052 }
1053 
1054 static int
1055 no_bdev_vdev_worker(void *arg)
1056 {
1057 	struct spdk_vhost_blk_session *bvsession = arg;
1058 	struct spdk_vhost_session *vsession = &bvsession->vsession;
1059 	uint16_t q_idx;
1060 
1061 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
1062 		_no_bdev_vdev_vq_worker(&vsession->virtqueue[q_idx]);
1063 	}
1064 
1065 	return SPDK_POLLER_BUSY;
1066 }
1067 
1068 static void
1069 vhost_blk_session_unregister_interrupts(struct spdk_vhost_blk_session *bvsession)
1070 {
1071 	struct spdk_vhost_session *vsession = &bvsession->vsession;
1072 	struct spdk_vhost_virtqueue *vq;
1073 	int i;
1074 
1075 	SPDK_DEBUGLOG(vhost_blk, "unregister virtqueues interrupt\n");
1076 	for (i = 0; i < vsession->max_queues; i++) {
1077 		vq = &vsession->virtqueue[i];
1078 		if (vq->intr == NULL) {
1079 			break;
1080 		}
1081 
1082 		SPDK_DEBUGLOG(vhost_blk, "unregister vq[%d]'s kickfd is %d\n",
1083 			      i, vq->vring.kickfd);
1084 		spdk_interrupt_unregister(&vq->intr);
1085 	}
1086 }
1087 
1088 static void
1089 _vhost_blk_vq_register_interrupt(void *arg)
1090 {
1091 	struct spdk_vhost_virtqueue *vq = arg;
1092 	struct spdk_vhost_session *vsession = vq->vsession;
1093 	struct spdk_vhost_blk_dev *bvdev =  to_blk_dev(vsession->vdev);
1094 
1095 	assert(bvdev != NULL);
1096 
1097 	if (bvdev->bdev) {
1098 		vq->intr = spdk_interrupt_register(vq->vring.kickfd, vdev_vq_worker, vq, "vdev_vq_worker");
1099 	} else {
1100 		vq->intr = spdk_interrupt_register(vq->vring.kickfd, no_bdev_vdev_vq_worker, vq,
1101 						   "no_bdev_vdev_vq_worker");
1102 	}
1103 
1104 	if (vq->intr == NULL) {
1105 		SPDK_ERRLOG("Fail to register req notifier handler.\n");
1106 		assert(false);
1107 	}
1108 }
1109 
1110 static int
1111 vhost_blk_vq_enable(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *vq)
1112 {
1113 	if (spdk_interrupt_mode_is_enabled()) {
1114 		spdk_thread_send_msg(vsession->vdev->thread, _vhost_blk_vq_register_interrupt, vq);
1115 	}
1116 
1117 	return 0;
1118 }
1119 
1120 static int
1121 vhost_blk_session_register_no_bdev_interrupts(struct spdk_vhost_blk_session *bvsession)
1122 {
1123 	struct spdk_vhost_session *vsession = &bvsession->vsession;
1124 	struct spdk_vhost_virtqueue *vq = NULL;
1125 	int i;
1126 
1127 	SPDK_DEBUGLOG(vhost_blk, "Register virtqueues interrupt\n");
1128 	for (i = 0; i < vsession->max_queues; i++) {
1129 		vq = &vsession->virtqueue[i];
1130 		SPDK_DEBUGLOG(vhost_blk, "Register vq[%d]'s kickfd is %d\n",
1131 			      i, vq->vring.kickfd);
1132 		vq->intr = spdk_interrupt_register(vq->vring.kickfd, no_bdev_vdev_vq_worker, vq,
1133 						   "no_bdev_vdev_vq_worker");
1134 		if (vq->intr == NULL) {
1135 			goto err;
1136 		}
1137 
1138 	}
1139 
1140 	return 0;
1141 
1142 err:
1143 	vhost_blk_session_unregister_interrupts(bvsession);
1144 	return -1;
1145 }
1146 
1147 static void
1148 vhost_blk_poller_set_interrupt_mode(struct spdk_poller *poller, void *cb_arg, bool interrupt_mode)
1149 {
1150 	struct spdk_vhost_blk_session *bvsession = cb_arg;
1151 
1152 	vhost_user_session_set_interrupt_mode(&bvsession->vsession, interrupt_mode);
1153 }
1154 
1155 static void
1156 bdev_event_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx)
1157 {
1158 	enum spdk_bdev_event_type type = (enum spdk_bdev_event_type)(uintptr_t)ctx;
1159 	struct spdk_vhost_blk_dev *bvdev;
1160 
1161 	if (type == SPDK_BDEV_EVENT_REMOVE) {
1162 		/* All sessions have been notified, time to close the bdev */
1163 		bvdev = to_blk_dev(vdev);
1164 		assert(bvdev != NULL);
1165 		spdk_bdev_close(bvdev->bdev_desc);
1166 		bvdev->bdev_desc = NULL;
1167 		bvdev->bdev = NULL;
1168 	}
1169 }
1170 
1171 static int
1172 vhost_session_bdev_resize_cb(struct spdk_vhost_dev *vdev,
1173 			     struct spdk_vhost_session *vsession,
1174 			     void *ctx)
1175 {
1176 	SPDK_NOTICELOG("bdev send slave msg to vid(%d)\n", vsession->vid);
1177 #if RTE_VERSION >= RTE_VERSION_NUM(23, 03, 0, 0)
1178 	rte_vhost_backend_config_change(vsession->vid, false);
1179 #else
1180 	rte_vhost_slave_config_change(vsession->vid, false);
1181 #endif
1182 
1183 	return 0;
1184 }
1185 
1186 static void
1187 vhost_user_blk_resize_cb(struct spdk_vhost_dev *vdev, bdev_event_cb_complete cb, void *cb_arg)
1188 {
1189 	vhost_user_dev_foreach_session(vdev, vhost_session_bdev_resize_cb,
1190 				       cb, cb_arg);
1191 }
1192 
1193 static int
1194 vhost_user_session_bdev_remove_cb(struct spdk_vhost_dev *vdev,
1195 				  struct spdk_vhost_session *vsession,
1196 				  void *ctx)
1197 {
1198 	struct spdk_vhost_blk_session *bvsession;
1199 	int rc;
1200 
1201 	bvsession = to_blk_session(vsession);
1202 	if (bvsession->requestq_poller) {
1203 		spdk_poller_unregister(&bvsession->requestq_poller);
1204 		if (spdk_interrupt_mode_is_enabled()) {
1205 			vhost_blk_session_unregister_interrupts(bvsession);
1206 			rc = vhost_blk_session_register_no_bdev_interrupts(bvsession);
1207 			if (rc) {
1208 				SPDK_ERRLOG("%s: Interrupt register failed\n", vsession->name);
1209 				return rc;
1210 			}
1211 		}
1212 
1213 		bvsession->requestq_poller = SPDK_POLLER_REGISTER(no_bdev_vdev_worker, bvsession, 0);
1214 		spdk_poller_register_interrupt(bvsession->requestq_poller, vhost_blk_poller_set_interrupt_mode,
1215 					       bvsession);
1216 	}
1217 
1218 	return 0;
1219 }
1220 
1221 static void
1222 vhost_user_bdev_remove_cb(struct spdk_vhost_dev *vdev, bdev_event_cb_complete cb, void *cb_arg)
1223 {
1224 	SPDK_WARNLOG("%s: hot-removing bdev - all further requests will fail.\n",
1225 		     vdev->name);
1226 
1227 	vhost_user_dev_foreach_session(vdev, vhost_user_session_bdev_remove_cb,
1228 				       cb, cb_arg);
1229 }
1230 
1231 static void
1232 vhost_user_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_vhost_dev *vdev,
1233 			 bdev_event_cb_complete cb, void *cb_arg)
1234 {
1235 	switch (type) {
1236 	case SPDK_BDEV_EVENT_REMOVE:
1237 		vhost_user_bdev_remove_cb(vdev, cb, cb_arg);
1238 		break;
1239 	case SPDK_BDEV_EVENT_RESIZE:
1240 		vhost_user_blk_resize_cb(vdev, cb, cb_arg);
1241 		break;
1242 	default:
1243 		assert(false);
1244 		return;
1245 	}
1246 }
1247 
1248 static void
1249 bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
1250 	      void *event_ctx)
1251 {
1252 	struct spdk_vhost_dev *vdev = (struct spdk_vhost_dev *)event_ctx;
1253 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
1254 
1255 	assert(bvdev != NULL);
1256 
1257 	SPDK_DEBUGLOG(vhost_blk, "Bdev event: type %d, name %s\n",
1258 		      type,
1259 		      bdev->name);
1260 
1261 	switch (type) {
1262 	case SPDK_BDEV_EVENT_REMOVE:
1263 	case SPDK_BDEV_EVENT_RESIZE:
1264 		bvdev->ops->bdev_event(type, vdev, bdev_event_cpl_cb, (void *)type);
1265 		break;
1266 	default:
1267 		SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
1268 		break;
1269 	}
1270 }
1271 
1272 static void
1273 free_task_pool(struct spdk_vhost_blk_session *bvsession)
1274 {
1275 	struct spdk_vhost_session *vsession = &bvsession->vsession;
1276 	struct spdk_vhost_virtqueue *vq;
1277 	uint16_t i;
1278 
1279 	for (i = 0; i < vsession->max_queues; i++) {
1280 		vq = &vsession->virtqueue[i];
1281 		if (vq->tasks == NULL) {
1282 			continue;
1283 		}
1284 
1285 		spdk_free(vq->tasks);
1286 		vq->tasks = NULL;
1287 	}
1288 }
1289 
1290 static int
1291 alloc_vq_task_pool(struct spdk_vhost_session *vsession, uint16_t qid)
1292 {
1293 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
1294 	struct spdk_vhost_virtqueue *vq;
1295 	struct spdk_vhost_user_blk_task *task;
1296 	uint32_t task_cnt;
1297 	uint32_t j;
1298 
1299 	if (qid >= SPDK_VHOST_MAX_VQUEUES) {
1300 		return -EINVAL;
1301 	}
1302 
1303 	vq = &vsession->virtqueue[qid];
1304 	if (vq->vring.desc == NULL) {
1305 		return 0;
1306 	}
1307 
1308 	task_cnt = vq->vring.size;
1309 	if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
1310 		/* sanity check */
1311 		SPDK_ERRLOG("%s: virtqueue %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
1312 			    vsession->name, qid, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
1313 		return -1;
1314 	}
1315 	vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_user_blk_task) * task_cnt,
1316 				 SPDK_CACHE_LINE_SIZE, NULL,
1317 				 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
1318 	if (vq->tasks == NULL) {
1319 		SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
1320 			    vsession->name, task_cnt, qid);
1321 		return -1;
1322 	}
1323 
1324 	for (j = 0; j < task_cnt; j++) {
1325 		task = &((struct spdk_vhost_user_blk_task *)vq->tasks)[j];
1326 		task->bvsession = bvsession;
1327 		task->req_idx = j;
1328 		task->vq = vq;
1329 	}
1330 
1331 	return 0;
1332 }
1333 
1334 static int
1335 vhost_blk_start(struct spdk_vhost_dev *vdev,
1336 		struct spdk_vhost_session *vsession, void *unused)
1337 {
1338 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
1339 	struct spdk_vhost_blk_dev *bvdev;
1340 	int i;
1341 
1342 	/* return if start is already in progress */
1343 	if (bvsession->requestq_poller) {
1344 		SPDK_INFOLOG(vhost, "%s: start in progress\n", vsession->name);
1345 		return -EINPROGRESS;
1346 	}
1347 
1348 	/* validate all I/O queues are in a contiguous index range */
1349 	for (i = 0; i < vsession->max_queues; i++) {
1350 		/* vring.desc and vring.desc_packed are in a union struct
1351 		 * so q->vring.desc can replace q->vring.desc_packed.
1352 		 */
1353 		if (vsession->virtqueue[i].vring.desc == NULL) {
1354 			SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i);
1355 			return -1;
1356 		}
1357 	}
1358 
1359 	bvdev = to_blk_dev(vdev);
1360 	assert(bvdev != NULL);
1361 	bvsession->bvdev = bvdev;
1362 
1363 	if (bvdev->bdev) {
1364 		bvsession->io_channel = vhost_blk_get_io_channel(vdev);
1365 		if (!bvsession->io_channel) {
1366 			free_task_pool(bvsession);
1367 			SPDK_ERRLOG("%s: I/O channel allocation failed\n", vsession->name);
1368 			return -1;
1369 		}
1370 	}
1371 
1372 	if (bvdev->bdev) {
1373 		bvsession->requestq_poller = SPDK_POLLER_REGISTER(vdev_worker, bvsession, 0);
1374 	} else {
1375 		bvsession->requestq_poller = SPDK_POLLER_REGISTER(no_bdev_vdev_worker, bvsession, 0);
1376 	}
1377 	SPDK_INFOLOG(vhost, "%s: started poller on lcore %d\n",
1378 		     vsession->name, spdk_env_get_current_core());
1379 
1380 	spdk_poller_register_interrupt(bvsession->requestq_poller, vhost_blk_poller_set_interrupt_mode,
1381 				       bvsession);
1382 
1383 	return 0;
1384 }
1385 
1386 static int
1387 destroy_session_poller_cb(void *arg)
1388 {
1389 	struct spdk_vhost_blk_session *bvsession = arg;
1390 	struct spdk_vhost_session *vsession = &bvsession->vsession;
1391 	struct spdk_vhost_user_dev *user_dev = to_user_dev(vsession->vdev);
1392 	int i;
1393 
1394 	if (vsession->task_cnt > 0 || (pthread_mutex_trylock(&user_dev->lock) != 0)) {
1395 		assert(vsession->stop_retry_count > 0);
1396 		vsession->stop_retry_count--;
1397 		if (vsession->stop_retry_count == 0) {
1398 			SPDK_ERRLOG("%s: Timedout when destroy session (task_cnt %d)\n", vsession->name,
1399 				    vsession->task_cnt);
1400 			spdk_poller_unregister(&bvsession->stop_poller);
1401 			vhost_user_session_stop_done(vsession, -ETIMEDOUT);
1402 		}
1403 
1404 		return SPDK_POLLER_BUSY;
1405 	}
1406 
1407 	for (i = 0; i < vsession->max_queues; i++) {
1408 		vsession->virtqueue[i].next_event_time = 0;
1409 		vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
1410 	}
1411 
1412 	SPDK_INFOLOG(vhost, "%s: stopping poller on lcore %d\n",
1413 		     vsession->name, spdk_env_get_current_core());
1414 
1415 	if (bvsession->io_channel) {
1416 		vhost_blk_put_io_channel(bvsession->io_channel);
1417 		bvsession->io_channel = NULL;
1418 	}
1419 
1420 	free_task_pool(bvsession);
1421 	spdk_poller_unregister(&bvsession->stop_poller);
1422 	vhost_user_session_stop_done(vsession, 0);
1423 
1424 	pthread_mutex_unlock(&user_dev->lock);
1425 	return SPDK_POLLER_BUSY;
1426 }
1427 
1428 static int
1429 vhost_blk_stop(struct spdk_vhost_dev *vdev,
1430 	       struct spdk_vhost_session *vsession, void *unused)
1431 {
1432 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
1433 
1434 	/* return if stop is already in progress */
1435 	if (bvsession->stop_poller) {
1436 		return -EINPROGRESS;
1437 	}
1438 
1439 	spdk_poller_unregister(&bvsession->requestq_poller);
1440 	vhost_blk_session_unregister_interrupts(bvsession);
1441 
1442 	bvsession->vsession.stop_retry_count = (SPDK_VHOST_SESSION_STOP_RETRY_TIMEOUT_IN_SEC * 1000 *
1443 						1000) / SPDK_VHOST_SESSION_STOP_RETRY_PERIOD_IN_US;
1444 	bvsession->stop_poller = SPDK_POLLER_REGISTER(destroy_session_poller_cb,
1445 				 bvsession, SPDK_VHOST_SESSION_STOP_RETRY_PERIOD_IN_US);
1446 	return 0;
1447 }
1448 
1449 static void
1450 vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
1451 {
1452 	struct spdk_vhost_blk_dev *bvdev;
1453 
1454 	bvdev = to_blk_dev(vdev);
1455 	assert(bvdev != NULL);
1456 
1457 	spdk_json_write_named_object_begin(w, "block");
1458 
1459 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
1460 
1461 	spdk_json_write_name(w, "bdev");
1462 	if (bvdev->bdev) {
1463 		spdk_json_write_string(w, spdk_bdev_get_name(bvdev->bdev));
1464 	} else {
1465 		spdk_json_write_null(w);
1466 	}
1467 	spdk_json_write_named_string(w, "transport", bvdev->ops->name);
1468 
1469 	spdk_json_write_object_end(w);
1470 }
1471 
1472 static void
1473 vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
1474 {
1475 	struct spdk_vhost_blk_dev *bvdev;
1476 
1477 	bvdev = to_blk_dev(vdev);
1478 	assert(bvdev != NULL);
1479 
1480 	if (!bvdev->bdev) {
1481 		return;
1482 	}
1483 
1484 	spdk_json_write_object_begin(w);
1485 	spdk_json_write_named_string(w, "method", "vhost_create_blk_controller");
1486 
1487 	spdk_json_write_named_object_begin(w, "params");
1488 	spdk_json_write_named_string(w, "ctrlr", vdev->name);
1489 	spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev));
1490 	spdk_json_write_named_string(w, "cpumask",
1491 				     spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread)));
1492 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
1493 	spdk_json_write_named_string(w, "transport", bvdev->ops->name);
1494 	spdk_json_write_object_end(w);
1495 
1496 	spdk_json_write_object_end(w);
1497 }
1498 
1499 static int vhost_blk_destroy(struct spdk_vhost_dev *dev);
1500 
1501 static int
1502 vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
1503 		     uint32_t len)
1504 {
1505 	struct virtio_blk_config blkcfg;
1506 	struct spdk_bdev *bdev;
1507 	uint32_t blk_size;
1508 	uint64_t blkcnt;
1509 
1510 	memset(&blkcfg, 0, sizeof(blkcfg));
1511 	bdev = vhost_blk_get_bdev(vdev);
1512 	if (bdev == NULL) {
1513 		/* We can't just return -1 here as this GET_CONFIG message might
1514 		 * be caused by a QEMU VM reboot. Returning -1 will indicate an
1515 		 * error to QEMU, who might then decide to terminate itself.
1516 		 * We don't want that. A simple reboot shouldn't break the system.
1517 		 *
1518 		 * Presenting a block device with block size 0 and block count 0
1519 		 * doesn't cause any problems on QEMU side and the virtio-pci
1520 		 * device is even still available inside the VM, but there will
1521 		 * be no block device created for it - the kernel drivers will
1522 		 * silently reject it.
1523 		 */
1524 		blk_size = 0;
1525 		blkcnt = 0;
1526 	} else {
1527 		blk_size = spdk_bdev_get_block_size(bdev);
1528 		blkcnt = spdk_bdev_get_num_blocks(bdev);
1529 		if (spdk_bdev_get_buf_align(bdev) > 1) {
1530 			blkcfg.size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE;
1531 			blkcfg.seg_max = spdk_min(SPDK_VHOST_IOVS_MAX - 2 - 1, SPDK_BDEV_IO_NUM_CHILD_IOV - 2 - 1);
1532 		} else {
1533 			blkcfg.size_max = 131072;
1534 			/*  -2 for REQ and RESP and -1 for region boundary splitting */
1535 			blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
1536 		}
1537 	}
1538 
1539 	blkcfg.blk_size = blk_size;
1540 	/* minimum I/O size in blocks */
1541 	blkcfg.min_io_size = 1;
1542 	/* expressed in 512 Bytes sectors */
1543 	blkcfg.capacity = (blkcnt * blk_size) / 512;
1544 	/* QEMU can overwrite this value when started */
1545 	blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES;
1546 
1547 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
1548 		/* 16MiB, expressed in 512 Bytes */
1549 		blkcfg.max_discard_sectors = 32768;
1550 		blkcfg.max_discard_seg = 1;
1551 		blkcfg.discard_sector_alignment = blk_size / 512;
1552 	}
1553 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
1554 		blkcfg.max_write_zeroes_sectors = 32768;
1555 		blkcfg.max_write_zeroes_seg = 1;
1556 	}
1557 
1558 	memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg)));
1559 
1560 	return 0;
1561 }
1562 
1563 static int
1564 vhost_blk_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us,
1565 			 uint32_t iops_threshold)
1566 {
1567 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
1568 
1569 	assert(bvdev != NULL);
1570 
1571 	return bvdev->ops->set_coalescing(vdev, delay_base_us, iops_threshold);
1572 }
1573 
1574 static void
1575 vhost_blk_get_coalescing(struct spdk_vhost_dev *vdev, uint32_t *delay_base_us,
1576 			 uint32_t *iops_threshold)
1577 {
1578 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
1579 
1580 	assert(bvdev != NULL);
1581 
1582 	bvdev->ops->get_coalescing(vdev, delay_base_us, iops_threshold);
1583 }
1584 
1585 static const struct spdk_vhost_user_dev_backend vhost_blk_user_device_backend = {
1586 	.session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session),
1587 	.start_session =  vhost_blk_start,
1588 	.stop_session = vhost_blk_stop,
1589 	.alloc_vq_tasks = alloc_vq_task_pool,
1590 	.enable_vq = vhost_blk_vq_enable,
1591 };
1592 
1593 static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
1594 	.type = VHOST_BACKEND_BLK,
1595 	.vhost_get_config = vhost_blk_get_config,
1596 	.dump_info_json = vhost_blk_dump_info_json,
1597 	.write_config_json = vhost_blk_write_config_json,
1598 	.remove_device = vhost_blk_destroy,
1599 	.set_coalescing = vhost_blk_set_coalescing,
1600 	.get_coalescing = vhost_blk_get_coalescing,
1601 };
1602 
1603 int
1604 virtio_blk_construct_ctrlr(struct spdk_vhost_dev *vdev, const char *address,
1605 			   struct spdk_cpuset *cpumask, const struct spdk_json_val *params,
1606 			   const struct spdk_vhost_user_dev_backend *user_backend)
1607 {
1608 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
1609 
1610 	assert(bvdev != NULL);
1611 
1612 	return bvdev->ops->create_ctrlr(vdev, cpumask, address, params, (void *)user_backend);
1613 }
1614 
1615 int
1616 spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name,
1617 			 const char *transport, const struct spdk_json_val *params)
1618 {
1619 	struct spdk_vhost_blk_dev *bvdev = NULL;
1620 	struct spdk_vhost_dev *vdev;
1621 	struct spdk_bdev *bdev;
1622 	const char *transport_name = VIRTIO_BLK_DEFAULT_TRANSPORT;
1623 	int ret = 0;
1624 
1625 	bvdev = calloc(1, sizeof(*bvdev));
1626 	if (bvdev == NULL) {
1627 		ret = -ENOMEM;
1628 		goto out;
1629 	}
1630 
1631 	if (transport != NULL) {
1632 		transport_name = transport;
1633 	}
1634 
1635 	bvdev->ops = virtio_blk_get_transport_ops(transport_name);
1636 	if (!bvdev->ops) {
1637 		ret = -EINVAL;
1638 		SPDK_ERRLOG("Transport type '%s' unavailable.\n", transport_name);
1639 		goto out;
1640 	}
1641 
1642 	ret = spdk_bdev_open_ext(dev_name, true, bdev_event_cb, bvdev, &bvdev->bdev_desc);
1643 	if (ret != 0) {
1644 		SPDK_ERRLOG("%s: could not open bdev '%s', error=%d\n",
1645 			    name, dev_name, ret);
1646 		goto out;
1647 	}
1648 	bdev = spdk_bdev_desc_get_bdev(bvdev->bdev_desc);
1649 
1650 	vdev = &bvdev->vdev;
1651 	vdev->virtio_features = SPDK_VHOST_BLK_FEATURES_BASE;
1652 	vdev->disabled_features = SPDK_VHOST_BLK_DISABLED_FEATURES;
1653 	vdev->protocol_features = SPDK_VHOST_BLK_PROTOCOL_FEATURES;
1654 
1655 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
1656 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_DISCARD);
1657 	}
1658 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
1659 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
1660 	}
1661 
1662 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) {
1663 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_FLUSH);
1664 	}
1665 
1666 	bvdev->bdev = bdev;
1667 	bvdev->readonly = false;
1668 	ret = vhost_dev_register(vdev, name, cpumask, params, &vhost_blk_device_backend,
1669 				 &vhost_blk_user_device_backend, false);
1670 	if (ret != 0) {
1671 		spdk_bdev_close(bvdev->bdev_desc);
1672 		goto out;
1673 	}
1674 
1675 	SPDK_INFOLOG(vhost, "%s: using bdev '%s'\n", name, dev_name);
1676 out:
1677 	if (ret != 0 && bvdev) {
1678 		free(bvdev);
1679 	}
1680 	return ret;
1681 }
1682 
1683 int
1684 virtio_blk_destroy_ctrlr(struct spdk_vhost_dev *vdev)
1685 {
1686 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
1687 
1688 	assert(bvdev != NULL);
1689 
1690 	return bvdev->ops->destroy_ctrlr(vdev);
1691 }
1692 
1693 static int
1694 vhost_blk_destroy(struct spdk_vhost_dev *vdev)
1695 {
1696 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
1697 	int rc;
1698 
1699 	assert(bvdev != NULL);
1700 
1701 	rc = vhost_dev_unregister(&bvdev->vdev);
1702 	if (rc != 0) {
1703 		return rc;
1704 	}
1705 
1706 	if (bvdev->bdev_desc) {
1707 		spdk_bdev_close(bvdev->bdev_desc);
1708 		bvdev->bdev_desc = NULL;
1709 	}
1710 	bvdev->bdev = NULL;
1711 
1712 	free(bvdev);
1713 	return 0;
1714 }
1715 
1716 struct spdk_io_channel *
1717 vhost_blk_get_io_channel(struct spdk_vhost_dev *vdev)
1718 {
1719 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
1720 
1721 	assert(bvdev != NULL);
1722 
1723 	return spdk_bdev_get_io_channel(bvdev->bdev_desc);
1724 }
1725 
1726 void
1727 vhost_blk_put_io_channel(struct spdk_io_channel *ch)
1728 {
1729 	spdk_put_io_channel(ch);
1730 }
1731 
1732 static struct spdk_virtio_blk_transport *
1733 vhost_user_blk_create(const struct spdk_json_val *params)
1734 {
1735 	int ret;
1736 	struct spdk_virtio_blk_transport *vhost_user_blk;
1737 
1738 	vhost_user_blk = calloc(1, sizeof(*vhost_user_blk));
1739 	if (!vhost_user_blk) {
1740 		return NULL;
1741 	}
1742 
1743 	ret = vhost_user_init();
1744 	if (ret != 0) {
1745 		free(vhost_user_blk);
1746 		return NULL;
1747 	}
1748 
1749 	return vhost_user_blk;
1750 }
1751 
1752 static int
1753 vhost_user_blk_destroy(struct spdk_virtio_blk_transport *transport,
1754 		       spdk_vhost_fini_cb cb_fn)
1755 {
1756 	vhost_user_fini(cb_fn);
1757 	free(transport);
1758 	return 0;
1759 }
1760 
1761 struct rpc_vhost_blk {
1762 	bool readonly;
1763 	bool packed_ring;
1764 };
1765 
1766 static const struct spdk_json_object_decoder rpc_construct_vhost_blk[] = {
1767 	{"readonly", offsetof(struct rpc_vhost_blk, readonly), spdk_json_decode_bool, true},
1768 	{"packed_ring", offsetof(struct rpc_vhost_blk, packed_ring), spdk_json_decode_bool, true},
1769 };
1770 
1771 static int
1772 vhost_user_blk_create_ctrlr(struct spdk_vhost_dev *vdev, struct spdk_cpuset *cpumask,
1773 			    const char *address, const struct spdk_json_val *params, void *custom_opts)
1774 {
1775 	struct rpc_vhost_blk req = {0};
1776 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
1777 
1778 	assert(bvdev != NULL);
1779 
1780 	if (spdk_json_decode_object_relaxed(params, rpc_construct_vhost_blk,
1781 					    SPDK_COUNTOF(rpc_construct_vhost_blk),
1782 					    &req)) {
1783 		SPDK_DEBUGLOG(vhost_blk, "spdk_json_decode_object failed\n");
1784 		return -EINVAL;
1785 	}
1786 
1787 	if (req.packed_ring) {
1788 		vdev->virtio_features |= (uint64_t)req.packed_ring << VIRTIO_F_RING_PACKED;
1789 	}
1790 	if (req.readonly) {
1791 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_RO);
1792 		bvdev->readonly = req.readonly;
1793 	}
1794 
1795 	return vhost_user_dev_create(vdev, address, cpumask, custom_opts, false);
1796 }
1797 
1798 static int
1799 vhost_user_blk_destroy_ctrlr(struct spdk_vhost_dev *vdev)
1800 {
1801 	return vhost_user_dev_unregister(vdev);
1802 }
1803 
1804 static void
1805 vhost_user_blk_dump_opts(struct spdk_virtio_blk_transport *transport, struct spdk_json_write_ctx *w)
1806 {
1807 	assert(w != NULL);
1808 
1809 	spdk_json_write_named_string(w, "name", transport->ops->name);
1810 }
1811 
1812 static const struct spdk_virtio_blk_transport_ops vhost_user_blk = {
1813 	.name = "vhost_user_blk",
1814 
1815 	.dump_opts = vhost_user_blk_dump_opts,
1816 
1817 	.create = vhost_user_blk_create,
1818 	.destroy = vhost_user_blk_destroy,
1819 
1820 	.create_ctrlr = vhost_user_blk_create_ctrlr,
1821 	.destroy_ctrlr = vhost_user_blk_destroy_ctrlr,
1822 
1823 	.bdev_event = vhost_user_bdev_event_cb,
1824 	.set_coalescing = vhost_user_set_coalescing,
1825 	.get_coalescing = vhost_user_get_coalescing,
1826 };
1827 
1828 SPDK_VIRTIO_BLK_TRANSPORT_REGISTER(vhost_user_blk, &vhost_user_blk);
1829 
1830 SPDK_LOG_REGISTER_COMPONENT(vhost_blk)
1831 SPDK_LOG_REGISTER_COMPONENT(vhost_blk_data)
1832