xref: /spdk/lib/vhost/vhost_blk.c (revision 26ae3d66d89392c108a30e405ca8424617a03417)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <linux/virtio_blk.h>
35 
36 #include "spdk/env.h"
37 #include "spdk/bdev.h"
38 #include "spdk/bdev_module.h"
39 #include "spdk/thread.h"
40 #include "spdk/likely.h"
41 #include "spdk/string.h"
42 #include "spdk/util.h"
43 #include "spdk/vhost.h"
44 
45 #include "vhost_internal.h"
46 #include <rte_version.h>
47 
48 /* Minimal set of features supported by every SPDK VHOST-BLK device */
49 #define SPDK_VHOST_BLK_FEATURES_BASE (SPDK_VHOST_FEATURES | \
50 		(1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) | \
51 		(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \
52 		(1ULL << VIRTIO_BLK_F_TOPOLOGY) | (1ULL << VIRTIO_BLK_F_BARRIER)  | \
53 		(1ULL << VIRTIO_BLK_F_SCSI)     | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
54 		(1ULL << VIRTIO_BLK_F_MQ))
55 
56 /* Not supported features */
57 #define SPDK_VHOST_BLK_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \
58 		(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
59 		(1ULL << VIRTIO_BLK_F_BARRIER)  | (1ULL << VIRTIO_BLK_F_SCSI))
60 
61 /* Vhost-blk support protocol features */
62 #define SPDK_VHOST_BLK_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_CONFIG) | \
63 		(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
64 
65 struct spdk_vhost_blk_task {
66 	struct spdk_bdev_io *bdev_io;
67 	struct spdk_vhost_blk_session *bvsession;
68 	struct spdk_vhost_virtqueue *vq;
69 
70 	volatile uint8_t *status;
71 
72 	uint16_t req_idx;
73 	uint16_t num_descs;
74 	uint16_t buffer_id;
75 
76 	/* for io wait */
77 	struct spdk_bdev_io_wait_entry bdev_io_wait;
78 
79 	/* If set, the task is currently used for I/O processing. */
80 	bool used;
81 
82 	/** Number of bytes that were written. */
83 	uint32_t used_len;
84 	uint16_t iovcnt;
85 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
86 };
87 
88 struct spdk_vhost_blk_dev {
89 	struct spdk_vhost_dev vdev;
90 	struct spdk_bdev *bdev;
91 	struct spdk_bdev_desc *bdev_desc;
92 	/* dummy_io_channel is used to hold a bdev reference */
93 	struct spdk_io_channel *dummy_io_channel;
94 	bool readonly;
95 };
96 
97 struct spdk_vhost_blk_session {
98 	/* The parent session must be the very first field in this struct */
99 	struct spdk_vhost_session vsession;
100 	struct spdk_vhost_blk_dev *bvdev;
101 	struct spdk_poller *requestq_poller;
102 	struct spdk_io_channel *io_channel;
103 	struct spdk_poller *stop_poller;
104 };
105 
106 /* forward declaration */
107 static const struct spdk_vhost_dev_backend vhost_blk_device_backend;
108 
109 static int
110 process_blk_request(struct spdk_vhost_blk_task *task,
111 		    struct spdk_vhost_blk_session *bvsession,
112 		    struct spdk_vhost_virtqueue *vq);
113 
114 static struct spdk_vhost_blk_session *
115 to_blk_session(struct spdk_vhost_session *vsession)
116 {
117 	assert(vsession->vdev->backend == &vhost_blk_device_backend);
118 	return (struct spdk_vhost_blk_session *)vsession;
119 }
120 
121 static void
122 blk_task_finish(struct spdk_vhost_blk_task *task)
123 {
124 	assert(task->bvsession->vsession.task_cnt > 0);
125 	task->bvsession->vsession.task_cnt--;
126 	task->used = false;
127 }
128 
129 static void
130 blk_task_init(struct spdk_vhost_blk_task *task)
131 {
132 	task->used = true;
133 	task->iovcnt = SPDK_COUNTOF(task->iovs);
134 	task->status = NULL;
135 	task->used_len = 0;
136 }
137 
138 static void
139 blk_task_enqueue(struct spdk_vhost_blk_task *task)
140 {
141 	if (task->vq->packed.packed_ring) {
142 		vhost_vq_packed_ring_enqueue(&task->bvsession->vsession, task->vq,
143 					     task->num_descs,
144 					     task->buffer_id, task->used_len);
145 	} else {
146 		vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq,
147 					   task->req_idx, task->used_len);
148 	}
149 }
150 
151 static void
152 invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status)
153 {
154 	if (task->status) {
155 		*task->status = status;
156 	}
157 
158 	blk_task_enqueue(task);
159 	blk_task_finish(task);
160 	SPDK_DEBUGLOG(vhost_blk_data, "Invalid request (status=%" PRIu8")\n", status);
161 }
162 
163 /*
164  * Process task's descriptor chain and setup data related fields.
165  * Return
166  *   total size of suplied buffers
167  *
168  *   FIXME: Make this function return to rd_cnt and wr_cnt
169  */
170 static int
171 blk_iovs_split_queue_setup(struct spdk_vhost_blk_session *bvsession,
172 			   struct spdk_vhost_virtqueue *vq,
173 			   uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
174 {
175 	struct spdk_vhost_session *vsession = &bvsession->vsession;
176 	struct spdk_vhost_dev *vdev = vsession->vdev;
177 	struct vring_desc *desc, *desc_table;
178 	uint16_t out_cnt = 0, cnt = 0;
179 	uint32_t desc_table_size, len = 0;
180 	uint32_t desc_handled_cnt;
181 	int rc;
182 
183 	rc = vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size);
184 	if (rc != 0) {
185 		SPDK_ERRLOG("%s: invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
186 		return -1;
187 	}
188 
189 	desc_handled_cnt = 0;
190 	while (1) {
191 		/*
192 		 * Maximum cnt reached?
193 		 * Should not happen if request is well formatted, otherwise this is a BUG.
194 		 */
195 		if (spdk_unlikely(cnt == *iovs_cnt)) {
196 			SPDK_DEBUGLOG(vhost_blk, "%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
197 				      vsession->name, req_idx);
198 			return -1;
199 		}
200 
201 		if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) {
202 			SPDK_DEBUGLOG(vhost_blk, "%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
203 				      vsession->name, req_idx, cnt);
204 			return -1;
205 		}
206 
207 		len += desc->len;
208 
209 		out_cnt += vhost_vring_desc_is_wr(desc);
210 
211 		rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
212 		if (rc != 0) {
213 			SPDK_ERRLOG("%s: descriptor chain at index %"PRIu16" terminated unexpectedly.\n",
214 				    vsession->name, req_idx);
215 			return -1;
216 		} else if (desc == NULL) {
217 			break;
218 		}
219 
220 		desc_handled_cnt++;
221 		if (spdk_unlikely(desc_handled_cnt > desc_table_size)) {
222 			/* Break a cycle and report an error, if any. */
223 			SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n",
224 				    vsession->name, desc_table_size, desc_handled_cnt);
225 			return -1;
226 		}
227 	}
228 
229 	/*
230 	 * There must be least two descriptors.
231 	 * First contain request so it must be readable.
232 	 * Last descriptor contain buffer for response so it must be writable.
233 	 */
234 	if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
235 		return -1;
236 	}
237 
238 	*length = len;
239 	*iovs_cnt = cnt;
240 	return 0;
241 }
242 
243 static int
244 blk_iovs_packed_queue_setup(struct spdk_vhost_blk_session *bvsession,
245 			    struct spdk_vhost_virtqueue *vq,
246 			    uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
247 {
248 	struct spdk_vhost_session *vsession = &bvsession->vsession;
249 	struct spdk_vhost_dev *vdev = vsession->vdev;
250 	struct vring_packed_desc *desc = NULL, *desc_table;
251 	uint16_t out_cnt = 0, cnt = 0;
252 	uint32_t desc_table_size, len = 0;
253 	int rc = 0;
254 
255 	rc = vhost_vq_get_desc_packed(vsession, vq, req_idx, &desc,
256 				      &desc_table, &desc_table_size);
257 	if (spdk_unlikely(rc != 0)) {
258 		SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
259 		return rc;
260 	}
261 
262 	if (desc_table != NULL) {
263 		req_idx = 0;
264 	}
265 
266 	while (1) {
267 		/*
268 		 * Maximum cnt reached?
269 		 * Should not happen if request is well formatted, otherwise this is a BUG.
270 		 */
271 		if (spdk_unlikely(cnt == *iovs_cnt)) {
272 			SPDK_ERRLOG("%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
273 				    vsession->name, req_idx);
274 			return -EINVAL;
275 		}
276 
277 		if (spdk_unlikely(vhost_vring_packed_desc_to_iov(vsession, iovs, &cnt, desc))) {
278 			SPDK_ERRLOG("%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
279 				    vsession->name, req_idx, cnt);
280 			return -EINVAL;
281 		}
282 
283 		len += desc->len;
284 		out_cnt += vhost_vring_packed_desc_is_wr(desc);
285 
286 		/* desc is NULL means we reach the last desc of this request */
287 		vhost_vring_packed_desc_get_next(&desc, &req_idx, vq, desc_table, desc_table_size);
288 		if (desc == NULL) {
289 			break;
290 		}
291 	}
292 
293 	/*
294 	 * There must be least two descriptors.
295 	 * First contain request so it must be readable.
296 	 * Last descriptor contain buffer for response so it must be writable.
297 	 */
298 	if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
299 		return -EINVAL;
300 	}
301 
302 	*length = len;
303 	*iovs_cnt = cnt;
304 
305 	return 0;
306 }
307 
308 static void
309 blk_request_finish(bool success, struct spdk_vhost_blk_task *task)
310 {
311 	*task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
312 
313 	blk_task_enqueue(task);
314 
315 	SPDK_DEBUGLOG(vhost_blk, "Finished task (%p) req_idx=%d\n status: %s\n", task,
316 		      task->req_idx, success ? "OK" : "FAIL");
317 	blk_task_finish(task);
318 }
319 
320 static void
321 blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
322 {
323 	struct spdk_vhost_blk_task *task = cb_arg;
324 
325 	spdk_bdev_free_io(bdev_io);
326 	blk_request_finish(success, task);
327 }
328 
329 static void
330 blk_request_resubmit(void *arg)
331 {
332 	struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg;
333 	int rc = 0;
334 
335 	blk_task_init(task);
336 
337 	rc = process_blk_request(task, task->bvsession, task->vq);
338 	if (rc == 0) {
339 		SPDK_DEBUGLOG(vhost_blk, "====== Task %p resubmitted ======\n", task);
340 	} else {
341 		SPDK_DEBUGLOG(vhost_blk, "====== Task %p failed ======\n", task);
342 	}
343 }
344 
345 static inline void
346 blk_request_queue_io(struct spdk_vhost_blk_task *task)
347 {
348 	int rc;
349 	struct spdk_vhost_blk_session *bvsession = task->bvsession;
350 	struct spdk_bdev *bdev = bvsession->bvdev->bdev;
351 
352 	task->bdev_io_wait.bdev = bdev;
353 	task->bdev_io_wait.cb_fn = blk_request_resubmit;
354 	task->bdev_io_wait.cb_arg = task;
355 
356 	rc = spdk_bdev_queue_io_wait(bdev, bvsession->io_channel, &task->bdev_io_wait);
357 	if (rc != 0) {
358 		SPDK_ERRLOG("%s: failed to queue I/O, rc=%d\n", bvsession->vsession.name, rc);
359 		invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
360 	}
361 }
362 
363 static int
364 process_blk_request(struct spdk_vhost_blk_task *task,
365 		    struct spdk_vhost_blk_session *bvsession,
366 		    struct spdk_vhost_virtqueue *vq)
367 {
368 	struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
369 	const struct virtio_blk_outhdr *req;
370 	struct virtio_blk_discard_write_zeroes *desc;
371 	struct iovec *iov;
372 	uint32_t type;
373 	uint32_t payload_len;
374 	uint64_t flush_bytes;
375 	int rc;
376 
377 	if (vq->packed.packed_ring) {
378 		rc = blk_iovs_packed_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
379 						 &payload_len);
380 	} else {
381 		rc = blk_iovs_split_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
382 						&payload_len);
383 	}
384 
385 	if (rc) {
386 		SPDK_DEBUGLOG(vhost_blk, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
387 		/* Only READ and WRITE are supported for now. */
388 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
389 		return -1;
390 	}
391 
392 	iov = &task->iovs[0];
393 	if (spdk_unlikely(iov->iov_len != sizeof(*req))) {
394 		SPDK_DEBUGLOG(vhost_blk,
395 			      "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n",
396 			      iov->iov_len, sizeof(*req), task->req_idx);
397 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
398 		return -1;
399 	}
400 
401 	req = iov->iov_base;
402 
403 	iov = &task->iovs[task->iovcnt - 1];
404 	if (spdk_unlikely(iov->iov_len != 1)) {
405 		SPDK_DEBUGLOG(vhost_blk,
406 			      "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n",
407 			      iov->iov_len, 1, task->req_idx);
408 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
409 		return -1;
410 	}
411 
412 	task->status = iov->iov_base;
413 	payload_len -= sizeof(*req) + sizeof(*task->status);
414 	task->iovcnt -= 2;
415 
416 	type = req->type;
417 #ifdef VIRTIO_BLK_T_BARRIER
418 	/* Don't care about barier for now (as QEMU's virtio-blk do). */
419 	type &= ~VIRTIO_BLK_T_BARRIER;
420 #endif
421 
422 	switch (type) {
423 	case VIRTIO_BLK_T_IN:
424 	case VIRTIO_BLK_T_OUT:
425 		if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) {
426 			SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n",
427 				    type ? "WRITE" : "READ", task->req_idx);
428 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
429 			return -1;
430 		}
431 
432 		if (type == VIRTIO_BLK_T_IN) {
433 			task->used_len = payload_len + sizeof(*task->status);
434 			rc = spdk_bdev_readv(bvdev->bdev_desc, bvsession->io_channel,
435 					     &task->iovs[1], task->iovcnt, req->sector * 512,
436 					     payload_len, blk_request_complete_cb, task);
437 		} else if (!bvdev->readonly) {
438 			task->used_len = sizeof(*task->status);
439 			rc = spdk_bdev_writev(bvdev->bdev_desc, bvsession->io_channel,
440 					      &task->iovs[1], task->iovcnt, req->sector * 512,
441 					      payload_len, blk_request_complete_cb, task);
442 		} else {
443 			SPDK_DEBUGLOG(vhost_blk, "Device is in read-only mode!\n");
444 			rc = -1;
445 		}
446 
447 		if (rc) {
448 			if (rc == -ENOMEM) {
449 				SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
450 				blk_request_queue_io(task);
451 			} else {
452 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
453 				return -1;
454 			}
455 		}
456 		break;
457 	case VIRTIO_BLK_T_DISCARD:
458 		desc = task->iovs[1].iov_base;
459 		if (payload_len != sizeof(*desc)) {
460 			SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len);
461 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
462 			return -1;
463 		}
464 
465 		rc = spdk_bdev_unmap(bvdev->bdev_desc, bvsession->io_channel,
466 				     desc->sector * 512, desc->num_sectors * 512,
467 				     blk_request_complete_cb, task);
468 		if (rc) {
469 			if (rc == -ENOMEM) {
470 				SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
471 				blk_request_queue_io(task);
472 			} else {
473 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
474 				return -1;
475 			}
476 		}
477 		break;
478 	case VIRTIO_BLK_T_WRITE_ZEROES:
479 		desc = task->iovs[1].iov_base;
480 		if (payload_len != sizeof(*desc)) {
481 			SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len);
482 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
483 			return -1;
484 		}
485 
486 		/* Zeroed and Unmap the range, SPDK doen't support it. */
487 		if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
488 			SPDK_NOTICELOG("Can't support Write Zeroes with Unmap flag\n");
489 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
490 			return -1;
491 		}
492 
493 		rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvsession->io_channel,
494 					    desc->sector * 512, desc->num_sectors * 512,
495 					    blk_request_complete_cb, task);
496 		if (rc) {
497 			if (rc == -ENOMEM) {
498 				SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
499 				blk_request_queue_io(task);
500 			} else {
501 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
502 				return -1;
503 			}
504 		}
505 		break;
506 	case VIRTIO_BLK_T_FLUSH:
507 		flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev);
508 		if (req->sector != 0) {
509 			SPDK_NOTICELOG("sector must be zero for flush command\n");
510 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
511 			return -1;
512 		}
513 		rc = spdk_bdev_flush(bvdev->bdev_desc, bvsession->io_channel,
514 				     0, flush_bytes,
515 				     blk_request_complete_cb, task);
516 		if (rc) {
517 			if (rc == -ENOMEM) {
518 				SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
519 				blk_request_queue_io(task);
520 			} else {
521 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
522 				return -1;
523 			}
524 		}
525 		break;
526 	case VIRTIO_BLK_T_GET_ID:
527 		if (!task->iovcnt || !payload_len) {
528 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
529 			return -1;
530 		}
531 		task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len);
532 		spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev),
533 				task->used_len, ' ');
534 		blk_request_finish(true, task);
535 		break;
536 	default:
537 		SPDK_DEBUGLOG(vhost_blk, "Not supported request type '%"PRIu32"'.\n", type);
538 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
539 		return -1;
540 	}
541 
542 	return 0;
543 }
544 
545 static void
546 process_blk_task(struct spdk_vhost_virtqueue *vq, uint16_t req_idx)
547 {
548 	struct spdk_vhost_blk_task *task;
549 	uint16_t task_idx = req_idx, num_descs;
550 
551 	if (vq->packed.packed_ring) {
552 		/* Packed ring used the buffer_id as the task_idx to get task struct.
553 		 * In kernel driver, it uses the vq->free_head to set the buffer_id so the value
554 		 * must be in the range of 0 ~ vring.size. The free_head value must be unique
555 		 * in the outstanding requests.
556 		 * We can't use the req_idx as the task_idx because the desc can be reused in
557 		 * the next phase even when it's not completed in the previous phase. For example,
558 		 * At phase 0, last_used_idx was 2 and desc0 was not completed.Then after moving
559 		 * phase 1, last_avail_idx is updated to 1. In this case, req_idx can not be used
560 		 * as task_idx because we will know task[0]->used is true at phase 1.
561 		 * The split queue is quite different, the desc would insert into the free list when
562 		 * device completes the request, the driver gets the desc from the free list which
563 		 * ensures the req_idx is unique in the outstanding requests.
564 		 */
565 		task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs);
566 	}
567 
568 	task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx];
569 	if (spdk_unlikely(task->used)) {
570 		SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
571 			    task->bvsession->vsession.name, task_idx);
572 		task->used_len = 0;
573 		blk_task_enqueue(task);
574 		return;
575 	}
576 
577 	if (vq->packed.packed_ring) {
578 		task->req_idx = req_idx;
579 		task->num_descs = num_descs;
580 		task->buffer_id = task_idx;
581 	}
582 
583 	task->bvsession->vsession.task_cnt++;
584 
585 	blk_task_init(task);
586 
587 	if (process_blk_request(task, task->bvsession, vq) == 0) {
588 		SPDK_DEBUGLOG(vhost_blk, "====== Task %p req_idx %d submitted ======\n", task,
589 			      task_idx);
590 	} else {
591 		SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, task_idx);
592 	}
593 }
594 
595 static void
596 submit_inflight_desc(struct spdk_vhost_blk_session *bvsession,
597 		     struct spdk_vhost_virtqueue *vq)
598 {
599 	struct spdk_vhost_session *vsession = &bvsession->vsession;
600 	spdk_vhost_resubmit_info *resubmit = vq->vring_inflight.resubmit_inflight;
601 	spdk_vhost_resubmit_desc *resubmit_list;
602 	uint16_t req_idx;
603 
604 	if (spdk_likely(resubmit == NULL || resubmit->resubmit_list == NULL)) {
605 		return;
606 	}
607 
608 	resubmit_list = resubmit->resubmit_list;
609 	while (resubmit->resubmit_num-- > 0) {
610 		req_idx = resubmit_list[resubmit->resubmit_num].index;
611 		SPDK_DEBUGLOG(vhost_blk, "====== Start processing request idx %"PRIu16"======\n",
612 			      req_idx);
613 
614 		if (spdk_unlikely(req_idx >= vq->vring.size)) {
615 			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
616 				    vsession->name, req_idx, vq->vring.size);
617 			vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
618 			continue;
619 		}
620 
621 		process_blk_task(vq, req_idx);
622 	}
623 
624 	free(resubmit_list);
625 	resubmit->resubmit_list = NULL;
626 }
627 
628 static void
629 process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
630 {
631 	struct spdk_vhost_session *vsession = &bvsession->vsession;
632 	uint16_t reqs[SPDK_VHOST_VQ_MAX_SUBMISSIONS];
633 	uint16_t reqs_cnt, i;
634 
635 	submit_inflight_desc(bvsession, vq);
636 
637 	reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
638 	if (!reqs_cnt) {
639 		return;
640 	}
641 
642 	for (i = 0; i < reqs_cnt; i++) {
643 		SPDK_DEBUGLOG(vhost_blk, "====== Starting processing request idx %"PRIu16"======\n",
644 			      reqs[i]);
645 
646 		if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
647 			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
648 				    vsession->name, reqs[i], vq->vring.size);
649 			vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
650 			continue;
651 		}
652 
653 		rte_vhost_set_inflight_desc_split(vsession->vid, vq->vring_idx, reqs[i]);
654 
655 		process_blk_task(vq, reqs[i]);
656 	}
657 }
658 
659 static void
660 process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
661 {
662 	uint16_t i = 0;
663 
664 	while (i++ < SPDK_VHOST_VQ_MAX_SUBMISSIONS &&
665 	       vhost_vq_packed_ring_is_avail(vq)) {
666 		SPDK_DEBUGLOG(vhost_blk, "====== Starting processing request idx %"PRIu16"======\n",
667 			      vq->last_avail_idx);
668 
669 		process_blk_task(vq, vq->last_avail_idx);
670 	}
671 }
672 
673 static int
674 _vdev_vq_worker(struct spdk_vhost_virtqueue *vq)
675 {
676 	struct spdk_vhost_session *vsession = vq->vsession;
677 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
678 	bool packed_ring;
679 
680 	packed_ring = vq->packed.packed_ring;
681 	if (packed_ring) {
682 		process_packed_vq(bvsession, vq);
683 	} else {
684 		process_vq(bvsession, vq);
685 	}
686 
687 	vhost_session_vq_used_signal(vq);
688 
689 	return SPDK_POLLER_BUSY;
690 
691 }
692 
693 static int
694 vdev_vq_worker(void *arg)
695 {
696 	struct spdk_vhost_virtqueue *vq = arg;
697 
698 	return _vdev_vq_worker(vq);
699 }
700 
701 static int
702 vdev_worker(void *arg)
703 {
704 	struct spdk_vhost_blk_session *bvsession = arg;
705 	struct spdk_vhost_session *vsession = &bvsession->vsession;
706 	uint16_t q_idx;
707 
708 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
709 		_vdev_vq_worker(&vsession->virtqueue[q_idx]);
710 	}
711 
712 	return SPDK_POLLER_BUSY;
713 }
714 
715 static void
716 no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
717 {
718 	struct spdk_vhost_session *vsession = &bvsession->vsession;
719 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
720 	uint32_t length;
721 	uint16_t iovcnt, req_idx;
722 
723 	if (vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) {
724 		return;
725 	}
726 
727 	iovcnt = SPDK_COUNTOF(iovs);
728 	if (blk_iovs_split_queue_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) {
729 		*(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR;
730 		SPDK_DEBUGLOG(vhost_blk_data, "Aborting request %" PRIu16"\n", req_idx);
731 	}
732 
733 	vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
734 }
735 
736 static void
737 no_bdev_process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
738 {
739 	struct spdk_vhost_session *vsession = &bvsession->vsession;
740 	struct spdk_vhost_blk_task *task;
741 	uint32_t length;
742 	uint16_t req_idx = vq->last_avail_idx;
743 	uint16_t task_idx, num_descs;
744 
745 	if (!vhost_vq_packed_ring_is_avail(vq)) {
746 		return;
747 	}
748 
749 	task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs);
750 	task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx];
751 	if (spdk_unlikely(task->used)) {
752 		SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
753 			    vsession->name, req_idx);
754 		vhost_vq_packed_ring_enqueue(vsession, vq, num_descs,
755 					     task->buffer_id, task->used_len);
756 		return;
757 	}
758 
759 	task->req_idx = req_idx;
760 	task->num_descs = num_descs;
761 	task->buffer_id = task_idx;
762 	blk_task_init(task);
763 
764 	if (blk_iovs_packed_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
765 					&length)) {
766 		*(volatile uint8_t *)(task->iovs[task->iovcnt - 1].iov_base) = VIRTIO_BLK_S_IOERR;
767 		SPDK_DEBUGLOG(vhost_blk_data, "Aborting request %" PRIu16"\n", req_idx);
768 	}
769 
770 	task->used = false;
771 	vhost_vq_packed_ring_enqueue(vsession, vq, num_descs,
772 				     task->buffer_id, task->used_len);
773 }
774 
775 static int
776 _no_bdev_vdev_vq_worker(struct spdk_vhost_virtqueue *vq)
777 {
778 	struct spdk_vhost_session *vsession = vq->vsession;
779 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
780 	bool packed_ring;
781 
782 	packed_ring = vq->packed.packed_ring;
783 	if (packed_ring) {
784 		no_bdev_process_packed_vq(bvsession, vq);
785 	} else {
786 		no_bdev_process_vq(bvsession, vq);
787 	}
788 
789 	vhost_session_vq_used_signal(vq);
790 
791 	if (vsession->task_cnt == 0 && bvsession->io_channel) {
792 		spdk_put_io_channel(bvsession->io_channel);
793 		bvsession->io_channel = NULL;
794 	}
795 
796 	return SPDK_POLLER_BUSY;
797 }
798 
799 static int
800 no_bdev_vdev_vq_worker(void *arg)
801 {
802 	struct spdk_vhost_virtqueue *vq = arg;
803 
804 	return _no_bdev_vdev_vq_worker(vq);
805 }
806 
807 static int
808 no_bdev_vdev_worker(void *arg)
809 {
810 	struct spdk_vhost_blk_session *bvsession = arg;
811 	struct spdk_vhost_session *vsession = &bvsession->vsession;
812 	uint16_t q_idx;
813 
814 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
815 		_no_bdev_vdev_vq_worker(&vsession->virtqueue[q_idx]);
816 	}
817 
818 	return SPDK_POLLER_BUSY;
819 }
820 
821 static void
822 vhost_blk_session_unregister_interrupts(struct spdk_vhost_blk_session *bvsession)
823 {
824 	struct spdk_vhost_session *vsession = &bvsession->vsession;
825 	struct spdk_vhost_virtqueue *vq;
826 	int i;
827 
828 	SPDK_DEBUGLOG(vhost_blk, "unregister virtqueues interrupt\n");
829 	for (i = 0; i < vsession->max_queues; i++) {
830 		vq = &vsession->virtqueue[i];
831 		if (vq->intr == NULL) {
832 			break;
833 		}
834 
835 		SPDK_DEBUGLOG(vhost_blk, "unregister vq[%d]'s kickfd is %d\n",
836 			      i, vq->vring.kickfd);
837 		spdk_interrupt_unregister(&vq->intr);
838 	}
839 }
840 
841 static int
842 vhost_blk_session_register_interrupts(struct spdk_vhost_blk_session *bvsession,
843 				      spdk_interrupt_fn fn)
844 {
845 	struct spdk_vhost_session *vsession = &bvsession->vsession;
846 	struct spdk_vhost_virtqueue *vq = NULL;
847 	int i;
848 
849 	SPDK_DEBUGLOG(vhost_blk, "Register virtqueues interrupt\n");
850 	for (i = 0; i < vsession->max_queues; i++) {
851 		vq = &vsession->virtqueue[i];
852 		SPDK_DEBUGLOG(vhost_blk, "Register vq[%d]'s kickfd is %d\n",
853 			      i, vq->vring.kickfd);
854 
855 		vq->intr = SPDK_INTERRUPT_REGISTER(vq->vring.kickfd, fn, vq);
856 		if (vq->intr == NULL) {
857 			SPDK_ERRLOG("Fail to register req notifier handler.\n");
858 			goto err;
859 		}
860 	}
861 
862 	return 0;
863 
864 err:
865 	vhost_blk_session_unregister_interrupts(bvsession);
866 
867 	return -1;
868 }
869 
870 static struct spdk_vhost_blk_dev *
871 to_blk_dev(struct spdk_vhost_dev *vdev)
872 {
873 	if (vdev == NULL) {
874 		return NULL;
875 	}
876 
877 	if (vdev->backend != &vhost_blk_device_backend) {
878 		SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name);
879 		return NULL;
880 	}
881 
882 	return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev);
883 }
884 
885 static int
886 vhost_session_bdev_resize_cb(struct spdk_vhost_dev *vdev,
887 			     struct spdk_vhost_session *vsession,
888 			     void *ctx)
889 {
890 #if RTE_VERSION >= RTE_VERSION_NUM(20, 02, 0, 0)
891 	SPDK_NOTICELOG("bdev send slave msg to vid(%d)\n", vsession->vid);
892 	rte_vhost_slave_config_change(vsession->vid, false);
893 #else
894 	SPDK_NOTICELOG("bdev does not support resize until DPDK submodule version >= 20.02\n");
895 #endif
896 
897 	return 0;
898 }
899 
900 static void
901 blk_resize_cb(void *resize_ctx)
902 {
903 	struct spdk_vhost_blk_dev *bvdev = resize_ctx;
904 
905 	spdk_vhost_lock();
906 	vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_resize_cb,
907 				  NULL, NULL);
908 	spdk_vhost_unlock();
909 }
910 
911 static void
912 vhost_dev_bdev_remove_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx)
913 {
914 
915 	/* All sessions have been notified, time to close the bdev */
916 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
917 
918 	assert(bvdev != NULL);
919 	spdk_put_io_channel(bvdev->dummy_io_channel);
920 	spdk_bdev_close(bvdev->bdev_desc);
921 	bvdev->bdev_desc = NULL;
922 	bvdev->bdev = NULL;
923 }
924 
925 static int
926 vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev,
927 			     struct spdk_vhost_session *vsession,
928 			     void *ctx)
929 {
930 	struct spdk_vhost_blk_session *bvsession;
931 	int rc;
932 
933 	bvsession = (struct spdk_vhost_blk_session *)vsession;
934 	if (bvsession->requestq_poller) {
935 		spdk_poller_unregister(&bvsession->requestq_poller);
936 		bvsession->requestq_poller = SPDK_POLLER_REGISTER(no_bdev_vdev_worker, bvsession, 0);
937 	}
938 
939 	if (vsession->virtqueue[0].intr) {
940 		vhost_blk_session_unregister_interrupts(bvsession);
941 		rc = vhost_blk_session_register_interrupts(bvsession, no_bdev_vdev_vq_worker);
942 		if (rc) {
943 			SPDK_ERRLOG("%s: Interrupt register failed\n", vsession->name);
944 			return -1;
945 		}
946 
947 	}
948 
949 	return 0;
950 }
951 
952 static void
953 bdev_remove_cb(void *remove_ctx)
954 {
955 	struct spdk_vhost_blk_dev *bvdev = remove_ctx;
956 
957 	SPDK_WARNLOG("%s: hot-removing bdev - all further requests will fail.\n",
958 		     bvdev->vdev.name);
959 
960 	spdk_vhost_lock();
961 	vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_remove_cb,
962 				  vhost_dev_bdev_remove_cpl_cb, NULL);
963 	spdk_vhost_unlock();
964 }
965 
966 static void
967 bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
968 	      void *event_ctx)
969 {
970 	SPDK_DEBUGLOG(vhost_blk, "Bdev event: type %d, name %s\n",
971 		      type,
972 		      bdev->name);
973 
974 	switch (type) {
975 	case SPDK_BDEV_EVENT_REMOVE:
976 		SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_REMOVE)\n", bdev->name);
977 		bdev_remove_cb(event_ctx);
978 		break;
979 	case SPDK_BDEV_EVENT_RESIZE:
980 		SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_RESIZE)\n", bdev->name);
981 		blk_resize_cb(event_ctx);
982 		break;
983 	default:
984 		SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
985 		break;
986 	}
987 }
988 
989 static void
990 free_task_pool(struct spdk_vhost_blk_session *bvsession)
991 {
992 	struct spdk_vhost_session *vsession = &bvsession->vsession;
993 	struct spdk_vhost_virtqueue *vq;
994 	uint16_t i;
995 
996 	for (i = 0; i < vsession->max_queues; i++) {
997 		vq = &vsession->virtqueue[i];
998 		if (vq->tasks == NULL) {
999 			continue;
1000 		}
1001 
1002 		spdk_free(vq->tasks);
1003 		vq->tasks = NULL;
1004 	}
1005 }
1006 
1007 static int
1008 alloc_task_pool(struct spdk_vhost_blk_session *bvsession)
1009 {
1010 	struct spdk_vhost_session *vsession = &bvsession->vsession;
1011 	struct spdk_vhost_virtqueue *vq;
1012 	struct spdk_vhost_blk_task *task;
1013 	uint32_t task_cnt;
1014 	uint16_t i;
1015 	uint32_t j;
1016 
1017 	for (i = 0; i < vsession->max_queues; i++) {
1018 		vq = &vsession->virtqueue[i];
1019 		if (vq->vring.desc == NULL) {
1020 			continue;
1021 		}
1022 
1023 		task_cnt = vq->vring.size;
1024 		if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
1025 			/* sanity check */
1026 			SPDK_ERRLOG("%s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
1027 				    vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
1028 			free_task_pool(bvsession);
1029 			return -1;
1030 		}
1031 		vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt,
1032 					 SPDK_CACHE_LINE_SIZE, NULL,
1033 					 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
1034 		if (vq->tasks == NULL) {
1035 			SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
1036 				    vsession->name, task_cnt, i);
1037 			free_task_pool(bvsession);
1038 			return -1;
1039 		}
1040 
1041 		for (j = 0; j < task_cnt; j++) {
1042 			task = &((struct spdk_vhost_blk_task *)vq->tasks)[j];
1043 			task->bvsession = bvsession;
1044 			task->req_idx = j;
1045 			task->vq = vq;
1046 		}
1047 	}
1048 
1049 	return 0;
1050 }
1051 
1052 static int
1053 vhost_blk_start_cb(struct spdk_vhost_dev *vdev,
1054 		   struct spdk_vhost_session *vsession, void *unused)
1055 {
1056 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
1057 	struct spdk_vhost_blk_dev *bvdev;
1058 	int i, rc = 0;
1059 
1060 	bvdev = to_blk_dev(vdev);
1061 	assert(bvdev != NULL);
1062 	bvsession->bvdev = bvdev;
1063 
1064 	/* validate all I/O queues are in a contiguous index range */
1065 	for (i = 0; i < vsession->max_queues; i++) {
1066 		/* vring.desc and vring.desc_packed are in a union struct
1067 		 * so q->vring.desc can replace q->vring.desc_packed.
1068 		 */
1069 		if (vsession->virtqueue[i].vring.desc == NULL) {
1070 			SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i);
1071 			rc = -1;
1072 			goto out;
1073 		}
1074 	}
1075 
1076 	rc = alloc_task_pool(bvsession);
1077 	if (rc != 0) {
1078 		SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name);
1079 		goto out;
1080 	}
1081 
1082 	if (bvdev->bdev) {
1083 		bvsession->io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
1084 		if (!bvsession->io_channel) {
1085 			free_task_pool(bvsession);
1086 			SPDK_ERRLOG("%s: I/O channel allocation failed\n", vsession->name);
1087 			rc = -1;
1088 			goto out;
1089 		}
1090 	}
1091 
1092 	if (spdk_interrupt_mode_is_enabled()) {
1093 		rc = vhost_blk_session_register_interrupts(bvsession,
1094 				bvdev->bdev ? vdev_vq_worker : no_bdev_vdev_vq_worker);
1095 		if (rc) {
1096 			SPDK_ERRLOG("%s: Interrupt register failed\n", vsession->name);
1097 			goto out;
1098 		}
1099 		SPDK_INFOLOG(vhost, "%s: started interrupt source on lcore %d\n",
1100 			     vsession->name, spdk_env_get_current_core());
1101 	} else {
1102 		bvsession->requestq_poller = SPDK_POLLER_REGISTER(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker,
1103 					     bvsession, 0);
1104 		SPDK_INFOLOG(vhost, "%s: started poller on lcore %d\n",
1105 			     vsession->name, spdk_env_get_current_core());
1106 	}
1107 
1108 out:
1109 	vhost_session_start_done(vsession, rc);
1110 	return rc;
1111 }
1112 
1113 static int
1114 vhost_blk_start(struct spdk_vhost_session *vsession)
1115 {
1116 	return vhost_session_send_event(vsession, vhost_blk_start_cb,
1117 					3, "start session");
1118 }
1119 
1120 static int
1121 destroy_session_poller_cb(void *arg)
1122 {
1123 	struct spdk_vhost_blk_session *bvsession = arg;
1124 	struct spdk_vhost_session *vsession = &bvsession->vsession;
1125 	int i;
1126 
1127 	if (vsession->task_cnt > 0) {
1128 		return SPDK_POLLER_BUSY;
1129 	}
1130 
1131 	if (spdk_vhost_trylock() != 0) {
1132 		return SPDK_POLLER_BUSY;
1133 	}
1134 
1135 	for (i = 0; i < vsession->max_queues; i++) {
1136 		vsession->virtqueue[i].next_event_time = 0;
1137 		vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
1138 	}
1139 
1140 	SPDK_INFOLOG(vhost, "%s: stopping poller on lcore %d\n",
1141 		     vsession->name, spdk_env_get_current_core());
1142 
1143 	if (bvsession->io_channel) {
1144 		spdk_put_io_channel(bvsession->io_channel);
1145 		bvsession->io_channel = NULL;
1146 	}
1147 
1148 	free_task_pool(bvsession);
1149 	spdk_poller_unregister(&bvsession->stop_poller);
1150 	vhost_session_stop_done(vsession, 0);
1151 
1152 	spdk_vhost_unlock();
1153 	return SPDK_POLLER_BUSY;
1154 }
1155 
1156 static int
1157 vhost_blk_stop_cb(struct spdk_vhost_dev *vdev,
1158 		  struct spdk_vhost_session *vsession, void *unused)
1159 {
1160 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
1161 
1162 	spdk_poller_unregister(&bvsession->requestq_poller);
1163 
1164 	if (vsession->virtqueue[0].intr) {
1165 		vhost_blk_session_unregister_interrupts(bvsession);
1166 	}
1167 
1168 	bvsession->stop_poller = SPDK_POLLER_REGISTER(destroy_session_poller_cb,
1169 				 bvsession, 1000);
1170 	return 0;
1171 }
1172 
1173 static int
1174 vhost_blk_stop(struct spdk_vhost_session *vsession)
1175 {
1176 	return vhost_session_send_event(vsession, vhost_blk_stop_cb,
1177 					3, "stop session");
1178 }
1179 
1180 static void
1181 vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
1182 {
1183 	struct spdk_vhost_blk_dev *bvdev;
1184 
1185 	bvdev = to_blk_dev(vdev);
1186 	assert(bvdev != NULL);
1187 
1188 	spdk_json_write_named_object_begin(w, "block");
1189 
1190 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
1191 
1192 	spdk_json_write_name(w, "bdev");
1193 	if (bvdev->bdev) {
1194 		spdk_json_write_string(w, spdk_bdev_get_name(bvdev->bdev));
1195 	} else {
1196 		spdk_json_write_null(w);
1197 	}
1198 
1199 	spdk_json_write_object_end(w);
1200 }
1201 
1202 static void
1203 vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
1204 {
1205 	struct spdk_vhost_blk_dev *bvdev;
1206 
1207 	bvdev = to_blk_dev(vdev);
1208 	assert(bvdev != NULL);
1209 
1210 	if (!bvdev->bdev) {
1211 		return;
1212 	}
1213 
1214 	spdk_json_write_object_begin(w);
1215 	spdk_json_write_named_string(w, "method", "vhost_create_blk_controller");
1216 
1217 	spdk_json_write_named_object_begin(w, "params");
1218 	spdk_json_write_named_string(w, "ctrlr", vdev->name);
1219 	spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev));
1220 	spdk_json_write_named_string(w, "cpumask",
1221 				     spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread)));
1222 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
1223 	spdk_json_write_object_end(w);
1224 
1225 	spdk_json_write_object_end(w);
1226 }
1227 
1228 static int vhost_blk_destroy(struct spdk_vhost_dev *dev);
1229 
1230 static int
1231 vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
1232 		     uint32_t len)
1233 {
1234 	struct virtio_blk_config blkcfg;
1235 	struct spdk_vhost_blk_dev *bvdev;
1236 	struct spdk_bdev *bdev;
1237 	uint32_t blk_size;
1238 	uint64_t blkcnt;
1239 
1240 	memset(&blkcfg, 0, sizeof(blkcfg));
1241 	bvdev = to_blk_dev(vdev);
1242 	assert(bvdev != NULL);
1243 	bdev = bvdev->bdev;
1244 	if (bdev == NULL) {
1245 		/* We can't just return -1 here as this GET_CONFIG message might
1246 		 * be caused by a QEMU VM reboot. Returning -1 will indicate an
1247 		 * error to QEMU, who might then decide to terminate itself.
1248 		 * We don't want that. A simple reboot shouldn't break the system.
1249 		 *
1250 		 * Presenting a block device with block size 0 and block count 0
1251 		 * doesn't cause any problems on QEMU side and the virtio-pci
1252 		 * device is even still available inside the VM, but there will
1253 		 * be no block device created for it - the kernel drivers will
1254 		 * silently reject it.
1255 		 */
1256 		blk_size = 0;
1257 		blkcnt = 0;
1258 	} else {
1259 		blk_size = spdk_bdev_get_block_size(bdev);
1260 		blkcnt = spdk_bdev_get_num_blocks(bdev);
1261 		if (spdk_bdev_get_buf_align(bdev) > 1) {
1262 			blkcfg.size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE;
1263 			blkcfg.seg_max = spdk_min(SPDK_VHOST_IOVS_MAX - 2 - 1, BDEV_IO_NUM_CHILD_IOV - 2 - 1);
1264 		} else {
1265 			blkcfg.size_max = 131072;
1266 			/*  -2 for REQ and RESP and -1 for region boundary splitting */
1267 			blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
1268 		}
1269 	}
1270 
1271 	blkcfg.blk_size = blk_size;
1272 	/* minimum I/O size in blocks */
1273 	blkcfg.min_io_size = 1;
1274 	/* expressed in 512 Bytes sectors */
1275 	blkcfg.capacity = (blkcnt * blk_size) / 512;
1276 	/* QEMU can overwrite this value when started */
1277 	blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES;
1278 
1279 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
1280 		/* 16MiB, expressed in 512 Bytes */
1281 		blkcfg.max_discard_sectors = 32768;
1282 		blkcfg.max_discard_seg = 1;
1283 		blkcfg.discard_sector_alignment = blk_size / 512;
1284 	}
1285 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
1286 		blkcfg.max_write_zeroes_sectors = 32768;
1287 		blkcfg.max_write_zeroes_seg = 1;
1288 	}
1289 
1290 	memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg)));
1291 
1292 	return 0;
1293 }
1294 
1295 static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
1296 	.session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session),
1297 	.start_session =  vhost_blk_start,
1298 	.stop_session = vhost_blk_stop,
1299 	.vhost_get_config = vhost_blk_get_config,
1300 	.dump_info_json = vhost_blk_dump_info_json,
1301 	.write_config_json = vhost_blk_write_config_json,
1302 	.remove_device = vhost_blk_destroy,
1303 };
1304 
1305 int
1306 spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name,
1307 			 bool readonly, bool packed_ring)
1308 {
1309 	struct spdk_vhost_blk_dev *bvdev = NULL;
1310 	struct spdk_vhost_dev *vdev;
1311 	struct spdk_bdev *bdev;
1312 	int ret = 0;
1313 
1314 	spdk_vhost_lock();
1315 
1316 	bvdev = calloc(1, sizeof(*bvdev));
1317 	if (bvdev == NULL) {
1318 		ret = -ENOMEM;
1319 		goto out;
1320 	}
1321 
1322 	ret = spdk_bdev_open_ext(dev_name, true, bdev_event_cb, bvdev, &bvdev->bdev_desc);
1323 	if (ret != 0) {
1324 		SPDK_ERRLOG("%s: could not open bdev '%s', error=%d\n",
1325 			    name, dev_name, ret);
1326 		goto out;
1327 	}
1328 	bdev = spdk_bdev_desc_get_bdev(bvdev->bdev_desc);
1329 
1330 	vdev = &bvdev->vdev;
1331 	vdev->virtio_features = SPDK_VHOST_BLK_FEATURES_BASE;
1332 	vdev->disabled_features = SPDK_VHOST_BLK_DISABLED_FEATURES;
1333 	vdev->protocol_features = SPDK_VHOST_BLK_PROTOCOL_FEATURES;
1334 
1335 	vdev->virtio_features |= (uint64_t)packed_ring << VIRTIO_F_RING_PACKED;
1336 
1337 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
1338 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_DISCARD);
1339 	}
1340 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
1341 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
1342 	}
1343 	if (readonly) {
1344 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_RO);
1345 	}
1346 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) {
1347 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_FLUSH);
1348 	}
1349 
1350 	/*
1351 	 * When starting qemu with vhost-user-blk multiqueue, the vhost device will
1352 	 * be started/stopped many times, related to the queues num, as the
1353 	 * vhost-user backend doesn't know the exact number of queues used for this
1354 	 * device. The target have to stop and start the device once got a valid
1355 	 * IO queue.
1356 	 * When stoping and starting the vhost device, the backend bdev io device
1357 	 * will be deleted and created repeatedly.
1358 	 * Hold a bdev reference so that in the struct spdk_vhost_blk_dev, so that
1359 	 * the io device will not be deleted.
1360 	 */
1361 	bvdev->dummy_io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
1362 
1363 	bvdev->bdev = bdev;
1364 	bvdev->readonly = readonly;
1365 	ret = vhost_dev_register(vdev, name, cpumask, &vhost_blk_device_backend);
1366 	if (ret != 0) {
1367 		spdk_put_io_channel(bvdev->dummy_io_channel);
1368 		spdk_bdev_close(bvdev->bdev_desc);
1369 		goto out;
1370 	}
1371 
1372 	SPDK_INFOLOG(vhost, "%s: using bdev '%s'\n", name, dev_name);
1373 out:
1374 	if (ret != 0 && bvdev) {
1375 		free(bvdev);
1376 	}
1377 	spdk_vhost_unlock();
1378 	return ret;
1379 }
1380 
1381 static int
1382 vhost_blk_destroy(struct spdk_vhost_dev *vdev)
1383 {
1384 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
1385 	int rc;
1386 
1387 	assert(bvdev != NULL);
1388 
1389 	rc = vhost_dev_unregister(&bvdev->vdev);
1390 	if (rc != 0) {
1391 		return rc;
1392 	}
1393 
1394 	/* if the bdev is removed, don't need call spdk_put_io_channel. */
1395 	if (bvdev->bdev) {
1396 		spdk_put_io_channel(bvdev->dummy_io_channel);
1397 	}
1398 
1399 	if (bvdev->bdev_desc) {
1400 		spdk_bdev_close(bvdev->bdev_desc);
1401 		bvdev->bdev_desc = NULL;
1402 	}
1403 	bvdev->bdev = NULL;
1404 
1405 	free(bvdev);
1406 	return 0;
1407 }
1408 
1409 SPDK_LOG_REGISTER_COMPONENT(vhost_blk)
1410 SPDK_LOG_REGISTER_COMPONENT(vhost_blk_data)
1411