xref: /spdk/lib/vhost/vhost_blk.c (revision 32999ab917f67af61872f868585fd3d78ad6fb8a)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <linux/virtio_blk.h>
35 
36 #include "spdk/env.h"
37 #include "spdk/bdev.h"
38 #include "spdk/bdev_module.h"
39 #include "spdk/thread.h"
40 #include "spdk/likely.h"
41 #include "spdk/string.h"
42 #include "spdk/util.h"
43 #include "spdk/vhost.h"
44 
45 #include "vhost_internal.h"
46 #include <rte_version.h>
47 
48 /* Minimal set of features supported by every SPDK VHOST-BLK device */
49 #define SPDK_VHOST_BLK_FEATURES_BASE (SPDK_VHOST_FEATURES | \
50 		(1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) | \
51 		(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \
52 		(1ULL << VIRTIO_BLK_F_TOPOLOGY) | (1ULL << VIRTIO_BLK_F_BARRIER)  | \
53 		(1ULL << VIRTIO_BLK_F_SCSI)     | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
54 		(1ULL << VIRTIO_BLK_F_MQ))
55 
56 /* Not supported features */
57 #define SPDK_VHOST_BLK_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \
58 		(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
59 		(1ULL << VIRTIO_BLK_F_BARRIER)  | (1ULL << VIRTIO_BLK_F_SCSI))
60 
61 /* Vhost-blk support protocol features */
62 #define SPDK_VHOST_BLK_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_CONFIG) | \
63 		(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
64 
65 struct spdk_vhost_blk_task {
66 	struct spdk_bdev_io *bdev_io;
67 	struct spdk_vhost_blk_session *bvsession;
68 	struct spdk_vhost_virtqueue *vq;
69 
70 	volatile uint8_t *status;
71 
72 	uint16_t req_idx;
73 	uint16_t num_descs;
74 	uint16_t buffer_id;
75 	uint16_t inflight_head;
76 
77 	/* for io wait */
78 	struct spdk_bdev_io_wait_entry bdev_io_wait;
79 
80 	/* If set, the task is currently used for I/O processing. */
81 	bool used;
82 
83 	/** Number of bytes that were written. */
84 	uint32_t used_len;
85 	uint16_t iovcnt;
86 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
87 
88 	/** Size of whole payload in bytes */
89 	uint32_t payload_size;
90 };
91 
92 struct spdk_vhost_blk_dev {
93 	struct spdk_vhost_dev vdev;
94 	struct spdk_bdev *bdev;
95 	struct spdk_bdev_desc *bdev_desc;
96 	/* dummy_io_channel is used to hold a bdev reference */
97 	struct spdk_io_channel *dummy_io_channel;
98 	bool readonly;
99 };
100 
101 struct spdk_vhost_blk_session {
102 	/* The parent session must be the very first field in this struct */
103 	struct spdk_vhost_session vsession;
104 	struct spdk_vhost_blk_dev *bvdev;
105 	struct spdk_poller *requestq_poller;
106 	struct spdk_io_channel *io_channel;
107 	struct spdk_poller *stop_poller;
108 };
109 
110 /* forward declaration */
111 static const struct spdk_vhost_dev_backend vhost_blk_device_backend;
112 
113 static int
114 process_blk_request(struct spdk_vhost_blk_task *task,
115 		    struct spdk_vhost_blk_session *bvsession);
116 
117 static struct spdk_vhost_blk_session *
118 to_blk_session(struct spdk_vhost_session *vsession)
119 {
120 	assert(vsession->vdev->backend == &vhost_blk_device_backend);
121 	return (struct spdk_vhost_blk_session *)vsession;
122 }
123 
124 static void
125 blk_task_finish(struct spdk_vhost_blk_task *task)
126 {
127 	assert(task->bvsession->vsession.task_cnt > 0);
128 	task->bvsession->vsession.task_cnt--;
129 	task->used = false;
130 }
131 
132 static void
133 blk_task_init(struct spdk_vhost_blk_task *task)
134 {
135 	task->used = true;
136 	task->iovcnt = SPDK_COUNTOF(task->iovs);
137 	task->status = NULL;
138 	task->used_len = 0;
139 	task->payload_size = 0;
140 }
141 
142 static void
143 blk_task_enqueue(struct spdk_vhost_blk_task *task)
144 {
145 	if (task->vq->packed.packed_ring) {
146 		vhost_vq_packed_ring_enqueue(&task->bvsession->vsession, task->vq,
147 					     task->num_descs,
148 					     task->buffer_id, task->used_len,
149 					     task->inflight_head);
150 	} else {
151 		vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq,
152 					   task->req_idx, task->used_len);
153 	}
154 }
155 
156 static void
157 invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status)
158 {
159 	if (task->status) {
160 		*task->status = status;
161 	}
162 
163 	blk_task_enqueue(task);
164 	blk_task_finish(task);
165 	SPDK_DEBUGLOG(vhost_blk_data, "Invalid request (status=%" PRIu8")\n", status);
166 }
167 
168 /*
169  * Process task's descriptor chain and setup data related fields.
170  * Return
171  *   total size of suplied buffers
172  *
173  *   FIXME: Make this function return to rd_cnt and wr_cnt
174  */
175 static int
176 blk_iovs_split_queue_setup(struct spdk_vhost_blk_session *bvsession,
177 			   struct spdk_vhost_virtqueue *vq,
178 			   uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
179 {
180 	struct spdk_vhost_session *vsession = &bvsession->vsession;
181 	struct spdk_vhost_dev *vdev = vsession->vdev;
182 	struct vring_desc *desc, *desc_table;
183 	uint16_t out_cnt = 0, cnt = 0;
184 	uint32_t desc_table_size, len = 0;
185 	uint32_t desc_handled_cnt;
186 	int rc;
187 
188 	rc = vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size);
189 	if (rc != 0) {
190 		SPDK_ERRLOG("%s: invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
191 		return -1;
192 	}
193 
194 	desc_handled_cnt = 0;
195 	while (1) {
196 		/*
197 		 * Maximum cnt reached?
198 		 * Should not happen if request is well formatted, otherwise this is a BUG.
199 		 */
200 		if (spdk_unlikely(cnt == *iovs_cnt)) {
201 			SPDK_DEBUGLOG(vhost_blk, "%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
202 				      vsession->name, req_idx);
203 			return -1;
204 		}
205 
206 		if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) {
207 			SPDK_DEBUGLOG(vhost_blk, "%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
208 				      vsession->name, req_idx, cnt);
209 			return -1;
210 		}
211 
212 		len += desc->len;
213 
214 		out_cnt += vhost_vring_desc_is_wr(desc);
215 
216 		rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
217 		if (rc != 0) {
218 			SPDK_ERRLOG("%s: descriptor chain at index %"PRIu16" terminated unexpectedly.\n",
219 				    vsession->name, req_idx);
220 			return -1;
221 		} else if (desc == NULL) {
222 			break;
223 		}
224 
225 		desc_handled_cnt++;
226 		if (spdk_unlikely(desc_handled_cnt > desc_table_size)) {
227 			/* Break a cycle and report an error, if any. */
228 			SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n",
229 				    vsession->name, desc_table_size, desc_handled_cnt);
230 			return -1;
231 		}
232 	}
233 
234 	/*
235 	 * There must be least two descriptors.
236 	 * First contain request so it must be readable.
237 	 * Last descriptor contain buffer for response so it must be writable.
238 	 */
239 	if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
240 		return -1;
241 	}
242 
243 	*length = len;
244 	*iovs_cnt = cnt;
245 	return 0;
246 }
247 
248 static int
249 blk_iovs_packed_desc_setup(struct spdk_vhost_session *vsession,
250 			   struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
251 			   struct vring_packed_desc *desc_table, uint16_t desc_table_size,
252 			   struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
253 {
254 	struct vring_packed_desc *desc;
255 	uint16_t cnt = 0, out_cnt = 0;
256 	uint32_t len = 0;
257 
258 	if (desc_table == NULL) {
259 		desc = &vq->vring.desc_packed[req_idx];
260 	} else {
261 		req_idx = 0;
262 		desc = desc_table;
263 	}
264 
265 	while (1) {
266 		/*
267 		 * Maximum cnt reached?
268 		 * Should not happen if request is well formatted, otherwise this is a BUG.
269 		 */
270 		if (spdk_unlikely(cnt == *iovs_cnt)) {
271 			SPDK_ERRLOG("%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
272 				    vsession->name, req_idx);
273 			return -EINVAL;
274 		}
275 
276 		if (spdk_unlikely(vhost_vring_packed_desc_to_iov(vsession, iovs, &cnt, desc))) {
277 			SPDK_ERRLOG("%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
278 				    vsession->name, req_idx, cnt);
279 			return -EINVAL;
280 		}
281 
282 		len += desc->len;
283 		out_cnt += vhost_vring_packed_desc_is_wr(desc);
284 
285 		/* desc is NULL means we reach the last desc of this request */
286 		vhost_vring_packed_desc_get_next(&desc, &req_idx, vq, desc_table, desc_table_size);
287 		if (desc == NULL) {
288 			break;
289 		}
290 	}
291 
292 	/*
293 	 * There must be least two descriptors.
294 	 * First contain request so it must be readable.
295 	 * Last descriptor contain buffer for response so it must be writable.
296 	 */
297 	if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
298 		return -EINVAL;
299 	}
300 
301 	*length = len;
302 	*iovs_cnt = cnt;
303 
304 	return 0;
305 }
306 
307 static int
308 blk_iovs_packed_queue_setup(struct spdk_vhost_blk_session *bvsession,
309 			    struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
310 			    struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
311 {
312 	struct spdk_vhost_session *vsession = &bvsession->vsession;
313 	struct spdk_vhost_dev *vdev = vsession->vdev;
314 	struct vring_packed_desc *desc = NULL, *desc_table;
315 	uint32_t desc_table_size;
316 	int rc;
317 
318 	rc = vhost_vq_get_desc_packed(vsession, vq, req_idx, &desc,
319 				      &desc_table, &desc_table_size);
320 	if (spdk_unlikely(rc != 0)) {
321 		SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
322 		return rc;
323 	}
324 
325 	return blk_iovs_packed_desc_setup(vsession, vq, req_idx, desc_table, desc_table_size,
326 					  iovs, iovs_cnt, length);
327 }
328 
329 static int
330 blk_iovs_inflight_queue_setup(struct spdk_vhost_blk_session *bvsession,
331 			      struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
332 			      struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
333 {
334 	struct spdk_vhost_session *vsession = &bvsession->vsession;
335 	struct spdk_vhost_dev *vdev = vsession->vdev;
336 	spdk_vhost_inflight_desc *inflight_desc;
337 	struct vring_packed_desc *desc_table;
338 	uint16_t out_cnt = 0, cnt = 0;
339 	uint32_t desc_table_size, len = 0;
340 	int rc = 0;
341 
342 	rc = vhost_inflight_queue_get_desc(vsession, vq->vring_inflight.inflight_packed->desc,
343 					   req_idx, &inflight_desc, &desc_table, &desc_table_size);
344 	if (spdk_unlikely(rc != 0)) {
345 		SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
346 		return rc;
347 	}
348 
349 	if (desc_table != NULL) {
350 		return blk_iovs_packed_desc_setup(vsession, vq, req_idx, desc_table, desc_table_size,
351 						  iovs, iovs_cnt, length);
352 	}
353 
354 	while (1) {
355 		/*
356 		 * Maximum cnt reached?
357 		 * Should not happen if request is well formatted, otherwise this is a BUG.
358 		 */
359 		if (spdk_unlikely(cnt == *iovs_cnt)) {
360 			SPDK_ERRLOG("%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
361 				    vsession->name, req_idx);
362 			return -EINVAL;
363 		}
364 
365 		if (spdk_unlikely(vhost_vring_inflight_desc_to_iov(vsession, iovs, &cnt, inflight_desc))) {
366 			SPDK_ERRLOG("%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
367 				    vsession->name, req_idx, cnt);
368 			return -EINVAL;
369 		}
370 
371 		len += inflight_desc->len;
372 		out_cnt += vhost_vring_inflight_desc_is_wr(inflight_desc);
373 
374 		/* Without F_NEXT means it's the last desc */
375 		if ((inflight_desc->flags & VRING_DESC_F_NEXT) == 0) {
376 			break;
377 		}
378 
379 		inflight_desc = &vq->vring_inflight.inflight_packed->desc[inflight_desc->next];
380 	}
381 
382 	/*
383 	 * There must be least two descriptors.
384 	 * First contain request so it must be readable.
385 	 * Last descriptor contain buffer for response so it must be writable.
386 	 */
387 	if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
388 		return -EINVAL;
389 	}
390 
391 	*length = len;
392 	*iovs_cnt = cnt;
393 
394 	return 0;
395 }
396 
397 static void
398 blk_request_finish(bool success, struct spdk_vhost_blk_task *task)
399 {
400 	*task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
401 
402 	blk_task_enqueue(task);
403 
404 	SPDK_DEBUGLOG(vhost_blk, "Finished task (%p) req_idx=%d\n status: %s\n", task,
405 		      task->req_idx, success ? "OK" : "FAIL");
406 	blk_task_finish(task);
407 }
408 
409 static void
410 blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
411 {
412 	struct spdk_vhost_blk_task *task = cb_arg;
413 
414 	spdk_bdev_free_io(bdev_io);
415 	blk_request_finish(success, task);
416 }
417 
418 static void
419 blk_request_resubmit(void *arg)
420 {
421 	struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg;
422 	int rc = 0;
423 
424 	rc = process_blk_request(task, task->bvsession);
425 	if (rc == 0) {
426 		SPDK_DEBUGLOG(vhost_blk, "====== Task %p resubmitted ======\n", task);
427 	} else {
428 		SPDK_DEBUGLOG(vhost_blk, "====== Task %p failed ======\n", task);
429 	}
430 }
431 
432 static inline void
433 blk_request_queue_io(struct spdk_vhost_blk_task *task)
434 {
435 	int rc;
436 	struct spdk_vhost_blk_session *bvsession = task->bvsession;
437 	struct spdk_bdev *bdev = bvsession->bvdev->bdev;
438 
439 	task->bdev_io_wait.bdev = bdev;
440 	task->bdev_io_wait.cb_fn = blk_request_resubmit;
441 	task->bdev_io_wait.cb_arg = task;
442 
443 	rc = spdk_bdev_queue_io_wait(bdev, bvsession->io_channel, &task->bdev_io_wait);
444 	if (rc != 0) {
445 		SPDK_ERRLOG("%s: failed to queue I/O, rc=%d\n", bvsession->vsession.name, rc);
446 		invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
447 	}
448 }
449 
450 static int
451 process_blk_request(struct spdk_vhost_blk_task *task,
452 		    struct spdk_vhost_blk_session *bvsession)
453 {
454 	struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
455 	const struct virtio_blk_outhdr *req;
456 	struct virtio_blk_discard_write_zeroes *desc;
457 	struct iovec *iov;
458 	uint32_t type;
459 	uint64_t flush_bytes;
460 	uint32_t payload_len;
461 	int rc;
462 
463 	iov = &task->iovs[0];
464 	if (spdk_unlikely(iov->iov_len != sizeof(*req))) {
465 		SPDK_DEBUGLOG(vhost_blk,
466 			      "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n",
467 			      iov->iov_len, sizeof(*req), task->req_idx);
468 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
469 		return -1;
470 	}
471 
472 	req = iov->iov_base;
473 
474 	iov = &task->iovs[task->iovcnt - 1];
475 	if (spdk_unlikely(iov->iov_len != 1)) {
476 		SPDK_DEBUGLOG(vhost_blk,
477 			      "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n",
478 			      iov->iov_len, 1, task->req_idx);
479 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
480 		return -1;
481 	}
482 
483 	payload_len = task->payload_size;
484 	task->status = iov->iov_base;
485 	payload_len -= sizeof(*req) + sizeof(*task->status);
486 	task->iovcnt -= 2;
487 
488 	type = req->type;
489 #ifdef VIRTIO_BLK_T_BARRIER
490 	/* Don't care about barier for now (as QEMU's virtio-blk do). */
491 	type &= ~VIRTIO_BLK_T_BARRIER;
492 #endif
493 
494 	switch (type) {
495 	case VIRTIO_BLK_T_IN:
496 	case VIRTIO_BLK_T_OUT:
497 		if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) {
498 			SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n",
499 				    type ? "WRITE" : "READ", task->req_idx);
500 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
501 			return -1;
502 		}
503 
504 		if (type == VIRTIO_BLK_T_IN) {
505 			task->used_len = payload_len + sizeof(*task->status);
506 			rc = spdk_bdev_readv(bvdev->bdev_desc, bvsession->io_channel,
507 					     &task->iovs[1], task->iovcnt, req->sector * 512,
508 					     payload_len, blk_request_complete_cb, task);
509 		} else if (!bvdev->readonly) {
510 			task->used_len = sizeof(*task->status);
511 			rc = spdk_bdev_writev(bvdev->bdev_desc, bvsession->io_channel,
512 					      &task->iovs[1], task->iovcnt, req->sector * 512,
513 					      payload_len, blk_request_complete_cb, task);
514 		} else {
515 			SPDK_DEBUGLOG(vhost_blk, "Device is in read-only mode!\n");
516 			rc = -1;
517 		}
518 
519 		if (rc) {
520 			if (rc == -ENOMEM) {
521 				SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
522 				blk_request_queue_io(task);
523 			} else {
524 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
525 				return -1;
526 			}
527 		}
528 		break;
529 	case VIRTIO_BLK_T_DISCARD:
530 		desc = task->iovs[1].iov_base;
531 		if (payload_len != sizeof(*desc)) {
532 			SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len);
533 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
534 			return -1;
535 		}
536 
537 		if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
538 			SPDK_ERRLOG("UNMAP flag is only used for WRITE ZEROES command\n");
539 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
540 			return -1;
541 		}
542 
543 		rc = spdk_bdev_unmap(bvdev->bdev_desc, bvsession->io_channel,
544 				     desc->sector * 512, desc->num_sectors * 512,
545 				     blk_request_complete_cb, task);
546 		if (rc) {
547 			if (rc == -ENOMEM) {
548 				SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
549 				blk_request_queue_io(task);
550 			} else {
551 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
552 				return -1;
553 			}
554 		}
555 		break;
556 	case VIRTIO_BLK_T_WRITE_ZEROES:
557 		desc = task->iovs[1].iov_base;
558 		if (payload_len != sizeof(*desc)) {
559 			SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len);
560 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
561 			return -1;
562 		}
563 
564 		/* Unmap this range, SPDK doesn't support it, kernel will enable this flag by default
565 		 * without checking unmap feature is negociated or not, the flag isn't mandatory, so
566 		 * just print a warning.
567 		 */
568 		if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
569 			SPDK_WARNLOG("Ignore the unmap flag for WRITE ZEROES from %"PRIx64", len %"PRIx64"\n",
570 				     (uint64_t)desc->sector * 512, (uint64_t)desc->num_sectors * 512);
571 		}
572 
573 		rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvsession->io_channel,
574 					    desc->sector * 512, desc->num_sectors * 512,
575 					    blk_request_complete_cb, task);
576 		if (rc) {
577 			if (rc == -ENOMEM) {
578 				SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
579 				blk_request_queue_io(task);
580 			} else {
581 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
582 				return -1;
583 			}
584 		}
585 		break;
586 	case VIRTIO_BLK_T_FLUSH:
587 		flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev);
588 		if (req->sector != 0) {
589 			SPDK_NOTICELOG("sector must be zero for flush command\n");
590 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
591 			return -1;
592 		}
593 		rc = spdk_bdev_flush(bvdev->bdev_desc, bvsession->io_channel,
594 				     0, flush_bytes,
595 				     blk_request_complete_cb, task);
596 		if (rc) {
597 			if (rc == -ENOMEM) {
598 				SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
599 				blk_request_queue_io(task);
600 			} else {
601 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
602 				return -1;
603 			}
604 		}
605 		break;
606 	case VIRTIO_BLK_T_GET_ID:
607 		if (!task->iovcnt || !payload_len) {
608 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
609 			return -1;
610 		}
611 		task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len);
612 		spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev),
613 				task->used_len, ' ');
614 		blk_request_finish(true, task);
615 		break;
616 	default:
617 		SPDK_DEBUGLOG(vhost_blk, "Not supported request type '%"PRIu32"'.\n", type);
618 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
619 		return -1;
620 	}
621 
622 	return 0;
623 }
624 
625 static void
626 process_blk_task(struct spdk_vhost_virtqueue *vq, uint16_t req_idx)
627 {
628 	struct spdk_vhost_blk_task *task;
629 	int rc;
630 
631 	assert(vq->packed.packed_ring == false);
632 
633 	task = &((struct spdk_vhost_blk_task *)vq->tasks)[req_idx];
634 	if (spdk_unlikely(task->used)) {
635 		SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
636 			    task->bvsession->vsession.name, req_idx);
637 		task->used_len = 0;
638 		blk_task_enqueue(task);
639 		return;
640 	}
641 
642 	task->bvsession->vsession.task_cnt++;
643 
644 	blk_task_init(task);
645 
646 	rc = blk_iovs_split_queue_setup(task->bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
647 					&task->payload_size);
648 
649 	if (rc) {
650 		SPDK_DEBUGLOG(vhost_blk, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
651 		/* Only READ and WRITE are supported for now. */
652 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
653 		return;
654 	}
655 
656 	if (process_blk_request(task, task->bvsession) == 0) {
657 		SPDK_DEBUGLOG(vhost_blk, "====== Task %p req_idx %d submitted ======\n", task,
658 			      req_idx);
659 	} else {
660 		SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, req_idx);
661 	}
662 }
663 
664 static void
665 process_packed_blk_task(struct spdk_vhost_virtqueue *vq, uint16_t req_idx)
666 {
667 	struct spdk_vhost_blk_task *task;
668 	uint16_t task_idx = req_idx, num_descs;
669 	int rc;
670 
671 	assert(vq->packed.packed_ring);
672 
673 	/* Packed ring used the buffer_id as the task_idx to get task struct.
674 	 * In kernel driver, it uses the vq->free_head to set the buffer_id so the value
675 	 * must be in the range of 0 ~ vring.size. The free_head value must be unique
676 	 * in the outstanding requests.
677 	 * We can't use the req_idx as the task_idx because the desc can be reused in
678 	 * the next phase even when it's not completed in the previous phase. For example,
679 	 * At phase 0, last_used_idx was 2 and desc0 was not completed.Then after moving
680 	 * phase 1, last_avail_idx is updated to 1. In this case, req_idx can not be used
681 	 * as task_idx because we will know task[0]->used is true at phase 1.
682 	 * The split queue is quite different, the desc would insert into the free list when
683 	 * device completes the request, the driver gets the desc from the free list which
684 	 * ensures the req_idx is unique in the outstanding requests.
685 	 */
686 	task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs);
687 
688 	task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx];
689 	if (spdk_unlikely(task->used)) {
690 		SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
691 			    task->bvsession->vsession.name, task_idx);
692 		task->used_len = 0;
693 		blk_task_enqueue(task);
694 		return;
695 	}
696 
697 	task->req_idx = req_idx;
698 	task->num_descs = num_descs;
699 	task->buffer_id = task_idx;
700 
701 	rte_vhost_set_inflight_desc_packed(task->bvsession->vsession.vid, vq->vring_idx,
702 					   req_idx, (req_idx + num_descs - 1) % vq->vring.size,
703 					   &task->inflight_head);
704 
705 	task->bvsession->vsession.task_cnt++;
706 
707 	blk_task_init(task);
708 
709 	rc = blk_iovs_packed_queue_setup(task->bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
710 					 &task->payload_size);
711 	if (rc) {
712 		SPDK_DEBUGLOG(vhost_blk, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
713 		/* Only READ and WRITE are supported for now. */
714 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
715 		return;
716 	}
717 
718 	if (process_blk_request(task, task->bvsession) == 0) {
719 		SPDK_DEBUGLOG(vhost_blk, "====== Task %p req_idx %d submitted ======\n", task,
720 			      task_idx);
721 	} else {
722 		SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, task_idx);
723 	}
724 }
725 
726 static void
727 process_packed_inflight_blk_task(struct spdk_vhost_virtqueue *vq,
728 				 uint16_t req_idx)
729 {
730 	spdk_vhost_inflight_desc *desc_array = vq->vring_inflight.inflight_packed->desc;
731 	spdk_vhost_inflight_desc *desc = &desc_array[req_idx];
732 	struct spdk_vhost_blk_task *task;
733 	uint16_t task_idx, num_descs;
734 	int rc;
735 
736 	task_idx = desc_array[desc->last].id;
737 	num_descs = desc->num;
738 	/* In packed ring reconnection, we use the last_used_idx as the
739 	 * initial value. So when we process the inflight descs we still
740 	 * need to update the available ring index.
741 	 */
742 	vq->last_avail_idx += num_descs;
743 	if (vq->last_avail_idx >= vq->vring.size) {
744 		vq->last_avail_idx -= vq->vring.size;
745 		vq->packed.avail_phase = !vq->packed.avail_phase;
746 	}
747 
748 	task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx];
749 	if (spdk_unlikely(task->used)) {
750 		SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
751 			    task->bvsession->vsession.name, task_idx);
752 		task->used_len = 0;
753 		blk_task_enqueue(task);
754 		return;
755 	}
756 
757 	task->req_idx = req_idx;
758 	task->num_descs = num_descs;
759 	task->buffer_id = task_idx;
760 	/* It's for cleaning inflight entries */
761 	task->inflight_head = req_idx;
762 
763 	task->bvsession->vsession.task_cnt++;
764 
765 	blk_task_init(task);
766 
767 	rc = blk_iovs_inflight_queue_setup(task->bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
768 					   &task->payload_size);
769 	if (rc) {
770 		SPDK_DEBUGLOG(vhost_blk, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
771 		/* Only READ and WRITE are supported for now. */
772 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
773 		return;
774 	}
775 
776 	if (process_blk_request(task, task->bvsession) == 0) {
777 		SPDK_DEBUGLOG(vhost_blk, "====== Task %p req_idx %d submitted ======\n", task,
778 			      task_idx);
779 	} else {
780 		SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, task_idx);
781 	}
782 }
783 
784 static void
785 submit_inflight_desc(struct spdk_vhost_blk_session *bvsession,
786 		     struct spdk_vhost_virtqueue *vq)
787 {
788 	struct spdk_vhost_session *vsession = &bvsession->vsession;
789 	spdk_vhost_resubmit_info *resubmit = vq->vring_inflight.resubmit_inflight;
790 	spdk_vhost_resubmit_desc *resubmit_list;
791 	uint16_t req_idx;
792 
793 	if (spdk_likely(resubmit == NULL || resubmit->resubmit_list == NULL)) {
794 		return;
795 	}
796 
797 	resubmit_list = resubmit->resubmit_list;
798 	while (resubmit->resubmit_num-- > 0) {
799 		req_idx = resubmit_list[resubmit->resubmit_num].index;
800 		SPDK_DEBUGLOG(vhost_blk, "====== Start processing request idx %"PRIu16"======\n",
801 			      req_idx);
802 
803 		if (spdk_unlikely(req_idx >= vq->vring.size)) {
804 			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
805 				    vsession->name, req_idx, vq->vring.size);
806 			vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
807 			continue;
808 		}
809 
810 		if (vq->packed.packed_ring) {
811 			process_packed_inflight_blk_task(vq, req_idx);
812 		} else {
813 			process_blk_task(vq, req_idx);
814 		}
815 	}
816 
817 	free(resubmit_list);
818 	resubmit->resubmit_list = NULL;
819 }
820 
821 static void
822 process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
823 {
824 	struct spdk_vhost_session *vsession = &bvsession->vsession;
825 	uint16_t reqs[SPDK_VHOST_VQ_MAX_SUBMISSIONS];
826 	uint16_t reqs_cnt, i;
827 
828 	submit_inflight_desc(bvsession, vq);
829 
830 	reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
831 	if (!reqs_cnt) {
832 		return;
833 	}
834 
835 	for (i = 0; i < reqs_cnt; i++) {
836 		SPDK_DEBUGLOG(vhost_blk, "====== Starting processing request idx %"PRIu16"======\n",
837 			      reqs[i]);
838 
839 		if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
840 			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
841 				    vsession->name, reqs[i], vq->vring.size);
842 			vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
843 			continue;
844 		}
845 
846 		rte_vhost_set_inflight_desc_split(vsession->vid, vq->vring_idx, reqs[i]);
847 
848 		process_blk_task(vq, reqs[i]);
849 	}
850 }
851 
852 static void
853 process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
854 {
855 	uint16_t i = 0;
856 
857 	submit_inflight_desc(bvsession, vq);
858 
859 	while (i++ < SPDK_VHOST_VQ_MAX_SUBMISSIONS &&
860 	       vhost_vq_packed_ring_is_avail(vq)) {
861 		SPDK_DEBUGLOG(vhost_blk, "====== Starting processing request idx %"PRIu16"======\n",
862 			      vq->last_avail_idx);
863 
864 		process_packed_blk_task(vq, vq->last_avail_idx);
865 	}
866 }
867 
868 static int
869 _vdev_vq_worker(struct spdk_vhost_virtqueue *vq)
870 {
871 	struct spdk_vhost_session *vsession = vq->vsession;
872 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
873 	bool packed_ring;
874 
875 	packed_ring = vq->packed.packed_ring;
876 	if (packed_ring) {
877 		process_packed_vq(bvsession, vq);
878 	} else {
879 		process_vq(bvsession, vq);
880 	}
881 
882 	vhost_session_vq_used_signal(vq);
883 
884 	return SPDK_POLLER_BUSY;
885 
886 }
887 
888 static int
889 vdev_vq_worker(void *arg)
890 {
891 	struct spdk_vhost_virtqueue *vq = arg;
892 
893 	return _vdev_vq_worker(vq);
894 }
895 
896 static int
897 vdev_worker(void *arg)
898 {
899 	struct spdk_vhost_blk_session *bvsession = arg;
900 	struct spdk_vhost_session *vsession = &bvsession->vsession;
901 	uint16_t q_idx;
902 
903 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
904 		_vdev_vq_worker(&vsession->virtqueue[q_idx]);
905 	}
906 
907 	return SPDK_POLLER_BUSY;
908 }
909 
910 static void
911 no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
912 {
913 	struct spdk_vhost_session *vsession = &bvsession->vsession;
914 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
915 	uint32_t length;
916 	uint16_t iovcnt, req_idx;
917 
918 	if (vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) {
919 		return;
920 	}
921 
922 	iovcnt = SPDK_COUNTOF(iovs);
923 	if (blk_iovs_split_queue_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) {
924 		*(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR;
925 		SPDK_DEBUGLOG(vhost_blk_data, "Aborting request %" PRIu16"\n", req_idx);
926 	}
927 
928 	vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
929 }
930 
931 static void
932 no_bdev_process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
933 {
934 	struct spdk_vhost_session *vsession = &bvsession->vsession;
935 	struct spdk_vhost_blk_task *task;
936 	uint32_t length;
937 	uint16_t req_idx = vq->last_avail_idx;
938 	uint16_t task_idx, num_descs;
939 
940 	if (!vhost_vq_packed_ring_is_avail(vq)) {
941 		return;
942 	}
943 
944 	task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs);
945 	task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx];
946 	if (spdk_unlikely(task->used)) {
947 		SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
948 			    vsession->name, req_idx);
949 		vhost_vq_packed_ring_enqueue(vsession, vq, num_descs,
950 					     task->buffer_id, task->used_len,
951 					     task->inflight_head);
952 		return;
953 	}
954 
955 	task->req_idx = req_idx;
956 	task->num_descs = num_descs;
957 	task->buffer_id = task_idx;
958 	blk_task_init(task);
959 
960 	if (blk_iovs_packed_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
961 					&length)) {
962 		*(volatile uint8_t *)(task->iovs[task->iovcnt - 1].iov_base) = VIRTIO_BLK_S_IOERR;
963 		SPDK_DEBUGLOG(vhost_blk_data, "Aborting request %" PRIu16"\n", req_idx);
964 	}
965 
966 	task->used = false;
967 	vhost_vq_packed_ring_enqueue(vsession, vq, num_descs,
968 				     task->buffer_id, task->used_len,
969 				     task->inflight_head);
970 }
971 
972 static int
973 _no_bdev_vdev_vq_worker(struct spdk_vhost_virtqueue *vq)
974 {
975 	struct spdk_vhost_session *vsession = vq->vsession;
976 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
977 	bool packed_ring;
978 
979 	packed_ring = vq->packed.packed_ring;
980 	if (packed_ring) {
981 		no_bdev_process_packed_vq(bvsession, vq);
982 	} else {
983 		no_bdev_process_vq(bvsession, vq);
984 	}
985 
986 	vhost_session_vq_used_signal(vq);
987 
988 	if (vsession->task_cnt == 0 && bvsession->io_channel) {
989 		spdk_put_io_channel(bvsession->io_channel);
990 		bvsession->io_channel = NULL;
991 	}
992 
993 	return SPDK_POLLER_BUSY;
994 }
995 
996 static int
997 no_bdev_vdev_vq_worker(void *arg)
998 {
999 	struct spdk_vhost_virtqueue *vq = arg;
1000 
1001 	return _no_bdev_vdev_vq_worker(vq);
1002 }
1003 
1004 static int
1005 no_bdev_vdev_worker(void *arg)
1006 {
1007 	struct spdk_vhost_blk_session *bvsession = arg;
1008 	struct spdk_vhost_session *vsession = &bvsession->vsession;
1009 	uint16_t q_idx;
1010 
1011 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
1012 		_no_bdev_vdev_vq_worker(&vsession->virtqueue[q_idx]);
1013 	}
1014 
1015 	return SPDK_POLLER_BUSY;
1016 }
1017 
1018 static void
1019 vhost_blk_session_unregister_interrupts(struct spdk_vhost_blk_session *bvsession)
1020 {
1021 	struct spdk_vhost_session *vsession = &bvsession->vsession;
1022 	struct spdk_vhost_virtqueue *vq;
1023 	int i;
1024 
1025 	SPDK_DEBUGLOG(vhost_blk, "unregister virtqueues interrupt\n");
1026 	for (i = 0; i < vsession->max_queues; i++) {
1027 		vq = &vsession->virtqueue[i];
1028 		if (vq->intr == NULL) {
1029 			break;
1030 		}
1031 
1032 		SPDK_DEBUGLOG(vhost_blk, "unregister vq[%d]'s kickfd is %d\n",
1033 			      i, vq->vring.kickfd);
1034 		spdk_interrupt_unregister(&vq->intr);
1035 	}
1036 }
1037 
1038 static int
1039 vhost_blk_session_register_interrupts(struct spdk_vhost_blk_session *bvsession,
1040 				      spdk_interrupt_fn fn)
1041 {
1042 	struct spdk_vhost_session *vsession = &bvsession->vsession;
1043 	struct spdk_vhost_virtqueue *vq = NULL;
1044 	int i;
1045 
1046 	SPDK_DEBUGLOG(vhost_blk, "Register virtqueues interrupt\n");
1047 	for (i = 0; i < vsession->max_queues; i++) {
1048 		vq = &vsession->virtqueue[i];
1049 		SPDK_DEBUGLOG(vhost_blk, "Register vq[%d]'s kickfd is %d\n",
1050 			      i, vq->vring.kickfd);
1051 
1052 		vq->intr = SPDK_INTERRUPT_REGISTER(vq->vring.kickfd, fn, vq);
1053 		if (vq->intr == NULL) {
1054 			SPDK_ERRLOG("Fail to register req notifier handler.\n");
1055 			goto err;
1056 		}
1057 	}
1058 
1059 	return 0;
1060 
1061 err:
1062 	vhost_blk_session_unregister_interrupts(bvsession);
1063 
1064 	return -1;
1065 }
1066 
1067 static void
1068 vhost_blk_poller_set_interrupt_mode(struct spdk_poller *poller, void *cb_arg, bool interrupt_mode)
1069 {
1070 	struct spdk_vhost_blk_session *bvsession = cb_arg;
1071 
1072 	vhost_session_set_interrupt_mode(&bvsession->vsession, interrupt_mode);
1073 }
1074 
1075 static struct spdk_vhost_blk_dev *
1076 to_blk_dev(struct spdk_vhost_dev *vdev)
1077 {
1078 	if (vdev == NULL) {
1079 		return NULL;
1080 	}
1081 
1082 	if (vdev->backend != &vhost_blk_device_backend) {
1083 		SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name);
1084 		return NULL;
1085 	}
1086 
1087 	return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev);
1088 }
1089 
1090 static int
1091 vhost_session_bdev_resize_cb(struct spdk_vhost_dev *vdev,
1092 			     struct spdk_vhost_session *vsession,
1093 			     void *ctx)
1094 {
1095 #if RTE_VERSION >= RTE_VERSION_NUM(20, 02, 0, 0)
1096 	SPDK_NOTICELOG("bdev send slave msg to vid(%d)\n", vsession->vid);
1097 	rte_vhost_slave_config_change(vsession->vid, false);
1098 #else
1099 	SPDK_NOTICELOG("bdev does not support resize until DPDK submodule version >= 20.02\n");
1100 #endif
1101 
1102 	return 0;
1103 }
1104 
1105 static void
1106 blk_resize_cb(void *resize_ctx)
1107 {
1108 	struct spdk_vhost_blk_dev *bvdev = resize_ctx;
1109 
1110 	spdk_vhost_lock();
1111 	vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_resize_cb,
1112 				  NULL, NULL);
1113 	spdk_vhost_unlock();
1114 }
1115 
1116 static void
1117 vhost_dev_bdev_remove_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx)
1118 {
1119 
1120 	/* All sessions have been notified, time to close the bdev */
1121 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
1122 
1123 	assert(bvdev != NULL);
1124 	spdk_put_io_channel(bvdev->dummy_io_channel);
1125 	spdk_bdev_close(bvdev->bdev_desc);
1126 	bvdev->bdev_desc = NULL;
1127 	bvdev->bdev = NULL;
1128 }
1129 
1130 static int
1131 vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev,
1132 			     struct spdk_vhost_session *vsession,
1133 			     void *ctx)
1134 {
1135 	struct spdk_vhost_blk_session *bvsession;
1136 	int rc;
1137 
1138 	bvsession = to_blk_session(vsession);
1139 	if (bvsession->requestq_poller) {
1140 		spdk_poller_unregister(&bvsession->requestq_poller);
1141 		if (vsession->virtqueue[0].intr) {
1142 			vhost_blk_session_unregister_interrupts(bvsession);
1143 			rc = vhost_blk_session_register_interrupts(bvsession, no_bdev_vdev_vq_worker);
1144 			if (rc) {
1145 				SPDK_ERRLOG("%s: Interrupt register failed\n", vsession->name);
1146 				return rc;
1147 			}
1148 		}
1149 
1150 		bvsession->requestq_poller = SPDK_POLLER_REGISTER(no_bdev_vdev_worker, bvsession, 0);
1151 		spdk_poller_register_interrupt(bvsession->requestq_poller, vhost_blk_poller_set_interrupt_mode,
1152 					       bvsession);
1153 	}
1154 
1155 	return 0;
1156 }
1157 
1158 static void
1159 bdev_remove_cb(void *remove_ctx)
1160 {
1161 	struct spdk_vhost_blk_dev *bvdev = remove_ctx;
1162 
1163 	SPDK_WARNLOG("%s: hot-removing bdev - all further requests will fail.\n",
1164 		     bvdev->vdev.name);
1165 
1166 	spdk_vhost_lock();
1167 	vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_remove_cb,
1168 				  vhost_dev_bdev_remove_cpl_cb, NULL);
1169 	spdk_vhost_unlock();
1170 }
1171 
1172 static void
1173 bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
1174 	      void *event_ctx)
1175 {
1176 	SPDK_DEBUGLOG(vhost_blk, "Bdev event: type %d, name %s\n",
1177 		      type,
1178 		      bdev->name);
1179 
1180 	switch (type) {
1181 	case SPDK_BDEV_EVENT_REMOVE:
1182 		SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_REMOVE)\n", bdev->name);
1183 		bdev_remove_cb(event_ctx);
1184 		break;
1185 	case SPDK_BDEV_EVENT_RESIZE:
1186 		SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_RESIZE)\n", bdev->name);
1187 		blk_resize_cb(event_ctx);
1188 		break;
1189 	default:
1190 		SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
1191 		break;
1192 	}
1193 }
1194 
1195 static void
1196 free_task_pool(struct spdk_vhost_blk_session *bvsession)
1197 {
1198 	struct spdk_vhost_session *vsession = &bvsession->vsession;
1199 	struct spdk_vhost_virtqueue *vq;
1200 	uint16_t i;
1201 
1202 	for (i = 0; i < vsession->max_queues; i++) {
1203 		vq = &vsession->virtqueue[i];
1204 		if (vq->tasks == NULL) {
1205 			continue;
1206 		}
1207 
1208 		spdk_free(vq->tasks);
1209 		vq->tasks = NULL;
1210 	}
1211 }
1212 
1213 static int
1214 alloc_task_pool(struct spdk_vhost_blk_session *bvsession)
1215 {
1216 	struct spdk_vhost_session *vsession = &bvsession->vsession;
1217 	struct spdk_vhost_virtqueue *vq;
1218 	struct spdk_vhost_blk_task *task;
1219 	uint32_t task_cnt;
1220 	uint16_t i;
1221 	uint32_t j;
1222 
1223 	for (i = 0; i < vsession->max_queues; i++) {
1224 		vq = &vsession->virtqueue[i];
1225 		if (vq->vring.desc == NULL) {
1226 			continue;
1227 		}
1228 
1229 		task_cnt = vq->vring.size;
1230 		if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
1231 			/* sanity check */
1232 			SPDK_ERRLOG("%s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
1233 				    vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
1234 			free_task_pool(bvsession);
1235 			return -1;
1236 		}
1237 		vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt,
1238 					 SPDK_CACHE_LINE_SIZE, NULL,
1239 					 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
1240 		if (vq->tasks == NULL) {
1241 			SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
1242 				    vsession->name, task_cnt, i);
1243 			free_task_pool(bvsession);
1244 			return -1;
1245 		}
1246 
1247 		for (j = 0; j < task_cnt; j++) {
1248 			task = &((struct spdk_vhost_blk_task *)vq->tasks)[j];
1249 			task->bvsession = bvsession;
1250 			task->req_idx = j;
1251 			task->vq = vq;
1252 		}
1253 	}
1254 
1255 	return 0;
1256 }
1257 
1258 static int
1259 vhost_blk_start_cb(struct spdk_vhost_dev *vdev,
1260 		   struct spdk_vhost_session *vsession, void *unused)
1261 {
1262 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
1263 	struct spdk_vhost_blk_dev *bvdev;
1264 	int i, rc = 0;
1265 
1266 	bvdev = to_blk_dev(vdev);
1267 	assert(bvdev != NULL);
1268 	bvsession->bvdev = bvdev;
1269 
1270 	/* validate all I/O queues are in a contiguous index range */
1271 	for (i = 0; i < vsession->max_queues; i++) {
1272 		/* vring.desc and vring.desc_packed are in a union struct
1273 		 * so q->vring.desc can replace q->vring.desc_packed.
1274 		 */
1275 		if (vsession->virtqueue[i].vring.desc == NULL) {
1276 			SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i);
1277 			rc = -1;
1278 			goto out;
1279 		}
1280 	}
1281 
1282 	rc = alloc_task_pool(bvsession);
1283 	if (rc != 0) {
1284 		SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name);
1285 		goto out;
1286 	}
1287 
1288 	if (bvdev->bdev) {
1289 		bvsession->io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
1290 		if (!bvsession->io_channel) {
1291 			free_task_pool(bvsession);
1292 			SPDK_ERRLOG("%s: I/O channel allocation failed\n", vsession->name);
1293 			rc = -1;
1294 			goto out;
1295 		}
1296 	}
1297 
1298 	if (spdk_interrupt_mode_is_enabled()) {
1299 		rc = vhost_blk_session_register_interrupts(bvsession,
1300 				bvdev->bdev ? vdev_vq_worker : no_bdev_vdev_vq_worker);
1301 		if (rc) {
1302 			SPDK_ERRLOG("%s: Interrupt register failed\n", vsession->name);
1303 			goto out;
1304 		}
1305 	}
1306 
1307 	bvsession->requestq_poller = SPDK_POLLER_REGISTER(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker,
1308 				     bvsession, 0);
1309 	SPDK_INFOLOG(vhost, "%s: started poller on lcore %d\n",
1310 		     vsession->name, spdk_env_get_current_core());
1311 
1312 	spdk_poller_register_interrupt(bvsession->requestq_poller, vhost_blk_poller_set_interrupt_mode,
1313 				       bvsession);
1314 
1315 out:
1316 	vhost_session_start_done(vsession, rc);
1317 	return rc;
1318 }
1319 
1320 static int
1321 vhost_blk_start(struct spdk_vhost_session *vsession)
1322 {
1323 	return vhost_session_send_event(vsession, vhost_blk_start_cb,
1324 					3, "start session");
1325 }
1326 
1327 static int
1328 destroy_session_poller_cb(void *arg)
1329 {
1330 	struct spdk_vhost_blk_session *bvsession = arg;
1331 	struct spdk_vhost_session *vsession = &bvsession->vsession;
1332 	int i;
1333 
1334 	if (vsession->task_cnt > 0) {
1335 		return SPDK_POLLER_BUSY;
1336 	}
1337 
1338 	if (spdk_vhost_trylock() != 0) {
1339 		return SPDK_POLLER_BUSY;
1340 	}
1341 
1342 	for (i = 0; i < vsession->max_queues; i++) {
1343 		vsession->virtqueue[i].next_event_time = 0;
1344 		vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
1345 	}
1346 
1347 	SPDK_INFOLOG(vhost, "%s: stopping poller on lcore %d\n",
1348 		     vsession->name, spdk_env_get_current_core());
1349 
1350 	if (bvsession->io_channel) {
1351 		spdk_put_io_channel(bvsession->io_channel);
1352 		bvsession->io_channel = NULL;
1353 	}
1354 
1355 	free_task_pool(bvsession);
1356 	spdk_poller_unregister(&bvsession->stop_poller);
1357 	vhost_session_stop_done(vsession, 0);
1358 
1359 	spdk_vhost_unlock();
1360 	return SPDK_POLLER_BUSY;
1361 }
1362 
1363 static int
1364 vhost_blk_stop_cb(struct spdk_vhost_dev *vdev,
1365 		  struct spdk_vhost_session *vsession, void *unused)
1366 {
1367 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
1368 
1369 	spdk_poller_unregister(&bvsession->requestq_poller);
1370 
1371 	if (vsession->virtqueue[0].intr) {
1372 		vhost_blk_session_unregister_interrupts(bvsession);
1373 	}
1374 
1375 	bvsession->stop_poller = SPDK_POLLER_REGISTER(destroy_session_poller_cb,
1376 				 bvsession, 1000);
1377 	return 0;
1378 }
1379 
1380 static int
1381 vhost_blk_stop(struct spdk_vhost_session *vsession)
1382 {
1383 	return vhost_session_send_event(vsession, vhost_blk_stop_cb,
1384 					3, "stop session");
1385 }
1386 
1387 static void
1388 vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
1389 {
1390 	struct spdk_vhost_blk_dev *bvdev;
1391 
1392 	bvdev = to_blk_dev(vdev);
1393 	assert(bvdev != NULL);
1394 
1395 	spdk_json_write_named_object_begin(w, "block");
1396 
1397 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
1398 
1399 	spdk_json_write_name(w, "bdev");
1400 	if (bvdev->bdev) {
1401 		spdk_json_write_string(w, spdk_bdev_get_name(bvdev->bdev));
1402 	} else {
1403 		spdk_json_write_null(w);
1404 	}
1405 
1406 	spdk_json_write_object_end(w);
1407 }
1408 
1409 static void
1410 vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
1411 {
1412 	struct spdk_vhost_blk_dev *bvdev;
1413 
1414 	bvdev = to_blk_dev(vdev);
1415 	assert(bvdev != NULL);
1416 
1417 	if (!bvdev->bdev) {
1418 		return;
1419 	}
1420 
1421 	spdk_json_write_object_begin(w);
1422 	spdk_json_write_named_string(w, "method", "vhost_create_blk_controller");
1423 
1424 	spdk_json_write_named_object_begin(w, "params");
1425 	spdk_json_write_named_string(w, "ctrlr", vdev->name);
1426 	spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev));
1427 	spdk_json_write_named_string(w, "cpumask",
1428 				     spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread)));
1429 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
1430 	spdk_json_write_object_end(w);
1431 
1432 	spdk_json_write_object_end(w);
1433 }
1434 
1435 static int vhost_blk_destroy(struct spdk_vhost_dev *dev);
1436 
1437 static int
1438 vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
1439 		     uint32_t len)
1440 {
1441 	struct virtio_blk_config blkcfg;
1442 	struct spdk_vhost_blk_dev *bvdev;
1443 	struct spdk_bdev *bdev;
1444 	uint32_t blk_size;
1445 	uint64_t blkcnt;
1446 
1447 	memset(&blkcfg, 0, sizeof(blkcfg));
1448 	bvdev = to_blk_dev(vdev);
1449 	assert(bvdev != NULL);
1450 	bdev = bvdev->bdev;
1451 	if (bdev == NULL) {
1452 		/* We can't just return -1 here as this GET_CONFIG message might
1453 		 * be caused by a QEMU VM reboot. Returning -1 will indicate an
1454 		 * error to QEMU, who might then decide to terminate itself.
1455 		 * We don't want that. A simple reboot shouldn't break the system.
1456 		 *
1457 		 * Presenting a block device with block size 0 and block count 0
1458 		 * doesn't cause any problems on QEMU side and the virtio-pci
1459 		 * device is even still available inside the VM, but there will
1460 		 * be no block device created for it - the kernel drivers will
1461 		 * silently reject it.
1462 		 */
1463 		blk_size = 0;
1464 		blkcnt = 0;
1465 	} else {
1466 		blk_size = spdk_bdev_get_block_size(bdev);
1467 		blkcnt = spdk_bdev_get_num_blocks(bdev);
1468 		if (spdk_bdev_get_buf_align(bdev) > 1) {
1469 			blkcfg.size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE;
1470 			blkcfg.seg_max = spdk_min(SPDK_VHOST_IOVS_MAX - 2 - 1, BDEV_IO_NUM_CHILD_IOV - 2 - 1);
1471 		} else {
1472 			blkcfg.size_max = 131072;
1473 			/*  -2 for REQ and RESP and -1 for region boundary splitting */
1474 			blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
1475 		}
1476 	}
1477 
1478 	blkcfg.blk_size = blk_size;
1479 	/* minimum I/O size in blocks */
1480 	blkcfg.min_io_size = 1;
1481 	/* expressed in 512 Bytes sectors */
1482 	blkcfg.capacity = (blkcnt * blk_size) / 512;
1483 	/* QEMU can overwrite this value when started */
1484 	blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES;
1485 
1486 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
1487 		/* 16MiB, expressed in 512 Bytes */
1488 		blkcfg.max_discard_sectors = 32768;
1489 		blkcfg.max_discard_seg = 1;
1490 		blkcfg.discard_sector_alignment = blk_size / 512;
1491 	}
1492 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
1493 		blkcfg.max_write_zeroes_sectors = 32768;
1494 		blkcfg.max_write_zeroes_seg = 1;
1495 	}
1496 
1497 	memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg)));
1498 
1499 	return 0;
1500 }
1501 
1502 static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
1503 	.session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session),
1504 	.start_session =  vhost_blk_start,
1505 	.stop_session = vhost_blk_stop,
1506 	.vhost_get_config = vhost_blk_get_config,
1507 	.dump_info_json = vhost_blk_dump_info_json,
1508 	.write_config_json = vhost_blk_write_config_json,
1509 	.remove_device = vhost_blk_destroy,
1510 };
1511 
1512 int
1513 spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name,
1514 			 bool readonly, bool packed_ring)
1515 {
1516 	struct spdk_vhost_blk_dev *bvdev = NULL;
1517 	struct spdk_vhost_dev *vdev;
1518 	struct spdk_bdev *bdev;
1519 	int ret = 0;
1520 
1521 	spdk_vhost_lock();
1522 
1523 	bvdev = calloc(1, sizeof(*bvdev));
1524 	if (bvdev == NULL) {
1525 		ret = -ENOMEM;
1526 		goto out;
1527 	}
1528 
1529 	ret = spdk_bdev_open_ext(dev_name, true, bdev_event_cb, bvdev, &bvdev->bdev_desc);
1530 	if (ret != 0) {
1531 		SPDK_ERRLOG("%s: could not open bdev '%s', error=%d\n",
1532 			    name, dev_name, ret);
1533 		goto out;
1534 	}
1535 	bdev = spdk_bdev_desc_get_bdev(bvdev->bdev_desc);
1536 
1537 	vdev = &bvdev->vdev;
1538 	vdev->virtio_features = SPDK_VHOST_BLK_FEATURES_BASE;
1539 	vdev->disabled_features = SPDK_VHOST_BLK_DISABLED_FEATURES;
1540 	vdev->protocol_features = SPDK_VHOST_BLK_PROTOCOL_FEATURES;
1541 
1542 	vdev->virtio_features |= (uint64_t)packed_ring << VIRTIO_F_RING_PACKED;
1543 
1544 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
1545 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_DISCARD);
1546 	}
1547 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
1548 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
1549 	}
1550 	if (readonly) {
1551 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_RO);
1552 	}
1553 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) {
1554 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_FLUSH);
1555 	}
1556 
1557 	/*
1558 	 * When starting qemu with vhost-user-blk multiqueue, the vhost device will
1559 	 * be started/stopped many times, related to the queues num, as the
1560 	 * vhost-user backend doesn't know the exact number of queues used for this
1561 	 * device. The target have to stop and start the device once got a valid
1562 	 * IO queue.
1563 	 * When stoping and starting the vhost device, the backend bdev io device
1564 	 * will be deleted and created repeatedly.
1565 	 * Hold a bdev reference so that in the struct spdk_vhost_blk_dev, so that
1566 	 * the io device will not be deleted.
1567 	 */
1568 	bvdev->dummy_io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
1569 
1570 	bvdev->bdev = bdev;
1571 	bvdev->readonly = readonly;
1572 	ret = vhost_dev_register(vdev, name, cpumask, &vhost_blk_device_backend);
1573 	if (ret != 0) {
1574 		spdk_put_io_channel(bvdev->dummy_io_channel);
1575 		spdk_bdev_close(bvdev->bdev_desc);
1576 		goto out;
1577 	}
1578 
1579 	SPDK_INFOLOG(vhost, "%s: using bdev '%s'\n", name, dev_name);
1580 out:
1581 	if (ret != 0 && bvdev) {
1582 		free(bvdev);
1583 	}
1584 	spdk_vhost_unlock();
1585 	return ret;
1586 }
1587 
1588 static int
1589 vhost_blk_destroy(struct spdk_vhost_dev *vdev)
1590 {
1591 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
1592 	int rc;
1593 
1594 	assert(bvdev != NULL);
1595 
1596 	rc = vhost_dev_unregister(&bvdev->vdev);
1597 	if (rc != 0) {
1598 		return rc;
1599 	}
1600 
1601 	/* if the bdev is removed, don't need call spdk_put_io_channel. */
1602 	if (bvdev->bdev) {
1603 		spdk_put_io_channel(bvdev->dummy_io_channel);
1604 	}
1605 
1606 	if (bvdev->bdev_desc) {
1607 		spdk_bdev_close(bvdev->bdev_desc);
1608 		bvdev->bdev_desc = NULL;
1609 	}
1610 	bvdev->bdev = NULL;
1611 
1612 	free(bvdev);
1613 	return 0;
1614 }
1615 
1616 SPDK_LOG_REGISTER_COMPONENT(vhost_blk)
1617 SPDK_LOG_REGISTER_COMPONENT(vhost_blk_data)
1618