xref: /spdk/lib/vhost/vhost_blk.c (revision 8a76c2484a2eae4014a1c22e985b20b2cef801df)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <linux/virtio_blk.h>
35 
36 #include "spdk/env.h"
37 #include "spdk/bdev.h"
38 #include "spdk/bdev_module.h"
39 #include "spdk/conf.h"
40 #include "spdk/thread.h"
41 #include "spdk/likely.h"
42 #include "spdk/string.h"
43 #include "spdk/util.h"
44 #include "spdk/vhost.h"
45 
46 #include "vhost_internal.h"
47 #include <rte_version.h>
48 
49 /* Minimal set of features supported by every SPDK VHOST-BLK device */
50 #define SPDK_VHOST_BLK_FEATURES_BASE (SPDK_VHOST_FEATURES | \
51 		(1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) | \
52 		(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \
53 		(1ULL << VIRTIO_BLK_F_TOPOLOGY) | (1ULL << VIRTIO_BLK_F_BARRIER)  | \
54 		(1ULL << VIRTIO_BLK_F_SCSI)     | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
55 		(1ULL << VIRTIO_BLK_F_MQ))
56 
57 /* Not supported features */
58 #define SPDK_VHOST_BLK_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \
59 		(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
60 		(1ULL << VIRTIO_BLK_F_BARRIER)  | (1ULL << VIRTIO_BLK_F_SCSI))
61 
62 /* Vhost-blk support protocol features */
63 #ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB
64 #define SPDK_VHOST_BLK_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_CONFIG) | \
65 		(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
66 #else
67 #define SPDK_VHOST_BLK_PROTOCOL_FEATURES (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)
68 #endif
69 
70 struct spdk_vhost_blk_task {
71 	struct spdk_bdev_io *bdev_io;
72 	struct spdk_vhost_blk_session *bvsession;
73 	struct spdk_vhost_virtqueue *vq;
74 
75 	volatile uint8_t *status;
76 
77 	uint16_t req_idx;
78 	uint16_t num_descs;
79 	uint16_t buffer_id;
80 
81 	/* for io wait */
82 	struct spdk_bdev_io_wait_entry bdev_io_wait;
83 
84 	/* If set, the task is currently used for I/O processing. */
85 	bool used;
86 
87 	/** Number of bytes that were written. */
88 	uint32_t used_len;
89 	uint16_t iovcnt;
90 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
91 };
92 
93 struct spdk_vhost_blk_dev {
94 	struct spdk_vhost_dev vdev;
95 	struct spdk_bdev *bdev;
96 	struct spdk_bdev_desc *bdev_desc;
97 	/* dummy_io_channel is used to hold a bdev reference */
98 	struct spdk_io_channel *dummy_io_channel;
99 	bool readonly;
100 };
101 
102 struct spdk_vhost_blk_session {
103 	/* The parent session must be the very first field in this struct */
104 	struct spdk_vhost_session vsession;
105 	struct spdk_vhost_blk_dev *bvdev;
106 	struct spdk_poller *requestq_poller;
107 	struct spdk_io_channel *io_channel;
108 	struct spdk_poller *stop_poller;
109 };
110 
111 /* forward declaration */
112 static const struct spdk_vhost_dev_backend vhost_blk_device_backend;
113 
114 static int
115 process_blk_request(struct spdk_vhost_blk_task *task,
116 		    struct spdk_vhost_blk_session *bvsession,
117 		    struct spdk_vhost_virtqueue *vq);
118 
119 static void
120 blk_task_finish(struct spdk_vhost_blk_task *task)
121 {
122 	assert(task->bvsession->vsession.task_cnt > 0);
123 	task->bvsession->vsession.task_cnt--;
124 	task->used = false;
125 }
126 
127 static void
128 blk_task_init(struct spdk_vhost_blk_task *task)
129 {
130 	task->used = true;
131 	task->iovcnt = SPDK_COUNTOF(task->iovs);
132 	task->status = NULL;
133 	task->used_len = 0;
134 }
135 
136 static void
137 blk_task_enqueue(struct spdk_vhost_blk_task *task)
138 {
139 	if (task->vq->packed.packed_ring) {
140 		vhost_vq_packed_ring_enqueue(&task->bvsession->vsession, task->vq,
141 					     task->num_descs,
142 					     task->buffer_id, task->used_len);
143 	} else {
144 		vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq,
145 					   task->req_idx, task->used_len);
146 	}
147 }
148 
149 static void
150 invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status)
151 {
152 	if (task->status) {
153 		*task->status = status;
154 	}
155 
156 	blk_task_enqueue(task);
157 	blk_task_finish(task);
158 	SPDK_DEBUGLOG(vhost_blk_data, "Invalid request (status=%" PRIu8")\n", status);
159 }
160 
161 /*
162  * Process task's descriptor chain and setup data related fields.
163  * Return
164  *   total size of suplied buffers
165  *
166  *   FIXME: Make this function return to rd_cnt and wr_cnt
167  */
168 static int
169 blk_iovs_split_queue_setup(struct spdk_vhost_blk_session *bvsession,
170 			   struct spdk_vhost_virtqueue *vq,
171 			   uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
172 {
173 	struct spdk_vhost_session *vsession = &bvsession->vsession;
174 	struct spdk_vhost_dev *vdev = vsession->vdev;
175 	struct vring_desc *desc, *desc_table;
176 	uint16_t out_cnt = 0, cnt = 0;
177 	uint32_t desc_table_size, len = 0;
178 	uint32_t desc_handled_cnt;
179 	int rc;
180 
181 	rc = vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size);
182 	if (rc != 0) {
183 		SPDK_ERRLOG("%s: invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
184 		return -1;
185 	}
186 
187 	desc_handled_cnt = 0;
188 	while (1) {
189 		/*
190 		 * Maximum cnt reached?
191 		 * Should not happen if request is well formatted, otherwise this is a BUG.
192 		 */
193 		if (spdk_unlikely(cnt == *iovs_cnt)) {
194 			SPDK_DEBUGLOG(vhost_blk, "%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
195 				      vsession->name, req_idx);
196 			return -1;
197 		}
198 
199 		if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) {
200 			SPDK_DEBUGLOG(vhost_blk, "%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
201 				      vsession->name, req_idx, cnt);
202 			return -1;
203 		}
204 
205 		len += desc->len;
206 
207 		out_cnt += vhost_vring_desc_is_wr(desc);
208 
209 		rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
210 		if (rc != 0) {
211 			SPDK_ERRLOG("%s: descriptor chain at index %"PRIu16" terminated unexpectedly.\n",
212 				    vsession->name, req_idx);
213 			return -1;
214 		} else if (desc == NULL) {
215 			break;
216 		}
217 
218 		desc_handled_cnt++;
219 		if (spdk_unlikely(desc_handled_cnt > desc_table_size)) {
220 			/* Break a cycle and report an error, if any. */
221 			SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n",
222 				    vsession->name, desc_table_size, desc_handled_cnt);
223 			return -1;
224 		}
225 	}
226 
227 	/*
228 	 * There must be least two descriptors.
229 	 * First contain request so it must be readable.
230 	 * Last descriptor contain buffer for response so it must be writable.
231 	 */
232 	if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
233 		return -1;
234 	}
235 
236 	*length = len;
237 	*iovs_cnt = cnt;
238 	return 0;
239 }
240 
241 static int
242 blk_iovs_packed_queue_setup(struct spdk_vhost_blk_session *bvsession,
243 			    struct spdk_vhost_virtqueue *vq,
244 			    uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
245 {
246 	struct spdk_vhost_session *vsession = &bvsession->vsession;
247 	struct spdk_vhost_dev *vdev = vsession->vdev;
248 	struct vring_packed_desc *desc = NULL, *desc_table;
249 	uint16_t out_cnt = 0, cnt = 0;
250 	uint32_t desc_table_size, len = 0;
251 	int rc = 0;
252 
253 	rc = vhost_vq_get_desc_packed(vsession, vq, req_idx, &desc,
254 				      &desc_table, &desc_table_size);
255 	if (spdk_unlikely(rc != 0)) {
256 		SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
257 		return rc;
258 	}
259 
260 	if (desc_table != NULL) {
261 		req_idx = 0;
262 	}
263 
264 	while (1) {
265 		/*
266 		 * Maximum cnt reached?
267 		 * Should not happen if request is well formatted, otherwise this is a BUG.
268 		 */
269 		if (spdk_unlikely(cnt == *iovs_cnt)) {
270 			SPDK_ERRLOG("%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
271 				    vsession->name, req_idx);
272 			return -EINVAL;
273 		}
274 
275 		if (spdk_unlikely(vhost_vring_packed_desc_to_iov(vsession, iovs, &cnt, desc))) {
276 			SPDK_ERRLOG("%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
277 				    vsession->name, req_idx, cnt);
278 			return -EINVAL;
279 		}
280 
281 		len += desc->len;
282 		out_cnt += vhost_vring_packed_desc_is_wr(desc);
283 
284 		/* desc is NULL means we reach the last desc of this request */
285 		vhost_vring_packed_desc_get_next(&desc, &req_idx, vq, desc_table, desc_table_size);
286 		if (desc == NULL) {
287 			break;
288 		}
289 	}
290 
291 	/*
292 	 * There must be least two descriptors.
293 	 * First contain request so it must be readable.
294 	 * Last descriptor contain buffer for response so it must be writable.
295 	 */
296 	if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
297 		return -EINVAL;
298 	}
299 
300 	*length = len;
301 	*iovs_cnt = cnt;
302 
303 	return 0;
304 }
305 
306 static void
307 blk_request_finish(bool success, struct spdk_vhost_blk_task *task)
308 {
309 	*task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
310 
311 	blk_task_enqueue(task);
312 
313 	SPDK_DEBUGLOG(vhost_blk, "Finished task (%p) req_idx=%d\n status: %s\n", task,
314 		      task->req_idx, success ? "OK" : "FAIL");
315 	blk_task_finish(task);
316 }
317 
318 static void
319 blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
320 {
321 	struct spdk_vhost_blk_task *task = cb_arg;
322 
323 	spdk_bdev_free_io(bdev_io);
324 	blk_request_finish(success, task);
325 }
326 
327 static void
328 blk_request_resubmit(void *arg)
329 {
330 	struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg;
331 	int rc = 0;
332 
333 	blk_task_init(task);
334 
335 	rc = process_blk_request(task, task->bvsession, task->vq);
336 	if (rc == 0) {
337 		SPDK_DEBUGLOG(vhost_blk, "====== Task %p resubmitted ======\n", task);
338 	} else {
339 		SPDK_DEBUGLOG(vhost_blk, "====== Task %p failed ======\n", task);
340 	}
341 }
342 
343 static inline void
344 blk_request_queue_io(struct spdk_vhost_blk_task *task)
345 {
346 	int rc;
347 	struct spdk_vhost_blk_session *bvsession = task->bvsession;
348 	struct spdk_bdev *bdev = bvsession->bvdev->bdev;
349 
350 	task->bdev_io_wait.bdev = bdev;
351 	task->bdev_io_wait.cb_fn = blk_request_resubmit;
352 	task->bdev_io_wait.cb_arg = task;
353 
354 	rc = spdk_bdev_queue_io_wait(bdev, bvsession->io_channel, &task->bdev_io_wait);
355 	if (rc != 0) {
356 		SPDK_ERRLOG("%s: failed to queue I/O, rc=%d\n", bvsession->vsession.name, rc);
357 		invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
358 	}
359 }
360 
361 static int
362 process_blk_request(struct spdk_vhost_blk_task *task,
363 		    struct spdk_vhost_blk_session *bvsession,
364 		    struct spdk_vhost_virtqueue *vq)
365 {
366 	struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
367 	const struct virtio_blk_outhdr *req;
368 	struct virtio_blk_discard_write_zeroes *desc;
369 	struct iovec *iov;
370 	uint32_t type;
371 	uint32_t payload_len;
372 	uint64_t flush_bytes;
373 	int rc;
374 
375 	if (vq->packed.packed_ring) {
376 		rc = blk_iovs_packed_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
377 						 &payload_len);
378 	} else {
379 		rc = blk_iovs_split_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
380 						&payload_len);
381 	}
382 
383 	if (rc) {
384 		SPDK_DEBUGLOG(vhost_blk, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
385 		/* Only READ and WRITE are supported for now. */
386 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
387 		return -1;
388 	}
389 
390 	iov = &task->iovs[0];
391 	if (spdk_unlikely(iov->iov_len != sizeof(*req))) {
392 		SPDK_DEBUGLOG(vhost_blk,
393 			      "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n",
394 			      iov->iov_len, sizeof(*req), task->req_idx);
395 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
396 		return -1;
397 	}
398 
399 	req = iov->iov_base;
400 
401 	iov = &task->iovs[task->iovcnt - 1];
402 	if (spdk_unlikely(iov->iov_len != 1)) {
403 		SPDK_DEBUGLOG(vhost_blk,
404 			      "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n",
405 			      iov->iov_len, 1, task->req_idx);
406 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
407 		return -1;
408 	}
409 
410 	task->status = iov->iov_base;
411 	payload_len -= sizeof(*req) + sizeof(*task->status);
412 	task->iovcnt -= 2;
413 
414 	type = req->type;
415 #ifdef VIRTIO_BLK_T_BARRIER
416 	/* Don't care about barier for now (as QEMU's virtio-blk do). */
417 	type &= ~VIRTIO_BLK_T_BARRIER;
418 #endif
419 
420 	switch (type) {
421 	case VIRTIO_BLK_T_IN:
422 	case VIRTIO_BLK_T_OUT:
423 		if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) {
424 			SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n",
425 				    type ? "WRITE" : "READ", task->req_idx);
426 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
427 			return -1;
428 		}
429 
430 		if (type == VIRTIO_BLK_T_IN) {
431 			task->used_len = payload_len + sizeof(*task->status);
432 			rc = spdk_bdev_readv(bvdev->bdev_desc, bvsession->io_channel,
433 					     &task->iovs[1], task->iovcnt, req->sector * 512,
434 					     payload_len, blk_request_complete_cb, task);
435 		} else if (!bvdev->readonly) {
436 			task->used_len = sizeof(*task->status);
437 			rc = spdk_bdev_writev(bvdev->bdev_desc, bvsession->io_channel,
438 					      &task->iovs[1], task->iovcnt, req->sector * 512,
439 					      payload_len, blk_request_complete_cb, task);
440 		} else {
441 			SPDK_DEBUGLOG(vhost_blk, "Device is in read-only mode!\n");
442 			rc = -1;
443 		}
444 
445 		if (rc) {
446 			if (rc == -ENOMEM) {
447 				SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
448 				blk_request_queue_io(task);
449 			} else {
450 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
451 				return -1;
452 			}
453 		}
454 		break;
455 	case VIRTIO_BLK_T_DISCARD:
456 		desc = task->iovs[1].iov_base;
457 		if (payload_len != sizeof(*desc)) {
458 			SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len);
459 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
460 			return -1;
461 		}
462 
463 		rc = spdk_bdev_unmap(bvdev->bdev_desc, bvsession->io_channel,
464 				     desc->sector * 512, desc->num_sectors * 512,
465 				     blk_request_complete_cb, task);
466 		if (rc) {
467 			if (rc == -ENOMEM) {
468 				SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
469 				blk_request_queue_io(task);
470 			} else {
471 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
472 				return -1;
473 			}
474 		}
475 		break;
476 	case VIRTIO_BLK_T_WRITE_ZEROES:
477 		desc = task->iovs[1].iov_base;
478 		if (payload_len != sizeof(*desc)) {
479 			SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len);
480 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
481 			return -1;
482 		}
483 
484 		/* Zeroed and Unmap the range, SPDK doen't support it. */
485 		if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
486 			SPDK_NOTICELOG("Can't support Write Zeroes with Unmap flag\n");
487 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
488 			return -1;
489 		}
490 
491 		rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvsession->io_channel,
492 					    desc->sector * 512, desc->num_sectors * 512,
493 					    blk_request_complete_cb, task);
494 		if (rc) {
495 			if (rc == -ENOMEM) {
496 				SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
497 				blk_request_queue_io(task);
498 			} else {
499 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
500 				return -1;
501 			}
502 		}
503 		break;
504 	case VIRTIO_BLK_T_FLUSH:
505 		flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev);
506 		if (req->sector != 0) {
507 			SPDK_NOTICELOG("sector must be zero for flush command\n");
508 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
509 			return -1;
510 		}
511 		rc = spdk_bdev_flush(bvdev->bdev_desc, bvsession->io_channel,
512 				     0, flush_bytes,
513 				     blk_request_complete_cb, task);
514 		if (rc) {
515 			if (rc == -ENOMEM) {
516 				SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
517 				blk_request_queue_io(task);
518 			} else {
519 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
520 				return -1;
521 			}
522 		}
523 		break;
524 	case VIRTIO_BLK_T_GET_ID:
525 		if (!task->iovcnt || !payload_len) {
526 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
527 			return -1;
528 		}
529 		task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len);
530 		spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev),
531 				task->used_len, ' ');
532 		blk_request_finish(true, task);
533 		break;
534 	default:
535 		SPDK_DEBUGLOG(vhost_blk, "Not supported request type '%"PRIu32"'.\n", type);
536 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
537 		return -1;
538 	}
539 
540 	return 0;
541 }
542 
543 static void
544 process_blk_task(struct spdk_vhost_virtqueue *vq, uint16_t req_idx)
545 {
546 	struct spdk_vhost_blk_task *task;
547 	uint16_t task_idx = req_idx, num_descs;
548 
549 	if (vq->packed.packed_ring) {
550 		/* Packed ring used the buffer_id as the task_idx to get task struct.
551 		 * In kernel driver, it uses the vq->free_head to set the buffer_id so the value
552 		 * must be in the range of 0 ~ vring.size. The free_head value must be unique
553 		 * in the outstanding requests.
554 		 * We can't use the req_idx as the task_idx because the desc can be reused in
555 		 * the next phase even when it's not completed in the previous phase. For example,
556 		 * At phase 0, last_used_idx was 2 and desc0 was not completed.Then after moving
557 		 * phase 1, last_avail_idx is updated to 1. In this case, req_idx can not be used
558 		 * as task_idx because we will know task[0]->used is true at phase 1.
559 		 * The split queue is quite different, the desc would insert into the free list when
560 		 * device completes the request, the driver gets the desc from the free list which
561 		 * ensures the req_idx is unique in the outstanding requests.
562 		 */
563 		task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs);
564 	}
565 
566 	task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx];
567 	if (spdk_unlikely(task->used)) {
568 		SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
569 			    task->bvsession->vsession.name, task_idx);
570 		task->used_len = 0;
571 		blk_task_enqueue(task);
572 		return;
573 	}
574 
575 	if (vq->packed.packed_ring) {
576 		task->req_idx = req_idx;
577 		task->num_descs = num_descs;
578 		task->buffer_id = task_idx;
579 	}
580 
581 	task->bvsession->vsession.task_cnt++;
582 
583 	blk_task_init(task);
584 
585 	if (process_blk_request(task, task->bvsession, vq) == 0) {
586 		SPDK_DEBUGLOG(vhost_blk, "====== Task %p req_idx %d submitted ======\n", task,
587 			      task_idx);
588 	} else {
589 		SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, task_idx);
590 	}
591 }
592 
593 static void
594 submit_inflight_desc(struct spdk_vhost_blk_session *bvsession,
595 		     struct spdk_vhost_virtqueue *vq)
596 {
597 	struct spdk_vhost_session *vsession = &bvsession->vsession;
598 	spdk_vhost_resubmit_info *resubmit = vq->vring_inflight.resubmit_inflight;
599 	spdk_vhost_resubmit_desc *resubmit_list;
600 	uint16_t req_idx;
601 
602 	if (spdk_likely(resubmit == NULL || resubmit->resubmit_list == NULL)) {
603 		return;
604 	}
605 
606 	resubmit_list = resubmit->resubmit_list;
607 	while (resubmit->resubmit_num-- > 0) {
608 		req_idx = resubmit_list[resubmit->resubmit_num].index;
609 		SPDK_DEBUGLOG(vhost_blk, "====== Start processing request idx %"PRIu16"======\n",
610 			      req_idx);
611 
612 		if (spdk_unlikely(req_idx >= vq->vring.size)) {
613 			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
614 				    vsession->name, req_idx, vq->vring.size);
615 			vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
616 			continue;
617 		}
618 
619 		process_blk_task(vq, req_idx);
620 	}
621 
622 	free(resubmit_list);
623 	resubmit->resubmit_list = NULL;
624 }
625 
626 static void
627 process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
628 {
629 	struct spdk_vhost_session *vsession = &bvsession->vsession;
630 	uint16_t reqs[SPDK_VHOST_VQ_MAX_SUBMISSIONS];
631 	uint16_t reqs_cnt, i;
632 
633 	submit_inflight_desc(bvsession, vq);
634 
635 	reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
636 	if (!reqs_cnt) {
637 		return;
638 	}
639 
640 	for (i = 0; i < reqs_cnt; i++) {
641 		SPDK_DEBUGLOG(vhost_blk, "====== Starting processing request idx %"PRIu16"======\n",
642 			      reqs[i]);
643 
644 		if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
645 			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
646 				    vsession->name, reqs[i], vq->vring.size);
647 			vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
648 			continue;
649 		}
650 
651 		rte_vhost_set_inflight_desc_split(vsession->vid, vq->vring_idx, reqs[i]);
652 
653 		process_blk_task(vq, reqs[i]);
654 	}
655 }
656 
657 static void
658 process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
659 {
660 	uint16_t i = 0;
661 
662 	while (i++ < SPDK_VHOST_VQ_MAX_SUBMISSIONS &&
663 	       vhost_vq_packed_ring_is_avail(vq)) {
664 		SPDK_DEBUGLOG(vhost_blk, "====== Starting processing request idx %"PRIu16"======\n",
665 			      vq->last_avail_idx);
666 
667 		process_blk_task(vq, vq->last_avail_idx);
668 	}
669 }
670 
671 static int
672 vdev_worker(void *arg)
673 {
674 	struct spdk_vhost_blk_session *bvsession = arg;
675 	struct spdk_vhost_session *vsession = &bvsession->vsession;
676 
677 	uint16_t q_idx;
678 	bool packed_ring;
679 
680 	/* In a session, every vq supports the same format */
681 	packed_ring = vsession->virtqueue[0].packed.packed_ring;
682 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
683 		if (packed_ring) {
684 			process_packed_vq(bvsession, &vsession->virtqueue[q_idx]);
685 		} else {
686 			process_vq(bvsession, &vsession->virtqueue[q_idx]);
687 		}
688 	}
689 
690 	vhost_session_used_signal(vsession);
691 
692 	return SPDK_POLLER_BUSY;
693 }
694 
695 static void
696 no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
697 {
698 	struct spdk_vhost_session *vsession = &bvsession->vsession;
699 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
700 	uint32_t length;
701 	uint16_t iovcnt, req_idx;
702 
703 	if (vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) {
704 		return;
705 	}
706 
707 	iovcnt = SPDK_COUNTOF(iovs);
708 	if (blk_iovs_split_queue_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) {
709 		*(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR;
710 		SPDK_DEBUGLOG(vhost_blk_data, "Aborting request %" PRIu16"\n", req_idx);
711 	}
712 
713 	vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
714 }
715 
716 static void
717 no_bdev_process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
718 {
719 	struct spdk_vhost_session *vsession = &bvsession->vsession;
720 	struct spdk_vhost_blk_task *task;
721 	uint32_t length;
722 	uint16_t req_idx = vq->last_avail_idx;
723 	uint16_t task_idx, num_descs;
724 
725 	if (!vhost_vq_packed_ring_is_avail(vq)) {
726 		return;
727 	}
728 
729 	task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs);
730 	task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx];
731 	if (spdk_unlikely(task->used)) {
732 		SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
733 			    vsession->name, req_idx);
734 		vhost_vq_packed_ring_enqueue(vsession, vq, num_descs,
735 					     task->buffer_id, task->used_len);
736 		return;
737 	}
738 
739 	task->req_idx = req_idx;
740 	task->num_descs = num_descs;
741 	task->buffer_id = task_idx;
742 	blk_task_init(task);
743 
744 	if (blk_iovs_packed_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
745 					&length)) {
746 		*(volatile uint8_t *)(task->iovs[task->iovcnt - 1].iov_base) = VIRTIO_BLK_S_IOERR;
747 		SPDK_DEBUGLOG(vhost_blk_data, "Aborting request %" PRIu16"\n", req_idx);
748 	}
749 
750 	task->used = false;
751 	vhost_vq_packed_ring_enqueue(vsession, vq, num_descs,
752 				     task->buffer_id, task->used_len);
753 }
754 
755 static int
756 no_bdev_vdev_worker(void *arg)
757 {
758 	struct spdk_vhost_blk_session *bvsession = arg;
759 	struct spdk_vhost_session *vsession = &bvsession->vsession;
760 	uint16_t q_idx;
761 	bool packed_ring;
762 
763 	/* In a session, every vq supports the same format */
764 	packed_ring = vsession->virtqueue[0].packed.packed_ring;
765 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
766 		if (packed_ring) {
767 			no_bdev_process_packed_vq(bvsession, &vsession->virtqueue[q_idx]);
768 		} else {
769 			no_bdev_process_vq(bvsession, &vsession->virtqueue[q_idx]);
770 		}
771 	}
772 
773 	vhost_session_used_signal(vsession);
774 
775 	if (vsession->task_cnt == 0 && bvsession->io_channel) {
776 		spdk_put_io_channel(bvsession->io_channel);
777 		bvsession->io_channel = NULL;
778 	}
779 
780 	return SPDK_POLLER_BUSY;
781 }
782 
783 static struct spdk_vhost_blk_session *
784 to_blk_session(struct spdk_vhost_session *vsession)
785 {
786 	assert(vsession->vdev->backend == &vhost_blk_device_backend);
787 	return (struct spdk_vhost_blk_session *)vsession;
788 }
789 
790 static struct spdk_vhost_blk_dev *
791 to_blk_dev(struct spdk_vhost_dev *vdev)
792 {
793 	if (vdev == NULL) {
794 		return NULL;
795 	}
796 
797 	if (vdev->backend != &vhost_blk_device_backend) {
798 		SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name);
799 		return NULL;
800 	}
801 
802 	return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev);
803 }
804 
805 static int
806 vhost_session_bdev_resize_cb(struct spdk_vhost_dev *vdev,
807 			     struct spdk_vhost_session *vsession,
808 			     void *ctx)
809 {
810 #if RTE_VERSION >= RTE_VERSION_NUM(20, 02, 0, 0)
811 	SPDK_NOTICELOG("bdev send slave msg to vid(%d)\n", vsession->vid);
812 	rte_vhost_slave_config_change(vsession->vid, false);
813 #else
814 	SPDK_NOTICELOG("bdev does not support resize until DPDK submodule version >= 20.02\n");
815 #endif
816 
817 	return 0;
818 }
819 
820 static void
821 blk_resize_cb(void *resize_ctx)
822 {
823 	struct spdk_vhost_blk_dev *bvdev = resize_ctx;
824 
825 	spdk_vhost_lock();
826 	vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_resize_cb,
827 				  NULL, NULL);
828 	spdk_vhost_unlock();
829 }
830 
831 static void
832 vhost_dev_bdev_remove_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx)
833 {
834 
835 	/* All sessions have been notified, time to close the bdev */
836 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
837 
838 	assert(bvdev != NULL);
839 	spdk_put_io_channel(bvdev->dummy_io_channel);
840 	spdk_bdev_close(bvdev->bdev_desc);
841 	bvdev->bdev_desc = NULL;
842 	bvdev->bdev = NULL;
843 }
844 
845 static int
846 vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev,
847 			     struct spdk_vhost_session *vsession,
848 			     void *ctx)
849 {
850 	struct spdk_vhost_blk_session *bvsession;
851 
852 	bvsession = (struct spdk_vhost_blk_session *)vsession;
853 	if (bvsession->requestq_poller) {
854 		spdk_poller_unregister(&bvsession->requestq_poller);
855 		bvsession->requestq_poller = SPDK_POLLER_REGISTER(no_bdev_vdev_worker, bvsession, 0);
856 	}
857 
858 	return 0;
859 }
860 
861 static void
862 bdev_remove_cb(void *remove_ctx)
863 {
864 	struct spdk_vhost_blk_dev *bvdev = remove_ctx;
865 
866 	SPDK_WARNLOG("%s: hot-removing bdev - all further requests will fail.\n",
867 		     bvdev->vdev.name);
868 
869 	spdk_vhost_lock();
870 	vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_remove_cb,
871 				  vhost_dev_bdev_remove_cpl_cb, NULL);
872 	spdk_vhost_unlock();
873 }
874 
875 static void
876 bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
877 	      void *event_ctx)
878 {
879 	SPDK_DEBUGLOG(vhost_blk, "Bdev event: type %d, name %s\n",
880 		      type,
881 		      bdev->name);
882 
883 	switch (type) {
884 	case SPDK_BDEV_EVENT_REMOVE:
885 		SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_REMOVE)\n", bdev->name);
886 		bdev_remove_cb(event_ctx);
887 		break;
888 	case SPDK_BDEV_EVENT_RESIZE:
889 		SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_RESIZE)\n", bdev->name);
890 		blk_resize_cb(event_ctx);
891 		break;
892 	default:
893 		SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
894 		break;
895 	}
896 }
897 
898 static void
899 free_task_pool(struct spdk_vhost_blk_session *bvsession)
900 {
901 	struct spdk_vhost_session *vsession = &bvsession->vsession;
902 	struct spdk_vhost_virtqueue *vq;
903 	uint16_t i;
904 
905 	for (i = 0; i < vsession->max_queues; i++) {
906 		vq = &vsession->virtqueue[i];
907 		if (vq->tasks == NULL) {
908 			continue;
909 		}
910 
911 		spdk_free(vq->tasks);
912 		vq->tasks = NULL;
913 	}
914 }
915 
916 static int
917 alloc_task_pool(struct spdk_vhost_blk_session *bvsession)
918 {
919 	struct spdk_vhost_session *vsession = &bvsession->vsession;
920 	struct spdk_vhost_virtqueue *vq;
921 	struct spdk_vhost_blk_task *task;
922 	uint32_t task_cnt;
923 	uint16_t i;
924 	uint32_t j;
925 
926 	for (i = 0; i < vsession->max_queues; i++) {
927 		vq = &vsession->virtqueue[i];
928 		if (vq->vring.desc == NULL) {
929 			continue;
930 		}
931 
932 		task_cnt = vq->vring.size;
933 		if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
934 			/* sanity check */
935 			SPDK_ERRLOG("%s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
936 				    vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
937 			free_task_pool(bvsession);
938 			return -1;
939 		}
940 		vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt,
941 					 SPDK_CACHE_LINE_SIZE, NULL,
942 					 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
943 		if (vq->tasks == NULL) {
944 			SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
945 				    vsession->name, task_cnt, i);
946 			free_task_pool(bvsession);
947 			return -1;
948 		}
949 
950 		for (j = 0; j < task_cnt; j++) {
951 			task = &((struct spdk_vhost_blk_task *)vq->tasks)[j];
952 			task->bvsession = bvsession;
953 			task->req_idx = j;
954 			task->vq = vq;
955 		}
956 	}
957 
958 	return 0;
959 }
960 
961 static int
962 vhost_blk_start_cb(struct spdk_vhost_dev *vdev,
963 		   struct spdk_vhost_session *vsession, void *unused)
964 {
965 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
966 	struct spdk_vhost_blk_dev *bvdev;
967 	int i, rc = 0;
968 
969 	bvdev = to_blk_dev(vdev);
970 	assert(bvdev != NULL);
971 	bvsession->bvdev = bvdev;
972 
973 	/* validate all I/O queues are in a contiguous index range */
974 	for (i = 0; i < vsession->max_queues; i++) {
975 		/* vring.desc and vring.desc_packed are in a union struct
976 		 * so q->vring.desc can replace q->vring.desc_packed.
977 		 */
978 		if (vsession->virtqueue[i].vring.desc == NULL) {
979 			SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i);
980 			rc = -1;
981 			goto out;
982 		}
983 	}
984 
985 	rc = alloc_task_pool(bvsession);
986 	if (rc != 0) {
987 		SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name);
988 		goto out;
989 	}
990 
991 	if (bvdev->bdev) {
992 		bvsession->io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
993 		if (!bvsession->io_channel) {
994 			free_task_pool(bvsession);
995 			SPDK_ERRLOG("%s: I/O channel allocation failed\n", vsession->name);
996 			rc = -1;
997 			goto out;
998 		}
999 	}
1000 
1001 	bvsession->requestq_poller = SPDK_POLLER_REGISTER(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker,
1002 				     bvsession, 0);
1003 	SPDK_INFOLOG(vhost, "%s: started poller on lcore %d\n",
1004 		     vsession->name, spdk_env_get_current_core());
1005 out:
1006 	vhost_session_start_done(vsession, rc);
1007 	return rc;
1008 }
1009 
1010 static int
1011 vhost_blk_start(struct spdk_vhost_session *vsession)
1012 {
1013 	return vhost_session_send_event(vsession, vhost_blk_start_cb,
1014 					3, "start session");
1015 }
1016 
1017 static int
1018 destroy_session_poller_cb(void *arg)
1019 {
1020 	struct spdk_vhost_blk_session *bvsession = arg;
1021 	struct spdk_vhost_session *vsession = &bvsession->vsession;
1022 	int i;
1023 
1024 	if (vsession->task_cnt > 0) {
1025 		return SPDK_POLLER_BUSY;
1026 	}
1027 
1028 	if (spdk_vhost_trylock() != 0) {
1029 		return SPDK_POLLER_BUSY;
1030 	}
1031 
1032 	for (i = 0; i < vsession->max_queues; i++) {
1033 		vsession->virtqueue[i].next_event_time = 0;
1034 		vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
1035 	}
1036 
1037 	SPDK_INFOLOG(vhost, "%s: stopping poller on lcore %d\n",
1038 		     vsession->name, spdk_env_get_current_core());
1039 
1040 	if (bvsession->io_channel) {
1041 		spdk_put_io_channel(bvsession->io_channel);
1042 		bvsession->io_channel = NULL;
1043 	}
1044 
1045 	free_task_pool(bvsession);
1046 	spdk_poller_unregister(&bvsession->stop_poller);
1047 	vhost_session_stop_done(vsession, 0);
1048 
1049 	spdk_vhost_unlock();
1050 	return SPDK_POLLER_BUSY;
1051 }
1052 
1053 static int
1054 vhost_blk_stop_cb(struct spdk_vhost_dev *vdev,
1055 		  struct spdk_vhost_session *vsession, void *unused)
1056 {
1057 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
1058 
1059 	spdk_poller_unregister(&bvsession->requestq_poller);
1060 	bvsession->stop_poller = SPDK_POLLER_REGISTER(destroy_session_poller_cb,
1061 				 bvsession, 1000);
1062 	return 0;
1063 }
1064 
1065 static int
1066 vhost_blk_stop(struct spdk_vhost_session *vsession)
1067 {
1068 	return vhost_session_send_event(vsession, vhost_blk_stop_cb,
1069 					3, "stop session");
1070 }
1071 
1072 static void
1073 vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
1074 {
1075 	struct spdk_vhost_blk_dev *bvdev;
1076 
1077 	bvdev = to_blk_dev(vdev);
1078 	assert(bvdev != NULL);
1079 
1080 	spdk_json_write_named_object_begin(w, "block");
1081 
1082 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
1083 
1084 	spdk_json_write_name(w, "bdev");
1085 	if (bvdev->bdev) {
1086 		spdk_json_write_string(w, spdk_bdev_get_name(bvdev->bdev));
1087 	} else {
1088 		spdk_json_write_null(w);
1089 	}
1090 
1091 	spdk_json_write_object_end(w);
1092 }
1093 
1094 static void
1095 vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
1096 {
1097 	struct spdk_vhost_blk_dev *bvdev;
1098 
1099 	bvdev = to_blk_dev(vdev);
1100 	assert(bvdev != NULL);
1101 
1102 	if (!bvdev->bdev) {
1103 		return;
1104 	}
1105 
1106 	spdk_json_write_object_begin(w);
1107 	spdk_json_write_named_string(w, "method", "vhost_create_blk_controller");
1108 
1109 	spdk_json_write_named_object_begin(w, "params");
1110 	spdk_json_write_named_string(w, "ctrlr", vdev->name);
1111 	spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev));
1112 	spdk_json_write_named_string(w, "cpumask",
1113 				     spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread)));
1114 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
1115 	spdk_json_write_object_end(w);
1116 
1117 	spdk_json_write_object_end(w);
1118 }
1119 
1120 static int vhost_blk_destroy(struct spdk_vhost_dev *dev);
1121 
1122 static int
1123 vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
1124 		     uint32_t len)
1125 {
1126 	struct virtio_blk_config blkcfg;
1127 	struct spdk_vhost_blk_dev *bvdev;
1128 	struct spdk_bdev *bdev;
1129 	uint32_t blk_size;
1130 	uint64_t blkcnt;
1131 
1132 	memset(&blkcfg, 0, sizeof(blkcfg));
1133 	bvdev = to_blk_dev(vdev);
1134 	assert(bvdev != NULL);
1135 	bdev = bvdev->bdev;
1136 	if (bdev == NULL) {
1137 		/* We can't just return -1 here as this GET_CONFIG message might
1138 		 * be caused by a QEMU VM reboot. Returning -1 will indicate an
1139 		 * error to QEMU, who might then decide to terminate itself.
1140 		 * We don't want that. A simple reboot shouldn't break the system.
1141 		 *
1142 		 * Presenting a block device with block size 0 and block count 0
1143 		 * doesn't cause any problems on QEMU side and the virtio-pci
1144 		 * device is even still available inside the VM, but there will
1145 		 * be no block device created for it - the kernel drivers will
1146 		 * silently reject it.
1147 		 */
1148 		blk_size = 0;
1149 		blkcnt = 0;
1150 	} else {
1151 		blk_size = spdk_bdev_get_block_size(bdev);
1152 		blkcnt = spdk_bdev_get_num_blocks(bdev);
1153 		if (spdk_bdev_get_buf_align(bdev) > 1) {
1154 			blkcfg.size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE;
1155 			blkcfg.seg_max = spdk_min(SPDK_VHOST_IOVS_MAX - 2 - 1, BDEV_IO_NUM_CHILD_IOV - 2 - 1);
1156 		} else {
1157 			blkcfg.size_max = 131072;
1158 			/*  -2 for REQ and RESP and -1 for region boundary splitting */
1159 			blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
1160 		}
1161 	}
1162 
1163 	blkcfg.blk_size = blk_size;
1164 	/* minimum I/O size in blocks */
1165 	blkcfg.min_io_size = 1;
1166 	/* expressed in 512 Bytes sectors */
1167 	blkcfg.capacity = (blkcnt * blk_size) / 512;
1168 	/* QEMU can overwrite this value when started */
1169 	blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES;
1170 
1171 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
1172 		/* 16MiB, expressed in 512 Bytes */
1173 		blkcfg.max_discard_sectors = 32768;
1174 		blkcfg.max_discard_seg = 1;
1175 		blkcfg.discard_sector_alignment = blk_size / 512;
1176 	}
1177 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
1178 		blkcfg.max_write_zeroes_sectors = 32768;
1179 		blkcfg.max_write_zeroes_seg = 1;
1180 	}
1181 
1182 	memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg)));
1183 
1184 	return 0;
1185 }
1186 
1187 static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
1188 	.session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session),
1189 	.start_session =  vhost_blk_start,
1190 	.stop_session = vhost_blk_stop,
1191 	.vhost_get_config = vhost_blk_get_config,
1192 	.dump_info_json = vhost_blk_dump_info_json,
1193 	.write_config_json = vhost_blk_write_config_json,
1194 	.remove_device = vhost_blk_destroy,
1195 };
1196 
1197 int
1198 vhost_blk_controller_construct(void)
1199 {
1200 	struct spdk_conf_section *sp;
1201 	unsigned ctrlr_num;
1202 	char *bdev_name;
1203 	char *cpumask;
1204 	char *name;
1205 	bool readonly;
1206 	bool packed_ring;
1207 
1208 	for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) {
1209 		if (!spdk_conf_section_match_prefix(sp, "VhostBlk")) {
1210 			continue;
1211 		}
1212 
1213 		if (sscanf(spdk_conf_section_get_name(sp), "VhostBlk%u", &ctrlr_num) != 1) {
1214 			SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
1215 				    spdk_conf_section_get_name(sp));
1216 			return -1;
1217 		}
1218 
1219 		name = spdk_conf_section_get_val(sp, "Name");
1220 		if (name == NULL) {
1221 			SPDK_ERRLOG("VhostBlk%u: missing Name\n", ctrlr_num);
1222 			return -1;
1223 		}
1224 
1225 		cpumask = spdk_conf_section_get_val(sp, "Cpumask");
1226 		readonly = spdk_conf_section_get_boolval(sp, "ReadOnly", false);
1227 		packed_ring = spdk_conf_section_get_boolval(sp, "PackedRing", false);
1228 
1229 		bdev_name = spdk_conf_section_get_val(sp, "Dev");
1230 		if (bdev_name == NULL) {
1231 			continue;
1232 		}
1233 
1234 		if (spdk_vhost_blk_construct(name, cpumask, bdev_name,
1235 					     readonly, packed_ring) < 0) {
1236 			return -1;
1237 		}
1238 	}
1239 
1240 	return 0;
1241 }
1242 
1243 int
1244 spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name,
1245 			 bool readonly, bool packed_ring)
1246 {
1247 	struct spdk_vhost_blk_dev *bvdev = NULL;
1248 	struct spdk_vhost_dev *vdev;
1249 	struct spdk_bdev *bdev;
1250 	int ret = 0;
1251 
1252 	spdk_vhost_lock();
1253 
1254 	bvdev = calloc(1, sizeof(*bvdev));
1255 	if (bvdev == NULL) {
1256 		ret = -ENOMEM;
1257 		goto out;
1258 	}
1259 
1260 	ret = spdk_bdev_open_ext(dev_name, true, bdev_event_cb, bvdev, &bvdev->bdev_desc);
1261 	if (ret != 0) {
1262 		SPDK_ERRLOG("%s: could not open bdev '%s', error=%d\n",
1263 			    name, dev_name, ret);
1264 		goto out;
1265 	}
1266 	bdev = spdk_bdev_desc_get_bdev(bvdev->bdev_desc);
1267 
1268 	vdev = &bvdev->vdev;
1269 	vdev->virtio_features = SPDK_VHOST_BLK_FEATURES_BASE;
1270 	vdev->disabled_features = SPDK_VHOST_BLK_DISABLED_FEATURES;
1271 	vdev->protocol_features = SPDK_VHOST_BLK_PROTOCOL_FEATURES;
1272 
1273 	vdev->virtio_features |= (uint64_t)packed_ring << VIRTIO_F_RING_PACKED;
1274 
1275 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
1276 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_DISCARD);
1277 	}
1278 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
1279 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
1280 	}
1281 	if (readonly) {
1282 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_RO);
1283 	}
1284 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) {
1285 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_FLUSH);
1286 	}
1287 
1288 	/*
1289 	 * When starting qemu with vhost-user-blk multiqueue, the vhost device will
1290 	 * be started/stopped many times, related to the queues num, as the
1291 	 * vhost-user backend doesn't know the exact number of queues used for this
1292 	 * device. The target have to stop and start the device once got a valid
1293 	 * IO queue.
1294 	 * When stoping and starting the vhost device, the backend bdev io device
1295 	 * will be deleted and created repeatedly.
1296 	 * Hold a bdev reference so that in the struct spdk_vhost_blk_dev, so that
1297 	 * the io device will not be deleted.
1298 	 */
1299 	bvdev->dummy_io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
1300 
1301 	bvdev->bdev = bdev;
1302 	bvdev->readonly = readonly;
1303 	ret = vhost_dev_register(vdev, name, cpumask, &vhost_blk_device_backend);
1304 	if (ret != 0) {
1305 		spdk_put_io_channel(bvdev->dummy_io_channel);
1306 		spdk_bdev_close(bvdev->bdev_desc);
1307 		goto out;
1308 	}
1309 
1310 	SPDK_INFOLOG(vhost, "%s: using bdev '%s'\n", name, dev_name);
1311 out:
1312 	if (ret != 0 && bvdev) {
1313 		free(bvdev);
1314 	}
1315 	spdk_vhost_unlock();
1316 	return ret;
1317 }
1318 
1319 static int
1320 vhost_blk_destroy(struct spdk_vhost_dev *vdev)
1321 {
1322 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
1323 	int rc;
1324 
1325 	assert(bvdev != NULL);
1326 
1327 	rc = vhost_dev_unregister(&bvdev->vdev);
1328 	if (rc != 0) {
1329 		return rc;
1330 	}
1331 
1332 	/* if the bdev is removed, don't need call spdk_put_io_channel. */
1333 	if (bvdev->bdev) {
1334 		spdk_put_io_channel(bvdev->dummy_io_channel);
1335 	}
1336 
1337 	if (bvdev->bdev_desc) {
1338 		spdk_bdev_close(bvdev->bdev_desc);
1339 		bvdev->bdev_desc = NULL;
1340 	}
1341 	bvdev->bdev = NULL;
1342 
1343 	free(bvdev);
1344 	return 0;
1345 }
1346 
1347 SPDK_LOG_REGISTER_COMPONENT(vhost_blk)
1348 SPDK_LOG_REGISTER_COMPONENT(vhost_blk_data)
1349