xref: /spdk/lib/vhost/vhost_blk.c (revision 6569a0ea0630b45bb83582ac2893cda584375dc4)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <linux/virtio_blk.h>
35 
36 #include "spdk/env.h"
37 #include "spdk/bdev.h"
38 #include "spdk/bdev_module.h"
39 #include "spdk/conf.h"
40 #include "spdk/thread.h"
41 #include "spdk/likely.h"
42 #include "spdk/string.h"
43 #include "spdk/util.h"
44 #include "spdk/vhost.h"
45 
46 #include "vhost_internal.h"
47 
48 /* Minimal set of features supported by every SPDK VHOST-BLK device */
49 #define SPDK_VHOST_BLK_FEATURES_BASE (SPDK_VHOST_FEATURES | \
50 		(1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) | \
51 		(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \
52 		(1ULL << VIRTIO_BLK_F_TOPOLOGY) | (1ULL << VIRTIO_BLK_F_BARRIER)  | \
53 		(1ULL << VIRTIO_BLK_F_SCSI)     | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
54 		(1ULL << VIRTIO_BLK_F_MQ))
55 
56 /* Not supported features */
57 #define SPDK_VHOST_BLK_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \
58 		(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
59 		(1ULL << VIRTIO_BLK_F_BARRIER)  | (1ULL << VIRTIO_BLK_F_SCSI))
60 
61 /* Vhost-blk support protocol features */
62 #ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB
63 #define SPDK_VHOST_BLK_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_CONFIG) | \
64 		(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
65 #else
66 #define SPDK_VHOST_BLK_PROTOCOL_FEATURES (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)
67 #endif
68 
69 struct spdk_vhost_blk_task {
70 	struct spdk_bdev_io *bdev_io;
71 	struct spdk_vhost_blk_session *bvsession;
72 	struct spdk_vhost_virtqueue *vq;
73 
74 	volatile uint8_t *status;
75 
76 	uint16_t req_idx;
77 	uint16_t num_descs;
78 	uint16_t buffer_id;
79 
80 	/* for io wait */
81 	struct spdk_bdev_io_wait_entry bdev_io_wait;
82 
83 	/* If set, the task is currently used for I/O processing. */
84 	bool used;
85 
86 	/** Number of bytes that were written. */
87 	uint32_t used_len;
88 	uint16_t iovcnt;
89 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
90 };
91 
92 struct spdk_vhost_blk_dev {
93 	struct spdk_vhost_dev vdev;
94 	struct spdk_bdev *bdev;
95 	struct spdk_bdev_desc *bdev_desc;
96 	bool readonly;
97 };
98 
99 struct spdk_vhost_blk_session {
100 	/* The parent session must be the very first field in this struct */
101 	struct spdk_vhost_session vsession;
102 	struct spdk_vhost_blk_dev *bvdev;
103 	struct spdk_poller *requestq_poller;
104 	struct spdk_io_channel *io_channel;
105 	struct spdk_poller *stop_poller;
106 };
107 
108 /* forward declaration */
109 static const struct spdk_vhost_dev_backend vhost_blk_device_backend;
110 
111 static int
112 process_blk_request(struct spdk_vhost_blk_task *task,
113 		    struct spdk_vhost_blk_session *bvsession,
114 		    struct spdk_vhost_virtqueue *vq);
115 
116 static void
117 blk_task_finish(struct spdk_vhost_blk_task *task)
118 {
119 	assert(task->bvsession->vsession.task_cnt > 0);
120 	task->bvsession->vsession.task_cnt--;
121 	task->used = false;
122 }
123 
124 static void
125 blk_task_init(struct spdk_vhost_blk_task *task)
126 {
127 	task->used = true;
128 	task->iovcnt = SPDK_COUNTOF(task->iovs);
129 	task->status = NULL;
130 	task->used_len = 0;
131 }
132 
133 static void
134 blk_task_enqueue(struct spdk_vhost_blk_task *task)
135 {
136 	if (task->vq->packed.packed_ring) {
137 		vhost_vq_packed_ring_enqueue(&task->bvsession->vsession, task->vq,
138 					     task->num_descs,
139 					     task->buffer_id, task->used_len);
140 	} else {
141 		vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq,
142 					   task->req_idx, task->used_len);
143 	}
144 }
145 
146 static void
147 invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status)
148 {
149 	if (task->status) {
150 		*task->status = status;
151 	}
152 
153 	blk_task_enqueue(task);
154 	blk_task_finish(task);
155 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Invalid request (status=%" PRIu8")\n", status);
156 }
157 
158 /*
159  * Process task's descriptor chain and setup data related fields.
160  * Return
161  *   total size of suplied buffers
162  *
163  *   FIXME: Make this function return to rd_cnt and wr_cnt
164  */
165 static int
166 blk_iovs_split_queue_setup(struct spdk_vhost_blk_session *bvsession,
167 			   struct spdk_vhost_virtqueue *vq,
168 			   uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
169 {
170 	struct spdk_vhost_session *vsession = &bvsession->vsession;
171 	struct spdk_vhost_dev *vdev = vsession->vdev;
172 	struct vring_desc *desc, *desc_table;
173 	uint16_t out_cnt = 0, cnt = 0;
174 	uint32_t desc_table_size, len = 0;
175 	uint32_t desc_handled_cnt;
176 	int rc;
177 
178 	rc = vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size);
179 	if (rc != 0) {
180 		SPDK_ERRLOG("%s: invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
181 		return -1;
182 	}
183 
184 	desc_handled_cnt = 0;
185 	while (1) {
186 		/*
187 		 * Maximum cnt reached?
188 		 * Should not happen if request is well formatted, otherwise this is a BUG.
189 		 */
190 		if (spdk_unlikely(cnt == *iovs_cnt)) {
191 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
192 				      vsession->name, req_idx);
193 			return -1;
194 		}
195 
196 		if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) {
197 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
198 				      vsession->name, req_idx, cnt);
199 			return -1;
200 		}
201 
202 		len += desc->len;
203 
204 		out_cnt += vhost_vring_desc_is_wr(desc);
205 
206 		rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
207 		if (rc != 0) {
208 			SPDK_ERRLOG("%s: descriptor chain at index %"PRIu16" terminated unexpectedly.\n",
209 				    vsession->name, req_idx);
210 			return -1;
211 		} else if (desc == NULL) {
212 			break;
213 		}
214 
215 		desc_handled_cnt++;
216 		if (spdk_unlikely(desc_handled_cnt > desc_table_size)) {
217 			/* Break a cycle and report an error, if any. */
218 			SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n",
219 				    vsession->name, desc_table_size, desc_handled_cnt);
220 			return -1;
221 		}
222 	}
223 
224 	/*
225 	 * There must be least two descriptors.
226 	 * First contain request so it must be readable.
227 	 * Last descriptor contain buffer for response so it must be writable.
228 	 */
229 	if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
230 		return -1;
231 	}
232 
233 	*length = len;
234 	*iovs_cnt = cnt;
235 	return 0;
236 }
237 
238 static int
239 blk_iovs_packed_queue_setup(struct spdk_vhost_blk_session *bvsession,
240 			    struct spdk_vhost_virtqueue *vq,
241 			    uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
242 {
243 	struct spdk_vhost_session *vsession = &bvsession->vsession;
244 	struct spdk_vhost_dev *vdev = vsession->vdev;
245 	struct vring_packed_desc *desc = NULL, *desc_table;
246 	uint16_t out_cnt = 0, cnt = 0;
247 	uint32_t desc_table_size, len = 0;
248 	int rc = 0;
249 
250 	rc = vhost_vq_get_desc_packed(vsession, vq, req_idx, &desc,
251 				      &desc_table, &desc_table_size);
252 	if (spdk_unlikely(rc != 0)) {
253 		SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
254 		return rc;
255 	}
256 
257 	if (desc_table != NULL) {
258 		req_idx = 0;
259 	}
260 
261 	while (1) {
262 		/*
263 		 * Maximum cnt reached?
264 		 * Should not happen if request is well formatted, otherwise this is a BUG.
265 		 */
266 		if (spdk_unlikely(cnt == *iovs_cnt)) {
267 			SPDK_ERRLOG("%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
268 				    vsession->name, req_idx);
269 			return -EINVAL;
270 		}
271 
272 		if (spdk_unlikely(vhost_vring_packed_desc_to_iov(vsession, iovs, &cnt, desc))) {
273 			SPDK_ERRLOG("%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
274 				    vsession->name, req_idx, cnt);
275 			return -EINVAL;
276 		}
277 
278 		len += desc->len;
279 		out_cnt += vhost_vring_packed_desc_is_wr(desc);
280 
281 		/* desc is NULL means we reach the last desc of this request */
282 		vhost_vring_packed_desc_get_next(&desc, &req_idx, vq, desc_table, desc_table_size);
283 		if (desc == NULL) {
284 			break;
285 		}
286 	}
287 
288 	/*
289 	 * There must be least two descriptors.
290 	 * First contain request so it must be readable.
291 	 * Last descriptor contain buffer for response so it must be writable.
292 	 */
293 	if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
294 		return -EINVAL;
295 	}
296 
297 	*length = len;
298 	*iovs_cnt = cnt;
299 
300 	return 0;
301 }
302 
303 static void
304 blk_request_finish(bool success, struct spdk_vhost_blk_task *task)
305 {
306 	*task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
307 
308 	blk_task_enqueue(task);
309 
310 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Finished task (%p) req_idx=%d\n status: %s\n", task,
311 		      task->req_idx, success ? "OK" : "FAIL");
312 	blk_task_finish(task);
313 }
314 
315 static void
316 blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
317 {
318 	struct spdk_vhost_blk_task *task = cb_arg;
319 
320 	spdk_bdev_free_io(bdev_io);
321 	blk_request_finish(success, task);
322 }
323 
324 static void
325 blk_request_resubmit(void *arg)
326 {
327 	struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg;
328 	int rc = 0;
329 
330 	blk_task_init(task);
331 
332 	rc = process_blk_request(task, task->bvsession, task->vq);
333 	if (rc == 0) {
334 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p resubmitted ======\n", task);
335 	} else {
336 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p failed ======\n", task);
337 	}
338 }
339 
340 static inline void
341 blk_request_queue_io(struct spdk_vhost_blk_task *task)
342 {
343 	int rc;
344 	struct spdk_vhost_blk_session *bvsession = task->bvsession;
345 	struct spdk_bdev *bdev = bvsession->bvdev->bdev;
346 
347 	task->bdev_io_wait.bdev = bdev;
348 	task->bdev_io_wait.cb_fn = blk_request_resubmit;
349 	task->bdev_io_wait.cb_arg = task;
350 
351 	rc = spdk_bdev_queue_io_wait(bdev, bvsession->io_channel, &task->bdev_io_wait);
352 	if (rc != 0) {
353 		SPDK_ERRLOG("%s: failed to queue I/O, rc=%d\n", bvsession->vsession.name, rc);
354 		invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
355 	}
356 }
357 
358 static int
359 process_blk_request(struct spdk_vhost_blk_task *task,
360 		    struct spdk_vhost_blk_session *bvsession,
361 		    struct spdk_vhost_virtqueue *vq)
362 {
363 	struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
364 	const struct virtio_blk_outhdr *req;
365 	struct virtio_blk_discard_write_zeroes *desc;
366 	struct iovec *iov;
367 	uint32_t type;
368 	uint32_t payload_len;
369 	uint64_t flush_bytes;
370 	int rc;
371 
372 	if (vq->packed.packed_ring) {
373 		rc = blk_iovs_packed_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
374 						 &payload_len);
375 	} else {
376 		rc = blk_iovs_split_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
377 						&payload_len);
378 	}
379 
380 	if (rc) {
381 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
382 		/* Only READ and WRITE are supported for now. */
383 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
384 		return -1;
385 	}
386 
387 	iov = &task->iovs[0];
388 	if (spdk_unlikely(iov->iov_len != sizeof(*req))) {
389 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
390 			      "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n",
391 			      iov->iov_len, sizeof(*req), task->req_idx);
392 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
393 		return -1;
394 	}
395 
396 	req = iov->iov_base;
397 
398 	iov = &task->iovs[task->iovcnt - 1];
399 	if (spdk_unlikely(iov->iov_len != 1)) {
400 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
401 			      "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n",
402 			      iov->iov_len, 1, task->req_idx);
403 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
404 		return -1;
405 	}
406 
407 	task->status = iov->iov_base;
408 	payload_len -= sizeof(*req) + sizeof(*task->status);
409 	task->iovcnt -= 2;
410 
411 	type = req->type;
412 #ifdef VIRTIO_BLK_T_BARRIER
413 	/* Don't care about barier for now (as QEMU's virtio-blk do). */
414 	type &= ~VIRTIO_BLK_T_BARRIER;
415 #endif
416 
417 	switch (type) {
418 	case VIRTIO_BLK_T_IN:
419 	case VIRTIO_BLK_T_OUT:
420 		if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) {
421 			SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n",
422 				    type ? "WRITE" : "READ", task->req_idx);
423 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
424 			return -1;
425 		}
426 
427 		if (type == VIRTIO_BLK_T_IN) {
428 			task->used_len = payload_len + sizeof(*task->status);
429 			rc = spdk_bdev_readv(bvdev->bdev_desc, bvsession->io_channel,
430 					     &task->iovs[1], task->iovcnt, req->sector * 512,
431 					     payload_len, blk_request_complete_cb, task);
432 		} else if (!bvdev->readonly) {
433 			task->used_len = sizeof(*task->status);
434 			rc = spdk_bdev_writev(bvdev->bdev_desc, bvsession->io_channel,
435 					      &task->iovs[1], task->iovcnt, req->sector * 512,
436 					      payload_len, blk_request_complete_cb, task);
437 		} else {
438 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Device is in read-only mode!\n");
439 			rc = -1;
440 		}
441 
442 		if (rc) {
443 			if (rc == -ENOMEM) {
444 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
445 				blk_request_queue_io(task);
446 			} else {
447 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
448 				return -1;
449 			}
450 		}
451 		break;
452 	case VIRTIO_BLK_T_DISCARD:
453 		desc = task->iovs[1].iov_base;
454 		if (payload_len != sizeof(*desc)) {
455 			SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len);
456 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
457 			return -1;
458 		}
459 
460 		rc = spdk_bdev_unmap(bvdev->bdev_desc, bvsession->io_channel,
461 				     desc->sector * 512, desc->num_sectors * 512,
462 				     blk_request_complete_cb, task);
463 		if (rc) {
464 			if (rc == -ENOMEM) {
465 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
466 				blk_request_queue_io(task);
467 			} else {
468 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
469 				return -1;
470 			}
471 		}
472 		break;
473 	case VIRTIO_BLK_T_WRITE_ZEROES:
474 		desc = task->iovs[1].iov_base;
475 		if (payload_len != sizeof(*desc)) {
476 			SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len);
477 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
478 			return -1;
479 		}
480 
481 		/* Zeroed and Unmap the range, SPDK doen't support it. */
482 		if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
483 			SPDK_NOTICELOG("Can't support Write Zeroes with Unmap flag\n");
484 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
485 			return -1;
486 		}
487 
488 		rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvsession->io_channel,
489 					    desc->sector * 512, desc->num_sectors * 512,
490 					    blk_request_complete_cb, task);
491 		if (rc) {
492 			if (rc == -ENOMEM) {
493 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
494 				blk_request_queue_io(task);
495 			} else {
496 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
497 				return -1;
498 			}
499 		}
500 		break;
501 	case VIRTIO_BLK_T_FLUSH:
502 		flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev);
503 		if (req->sector != 0) {
504 			SPDK_NOTICELOG("sector must be zero for flush command\n");
505 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
506 			return -1;
507 		}
508 		rc = spdk_bdev_flush(bvdev->bdev_desc, bvsession->io_channel,
509 				     0, flush_bytes,
510 				     blk_request_complete_cb, task);
511 		if (rc) {
512 			if (rc == -ENOMEM) {
513 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
514 				blk_request_queue_io(task);
515 			} else {
516 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
517 				return -1;
518 			}
519 		}
520 		break;
521 	case VIRTIO_BLK_T_GET_ID:
522 		if (!task->iovcnt || !payload_len) {
523 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
524 			return -1;
525 		}
526 		task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len);
527 		spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev),
528 				task->used_len, ' ');
529 		blk_request_finish(true, task);
530 		break;
531 	default:
532 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Not supported request type '%"PRIu32"'.\n", type);
533 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
534 		return -1;
535 	}
536 
537 	return 0;
538 }
539 
540 static void
541 process_blk_task(struct spdk_vhost_virtqueue *vq, uint16_t req_idx)
542 {
543 	struct spdk_vhost_blk_task *task;
544 	uint16_t task_idx = req_idx, num_descs;
545 
546 	if (vq->packed.packed_ring) {
547 		/* Packed ring used the buffer_id as the task_idx to get task struct.
548 		 * In kernel driver, it uses the vq->free_head to set the buffer_id so the value
549 		 * must be in the range of 0 ~ vring.size. The free_head value must be unique
550 		 * in the outstanding requests.
551 		 * We can't use the req_idx as the task_idx because the desc can be reused in
552 		 * the next phase even when it's not completed in the previous phase. For example,
553 		 * At phase 0, last_used_idx was 2 and desc0 was not completed.Then after moving
554 		 * phase 1, last_avail_idx is updated to 1. In this case, req_idx can not be used
555 		 * as task_idx because we will know task[0]->used is true at phase 1.
556 		 * The split queue is quite different, the desc would insert into the free list when
557 		 * device completes the request, the driver gets the desc from the free list which
558 		 * ensures the req_idx is unique in the outstanding requests.
559 		 */
560 		task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs);
561 	}
562 
563 	task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx];
564 	if (spdk_unlikely(task->used)) {
565 		SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
566 			    task->bvsession->vsession.name, task_idx);
567 		task->used_len = 0;
568 		blk_task_enqueue(task);
569 		return;
570 	}
571 
572 	if (vq->packed.packed_ring) {
573 		task->req_idx = req_idx;
574 		task->num_descs = num_descs;
575 		task->buffer_id = task_idx;
576 	}
577 
578 	task->bvsession->vsession.task_cnt++;
579 
580 	blk_task_init(task);
581 
582 	if (process_blk_request(task, task->bvsession, vq) == 0) {
583 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task,
584 			      task_idx);
585 	} else {
586 		SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, task_idx);
587 	}
588 }
589 
590 static void
591 submit_inflight_desc(struct spdk_vhost_blk_session *bvsession,
592 		     struct spdk_vhost_virtqueue *vq)
593 {
594 	struct spdk_vhost_session *vsession = &bvsession->vsession;
595 	spdk_vhost_resubmit_info *resubmit = vq->vring_inflight.resubmit_inflight;
596 	spdk_vhost_resubmit_desc *resubmit_list;
597 	uint16_t req_idx;
598 
599 	if (spdk_likely(resubmit == NULL || resubmit->resubmit_list == NULL)) {
600 		return;
601 	}
602 
603 	resubmit_list = resubmit->resubmit_list;
604 	while (resubmit->resubmit_num-- > 0) {
605 		req_idx = resubmit_list[resubmit->resubmit_num].index;
606 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Start processing request idx %"PRIu16"======\n",
607 			      req_idx);
608 
609 		if (spdk_unlikely(req_idx >= vq->vring.size)) {
610 			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
611 				    vsession->name, req_idx, vq->vring.size);
612 			vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
613 			continue;
614 		}
615 
616 		process_blk_task(vq, req_idx);
617 	}
618 
619 	free(resubmit_list);
620 	resubmit->resubmit_list = NULL;
621 }
622 
623 static void
624 process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
625 {
626 	struct spdk_vhost_session *vsession = &bvsession->vsession;
627 	uint16_t reqs[SPDK_VHOST_VQ_MAX_SUBMISSIONS];
628 	uint16_t reqs_cnt, i;
629 
630 	submit_inflight_desc(bvsession, vq);
631 
632 	reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
633 	if (!reqs_cnt) {
634 		return;
635 	}
636 
637 	for (i = 0; i < reqs_cnt; i++) {
638 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n",
639 			      reqs[i]);
640 
641 		if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
642 			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
643 				    vsession->name, reqs[i], vq->vring.size);
644 			vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
645 			continue;
646 		}
647 
648 		rte_vhost_set_inflight_desc_split(vsession->vid, vq->vring_idx, reqs[i]);
649 
650 		process_blk_task(vq, reqs[i]);
651 	}
652 }
653 
654 static void
655 process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
656 {
657 	uint16_t i = 0;
658 
659 	while (i++ < SPDK_VHOST_VQ_MAX_SUBMISSIONS &&
660 	       vhost_vq_packed_ring_is_avail(vq)) {
661 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n",
662 			      vq->last_avail_idx);
663 
664 		process_blk_task(vq, vq->last_avail_idx);
665 	}
666 }
667 
668 static int
669 vdev_worker(void *arg)
670 {
671 	struct spdk_vhost_blk_session *bvsession = arg;
672 	struct spdk_vhost_session *vsession = &bvsession->vsession;
673 
674 	uint16_t q_idx;
675 	bool packed_ring;
676 
677 	/* In a session, every vq supports the same format */
678 	packed_ring = vsession->virtqueue[0].packed.packed_ring;
679 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
680 		if (packed_ring) {
681 			process_packed_vq(bvsession, &vsession->virtqueue[q_idx]);
682 		} else {
683 			process_vq(bvsession, &vsession->virtqueue[q_idx]);
684 		}
685 	}
686 
687 	vhost_session_used_signal(vsession);
688 
689 	return -1;
690 }
691 
692 static void
693 no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
694 {
695 	struct spdk_vhost_session *vsession = &bvsession->vsession;
696 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
697 	uint32_t length;
698 	uint16_t iovcnt, req_idx;
699 
700 	if (vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) {
701 		return;
702 	}
703 
704 	iovcnt = SPDK_COUNTOF(iovs);
705 	if (blk_iovs_split_queue_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) {
706 		*(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR;
707 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx);
708 	}
709 
710 	vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
711 }
712 
713 static void
714 no_bdev_process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
715 {
716 	struct spdk_vhost_session *vsession = &bvsession->vsession;
717 	struct spdk_vhost_blk_task *task;
718 	uint32_t length;
719 	uint16_t req_idx = vq->last_avail_idx;
720 	uint16_t task_idx, num_descs;
721 
722 	if (!vhost_vq_packed_ring_is_avail(vq)) {
723 		return;
724 	}
725 
726 	task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs);
727 	task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx];
728 	if (spdk_unlikely(task->used)) {
729 		SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
730 			    vsession->name, req_idx);
731 		vhost_vq_packed_ring_enqueue(vsession, vq, num_descs,
732 					     task->buffer_id, task->used_len);
733 		return;
734 	}
735 
736 	task->req_idx = req_idx;
737 	task->num_descs = num_descs;
738 	task->buffer_id = task_idx;
739 	blk_task_init(task);
740 
741 	if (blk_iovs_packed_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
742 					&length)) {
743 		*(volatile uint8_t *)(task->iovs[task->iovcnt - 1].iov_base) = VIRTIO_BLK_S_IOERR;
744 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx);
745 	}
746 
747 	task->used = false;
748 	vhost_vq_packed_ring_enqueue(vsession, vq, num_descs,
749 				     task->buffer_id, task->used_len);
750 }
751 
752 static int
753 no_bdev_vdev_worker(void *arg)
754 {
755 	struct spdk_vhost_blk_session *bvsession = arg;
756 	struct spdk_vhost_session *vsession = &bvsession->vsession;
757 	uint16_t q_idx;
758 	bool packed_ring;
759 
760 	/* In a session, every vq supports the same format */
761 	packed_ring = vsession->virtqueue[0].packed.packed_ring;
762 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
763 		if (packed_ring) {
764 			no_bdev_process_packed_vq(bvsession, &vsession->virtqueue[q_idx]);
765 		} else {
766 			no_bdev_process_vq(bvsession, &vsession->virtqueue[q_idx]);
767 		}
768 	}
769 
770 	vhost_session_used_signal(vsession);
771 
772 	if (vsession->task_cnt == 0 && bvsession->io_channel) {
773 		spdk_put_io_channel(bvsession->io_channel);
774 		bvsession->io_channel = NULL;
775 	}
776 
777 	return -1;
778 }
779 
780 static struct spdk_vhost_blk_session *
781 to_blk_session(struct spdk_vhost_session *vsession)
782 {
783 	assert(vsession->vdev->backend == &vhost_blk_device_backend);
784 	return (struct spdk_vhost_blk_session *)vsession;
785 }
786 
787 static struct spdk_vhost_blk_dev *
788 to_blk_dev(struct spdk_vhost_dev *vdev)
789 {
790 	if (vdev == NULL) {
791 		return NULL;
792 	}
793 
794 	if (vdev->backend != &vhost_blk_device_backend) {
795 		SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name);
796 		return NULL;
797 	}
798 
799 	return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev);
800 }
801 
802 struct spdk_bdev *
803 spdk_vhost_blk_get_dev(struct spdk_vhost_dev *vdev)
804 {
805 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
806 
807 	assert(bvdev != NULL);
808 	return bvdev->bdev;
809 }
810 
811 static void
812 vhost_dev_bdev_remove_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx)
813 {
814 
815 	/* All sessions have been notified, time to close the bdev */
816 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
817 
818 	assert(bvdev != NULL);
819 	spdk_bdev_close(bvdev->bdev_desc);
820 	bvdev->bdev_desc = NULL;
821 	bvdev->bdev = NULL;
822 }
823 
824 static int
825 vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev,
826 			     struct spdk_vhost_session *vsession,
827 			     void *ctx)
828 {
829 	struct spdk_vhost_blk_session *bvsession;
830 
831 	bvsession = (struct spdk_vhost_blk_session *)vsession;
832 	if (bvsession->requestq_poller) {
833 		spdk_poller_unregister(&bvsession->requestq_poller);
834 		bvsession->requestq_poller = SPDK_POLLER_REGISTER(no_bdev_vdev_worker, bvsession, 0);
835 	}
836 
837 	return 0;
838 }
839 
840 static void
841 bdev_remove_cb(void *remove_ctx)
842 {
843 	struct spdk_vhost_blk_dev *bvdev = remove_ctx;
844 
845 	SPDK_WARNLOG("%s: hot-removing bdev - all further requests will fail.\n",
846 		     bvdev->vdev.name);
847 
848 	spdk_vhost_lock();
849 	vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_remove_cb,
850 				  vhost_dev_bdev_remove_cpl_cb, NULL);
851 	spdk_vhost_unlock();
852 }
853 
854 static void
855 free_task_pool(struct spdk_vhost_blk_session *bvsession)
856 {
857 	struct spdk_vhost_session *vsession = &bvsession->vsession;
858 	struct spdk_vhost_virtqueue *vq;
859 	uint16_t i;
860 
861 	for (i = 0; i < vsession->max_queues; i++) {
862 		vq = &vsession->virtqueue[i];
863 		if (vq->tasks == NULL) {
864 			continue;
865 		}
866 
867 		spdk_free(vq->tasks);
868 		vq->tasks = NULL;
869 	}
870 }
871 
872 static int
873 alloc_task_pool(struct spdk_vhost_blk_session *bvsession)
874 {
875 	struct spdk_vhost_session *vsession = &bvsession->vsession;
876 	struct spdk_vhost_virtqueue *vq;
877 	struct spdk_vhost_blk_task *task;
878 	uint32_t task_cnt;
879 	uint16_t i;
880 	uint32_t j;
881 
882 	for (i = 0; i < vsession->max_queues; i++) {
883 		vq = &vsession->virtqueue[i];
884 		if (vq->vring.desc == NULL) {
885 			continue;
886 		}
887 
888 		task_cnt = vq->vring.size;
889 		if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
890 			/* sanity check */
891 			SPDK_ERRLOG("%s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
892 				    vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
893 			free_task_pool(bvsession);
894 			return -1;
895 		}
896 		vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt,
897 					 SPDK_CACHE_LINE_SIZE, NULL,
898 					 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
899 		if (vq->tasks == NULL) {
900 			SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
901 				    vsession->name, task_cnt, i);
902 			free_task_pool(bvsession);
903 			return -1;
904 		}
905 
906 		for (j = 0; j < task_cnt; j++) {
907 			task = &((struct spdk_vhost_blk_task *)vq->tasks)[j];
908 			task->bvsession = bvsession;
909 			task->req_idx = j;
910 			task->vq = vq;
911 		}
912 	}
913 
914 	return 0;
915 }
916 
917 static int
918 vhost_blk_start_cb(struct spdk_vhost_dev *vdev,
919 		   struct spdk_vhost_session *vsession, void *unused)
920 {
921 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
922 	struct spdk_vhost_blk_dev *bvdev;
923 	int i, rc = 0;
924 
925 	bvdev = to_blk_dev(vdev);
926 	assert(bvdev != NULL);
927 	bvsession->bvdev = bvdev;
928 
929 	/* validate all I/O queues are in a contiguous index range */
930 	for (i = 0; i < vsession->max_queues; i++) {
931 		/* vring.desc and vring.desc_packed are in a union struct
932 		 * so q->vring.desc can replace q->vring.desc_packed.
933 		 */
934 		if (vsession->virtqueue[i].vring.desc == NULL) {
935 			SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i);
936 			rc = -1;
937 			goto out;
938 		}
939 	}
940 
941 	rc = alloc_task_pool(bvsession);
942 	if (rc != 0) {
943 		SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name);
944 		goto out;
945 	}
946 
947 	if (bvdev->bdev) {
948 		bvsession->io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
949 		if (!bvsession->io_channel) {
950 			free_task_pool(bvsession);
951 			SPDK_ERRLOG("%s: I/O channel allocation failed\n", vsession->name);
952 			rc = -1;
953 			goto out;
954 		}
955 	}
956 
957 	bvsession->requestq_poller = SPDK_POLLER_REGISTER(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker,
958 				     bvsession, 0);
959 	SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: started poller on lcore %d\n",
960 		     vsession->name, spdk_env_get_current_core());
961 out:
962 	vhost_session_start_done(vsession, rc);
963 	return rc;
964 }
965 
966 static int
967 vhost_blk_start(struct spdk_vhost_session *vsession)
968 {
969 	return vhost_session_send_event(vsession, vhost_blk_start_cb,
970 					3, "start session");
971 }
972 
973 static int
974 destroy_session_poller_cb(void *arg)
975 {
976 	struct spdk_vhost_blk_session *bvsession = arg;
977 	struct spdk_vhost_session *vsession = &bvsession->vsession;
978 	int i;
979 
980 	if (vsession->task_cnt > 0) {
981 		return -1;
982 	}
983 
984 	if (spdk_vhost_trylock() != 0) {
985 		return -1;
986 	}
987 
988 	for (i = 0; i < vsession->max_queues; i++) {
989 		vsession->virtqueue[i].next_event_time = 0;
990 		vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
991 	}
992 
993 	SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: stopping poller on lcore %d\n",
994 		     vsession->name, spdk_env_get_current_core());
995 
996 	if (bvsession->io_channel) {
997 		spdk_put_io_channel(bvsession->io_channel);
998 		bvsession->io_channel = NULL;
999 	}
1000 
1001 	free_task_pool(bvsession);
1002 	spdk_poller_unregister(&bvsession->stop_poller);
1003 	vhost_session_stop_done(vsession, 0);
1004 
1005 	spdk_vhost_unlock();
1006 	return -1;
1007 }
1008 
1009 static int
1010 vhost_blk_stop_cb(struct spdk_vhost_dev *vdev,
1011 		  struct spdk_vhost_session *vsession, void *unused)
1012 {
1013 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
1014 
1015 	spdk_poller_unregister(&bvsession->requestq_poller);
1016 	bvsession->stop_poller = SPDK_POLLER_REGISTER(destroy_session_poller_cb,
1017 				 bvsession, 1000);
1018 	return 0;
1019 }
1020 
1021 static int
1022 vhost_blk_stop(struct spdk_vhost_session *vsession)
1023 {
1024 	return vhost_session_send_event(vsession, vhost_blk_stop_cb,
1025 					3, "stop session");
1026 }
1027 
1028 static void
1029 vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
1030 {
1031 	struct spdk_bdev *bdev = spdk_vhost_blk_get_dev(vdev);
1032 	struct spdk_vhost_blk_dev *bvdev;
1033 
1034 	bvdev = to_blk_dev(vdev);
1035 	assert(bvdev != NULL);
1036 	spdk_json_write_named_object_begin(w, "block");
1037 
1038 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
1039 
1040 	spdk_json_write_name(w, "bdev");
1041 	if (bdev) {
1042 		spdk_json_write_string(w, spdk_bdev_get_name(bdev));
1043 	} else {
1044 		spdk_json_write_null(w);
1045 	}
1046 
1047 	spdk_json_write_object_end(w);
1048 }
1049 
1050 static void
1051 vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
1052 {
1053 	struct spdk_vhost_blk_dev *bvdev;
1054 
1055 	bvdev = to_blk_dev(vdev);
1056 	assert(bvdev != NULL);
1057 	if (!bvdev->bdev) {
1058 		return;
1059 	}
1060 
1061 	spdk_json_write_object_begin(w);
1062 	spdk_json_write_named_string(w, "method", "vhost_create_blk_controller");
1063 
1064 	spdk_json_write_named_object_begin(w, "params");
1065 	spdk_json_write_named_string(w, "ctrlr", vdev->name);
1066 	spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev));
1067 	spdk_json_write_named_string(w, "cpumask",
1068 				     spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread)));
1069 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
1070 	spdk_json_write_object_end(w);
1071 
1072 	spdk_json_write_object_end(w);
1073 }
1074 
1075 static int vhost_blk_destroy(struct spdk_vhost_dev *dev);
1076 
1077 static int
1078 vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
1079 		     uint32_t len)
1080 {
1081 	struct virtio_blk_config blkcfg;
1082 	struct spdk_vhost_blk_dev *bvdev;
1083 	struct spdk_bdev *bdev;
1084 	uint32_t blk_size;
1085 	uint64_t blkcnt;
1086 
1087 	memset(&blkcfg, 0, sizeof(blkcfg));
1088 	bvdev = to_blk_dev(vdev);
1089 	assert(bvdev != NULL);
1090 	bdev = bvdev->bdev;
1091 	if (bdev == NULL) {
1092 		/* We can't just return -1 here as this GET_CONFIG message might
1093 		 * be caused by a QEMU VM reboot. Returning -1 will indicate an
1094 		 * error to QEMU, who might then decide to terminate itself.
1095 		 * We don't want that. A simple reboot shouldn't break the system.
1096 		 *
1097 		 * Presenting a block device with block size 0 and block count 0
1098 		 * doesn't cause any problems on QEMU side and the virtio-pci
1099 		 * device is even still available inside the VM, but there will
1100 		 * be no block device created for it - the kernel drivers will
1101 		 * silently reject it.
1102 		 */
1103 		blk_size = 0;
1104 		blkcnt = 0;
1105 	} else {
1106 		blk_size = spdk_bdev_get_block_size(bdev);
1107 		blkcnt = spdk_bdev_get_num_blocks(bdev);
1108 		if (spdk_bdev_get_buf_align(bdev) > 1) {
1109 			blkcfg.size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE;
1110 			blkcfg.seg_max = spdk_min(SPDK_VHOST_IOVS_MAX - 2 - 1, BDEV_IO_NUM_CHILD_IOV - 2 - 1);
1111 		} else {
1112 			blkcfg.size_max = 131072;
1113 			/*  -2 for REQ and RESP and -1 for region boundary splitting */
1114 			blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
1115 		}
1116 	}
1117 
1118 	blkcfg.blk_size = blk_size;
1119 	/* minimum I/O size in blocks */
1120 	blkcfg.min_io_size = 1;
1121 	/* expressed in 512 Bytes sectors */
1122 	blkcfg.capacity = (blkcnt * blk_size) / 512;
1123 	/* QEMU can overwrite this value when started */
1124 	blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES;
1125 
1126 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
1127 		/* 16MiB, expressed in 512 Bytes */
1128 		blkcfg.max_discard_sectors = 32768;
1129 		blkcfg.max_discard_seg = 1;
1130 		blkcfg.discard_sector_alignment = blk_size / 512;
1131 	}
1132 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
1133 		blkcfg.max_write_zeroes_sectors = 32768;
1134 		blkcfg.max_write_zeroes_seg = 1;
1135 	}
1136 
1137 	memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg)));
1138 
1139 	return 0;
1140 }
1141 
1142 static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
1143 	.session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session),
1144 	.start_session =  vhost_blk_start,
1145 	.stop_session = vhost_blk_stop,
1146 	.vhost_get_config = vhost_blk_get_config,
1147 	.dump_info_json = vhost_blk_dump_info_json,
1148 	.write_config_json = vhost_blk_write_config_json,
1149 	.remove_device = vhost_blk_destroy,
1150 };
1151 
1152 int
1153 vhost_blk_controller_construct(void)
1154 {
1155 	struct spdk_conf_section *sp;
1156 	unsigned ctrlr_num;
1157 	char *bdev_name;
1158 	char *cpumask;
1159 	char *name;
1160 	bool readonly;
1161 	bool packed_ring;
1162 
1163 	for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) {
1164 		if (!spdk_conf_section_match_prefix(sp, "VhostBlk")) {
1165 			continue;
1166 		}
1167 
1168 		if (sscanf(spdk_conf_section_get_name(sp), "VhostBlk%u", &ctrlr_num) != 1) {
1169 			SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
1170 				    spdk_conf_section_get_name(sp));
1171 			return -1;
1172 		}
1173 
1174 		name = spdk_conf_section_get_val(sp, "Name");
1175 		if (name == NULL) {
1176 			SPDK_ERRLOG("VhostBlk%u: missing Name\n", ctrlr_num);
1177 			return -1;
1178 		}
1179 
1180 		cpumask = spdk_conf_section_get_val(sp, "Cpumask");
1181 		readonly = spdk_conf_section_get_boolval(sp, "ReadOnly", false);
1182 		packed_ring = spdk_conf_section_get_boolval(sp, "PackedRing", false);
1183 
1184 		bdev_name = spdk_conf_section_get_val(sp, "Dev");
1185 		if (bdev_name == NULL) {
1186 			continue;
1187 		}
1188 
1189 		if (spdk_vhost_blk_construct(name, cpumask, bdev_name,
1190 					     readonly, packed_ring) < 0) {
1191 			return -1;
1192 		}
1193 	}
1194 
1195 	return 0;
1196 }
1197 
1198 int
1199 spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name,
1200 			 bool readonly, bool packed_ring)
1201 {
1202 	struct spdk_vhost_blk_dev *bvdev = NULL;
1203 	struct spdk_vhost_dev *vdev;
1204 	struct spdk_bdev *bdev;
1205 	int ret = 0;
1206 
1207 	spdk_vhost_lock();
1208 	bdev = spdk_bdev_get_by_name(dev_name);
1209 	if (bdev == NULL) {
1210 		SPDK_ERRLOG("%s: bdev '%s' not found\n",
1211 			    name, dev_name);
1212 		ret = -ENODEV;
1213 		goto out;
1214 	}
1215 
1216 	bvdev = calloc(1, sizeof(*bvdev));
1217 	if (bvdev == NULL) {
1218 		ret = -ENOMEM;
1219 		goto out;
1220 	}
1221 
1222 	vdev = &bvdev->vdev;
1223 	vdev->virtio_features = SPDK_VHOST_BLK_FEATURES_BASE;
1224 	vdev->disabled_features = SPDK_VHOST_BLK_DISABLED_FEATURES;
1225 	vdev->protocol_features = SPDK_VHOST_BLK_PROTOCOL_FEATURES;
1226 
1227 	vdev->virtio_features |= (uint64_t)packed_ring << VIRTIO_F_RING_PACKED;
1228 
1229 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
1230 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_DISCARD);
1231 	}
1232 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
1233 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
1234 	}
1235 	if (readonly) {
1236 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_RO);
1237 	}
1238 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) {
1239 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_FLUSH);
1240 	}
1241 
1242 	ret = spdk_bdev_open(bdev, true, bdev_remove_cb, bvdev, &bvdev->bdev_desc);
1243 	if (ret != 0) {
1244 		SPDK_ERRLOG("%s: could not open bdev '%s', error=%d\n",
1245 			    name, dev_name, ret);
1246 		goto out;
1247 	}
1248 
1249 	bvdev->bdev = bdev;
1250 	bvdev->readonly = readonly;
1251 	ret = vhost_dev_register(vdev, name, cpumask, &vhost_blk_device_backend);
1252 	if (ret != 0) {
1253 		spdk_bdev_close(bvdev->bdev_desc);
1254 		goto out;
1255 	}
1256 
1257 	SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: using bdev '%s'\n", name, dev_name);
1258 out:
1259 	if (ret != 0 && bvdev) {
1260 		free(bvdev);
1261 	}
1262 	spdk_vhost_unlock();
1263 	return ret;
1264 }
1265 
1266 static int
1267 vhost_blk_destroy(struct spdk_vhost_dev *vdev)
1268 {
1269 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
1270 	int rc;
1271 
1272 	assert(bvdev != NULL);
1273 	rc = vhost_dev_unregister(&bvdev->vdev);
1274 	if (rc != 0) {
1275 		return rc;
1276 	}
1277 
1278 	if (bvdev->bdev_desc) {
1279 		spdk_bdev_close(bvdev->bdev_desc);
1280 		bvdev->bdev_desc = NULL;
1281 	}
1282 	bvdev->bdev = NULL;
1283 
1284 	free(bvdev);
1285 	return 0;
1286 }
1287 
1288 SPDK_LOG_REGISTER_COMPONENT("vhost_blk", SPDK_LOG_VHOST_BLK)
1289 SPDK_LOG_REGISTER_COMPONENT("vhost_blk_data", SPDK_LOG_VHOST_BLK_DATA)
1290