xref: /spdk/lib/vhost/vhost_blk.c (revision 72f8c6a1f3f4aa1b3c373ced13e8d0ec06825ddc)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <linux/virtio_blk.h>
35 
36 #include "spdk/env.h"
37 #include "spdk/bdev.h"
38 #include "spdk/conf.h"
39 #include "spdk/thread.h"
40 #include "spdk/likely.h"
41 #include "spdk/string.h"
42 #include "spdk/util.h"
43 #include "spdk/vhost.h"
44 
45 #include "vhost_internal.h"
46 
47 struct spdk_vhost_blk_task {
48 	struct spdk_bdev_io *bdev_io;
49 	struct spdk_vhost_blk_dev *bvdev;
50 	struct spdk_vhost_virtqueue *vq;
51 
52 	volatile uint8_t *status;
53 
54 	uint16_t req_idx;
55 
56 	/* for io wait */
57 	struct spdk_bdev_io_wait_entry bdev_io_wait;
58 
59 	/* If set, the task is currently used for I/O processing. */
60 	bool used;
61 
62 	/** Number of bytes that were written. */
63 	uint32_t used_len;
64 	uint16_t iovcnt;
65 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
66 };
67 
68 struct spdk_vhost_blk_dev {
69 	struct spdk_vhost_dev vdev;
70 	struct spdk_bdev *bdev;
71 	struct spdk_bdev_desc *bdev_desc;
72 	struct spdk_io_channel *bdev_io_channel;
73 	struct spdk_poller *requestq_poller;
74 	struct spdk_vhost_dev_destroy_ctx destroy_ctx;
75 	bool readonly;
76 };
77 
78 /* forward declaration */
79 static const struct spdk_vhost_dev_backend vhost_blk_device_backend;
80 
81 static int
82 process_blk_request(struct spdk_vhost_blk_task *task, struct spdk_vhost_blk_dev *bvdev,
83 		    struct spdk_vhost_virtqueue *vq);
84 
85 static void
86 blk_task_finish(struct spdk_vhost_blk_task *task)
87 {
88 	assert(task->bvdev->vdev.task_cnt > 0);
89 	task->bvdev->vdev.task_cnt--;
90 	task->used = false;
91 }
92 
93 static void
94 invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status)
95 {
96 	if (task->status) {
97 		*task->status = status;
98 	}
99 
100 	spdk_vhost_vq_used_ring_enqueue(&task->bvdev->vdev, task->vq, task->req_idx,
101 					task->used_len);
102 	blk_task_finish(task);
103 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Invalid request (status=%" PRIu8")\n", status);
104 }
105 
106 /*
107  * Process task's descriptor chain and setup data related fields.
108  * Return
109  *   total size of suplied buffers
110  *
111  *   FIXME: Make this function return to rd_cnt and wr_cnt
112  */
113 static int
114 blk_iovs_setup(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
115 	       struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
116 {
117 	struct vring_desc *desc, *desc_table;
118 	uint16_t out_cnt = 0, cnt = 0;
119 	uint32_t desc_table_size, len = 0;
120 	int rc;
121 
122 	rc = spdk_vhost_vq_get_desc(vdev, vq, req_idx, &desc, &desc_table, &desc_table_size);
123 	if (rc != 0) {
124 		SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
125 		return -1;
126 	}
127 
128 	while (1) {
129 		/*
130 		 * Maximum cnt reached?
131 		 * Should not happen if request is well formatted, otherwise this is a BUG.
132 		 */
133 		if (spdk_unlikely(cnt == *iovs_cnt)) {
134 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Max IOVs in request reached (req_idx = %"PRIu16").\n",
135 				      req_idx);
136 			return -1;
137 		}
138 
139 		if (spdk_unlikely(spdk_vhost_vring_desc_to_iov(vdev, iovs, &cnt, desc))) {
140 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
141 				      req_idx, cnt);
142 			return -1;
143 		}
144 
145 		len += desc->len;
146 
147 		out_cnt += spdk_vhost_vring_desc_is_wr(desc);
148 
149 		rc = spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
150 		if (rc != 0) {
151 			SPDK_ERRLOG("%s: Descriptor chain at index %"PRIu16" terminated unexpectedly.\n",
152 				    vdev->name, req_idx);
153 			return -1;
154 		} else if (desc == NULL) {
155 			break;
156 		}
157 	}
158 
159 	/*
160 	 * There must be least two descriptors.
161 	 * First contain request so it must be readable.
162 	 * Last descriptor contain buffer for response so it must be writable.
163 	 */
164 	if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
165 		return -1;
166 	}
167 
168 	*length = len;
169 	*iovs_cnt = cnt;
170 	return 0;
171 }
172 
173 static void
174 blk_request_finish(bool success, struct spdk_vhost_blk_task *task)
175 {
176 	*task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
177 	spdk_vhost_vq_used_ring_enqueue(&task->bvdev->vdev, task->vq, task->req_idx,
178 					task->used_len);
179 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Finished task (%p) req_idx=%d\n status: %s\n", task,
180 		      task->req_idx, success ? "OK" : "FAIL");
181 	blk_task_finish(task);
182 }
183 
184 static void
185 blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
186 {
187 	struct spdk_vhost_blk_task *task = cb_arg;
188 
189 	spdk_bdev_free_io(bdev_io);
190 	blk_request_finish(success, task);
191 }
192 
193 static void
194 blk_request_resubmit(void *arg)
195 {
196 	struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg;
197 	int rc = 0;
198 
199 	rc = process_blk_request(task, task->bvdev, task->vq);
200 	if (rc == 0) {
201 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p resubmitted ======\n", task);
202 	} else {
203 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p failed ======\n", task);
204 	}
205 }
206 
207 static inline void
208 blk_request_queue_io(struct spdk_vhost_blk_task *task)
209 {
210 	int rc;
211 	struct spdk_vhost_blk_dev *bvdev = task->bvdev;
212 	struct spdk_bdev *bdev = bvdev->bdev;
213 
214 	task->bdev_io_wait.bdev = bdev;
215 	task->bdev_io_wait.cb_fn = blk_request_resubmit;
216 	task->bdev_io_wait.cb_arg = task;
217 
218 	rc = spdk_bdev_queue_io_wait(bdev, bvdev->bdev_io_channel, &task->bdev_io_wait);
219 	if (rc != 0) {
220 		SPDK_ERRLOG("Queue io failed in vhost_blk, rc=%d\n", rc);
221 		invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
222 	}
223 }
224 
225 static int
226 process_blk_request(struct spdk_vhost_blk_task *task, struct spdk_vhost_blk_dev *bvdev,
227 		    struct spdk_vhost_virtqueue *vq)
228 {
229 	const struct virtio_blk_outhdr *req;
230 	struct iovec *iov;
231 	uint32_t type;
232 	uint32_t payload_len;
233 	int rc;
234 
235 	if (blk_iovs_setup(&bvdev->vdev, vq, task->req_idx, task->iovs, &task->iovcnt, &payload_len)) {
236 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
237 		/* Only READ and WRITE are supported for now. */
238 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
239 		return -1;
240 	}
241 
242 	iov = &task->iovs[0];
243 	if (spdk_unlikely(iov->iov_len != sizeof(*req))) {
244 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
245 			      "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n",
246 			      iov->iov_len, sizeof(*req), task->req_idx);
247 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
248 		return -1;
249 	}
250 
251 	req = iov->iov_base;
252 
253 	iov = &task->iovs[task->iovcnt - 1];
254 	if (spdk_unlikely(iov->iov_len != 1)) {
255 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
256 			      "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n",
257 			      iov->iov_len, 1, task->req_idx);
258 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
259 		return -1;
260 	}
261 
262 	task->status = iov->iov_base;
263 	payload_len -= sizeof(*req) + sizeof(*task->status);
264 	task->iovcnt -= 2;
265 
266 	type = req->type;
267 #ifdef VIRTIO_BLK_T_BARRIER
268 	/* Don't care about barier for now (as QEMU's virtio-blk do). */
269 	type &= ~VIRTIO_BLK_T_BARRIER;
270 #endif
271 
272 	switch (type) {
273 	case VIRTIO_BLK_T_IN:
274 	case VIRTIO_BLK_T_OUT:
275 		if (spdk_unlikely((payload_len & (512 - 1)) != 0)) {
276 			SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n",
277 				    type ? "WRITE" : "READ", task->req_idx);
278 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
279 			return -1;
280 		}
281 
282 		if (type == VIRTIO_BLK_T_IN) {
283 			task->used_len = payload_len + sizeof(*task->status);
284 			rc = spdk_bdev_readv(bvdev->bdev_desc, bvdev->bdev_io_channel,
285 					     &task->iovs[1], task->iovcnt, req->sector * 512,
286 					     payload_len, blk_request_complete_cb, task);
287 		} else if (!bvdev->readonly) {
288 			task->used_len = sizeof(*task->status);
289 			rc = spdk_bdev_writev(bvdev->bdev_desc, bvdev->bdev_io_channel,
290 					      &task->iovs[1], task->iovcnt, req->sector * 512,
291 					      payload_len, blk_request_complete_cb, task);
292 		} else {
293 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Device is in read-only mode!\n");
294 			rc = -1;
295 		}
296 
297 		if (rc) {
298 			if (rc == -ENOMEM) {
299 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
300 				blk_request_queue_io(task);
301 			} else {
302 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
303 				return -1;
304 			}
305 		}
306 		break;
307 	case VIRTIO_BLK_T_GET_ID:
308 		if (!task->iovcnt || !payload_len) {
309 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
310 			return -1;
311 		}
312 		task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len);
313 		spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev),
314 				task->used_len, ' ');
315 		blk_request_finish(true, task);
316 		break;
317 	default:
318 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Not supported request type '%"PRIu32"'.\n", type);
319 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
320 		return -1;
321 	}
322 
323 	return 0;
324 }
325 
326 static void
327 process_vq(struct spdk_vhost_blk_dev *bvdev, struct spdk_vhost_virtqueue *vq)
328 {
329 	struct spdk_vhost_blk_task *task;
330 	int rc;
331 	uint16_t reqs[32];
332 	uint16_t reqs_cnt, i;
333 
334 	reqs_cnt = spdk_vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
335 	if (!reqs_cnt) {
336 		return;
337 	}
338 
339 	for (i = 0; i < reqs_cnt; i++) {
340 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n",
341 			      reqs[i]);
342 
343 		if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
344 			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
345 				    bvdev->vdev.name, reqs[i], vq->vring.size);
346 			spdk_vhost_vq_used_ring_enqueue(&bvdev->vdev, vq, reqs[i], 0);
347 			continue;
348 		}
349 
350 		task = &((struct spdk_vhost_blk_task *)vq->tasks)[reqs[i]];
351 		if (spdk_unlikely(task->used)) {
352 			SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
353 				    bvdev->vdev.name, reqs[i]);
354 			spdk_vhost_vq_used_ring_enqueue(&bvdev->vdev, vq, reqs[i], 0);
355 			continue;
356 		}
357 
358 		bvdev->vdev.task_cnt++;
359 
360 		task->used = true;
361 		task->iovcnt = SPDK_COUNTOF(task->iovs);
362 		task->status = NULL;
363 		task->used_len = 0;
364 
365 		rc = process_blk_request(task, bvdev, vq);
366 		if (rc == 0) {
367 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task,
368 				      reqs[i]);
369 		} else {
370 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d failed ======\n", task, reqs[i]);
371 		}
372 	}
373 }
374 
375 static int
376 vdev_worker(void *arg)
377 {
378 	struct spdk_vhost_blk_dev *bvdev = arg;
379 	uint16_t q_idx;
380 
381 	for (q_idx = 0; q_idx < bvdev->vdev.max_queues; q_idx++) {
382 		process_vq(bvdev, &bvdev->vdev.virtqueue[q_idx]);
383 	}
384 
385 	spdk_vhost_dev_used_signal(&bvdev->vdev);
386 
387 	return -1;
388 }
389 
390 static void
391 no_bdev_process_vq(struct spdk_vhost_blk_dev *bvdev, struct spdk_vhost_virtqueue *vq)
392 {
393 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
394 	uint32_t length;
395 	uint16_t iovcnt, req_idx;
396 
397 	if (spdk_vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) {
398 		return;
399 	}
400 
401 	iovcnt = SPDK_COUNTOF(iovs);
402 	if (blk_iovs_setup(&bvdev->vdev, vq, req_idx, iovs, &iovcnt, &length) == 0) {
403 		*(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR;
404 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx);
405 	}
406 
407 	spdk_vhost_vq_used_ring_enqueue(&bvdev->vdev, vq, req_idx, 0);
408 }
409 
410 static int
411 no_bdev_vdev_worker(void *arg)
412 {
413 	struct spdk_vhost_blk_dev *bvdev = arg;
414 	uint16_t q_idx;
415 
416 	for (q_idx = 0; q_idx < bvdev->vdev.max_queues; q_idx++) {
417 		no_bdev_process_vq(bvdev, &bvdev->vdev.virtqueue[q_idx]);
418 	}
419 
420 	spdk_vhost_dev_used_signal(&bvdev->vdev);
421 
422 	if (bvdev->vdev.task_cnt == 0 && bvdev->bdev_io_channel) {
423 		spdk_put_io_channel(bvdev->bdev_io_channel);
424 		bvdev->bdev_io_channel = NULL;
425 	}
426 
427 	return -1;
428 }
429 
430 static struct spdk_vhost_blk_dev *
431 to_blk_dev(struct spdk_vhost_dev *vdev)
432 {
433 	if (vdev == NULL) {
434 		return NULL;
435 	}
436 
437 	if (vdev->backend != &vhost_blk_device_backend) {
438 		SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name);
439 		return NULL;
440 	}
441 
442 	return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev);
443 }
444 
445 struct spdk_bdev *
446 spdk_vhost_blk_get_dev(struct spdk_vhost_dev *vdev)
447 {
448 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
449 
450 	assert(bvdev != NULL);
451 	return bvdev->bdev;
452 }
453 
454 static int
455 _bdev_remove_cb(struct spdk_vhost_dev *vdev, void *arg)
456 {
457 	struct spdk_vhost_blk_dev *bvdev = arg;
458 
459 	SPDK_WARNLOG("Controller %s: Hot-removing bdev - all further requests will fail.\n",
460 		     bvdev->vdev.name);
461 	if (bvdev->requestq_poller) {
462 		spdk_poller_unregister(&bvdev->requestq_poller);
463 		bvdev->requestq_poller = spdk_poller_register(no_bdev_vdev_worker, bvdev, 0);
464 	}
465 
466 	spdk_bdev_close(bvdev->bdev_desc);
467 	bvdev->bdev_desc = NULL;
468 	bvdev->bdev = NULL;
469 	return 0;
470 }
471 
472 static void
473 bdev_remove_cb(void *remove_ctx)
474 {
475 	struct spdk_vhost_blk_dev *bvdev = remove_ctx;
476 
477 	spdk_vhost_call_external_event(bvdev->vdev.name, _bdev_remove_cb, bvdev);
478 }
479 
480 static void
481 free_task_pool(struct spdk_vhost_blk_dev *bvdev)
482 {
483 	struct spdk_vhost_virtqueue *vq;
484 	uint16_t i;
485 
486 	for (i = 0; i < bvdev->vdev.max_queues; i++) {
487 		vq = &bvdev->vdev.virtqueue[i];
488 		if (vq->tasks == NULL) {
489 			continue;
490 		}
491 
492 		spdk_dma_free(vq->tasks);
493 		vq->tasks = NULL;
494 	}
495 }
496 
497 static int
498 alloc_task_pool(struct spdk_vhost_blk_dev *bvdev)
499 {
500 	struct spdk_vhost_virtqueue *vq;
501 	struct spdk_vhost_blk_task *task;
502 	uint32_t task_cnt;
503 	uint16_t i;
504 	uint32_t j;
505 
506 	for (i = 0; i < bvdev->vdev.max_queues; i++) {
507 		vq = &bvdev->vdev.virtqueue[i];
508 		if (vq->vring.desc == NULL) {
509 			continue;
510 		}
511 
512 		task_cnt = vq->vring.size;
513 		if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
514 			/* sanity check */
515 			SPDK_ERRLOG("Controller %s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
516 				    bvdev->vdev.name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
517 			free_task_pool(bvdev);
518 			return -1;
519 		}
520 		vq->tasks = spdk_dma_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt,
521 					     SPDK_CACHE_LINE_SIZE, NULL);
522 		if (vq->tasks == NULL) {
523 			SPDK_ERRLOG("Controller %s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
524 				    bvdev->vdev.name, task_cnt, i);
525 			free_task_pool(bvdev);
526 			return -1;
527 		}
528 
529 		for (j = 0; j < task_cnt; j++) {
530 			task = &((struct spdk_vhost_blk_task *)vq->tasks)[j];
531 			task->bvdev = bvdev;
532 			task->req_idx = j;
533 			task->vq = vq;
534 		}
535 	}
536 
537 	return 0;
538 }
539 
540 /*
541  * A new device is added to a data core. First the device is added to the main linked list
542  * and then allocated to a specific data core.
543  *
544  */
545 static int
546 spdk_vhost_blk_start(struct spdk_vhost_dev *vdev, void *event_ctx)
547 {
548 	struct spdk_vhost_blk_dev *bvdev;
549 	int i, rc = 0;
550 
551 	bvdev = to_blk_dev(vdev);
552 	if (bvdev == NULL) {
553 		SPDK_ERRLOG("Trying to start non-blk controller as a blk one.\n");
554 		rc = -1;
555 		goto out;
556 	}
557 
558 	/* validate all I/O queues are in a contiguous index range */
559 	for (i = 0; i < vdev->max_queues; i++) {
560 		if (vdev->virtqueue[i].vring.desc == NULL) {
561 			SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vdev->name, i);
562 			rc = -1;
563 			goto out;
564 		}
565 	}
566 
567 	rc = alloc_task_pool(bvdev);
568 	if (rc != 0) {
569 		SPDK_ERRLOG("%s: failed to alloc task pool.\n", bvdev->vdev.name);
570 		goto out;
571 	}
572 
573 	if (bvdev->bdev) {
574 		bvdev->bdev_io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
575 		if (!bvdev->bdev_io_channel) {
576 			free_task_pool(bvdev);
577 			SPDK_ERRLOG("Controller %s: IO channel allocation failed\n", vdev->name);
578 			rc = -1;
579 			goto out;
580 		}
581 	}
582 
583 	bvdev->requestq_poller = spdk_poller_register(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker,
584 				 bvdev, 0);
585 	SPDK_INFOLOG(SPDK_LOG_VHOST, "Started poller for vhost controller %s on lcore %d\n",
586 		     vdev->name, vdev->lcore);
587 out:
588 	spdk_vhost_dev_backend_event_done(event_ctx, rc);
589 	return rc;
590 }
591 
592 static int
593 destroy_device_poller_cb(void *arg)
594 {
595 	struct spdk_vhost_blk_dev *bvdev = arg;
596 	int i;
597 
598 	if (bvdev->vdev.task_cnt > 0) {
599 		return -1;
600 	}
601 
602 	for (i = 0; i < bvdev->vdev.max_queues; i++) {
603 		bvdev->vdev.virtqueue[i].next_event_time = 0;
604 		spdk_vhost_vq_used_signal(&bvdev->vdev, &bvdev->vdev.virtqueue[i]);
605 	}
606 
607 	SPDK_INFOLOG(SPDK_LOG_VHOST, "Stopping poller for vhost controller %s\n", bvdev->vdev.name);
608 
609 	if (bvdev->bdev_io_channel) {
610 		spdk_put_io_channel(bvdev->bdev_io_channel);
611 		bvdev->bdev_io_channel = NULL;
612 	}
613 
614 	free_task_pool(bvdev);
615 	spdk_poller_unregister(&bvdev->destroy_ctx.poller);
616 	spdk_vhost_dev_backend_event_done(bvdev->destroy_ctx.event_ctx, 0);
617 
618 	return -1;
619 }
620 
621 static int
622 spdk_vhost_blk_stop(struct spdk_vhost_dev *vdev, void *event_ctx)
623 {
624 	struct spdk_vhost_blk_dev *bvdev;
625 
626 	bvdev = to_blk_dev(vdev);
627 	if (bvdev == NULL) {
628 		SPDK_ERRLOG("Trying to stop non-blk controller as a blk one.\n");
629 		goto err;
630 	}
631 
632 	bvdev->destroy_ctx.event_ctx = event_ctx;
633 	spdk_poller_unregister(&bvdev->requestq_poller);
634 	bvdev->destroy_ctx.poller = spdk_poller_register(destroy_device_poller_cb,
635 				    bvdev, 1000);
636 	return 0;
637 
638 err:
639 	spdk_vhost_dev_backend_event_done(event_ctx, -1);
640 	return -1;
641 }
642 
643 static void
644 spdk_vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
645 {
646 	struct spdk_bdev *bdev = spdk_vhost_blk_get_dev(vdev);
647 	struct spdk_vhost_blk_dev *bvdev;
648 
649 	bvdev = to_blk_dev(vdev);
650 	if (bvdev == NULL) {
651 		return;
652 	}
653 
654 	assert(bvdev != NULL);
655 	spdk_json_write_name(w, "block");
656 	spdk_json_write_object_begin(w);
657 
658 	spdk_json_write_name(w, "readonly");
659 	spdk_json_write_bool(w, bvdev->readonly);
660 
661 	spdk_json_write_name(w, "bdev");
662 	if (bdev) {
663 		spdk_json_write_string(w, spdk_bdev_get_name(bdev));
664 	} else {
665 		spdk_json_write_null(w);
666 	}
667 
668 	spdk_json_write_object_end(w);
669 }
670 
671 static void
672 spdk_vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
673 {
674 	struct spdk_vhost_blk_dev *bvdev;
675 
676 	bvdev = to_blk_dev(vdev);
677 	if (bvdev == NULL) {
678 		return;
679 	}
680 
681 	if (!bvdev->bdev) {
682 		return;
683 	}
684 
685 	spdk_json_write_object_begin(w);
686 	spdk_json_write_named_string(w, "method", "construct_vhost_blk_controller");
687 
688 	spdk_json_write_named_object_begin(w, "params");
689 	spdk_json_write_named_string(w, "ctrlr", vdev->name);
690 	spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev));
691 	spdk_json_write_named_string(w, "cpumask", spdk_cpuset_fmt(vdev->cpumask));
692 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
693 	spdk_json_write_object_end(w);
694 
695 	spdk_json_write_object_end(w);
696 }
697 
698 static int spdk_vhost_blk_destroy(struct spdk_vhost_dev *dev);
699 
700 static int
701 spdk_vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
702 			  uint32_t len)
703 {
704 	struct virtio_blk_config *blkcfg = (struct virtio_blk_config *)config;
705 	struct spdk_vhost_blk_dev *bvdev;
706 	struct spdk_bdev *bdev;
707 	uint32_t blk_size;
708 	uint64_t blkcnt;
709 
710 	bvdev = to_blk_dev(vdev);
711 	if (bvdev == NULL) {
712 		SPDK_ERRLOG("Trying to get virito_blk configuration failed\n");
713 		return -1;
714 	}
715 
716 	if (len < sizeof(*blkcfg)) {
717 		return -1;
718 	}
719 
720 	bdev = bvdev->bdev;
721 	if (bdev == NULL) {
722 		/* We can't just return -1 here as this GET_CONFIG message might
723 		 * be caused by a QEMU VM reboot. Returning -1 will indicate an
724 		 * error to QEMU, who might then decide to terminate itself.
725 		 * We don't want that. A simple reboot shouldn't break the system.
726 		 *
727 		 * Presenting a block device with block size 0 and block count 0
728 		 * doesn't cause any problems on QEMU side and the virtio-pci
729 		 * device is even still available inside the VM, but there will
730 		 * be no block device created for it - the kernel drivers will
731 		 * silently reject it.
732 		 */
733 		blk_size = 0;
734 		blkcnt = 0;
735 	} else {
736 		blk_size = spdk_bdev_get_block_size(bdev);
737 		blkcnt = spdk_bdev_get_num_blocks(bdev);
738 	}
739 
740 	memset(blkcfg, 0, sizeof(*blkcfg));
741 	blkcfg->blk_size = blk_size;
742 	/* minimum I/O size in blocks */
743 	blkcfg->min_io_size = 1;
744 	/* expressed in 512 Bytes sectors */
745 	blkcfg->capacity = (blkcnt * blk_size) / 512;
746 	blkcfg->size_max = 131072;
747 	/*  -2 for REQ and RESP and -1 for region boundary splitting */
748 	blkcfg->seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
749 	/* QEMU can overwrite this value when started */
750 	blkcfg->num_queues = SPDK_VHOST_MAX_VQUEUES;
751 
752 	return 0;
753 }
754 
755 static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
756 	.virtio_features = SPDK_VHOST_FEATURES |
757 	(1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) |
758 	(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_RO) |
759 	(1ULL << VIRTIO_BLK_F_BLK_SIZE) | (1ULL << VIRTIO_BLK_F_TOPOLOGY) |
760 	(1ULL << VIRTIO_BLK_F_BARRIER)  | (1ULL << VIRTIO_BLK_F_SCSI) |
761 	(1ULL << VIRTIO_BLK_F_FLUSH)    | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) |
762 	(1ULL << VIRTIO_BLK_F_MQ),
763 	.disabled_features = SPDK_VHOST_DISABLED_FEATURES | (1ULL << VIRTIO_BLK_F_GEOMETRY) |
764 	(1ULL << VIRTIO_BLK_F_RO) | (1ULL << VIRTIO_BLK_F_FLUSH) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) |
765 	(1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI),
766 	.start_device =  spdk_vhost_blk_start,
767 	.stop_device = spdk_vhost_blk_stop,
768 	.vhost_get_config = spdk_vhost_blk_get_config,
769 	.dump_info_json = spdk_vhost_blk_dump_info_json,
770 	.write_config_json = spdk_vhost_blk_write_config_json,
771 	.remove_device = spdk_vhost_blk_destroy,
772 };
773 
774 int
775 spdk_vhost_blk_controller_construct(void)
776 {
777 	struct spdk_conf_section *sp;
778 	unsigned ctrlr_num;
779 	char *bdev_name;
780 	char *cpumask;
781 	char *name;
782 	bool readonly;
783 
784 	for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) {
785 		if (!spdk_conf_section_match_prefix(sp, "VhostBlk")) {
786 			continue;
787 		}
788 
789 		if (sscanf(spdk_conf_section_get_name(sp), "VhostBlk%u", &ctrlr_num) != 1) {
790 			SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
791 				    spdk_conf_section_get_name(sp));
792 			return -1;
793 		}
794 
795 		name = spdk_conf_section_get_val(sp, "Name");
796 		if (name == NULL) {
797 			SPDK_ERRLOG("VhostBlk%u: missing Name\n", ctrlr_num);
798 			return -1;
799 		}
800 
801 		cpumask = spdk_conf_section_get_val(sp, "Cpumask");
802 		readonly = spdk_conf_section_get_boolval(sp, "ReadOnly", false);
803 
804 		bdev_name = spdk_conf_section_get_val(sp, "Dev");
805 		if (bdev_name == NULL) {
806 			continue;
807 		}
808 
809 		if (spdk_vhost_blk_construct(name, cpumask, bdev_name, readonly) < 0) {
810 			return -1;
811 		}
812 	}
813 
814 	return 0;
815 }
816 
817 int
818 spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name, bool readonly)
819 {
820 	struct spdk_vhost_blk_dev *bvdev = NULL;
821 	struct spdk_bdev *bdev;
822 	int ret = 0;
823 
824 	spdk_vhost_lock();
825 	bdev = spdk_bdev_get_by_name(dev_name);
826 	if (bdev == NULL) {
827 		SPDK_ERRLOG("Controller %s: bdev '%s' not found\n",
828 			    name, dev_name);
829 		ret = -ENODEV;
830 		goto out;
831 	}
832 
833 	bvdev = spdk_dma_zmalloc(sizeof(*bvdev), SPDK_CACHE_LINE_SIZE, NULL);
834 	if (bvdev == NULL) {
835 		ret = -ENOMEM;
836 		goto out;
837 	}
838 
839 	ret = spdk_bdev_open(bdev, true, bdev_remove_cb, bvdev, &bvdev->bdev_desc);
840 	if (ret != 0) {
841 		SPDK_ERRLOG("Controller %s: could not open bdev '%s', error=%d\n",
842 			    name, dev_name, ret);
843 		goto out;
844 	}
845 
846 	bvdev->bdev = bdev;
847 	bvdev->readonly = readonly;
848 	ret = spdk_vhost_dev_register(&bvdev->vdev, name, cpumask, &vhost_blk_device_backend);
849 	if (ret != 0) {
850 		spdk_bdev_close(bvdev->bdev_desc);
851 		goto out;
852 	}
853 
854 	if (readonly && rte_vhost_driver_enable_features(bvdev->vdev.path, (1ULL << VIRTIO_BLK_F_RO))) {
855 		SPDK_ERRLOG("Controller %s: failed to set as a readonly\n", name);
856 		spdk_bdev_close(bvdev->bdev_desc);
857 
858 		if (spdk_vhost_dev_unregister(&bvdev->vdev) != 0) {
859 			SPDK_ERRLOG("Controller %s: failed to remove controller\n", name);
860 		}
861 
862 		ret = -1;
863 		goto out;
864 	}
865 
866 	SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: using bdev '%s'\n", name, dev_name);
867 out:
868 	if (ret != 0 && bvdev) {
869 		spdk_dma_free(bvdev);
870 	}
871 	spdk_vhost_unlock();
872 	return ret;
873 }
874 
875 static int
876 spdk_vhost_blk_destroy(struct spdk_vhost_dev *vdev)
877 {
878 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
879 	int rc;
880 
881 	if (!bvdev) {
882 		return -EINVAL;
883 	}
884 
885 	rc = spdk_vhost_dev_unregister(&bvdev->vdev);
886 	if (rc != 0) {
887 		return rc;
888 	}
889 
890 	if (bvdev->bdev_desc) {
891 		spdk_bdev_close(bvdev->bdev_desc);
892 		bvdev->bdev_desc = NULL;
893 	}
894 	bvdev->bdev = NULL;
895 
896 	spdk_dma_free(bvdev);
897 	return 0;
898 }
899 
900 SPDK_LOG_REGISTER_COMPONENT("vhost_blk", SPDK_LOG_VHOST_BLK)
901 SPDK_LOG_REGISTER_COMPONENT("vhost_blk_data", SPDK_LOG_VHOST_BLK_DATA)
902