xref: /spdk/lib/vhost/vhost_blk.c (revision 5977aad8f7486552c94c5cc93ea9bb110e1cb5d0)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <linux/virtio_blk.h>
35 
36 #include "spdk/env.h"
37 #include "spdk/bdev.h"
38 #include "spdk/conf.h"
39 #include "spdk/thread.h"
40 #include "spdk/likely.h"
41 #include "spdk/string.h"
42 #include "spdk/util.h"
43 #include "spdk/vhost.h"
44 
45 #include "vhost_internal.h"
46 
47 struct spdk_vhost_blk_task {
48 	struct spdk_bdev_io *bdev_io;
49 	struct spdk_vhost_blk_dev *bvdev;
50 	struct spdk_vhost_virtqueue *vq;
51 
52 	volatile uint8_t *status;
53 
54 	uint16_t req_idx;
55 
56 	/* for io wait */
57 	struct spdk_bdev_io_wait_entry bdev_io_wait;
58 
59 	/* If set, the task is currently used for I/O processing. */
60 	bool used;
61 
62 	/** Number of bytes that were written. */
63 	uint32_t used_len;
64 	uint16_t iovcnt;
65 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
66 };
67 
68 struct spdk_vhost_blk_dev {
69 	struct spdk_vhost_dev vdev;
70 	struct spdk_bdev *bdev;
71 	struct spdk_bdev_desc *bdev_desc;
72 	struct spdk_io_channel *bdev_io_channel;
73 	struct spdk_poller *requestq_poller;
74 	struct spdk_vhost_dev_destroy_ctx destroy_ctx;
75 	bool readonly;
76 };
77 
78 /* forward declaration */
79 static const struct spdk_vhost_dev_backend vhost_blk_device_backend;
80 
81 static int
82 process_blk_request(struct spdk_vhost_blk_task *task, struct spdk_vhost_blk_dev *bvdev,
83 		    struct spdk_vhost_virtqueue *vq);
84 
85 static void
86 blk_task_finish(struct spdk_vhost_blk_task *task)
87 {
88 	assert(task->bvdev->vdev.session->task_cnt > 0);
89 	task->bvdev->vdev.session->task_cnt--;
90 	task->used = false;
91 }
92 
93 static void
94 invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status)
95 {
96 	if (task->status) {
97 		*task->status = status;
98 	}
99 
100 	spdk_vhost_vq_used_ring_enqueue(task->bvdev->vdev.session, task->vq, task->req_idx,
101 					task->used_len);
102 	blk_task_finish(task);
103 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Invalid request (status=%" PRIu8")\n", status);
104 }
105 
106 /*
107  * Process task's descriptor chain and setup data related fields.
108  * Return
109  *   total size of suplied buffers
110  *
111  *   FIXME: Make this function return to rd_cnt and wr_cnt
112  */
113 static int
114 blk_iovs_setup(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
115 	       struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
116 {
117 	struct vring_desc *desc, *desc_table;
118 	uint16_t out_cnt = 0, cnt = 0;
119 	uint32_t desc_table_size, len = 0;
120 	int rc;
121 
122 	rc = spdk_vhost_vq_get_desc(vdev->session, vq, req_idx, &desc, &desc_table, &desc_table_size);
123 	if (rc != 0) {
124 		SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
125 		return -1;
126 	}
127 
128 	while (1) {
129 		/*
130 		 * Maximum cnt reached?
131 		 * Should not happen if request is well formatted, otherwise this is a BUG.
132 		 */
133 		if (spdk_unlikely(cnt == *iovs_cnt)) {
134 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Max IOVs in request reached (req_idx = %"PRIu16").\n",
135 				      req_idx);
136 			return -1;
137 		}
138 
139 		if (spdk_unlikely(spdk_vhost_vring_desc_to_iov(vdev->session, iovs, &cnt, desc))) {
140 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
141 				      req_idx, cnt);
142 			return -1;
143 		}
144 
145 		len += desc->len;
146 
147 		out_cnt += spdk_vhost_vring_desc_is_wr(desc);
148 
149 		rc = spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
150 		if (rc != 0) {
151 			SPDK_ERRLOG("%s: Descriptor chain at index %"PRIu16" terminated unexpectedly.\n",
152 				    vdev->name, req_idx);
153 			return -1;
154 		} else if (desc == NULL) {
155 			break;
156 		}
157 	}
158 
159 	/*
160 	 * There must be least two descriptors.
161 	 * First contain request so it must be readable.
162 	 * Last descriptor contain buffer for response so it must be writable.
163 	 */
164 	if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
165 		return -1;
166 	}
167 
168 	*length = len;
169 	*iovs_cnt = cnt;
170 	return 0;
171 }
172 
173 static void
174 blk_request_finish(bool success, struct spdk_vhost_blk_task *task)
175 {
176 	*task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
177 	spdk_vhost_vq_used_ring_enqueue(task->bvdev->vdev.session, task->vq, task->req_idx,
178 					task->used_len);
179 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Finished task (%p) req_idx=%d\n status: %s\n", task,
180 		      task->req_idx, success ? "OK" : "FAIL");
181 	blk_task_finish(task);
182 }
183 
184 static void
185 blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
186 {
187 	struct spdk_vhost_blk_task *task = cb_arg;
188 
189 	spdk_bdev_free_io(bdev_io);
190 	blk_request_finish(success, task);
191 }
192 
193 static void
194 blk_request_resubmit(void *arg)
195 {
196 	struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg;
197 	int rc = 0;
198 
199 	rc = process_blk_request(task, task->bvdev, task->vq);
200 	if (rc == 0) {
201 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p resubmitted ======\n", task);
202 	} else {
203 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p failed ======\n", task);
204 	}
205 }
206 
207 static inline void
208 blk_request_queue_io(struct spdk_vhost_blk_task *task)
209 {
210 	int rc;
211 	struct spdk_vhost_blk_dev *bvdev = task->bvdev;
212 	struct spdk_bdev *bdev = bvdev->bdev;
213 
214 	task->bdev_io_wait.bdev = bdev;
215 	task->bdev_io_wait.cb_fn = blk_request_resubmit;
216 	task->bdev_io_wait.cb_arg = task;
217 
218 	rc = spdk_bdev_queue_io_wait(bdev, bvdev->bdev_io_channel, &task->bdev_io_wait);
219 	if (rc != 0) {
220 		SPDK_ERRLOG("Queue io failed in vhost_blk, rc=%d\n", rc);
221 		invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
222 	}
223 }
224 
225 static int
226 process_blk_request(struct spdk_vhost_blk_task *task, struct spdk_vhost_blk_dev *bvdev,
227 		    struct spdk_vhost_virtqueue *vq)
228 {
229 	const struct virtio_blk_outhdr *req;
230 	struct virtio_blk_discard_write_zeroes *desc;
231 	struct iovec *iov;
232 	uint32_t type;
233 	uint32_t payload_len;
234 	int rc;
235 
236 	if (blk_iovs_setup(&bvdev->vdev, vq, task->req_idx, task->iovs, &task->iovcnt, &payload_len)) {
237 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
238 		/* Only READ and WRITE are supported for now. */
239 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
240 		return -1;
241 	}
242 
243 	iov = &task->iovs[0];
244 	if (spdk_unlikely(iov->iov_len != sizeof(*req))) {
245 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
246 			      "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n",
247 			      iov->iov_len, sizeof(*req), task->req_idx);
248 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
249 		return -1;
250 	}
251 
252 	req = iov->iov_base;
253 
254 	iov = &task->iovs[task->iovcnt - 1];
255 	if (spdk_unlikely(iov->iov_len != 1)) {
256 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
257 			      "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n",
258 			      iov->iov_len, 1, task->req_idx);
259 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
260 		return -1;
261 	}
262 
263 	task->status = iov->iov_base;
264 	payload_len -= sizeof(*req) + sizeof(*task->status);
265 	task->iovcnt -= 2;
266 
267 	type = req->type;
268 #ifdef VIRTIO_BLK_T_BARRIER
269 	/* Don't care about barier for now (as QEMU's virtio-blk do). */
270 	type &= ~VIRTIO_BLK_T_BARRIER;
271 #endif
272 
273 	switch (type) {
274 	case VIRTIO_BLK_T_IN:
275 	case VIRTIO_BLK_T_OUT:
276 		if (spdk_unlikely((payload_len & (512 - 1)) != 0)) {
277 			SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n",
278 				    type ? "WRITE" : "READ", task->req_idx);
279 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
280 			return -1;
281 		}
282 
283 		if (type == VIRTIO_BLK_T_IN) {
284 			task->used_len = payload_len + sizeof(*task->status);
285 			rc = spdk_bdev_readv(bvdev->bdev_desc, bvdev->bdev_io_channel,
286 					     &task->iovs[1], task->iovcnt, req->sector * 512,
287 					     payload_len, blk_request_complete_cb, task);
288 		} else if (!bvdev->readonly) {
289 			task->used_len = sizeof(*task->status);
290 			rc = spdk_bdev_writev(bvdev->bdev_desc, bvdev->bdev_io_channel,
291 					      &task->iovs[1], task->iovcnt, req->sector * 512,
292 					      payload_len, blk_request_complete_cb, task);
293 		} else {
294 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Device is in read-only mode!\n");
295 			rc = -1;
296 		}
297 
298 		if (rc) {
299 			if (rc == -ENOMEM) {
300 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
301 				blk_request_queue_io(task);
302 			} else {
303 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
304 				return -1;
305 			}
306 		}
307 		break;
308 	case VIRTIO_BLK_T_DISCARD:
309 		desc = task->iovs[1].iov_base;
310 		if (payload_len != sizeof(*desc)) {
311 			SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len);
312 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
313 			return -1;
314 		}
315 
316 		rc = spdk_bdev_unmap(bvdev->bdev_desc, bvdev->bdev_io_channel,
317 				     desc->sector * 512, desc->num_sectors * 512,
318 				     blk_request_complete_cb, task);
319 		if (rc) {
320 			if (rc == -ENOMEM) {
321 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
322 				blk_request_queue_io(task);
323 			} else {
324 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
325 				return -1;
326 			}
327 		}
328 		break;
329 	case VIRTIO_BLK_T_WRITE_ZEROES:
330 		desc = task->iovs[1].iov_base;
331 		if (payload_len != sizeof(*desc)) {
332 			SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len);
333 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
334 			return -1;
335 		}
336 
337 		/* Zeroed and Unmap the range, SPDK doen't support it. */
338 		if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
339 			SPDK_NOTICELOG("Can't support Write Zeroes with Unmap flag\n");
340 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
341 			return -1;
342 		}
343 
344 		rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvdev->bdev_io_channel,
345 					    desc->sector * 512, desc->num_sectors * 512,
346 					    blk_request_complete_cb, task);
347 		if (rc) {
348 			if (rc == -ENOMEM) {
349 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
350 				blk_request_queue_io(task);
351 			} else {
352 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
353 				return -1;
354 			}
355 		}
356 		break;
357 	case VIRTIO_BLK_T_GET_ID:
358 		if (!task->iovcnt || !payload_len) {
359 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
360 			return -1;
361 		}
362 		task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len);
363 		spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev),
364 				task->used_len, ' ');
365 		blk_request_finish(true, task);
366 		break;
367 	default:
368 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Not supported request type '%"PRIu32"'.\n", type);
369 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
370 		return -1;
371 	}
372 
373 	return 0;
374 }
375 
376 static void
377 process_vq(struct spdk_vhost_blk_dev *bvdev, struct spdk_vhost_virtqueue *vq)
378 {
379 	struct spdk_vhost_blk_task *task;
380 	struct spdk_vhost_session *vsession = bvdev->vdev.session;
381 	int rc;
382 	uint16_t reqs[32];
383 	uint16_t reqs_cnt, i;
384 
385 	reqs_cnt = spdk_vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
386 	if (!reqs_cnt) {
387 		return;
388 	}
389 
390 	for (i = 0; i < reqs_cnt; i++) {
391 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n",
392 			      reqs[i]);
393 
394 		if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
395 			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
396 				    bvdev->vdev.name, reqs[i], vq->vring.size);
397 			spdk_vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
398 			continue;
399 		}
400 
401 		task = &((struct spdk_vhost_blk_task *)vq->tasks)[reqs[i]];
402 		if (spdk_unlikely(task->used)) {
403 			SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
404 				    bvdev->vdev.name, reqs[i]);
405 			spdk_vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
406 			continue;
407 		}
408 
409 		vsession->task_cnt++;
410 
411 		task->used = true;
412 		task->iovcnt = SPDK_COUNTOF(task->iovs);
413 		task->status = NULL;
414 		task->used_len = 0;
415 
416 		rc = process_blk_request(task, bvdev, vq);
417 		if (rc == 0) {
418 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task,
419 				      reqs[i]);
420 		} else {
421 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d failed ======\n", task, reqs[i]);
422 		}
423 	}
424 }
425 
426 static int
427 vdev_worker(void *arg)
428 {
429 	struct spdk_vhost_blk_dev *bvdev = arg;
430 	struct spdk_vhost_session *vsession = bvdev->vdev.session;
431 	uint16_t q_idx;
432 
433 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
434 		process_vq(bvdev, &vsession->virtqueue[q_idx]);
435 	}
436 
437 	spdk_vhost_session_used_signal(vsession);
438 
439 	return -1;
440 }
441 
442 static void
443 no_bdev_process_vq(struct spdk_vhost_blk_dev *bvdev, struct spdk_vhost_virtqueue *vq)
444 {
445 	struct spdk_vhost_session *vsession = bvdev->vdev.session;
446 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
447 	uint32_t length;
448 	uint16_t iovcnt, req_idx;
449 
450 	if (spdk_vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) {
451 		return;
452 	}
453 
454 	iovcnt = SPDK_COUNTOF(iovs);
455 	if (blk_iovs_setup(&bvdev->vdev, vq, req_idx, iovs, &iovcnt, &length) == 0) {
456 		*(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR;
457 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx);
458 	}
459 
460 	spdk_vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
461 }
462 
463 static int
464 no_bdev_vdev_worker(void *arg)
465 {
466 	struct spdk_vhost_blk_dev *bvdev = arg;
467 	struct spdk_vhost_session *vsession = bvdev->vdev.session;
468 	uint16_t q_idx;
469 
470 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
471 		no_bdev_process_vq(bvdev, &vsession->virtqueue[q_idx]);
472 	}
473 
474 	spdk_vhost_session_used_signal(vsession);
475 
476 	if (vsession->task_cnt == 0 && bvdev->bdev_io_channel) {
477 		spdk_put_io_channel(bvdev->bdev_io_channel);
478 		bvdev->bdev_io_channel = NULL;
479 	}
480 
481 	return -1;
482 }
483 
484 static struct spdk_vhost_blk_dev *
485 to_blk_dev(struct spdk_vhost_dev *vdev)
486 {
487 	if (vdev == NULL) {
488 		return NULL;
489 	}
490 
491 	if (vdev->backend != &vhost_blk_device_backend) {
492 		SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name);
493 		return NULL;
494 	}
495 
496 	return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev);
497 }
498 
499 struct spdk_bdev *
500 spdk_vhost_blk_get_dev(struct spdk_vhost_dev *vdev)
501 {
502 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
503 
504 	assert(bvdev != NULL);
505 	return bvdev->bdev;
506 }
507 
508 static int
509 _bdev_remove_cb(struct spdk_vhost_dev *vdev, void *arg)
510 {
511 	struct spdk_vhost_blk_dev *bvdev = arg;
512 
513 	SPDK_WARNLOG("Controller %s: Hot-removing bdev - all further requests will fail.\n",
514 		     bvdev->vdev.name);
515 	if (bvdev->requestq_poller) {
516 		spdk_poller_unregister(&bvdev->requestq_poller);
517 		bvdev->requestq_poller = spdk_poller_register(no_bdev_vdev_worker, bvdev, 0);
518 	}
519 
520 	spdk_bdev_close(bvdev->bdev_desc);
521 	bvdev->bdev_desc = NULL;
522 	bvdev->bdev = NULL;
523 	return 0;
524 }
525 
526 static void
527 bdev_remove_cb(void *remove_ctx)
528 {
529 	struct spdk_vhost_blk_dev *bvdev = remove_ctx;
530 
531 	spdk_vhost_call_external_event(bvdev->vdev.name, _bdev_remove_cb, bvdev);
532 }
533 
534 static void
535 free_task_pool(struct spdk_vhost_blk_dev *bvdev)
536 {
537 	struct spdk_vhost_session *vsession = bvdev->vdev.session;
538 	struct spdk_vhost_virtqueue *vq;
539 	uint16_t i;
540 
541 	for (i = 0; i < vsession->max_queues; i++) {
542 		vq = &vsession->virtqueue[i];
543 		if (vq->tasks == NULL) {
544 			continue;
545 		}
546 
547 		spdk_dma_free(vq->tasks);
548 		vq->tasks = NULL;
549 	}
550 }
551 
552 static int
553 alloc_task_pool(struct spdk_vhost_blk_dev *bvdev)
554 {
555 	struct spdk_vhost_session *vsession = bvdev->vdev.session;
556 	struct spdk_vhost_virtqueue *vq;
557 	struct spdk_vhost_blk_task *task;
558 	uint32_t task_cnt;
559 	uint16_t i;
560 	uint32_t j;
561 
562 	for (i = 0; i < vsession->max_queues; i++) {
563 		vq = &vsession->virtqueue[i];
564 		if (vq->vring.desc == NULL) {
565 			continue;
566 		}
567 
568 		task_cnt = vq->vring.size;
569 		if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
570 			/* sanity check */
571 			SPDK_ERRLOG("Controller %s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
572 				    bvdev->vdev.name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
573 			free_task_pool(bvdev);
574 			return -1;
575 		}
576 		vq->tasks = spdk_dma_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt,
577 					     SPDK_CACHE_LINE_SIZE, NULL);
578 		if (vq->tasks == NULL) {
579 			SPDK_ERRLOG("Controller %s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
580 				    bvdev->vdev.name, task_cnt, i);
581 			free_task_pool(bvdev);
582 			return -1;
583 		}
584 
585 		for (j = 0; j < task_cnt; j++) {
586 			task = &((struct spdk_vhost_blk_task *)vq->tasks)[j];
587 			task->bvdev = bvdev;
588 			task->req_idx = j;
589 			task->vq = vq;
590 		}
591 	}
592 
593 	return 0;
594 }
595 
596 /*
597  * A new device is added to a data core. First the device is added to the main linked list
598  * and then allocated to a specific data core.
599  *
600  */
601 static int
602 spdk_vhost_blk_start(struct spdk_vhost_dev *vdev, void *event_ctx)
603 {
604 	struct spdk_vhost_blk_dev *bvdev;
605 	struct spdk_vhost_session *vsession = vdev->session;
606 	int i, rc = 0;
607 
608 	bvdev = to_blk_dev(vdev);
609 	if (bvdev == NULL) {
610 		SPDK_ERRLOG("Trying to start non-blk controller as a blk one.\n");
611 		rc = -1;
612 		goto out;
613 	}
614 
615 	/* validate all I/O queues are in a contiguous index range */
616 	for (i = 0; i < vsession->max_queues; i++) {
617 		if (vsession->virtqueue[i].vring.desc == NULL) {
618 			SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vdev->name, i);
619 			rc = -1;
620 			goto out;
621 		}
622 	}
623 
624 	rc = alloc_task_pool(bvdev);
625 	if (rc != 0) {
626 		SPDK_ERRLOG("%s: failed to alloc task pool.\n", bvdev->vdev.name);
627 		goto out;
628 	}
629 
630 	if (bvdev->bdev) {
631 		bvdev->bdev_io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
632 		if (!bvdev->bdev_io_channel) {
633 			free_task_pool(bvdev);
634 			SPDK_ERRLOG("Controller %s: IO channel allocation failed\n", vdev->name);
635 			rc = -1;
636 			goto out;
637 		}
638 	}
639 
640 	bvdev->requestq_poller = spdk_poller_register(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker,
641 				 bvdev, 0);
642 	SPDK_INFOLOG(SPDK_LOG_VHOST, "Started poller for vhost controller %s on lcore %d\n",
643 		     vdev->name, vdev->lcore);
644 out:
645 	spdk_vhost_dev_backend_event_done(event_ctx, rc);
646 	return rc;
647 }
648 
649 static int
650 destroy_device_poller_cb(void *arg)
651 {
652 	struct spdk_vhost_blk_dev *bvdev = arg;
653 	struct spdk_vhost_session *vsession = bvdev->vdev.session;
654 	int i;
655 
656 	if (vsession->task_cnt > 0) {
657 		return -1;
658 	}
659 
660 	for (i = 0; i < vsession->max_queues; i++) {
661 		vsession->virtqueue[i].next_event_time = 0;
662 		spdk_vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
663 	}
664 
665 	SPDK_INFOLOG(SPDK_LOG_VHOST, "Stopping poller for vhost controller %s\n", bvdev->vdev.name);
666 
667 	if (bvdev->bdev_io_channel) {
668 		spdk_put_io_channel(bvdev->bdev_io_channel);
669 		bvdev->bdev_io_channel = NULL;
670 	}
671 
672 	free_task_pool(bvdev);
673 	spdk_poller_unregister(&bvdev->destroy_ctx.poller);
674 	spdk_vhost_dev_backend_event_done(bvdev->destroy_ctx.event_ctx, 0);
675 
676 	return -1;
677 }
678 
679 static int
680 spdk_vhost_blk_stop(struct spdk_vhost_dev *vdev, void *event_ctx)
681 {
682 	struct spdk_vhost_blk_dev *bvdev;
683 
684 	bvdev = to_blk_dev(vdev);
685 	if (bvdev == NULL) {
686 		SPDK_ERRLOG("Trying to stop non-blk controller as a blk one.\n");
687 		goto err;
688 	}
689 
690 	bvdev->destroy_ctx.event_ctx = event_ctx;
691 	spdk_poller_unregister(&bvdev->requestq_poller);
692 	bvdev->destroy_ctx.poller = spdk_poller_register(destroy_device_poller_cb,
693 				    bvdev, 1000);
694 	return 0;
695 
696 err:
697 	spdk_vhost_dev_backend_event_done(event_ctx, -1);
698 	return -1;
699 }
700 
701 static void
702 spdk_vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
703 {
704 	struct spdk_bdev *bdev = spdk_vhost_blk_get_dev(vdev);
705 	struct spdk_vhost_blk_dev *bvdev;
706 
707 	bvdev = to_blk_dev(vdev);
708 	if (bvdev == NULL) {
709 		return;
710 	}
711 
712 	assert(bvdev != NULL);
713 	spdk_json_write_name(w, "block");
714 	spdk_json_write_object_begin(w);
715 
716 	spdk_json_write_name(w, "readonly");
717 	spdk_json_write_bool(w, bvdev->readonly);
718 
719 	spdk_json_write_name(w, "bdev");
720 	if (bdev) {
721 		spdk_json_write_string(w, spdk_bdev_get_name(bdev));
722 	} else {
723 		spdk_json_write_null(w);
724 	}
725 
726 	spdk_json_write_object_end(w);
727 }
728 
729 static void
730 spdk_vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
731 {
732 	struct spdk_vhost_blk_dev *bvdev;
733 
734 	bvdev = to_blk_dev(vdev);
735 	if (bvdev == NULL) {
736 		return;
737 	}
738 
739 	if (!bvdev->bdev) {
740 		return;
741 	}
742 
743 	spdk_json_write_object_begin(w);
744 	spdk_json_write_named_string(w, "method", "construct_vhost_blk_controller");
745 
746 	spdk_json_write_named_object_begin(w, "params");
747 	spdk_json_write_named_string(w, "ctrlr", vdev->name);
748 	spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev));
749 	spdk_json_write_named_string(w, "cpumask", spdk_cpuset_fmt(vdev->cpumask));
750 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
751 	spdk_json_write_object_end(w);
752 
753 	spdk_json_write_object_end(w);
754 }
755 
756 static int spdk_vhost_blk_destroy(struct spdk_vhost_dev *dev);
757 
758 static int
759 spdk_vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
760 			  uint32_t len)
761 {
762 	struct virtio_blk_config blkcfg;
763 	struct spdk_vhost_blk_dev *bvdev;
764 	struct spdk_bdev *bdev;
765 	uint32_t blk_size;
766 	uint64_t blkcnt;
767 
768 	bvdev = to_blk_dev(vdev);
769 	if (bvdev == NULL) {
770 		SPDK_ERRLOG("Trying to get virito_blk configuration failed\n");
771 		return -1;
772 	}
773 
774 	bdev = bvdev->bdev;
775 	if (bdev == NULL) {
776 		/* We can't just return -1 here as this GET_CONFIG message might
777 		 * be caused by a QEMU VM reboot. Returning -1 will indicate an
778 		 * error to QEMU, who might then decide to terminate itself.
779 		 * We don't want that. A simple reboot shouldn't break the system.
780 		 *
781 		 * Presenting a block device with block size 0 and block count 0
782 		 * doesn't cause any problems on QEMU side and the virtio-pci
783 		 * device is even still available inside the VM, but there will
784 		 * be no block device created for it - the kernel drivers will
785 		 * silently reject it.
786 		 */
787 		blk_size = 0;
788 		blkcnt = 0;
789 	} else {
790 		blk_size = spdk_bdev_get_block_size(bdev);
791 		blkcnt = spdk_bdev_get_num_blocks(bdev);
792 	}
793 
794 	memset(&blkcfg, 0, sizeof(blkcfg));
795 	blkcfg.blk_size = blk_size;
796 	/* minimum I/O size in blocks */
797 	blkcfg.min_io_size = 1;
798 	/* expressed in 512 Bytes sectors */
799 	blkcfg.capacity = (blkcnt * blk_size) / 512;
800 	blkcfg.size_max = 131072;
801 	/*  -2 for REQ and RESP and -1 for region boundary splitting */
802 	blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
803 	/* QEMU can overwrite this value when started */
804 	blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES;
805 
806 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
807 		/* 16MiB, expressed in 512 Bytes */
808 		blkcfg.max_discard_sectors = 32768;
809 		blkcfg.max_discard_seg = 1;
810 		blkcfg.discard_sector_alignment = blk_size / 512;
811 	}
812 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
813 		blkcfg.max_write_zeroes_sectors = 32768;
814 		blkcfg.max_write_zeroes_seg = 1;
815 	}
816 
817 	memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg)));
818 
819 	return 0;
820 }
821 
822 static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
823 	.virtio_features = SPDK_VHOST_FEATURES |
824 	(1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) |
825 	(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_RO) |
826 	(1ULL << VIRTIO_BLK_F_BLK_SIZE) | (1ULL << VIRTIO_BLK_F_TOPOLOGY) |
827 	(1ULL << VIRTIO_BLK_F_BARRIER)  | (1ULL << VIRTIO_BLK_F_SCSI) |
828 	(1ULL << VIRTIO_BLK_F_FLUSH)    | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) |
829 	(1ULL << VIRTIO_BLK_F_MQ)       | (1ULL << VIRTIO_BLK_F_DISCARD) |
830 	(1ULL << VIRTIO_BLK_F_WRITE_ZEROES),
831 	.disabled_features = SPDK_VHOST_DISABLED_FEATURES | (1ULL << VIRTIO_BLK_F_GEOMETRY) |
832 	(1ULL << VIRTIO_BLK_F_RO) | (1ULL << VIRTIO_BLK_F_FLUSH) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) |
833 	(1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI) | (1ULL << VIRTIO_BLK_F_DISCARD) |
834 	(1ULL << VIRTIO_BLK_F_WRITE_ZEROES),
835 	.start_device =  spdk_vhost_blk_start,
836 	.stop_device = spdk_vhost_blk_stop,
837 	.vhost_get_config = spdk_vhost_blk_get_config,
838 	.dump_info_json = spdk_vhost_blk_dump_info_json,
839 	.write_config_json = spdk_vhost_blk_write_config_json,
840 	.remove_device = spdk_vhost_blk_destroy,
841 };
842 
843 int
844 spdk_vhost_blk_controller_construct(void)
845 {
846 	struct spdk_conf_section *sp;
847 	unsigned ctrlr_num;
848 	char *bdev_name;
849 	char *cpumask;
850 	char *name;
851 	bool readonly;
852 
853 	for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) {
854 		if (!spdk_conf_section_match_prefix(sp, "VhostBlk")) {
855 			continue;
856 		}
857 
858 		if (sscanf(spdk_conf_section_get_name(sp), "VhostBlk%u", &ctrlr_num) != 1) {
859 			SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
860 				    spdk_conf_section_get_name(sp));
861 			return -1;
862 		}
863 
864 		name = spdk_conf_section_get_val(sp, "Name");
865 		if (name == NULL) {
866 			SPDK_ERRLOG("VhostBlk%u: missing Name\n", ctrlr_num);
867 			return -1;
868 		}
869 
870 		cpumask = spdk_conf_section_get_val(sp, "Cpumask");
871 		readonly = spdk_conf_section_get_boolval(sp, "ReadOnly", false);
872 
873 		bdev_name = spdk_conf_section_get_val(sp, "Dev");
874 		if (bdev_name == NULL) {
875 			continue;
876 		}
877 
878 		if (spdk_vhost_blk_construct(name, cpumask, bdev_name, readonly) < 0) {
879 			return -1;
880 		}
881 	}
882 
883 	return 0;
884 }
885 
886 int
887 spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name, bool readonly)
888 {
889 	struct spdk_vhost_blk_dev *bvdev = NULL;
890 	struct spdk_bdev *bdev;
891 	uint64_t features = 0;
892 	int ret = 0;
893 
894 	spdk_vhost_lock();
895 	bdev = spdk_bdev_get_by_name(dev_name);
896 	if (bdev == NULL) {
897 		SPDK_ERRLOG("Controller %s: bdev '%s' not found\n",
898 			    name, dev_name);
899 		ret = -ENODEV;
900 		goto out;
901 	}
902 
903 	bvdev = spdk_dma_zmalloc(sizeof(*bvdev), SPDK_CACHE_LINE_SIZE, NULL);
904 	if (bvdev == NULL) {
905 		ret = -ENOMEM;
906 		goto out;
907 	}
908 
909 	ret = spdk_bdev_open(bdev, true, bdev_remove_cb, bvdev, &bvdev->bdev_desc);
910 	if (ret != 0) {
911 		SPDK_ERRLOG("Controller %s: could not open bdev '%s', error=%d\n",
912 			    name, dev_name, ret);
913 		goto out;
914 	}
915 
916 	bvdev->bdev = bdev;
917 	bvdev->readonly = readonly;
918 	ret = spdk_vhost_dev_register(&bvdev->vdev, name, cpumask, &vhost_blk_device_backend);
919 	if (ret != 0) {
920 		spdk_bdev_close(bvdev->bdev_desc);
921 		goto out;
922 	}
923 
924 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
925 		features |= (1ULL << VIRTIO_BLK_F_DISCARD);
926 	}
927 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
928 		features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
929 	}
930 	if (readonly) {
931 		features |= (1ULL << VIRTIO_BLK_F_RO);
932 	}
933 
934 	if (features && rte_vhost_driver_enable_features(bvdev->vdev.path, features)) {
935 		SPDK_ERRLOG("Controller %s: failed to enable features 0x%"PRIx64"\n", name, features);
936 
937 		if (spdk_vhost_dev_unregister(&bvdev->vdev) != 0) {
938 			SPDK_ERRLOG("Controller %s: failed to remove controller\n", name);
939 		}
940 
941 		spdk_bdev_close(bvdev->bdev_desc);
942 		ret = -1;
943 		goto out;
944 	}
945 
946 	SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: using bdev '%s'\n", name, dev_name);
947 out:
948 	if (ret != 0 && bvdev) {
949 		spdk_dma_free(bvdev);
950 	}
951 	spdk_vhost_unlock();
952 	return ret;
953 }
954 
955 static int
956 spdk_vhost_blk_destroy(struct spdk_vhost_dev *vdev)
957 {
958 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
959 	int rc;
960 
961 	if (!bvdev) {
962 		return -EINVAL;
963 	}
964 
965 	rc = spdk_vhost_dev_unregister(&bvdev->vdev);
966 	if (rc != 0) {
967 		return rc;
968 	}
969 
970 	if (bvdev->bdev_desc) {
971 		spdk_bdev_close(bvdev->bdev_desc);
972 		bvdev->bdev_desc = NULL;
973 	}
974 	bvdev->bdev = NULL;
975 
976 	spdk_dma_free(bvdev);
977 	return 0;
978 }
979 
980 SPDK_LOG_REGISTER_COMPONENT("vhost_blk", SPDK_LOG_VHOST_BLK)
981 SPDK_LOG_REGISTER_COMPONENT("vhost_blk_data", SPDK_LOG_VHOST_BLK_DATA)
982