xref: /spdk/lib/vhost/vhost_blk.c (revision 73f79a5c56823bb53e2891c7e16a961b3c192fbc)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <linux/virtio_blk.h>
35 
36 #include "spdk/env.h"
37 #include "spdk/bdev.h"
38 #include "spdk/conf.h"
39 #include "spdk/thread.h"
40 #include "spdk/likely.h"
41 #include "spdk/string.h"
42 #include "spdk/util.h"
43 #include "spdk/vhost.h"
44 
45 #include "vhost_internal.h"
46 
47 struct spdk_vhost_blk_task {
48 	struct spdk_bdev_io *bdev_io;
49 	struct spdk_vhost_blk_session *bvsession;
50 	struct spdk_vhost_virtqueue *vq;
51 
52 	volatile uint8_t *status;
53 
54 	uint16_t req_idx;
55 
56 	/* for io wait */
57 	struct spdk_bdev_io_wait_entry bdev_io_wait;
58 
59 	/* If set, the task is currently used for I/O processing. */
60 	bool used;
61 
62 	/** Number of bytes that were written. */
63 	uint32_t used_len;
64 	uint16_t iovcnt;
65 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
66 };
67 
68 struct spdk_vhost_blk_dev {
69 	struct spdk_vhost_dev vdev;
70 	struct spdk_bdev *bdev;
71 	struct spdk_bdev_desc *bdev_desc;
72 	bool readonly;
73 };
74 
75 struct spdk_vhost_blk_session {
76 	/* The parent session must be the very first field in this struct */
77 	struct spdk_vhost_session vsession;
78 	struct spdk_vhost_blk_dev *bvdev;
79 	struct spdk_poller *requestq_poller;
80 	struct spdk_io_channel *io_channel;
81 	struct spdk_vhost_dev_destroy_ctx destroy_ctx;
82 };
83 
84 /* forward declaration */
85 static const struct spdk_vhost_dev_backend vhost_blk_device_backend;
86 
87 static int
88 process_blk_request(struct spdk_vhost_blk_task *task,
89 		    struct spdk_vhost_blk_session *bvsession,
90 		    struct spdk_vhost_virtqueue *vq);
91 
92 static void
93 blk_task_finish(struct spdk_vhost_blk_task *task)
94 {
95 	assert(task->bvsession->vsession.task_cnt > 0);
96 	task->bvsession->vsession.task_cnt--;
97 	task->used = false;
98 }
99 
100 static void
101 invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status)
102 {
103 	if (task->status) {
104 		*task->status = status;
105 	}
106 
107 	spdk_vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, task->req_idx,
108 					task->used_len);
109 	blk_task_finish(task);
110 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Invalid request (status=%" PRIu8")\n", status);
111 }
112 
113 /*
114  * Process task's descriptor chain and setup data related fields.
115  * Return
116  *   total size of suplied buffers
117  *
118  *   FIXME: Make this function return to rd_cnt and wr_cnt
119  */
120 static int
121 blk_iovs_setup(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq,
122 	       uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
123 {
124 	struct spdk_vhost_session *vsession = &bvsession->vsession;
125 	struct spdk_vhost_dev *vdev = vsession->vdev;
126 	struct vring_desc *desc, *desc_table;
127 	uint16_t out_cnt = 0, cnt = 0;
128 	uint32_t desc_table_size, len = 0;
129 	uint32_t desc_handled_cnt;
130 	int rc;
131 
132 	rc = spdk_vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size);
133 	if (rc != 0) {
134 		SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
135 		return -1;
136 	}
137 
138 	desc_handled_cnt = 0;
139 	while (1) {
140 		/*
141 		 * Maximum cnt reached?
142 		 * Should not happen if request is well formatted, otherwise this is a BUG.
143 		 */
144 		if (spdk_unlikely(cnt == *iovs_cnt)) {
145 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Max IOVs in request reached (req_idx = %"PRIu16").\n",
146 				      req_idx);
147 			return -1;
148 		}
149 
150 		if (spdk_unlikely(spdk_vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) {
151 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
152 				      req_idx, cnt);
153 			return -1;
154 		}
155 
156 		len += desc->len;
157 
158 		out_cnt += spdk_vhost_vring_desc_is_wr(desc);
159 
160 		rc = spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
161 		if (rc != 0) {
162 			SPDK_ERRLOG("%s: Descriptor chain at index %"PRIu16" terminated unexpectedly.\n",
163 				    vdev->name, req_idx);
164 			return -1;
165 		} else if (desc == NULL) {
166 			break;
167 		}
168 
169 		desc_handled_cnt++;
170 		if (spdk_unlikely(desc_handled_cnt > desc_table_size)) {
171 			/* Break a cycle and report an error, if any. */
172 			SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n",
173 				    vdev->name, desc_table_size, desc_handled_cnt);
174 			return -1;
175 		}
176 	}
177 
178 	/*
179 	 * There must be least two descriptors.
180 	 * First contain request so it must be readable.
181 	 * Last descriptor contain buffer for response so it must be writable.
182 	 */
183 	if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
184 		return -1;
185 	}
186 
187 	*length = len;
188 	*iovs_cnt = cnt;
189 	return 0;
190 }
191 
192 static void
193 blk_request_finish(bool success, struct spdk_vhost_blk_task *task)
194 {
195 	*task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
196 	spdk_vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, task->req_idx,
197 					task->used_len);
198 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Finished task (%p) req_idx=%d\n status: %s\n", task,
199 		      task->req_idx, success ? "OK" : "FAIL");
200 	blk_task_finish(task);
201 }
202 
203 static void
204 blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
205 {
206 	struct spdk_vhost_blk_task *task = cb_arg;
207 
208 	spdk_bdev_free_io(bdev_io);
209 	blk_request_finish(success, task);
210 }
211 
212 static void
213 blk_request_resubmit(void *arg)
214 {
215 	struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg;
216 	int rc = 0;
217 
218 	rc = process_blk_request(task, task->bvsession, task->vq);
219 	if (rc == 0) {
220 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p resubmitted ======\n", task);
221 	} else {
222 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p failed ======\n", task);
223 	}
224 }
225 
226 static inline void
227 blk_request_queue_io(struct spdk_vhost_blk_task *task)
228 {
229 	int rc;
230 	struct spdk_vhost_blk_session *bvsession = task->bvsession;
231 	struct spdk_bdev *bdev = bvsession->bvdev->bdev;
232 
233 	task->bdev_io_wait.bdev = bdev;
234 	task->bdev_io_wait.cb_fn = blk_request_resubmit;
235 	task->bdev_io_wait.cb_arg = task;
236 
237 	rc = spdk_bdev_queue_io_wait(bdev, bvsession->io_channel, &task->bdev_io_wait);
238 	if (rc != 0) {
239 		SPDK_ERRLOG("Queue io failed in vhost_blk, rc=%d\n", rc);
240 		invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
241 	}
242 }
243 
244 static int
245 process_blk_request(struct spdk_vhost_blk_task *task,
246 		    struct spdk_vhost_blk_session *bvsession,
247 		    struct spdk_vhost_virtqueue *vq)
248 {
249 	struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
250 	const struct virtio_blk_outhdr *req;
251 	struct virtio_blk_discard_write_zeroes *desc;
252 	struct iovec *iov;
253 	uint32_t type;
254 	uint32_t payload_len;
255 	uint64_t flush_bytes;
256 	int rc;
257 
258 	if (blk_iovs_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt, &payload_len)) {
259 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
260 		/* Only READ and WRITE are supported for now. */
261 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
262 		return -1;
263 	}
264 
265 	iov = &task->iovs[0];
266 	if (spdk_unlikely(iov->iov_len != sizeof(*req))) {
267 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
268 			      "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n",
269 			      iov->iov_len, sizeof(*req), task->req_idx);
270 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
271 		return -1;
272 	}
273 
274 	req = iov->iov_base;
275 
276 	iov = &task->iovs[task->iovcnt - 1];
277 	if (spdk_unlikely(iov->iov_len != 1)) {
278 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
279 			      "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n",
280 			      iov->iov_len, 1, task->req_idx);
281 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
282 		return -1;
283 	}
284 
285 	task->status = iov->iov_base;
286 	payload_len -= sizeof(*req) + sizeof(*task->status);
287 	task->iovcnt -= 2;
288 
289 	type = req->type;
290 #ifdef VIRTIO_BLK_T_BARRIER
291 	/* Don't care about barier for now (as QEMU's virtio-blk do). */
292 	type &= ~VIRTIO_BLK_T_BARRIER;
293 #endif
294 
295 	switch (type) {
296 	case VIRTIO_BLK_T_IN:
297 	case VIRTIO_BLK_T_OUT:
298 		if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) {
299 			SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n",
300 				    type ? "WRITE" : "READ", task->req_idx);
301 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
302 			return -1;
303 		}
304 
305 		if (type == VIRTIO_BLK_T_IN) {
306 			task->used_len = payload_len + sizeof(*task->status);
307 			rc = spdk_bdev_readv(bvdev->bdev_desc, bvsession->io_channel,
308 					     &task->iovs[1], task->iovcnt, req->sector * 512,
309 					     payload_len, blk_request_complete_cb, task);
310 		} else if (!bvdev->readonly) {
311 			task->used_len = sizeof(*task->status);
312 			rc = spdk_bdev_writev(bvdev->bdev_desc, bvsession->io_channel,
313 					      &task->iovs[1], task->iovcnt, req->sector * 512,
314 					      payload_len, blk_request_complete_cb, task);
315 		} else {
316 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Device is in read-only mode!\n");
317 			rc = -1;
318 		}
319 
320 		if (rc) {
321 			if (rc == -ENOMEM) {
322 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
323 				blk_request_queue_io(task);
324 			} else {
325 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
326 				return -1;
327 			}
328 		}
329 		break;
330 	case VIRTIO_BLK_T_DISCARD:
331 		desc = task->iovs[1].iov_base;
332 		if (payload_len != sizeof(*desc)) {
333 			SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len);
334 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
335 			return -1;
336 		}
337 
338 		rc = spdk_bdev_unmap(bvdev->bdev_desc, bvsession->io_channel,
339 				     desc->sector * 512, desc->num_sectors * 512,
340 				     blk_request_complete_cb, task);
341 		if (rc) {
342 			if (rc == -ENOMEM) {
343 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
344 				blk_request_queue_io(task);
345 			} else {
346 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
347 				return -1;
348 			}
349 		}
350 		break;
351 	case VIRTIO_BLK_T_WRITE_ZEROES:
352 		desc = task->iovs[1].iov_base;
353 		if (payload_len != sizeof(*desc)) {
354 			SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len);
355 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
356 			return -1;
357 		}
358 
359 		/* Zeroed and Unmap the range, SPDK doen't support it. */
360 		if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
361 			SPDK_NOTICELOG("Can't support Write Zeroes with Unmap flag\n");
362 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
363 			return -1;
364 		}
365 
366 		rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvsession->io_channel,
367 					    desc->sector * 512, desc->num_sectors * 512,
368 					    blk_request_complete_cb, task);
369 		if (rc) {
370 			if (rc == -ENOMEM) {
371 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
372 				blk_request_queue_io(task);
373 			} else {
374 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
375 				return -1;
376 			}
377 		}
378 		break;
379 	case VIRTIO_BLK_T_FLUSH:
380 		flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev);
381 		if (req->sector != 0) {
382 			SPDK_NOTICELOG("sector must be zero for flush command\n");
383 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
384 			return -1;
385 		}
386 		rc = spdk_bdev_flush(bvdev->bdev_desc, bvsession->io_channel,
387 				     0, flush_bytes,
388 				     blk_request_complete_cb, task);
389 		if (rc) {
390 			if (rc == -ENOMEM) {
391 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
392 				blk_request_queue_io(task);
393 			} else {
394 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
395 				return -1;
396 			}
397 		}
398 		break;
399 	case VIRTIO_BLK_T_GET_ID:
400 		if (!task->iovcnt || !payload_len) {
401 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
402 			return -1;
403 		}
404 		task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len);
405 		spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev),
406 				task->used_len, ' ');
407 		blk_request_finish(true, task);
408 		break;
409 	default:
410 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Not supported request type '%"PRIu32"'.\n", type);
411 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
412 		return -1;
413 	}
414 
415 	return 0;
416 }
417 
418 static void
419 process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
420 {
421 	struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
422 	struct spdk_vhost_blk_task *task;
423 	struct spdk_vhost_session *vsession = &bvsession->vsession;
424 	int rc;
425 	uint16_t reqs[32];
426 	uint16_t reqs_cnt, i;
427 
428 	reqs_cnt = spdk_vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
429 	if (!reqs_cnt) {
430 		return;
431 	}
432 
433 	for (i = 0; i < reqs_cnt; i++) {
434 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n",
435 			      reqs[i]);
436 
437 		if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
438 			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
439 				    bvdev->vdev.name, reqs[i], vq->vring.size);
440 			spdk_vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
441 			continue;
442 		}
443 
444 		task = &((struct spdk_vhost_blk_task *)vq->tasks)[reqs[i]];
445 		if (spdk_unlikely(task->used)) {
446 			SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
447 				    bvdev->vdev.name, reqs[i]);
448 			spdk_vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
449 			continue;
450 		}
451 
452 		vsession->task_cnt++;
453 
454 		task->used = true;
455 		task->iovcnt = SPDK_COUNTOF(task->iovs);
456 		task->status = NULL;
457 		task->used_len = 0;
458 
459 		rc = process_blk_request(task, bvsession, vq);
460 		if (rc == 0) {
461 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task,
462 				      reqs[i]);
463 		} else {
464 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d failed ======\n", task, reqs[i]);
465 		}
466 	}
467 }
468 
469 static int
470 vdev_worker(void *arg)
471 {
472 	struct spdk_vhost_blk_session *bvsession = arg;
473 	struct spdk_vhost_session *vsession = &bvsession->vsession;
474 
475 	uint16_t q_idx;
476 
477 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
478 		process_vq(bvsession, &vsession->virtqueue[q_idx]);
479 	}
480 
481 	spdk_vhost_session_used_signal(vsession);
482 
483 	return -1;
484 }
485 
486 static void
487 no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
488 {
489 	struct spdk_vhost_session *vsession = &bvsession->vsession;
490 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
491 	uint32_t length;
492 	uint16_t iovcnt, req_idx;
493 
494 	if (spdk_vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) {
495 		return;
496 	}
497 
498 	iovcnt = SPDK_COUNTOF(iovs);
499 	if (blk_iovs_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) {
500 		*(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR;
501 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx);
502 	}
503 
504 	spdk_vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
505 }
506 
507 static int
508 no_bdev_vdev_worker(void *arg)
509 {
510 	struct spdk_vhost_blk_session *bvsession = arg;
511 	struct spdk_vhost_session *vsession = &bvsession->vsession;
512 	uint16_t q_idx;
513 
514 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
515 		no_bdev_process_vq(bvsession, &vsession->virtqueue[q_idx]);
516 	}
517 
518 	spdk_vhost_session_used_signal(vsession);
519 
520 	if (vsession->task_cnt == 0 && bvsession->io_channel) {
521 		spdk_put_io_channel(bvsession->io_channel);
522 		bvsession->io_channel = NULL;
523 	}
524 
525 	return -1;
526 }
527 
528 static struct spdk_vhost_blk_session *
529 to_blk_session(struct spdk_vhost_session *vsession)
530 {
531 	if (vsession == NULL) {
532 		return NULL;
533 	}
534 
535 	if (vsession->vdev->backend != &vhost_blk_device_backend) {
536 		SPDK_ERRLOG("%s: not a vhost-blk device\n", vsession->vdev->name);
537 		return NULL;
538 	}
539 
540 	return (struct spdk_vhost_blk_session *)vsession;
541 }
542 
543 static struct spdk_vhost_blk_dev *
544 to_blk_dev(struct spdk_vhost_dev *vdev)
545 {
546 	if (vdev == NULL) {
547 		return NULL;
548 	}
549 
550 	if (vdev->backend != &vhost_blk_device_backend) {
551 		SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name);
552 		return NULL;
553 	}
554 
555 	return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev);
556 }
557 
558 struct spdk_bdev *
559 spdk_vhost_blk_get_dev(struct spdk_vhost_dev *vdev)
560 {
561 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
562 
563 	assert(bvdev != NULL);
564 	return bvdev->bdev;
565 }
566 
567 static int
568 _spdk_vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev, struct spdk_vhost_session *vsession,
569 				   void *ctx)
570 {
571 	struct spdk_vhost_blk_session *bvsession;
572 
573 	if (vdev == NULL) {
574 		/* Nothing to do */
575 		return 0;
576 	}
577 
578 	if (vsession == NULL) {
579 		/* All sessions have been notified, time to close the bdev */
580 		struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
581 
582 		assert(bvdev != NULL);
583 
584 		spdk_bdev_close(bvdev->bdev_desc);
585 		bvdev->bdev_desc = NULL;
586 		bvdev->bdev = NULL;
587 		return 0;
588 	}
589 
590 	bvsession = (struct spdk_vhost_blk_session *)vsession;
591 	if (bvsession->requestq_poller) {
592 		spdk_poller_unregister(&bvsession->requestq_poller);
593 		bvsession->requestq_poller = spdk_poller_register(no_bdev_vdev_worker, bvsession, 0);
594 	}
595 
596 	return 0;
597 }
598 
599 static int
600 _bdev_remove_cb(struct spdk_vhost_dev *vdev, void *arg)
601 {
602 	SPDK_WARNLOG("Controller %s: Hot-removing bdev - all further requests will fail.\n",
603 		     vdev->name);
604 	spdk_vhost_dev_foreach_session(vdev, _spdk_vhost_session_bdev_remove_cb, NULL);
605 	return 0;
606 }
607 
608 static void
609 bdev_remove_cb(void *remove_ctx)
610 {
611 	struct spdk_vhost_blk_dev *bvdev = remove_ctx;
612 
613 	spdk_vhost_call_external_event(bvdev->vdev.name, _bdev_remove_cb, bvdev);
614 }
615 
616 static void
617 free_task_pool(struct spdk_vhost_blk_session *bvsession)
618 {
619 	struct spdk_vhost_session *vsession = &bvsession->vsession;
620 	struct spdk_vhost_virtqueue *vq;
621 	uint16_t i;
622 
623 	for (i = 0; i < vsession->max_queues; i++) {
624 		vq = &vsession->virtqueue[i];
625 		if (vq->tasks == NULL) {
626 			continue;
627 		}
628 
629 		spdk_dma_free(vq->tasks);
630 		vq->tasks = NULL;
631 	}
632 }
633 
634 static int
635 alloc_task_pool(struct spdk_vhost_blk_session *bvsession)
636 {
637 	struct spdk_vhost_session *vsession = &bvsession->vsession;
638 	struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
639 	struct spdk_vhost_virtqueue *vq;
640 	struct spdk_vhost_blk_task *task;
641 	uint32_t task_cnt;
642 	uint16_t i;
643 	uint32_t j;
644 
645 	for (i = 0; i < vsession->max_queues; i++) {
646 		vq = &vsession->virtqueue[i];
647 		if (vq->vring.desc == NULL) {
648 			continue;
649 		}
650 
651 		task_cnt = vq->vring.size;
652 		if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
653 			/* sanity check */
654 			SPDK_ERRLOG("Controller %s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
655 				    bvdev->vdev.name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
656 			free_task_pool(bvsession);
657 			return -1;
658 		}
659 		vq->tasks = spdk_dma_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt,
660 					     SPDK_CACHE_LINE_SIZE, NULL);
661 		if (vq->tasks == NULL) {
662 			SPDK_ERRLOG("Controller %s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
663 				    bvdev->vdev.name, task_cnt, i);
664 			free_task_pool(bvsession);
665 			return -1;
666 		}
667 
668 		for (j = 0; j < task_cnt; j++) {
669 			task = &((struct spdk_vhost_blk_task *)vq->tasks)[j];
670 			task->bvsession = bvsession;
671 			task->req_idx = j;
672 			task->vq = vq;
673 		}
674 	}
675 
676 	return 0;
677 }
678 
679 static int
680 spdk_vhost_blk_start_cb(struct spdk_vhost_dev *vdev,
681 			struct spdk_vhost_session *vsession, void *event_ctx)
682 {
683 	struct spdk_vhost_blk_dev *bvdev;
684 	struct spdk_vhost_blk_session *bvsession;
685 	int i, rc = 0;
686 
687 	bvsession = to_blk_session(vsession);
688 	if (bvsession == NULL) {
689 		SPDK_ERRLOG("Trying to start non-blk controller as a blk one.\n");
690 		rc = -1;
691 		goto out;
692 	}
693 
694 	bvdev = to_blk_dev(vdev);
695 	assert(bvdev != NULL);
696 	bvsession->bvdev = bvdev;
697 
698 	/* validate all I/O queues are in a contiguous index range */
699 	for (i = 0; i < vsession->max_queues; i++) {
700 		if (vsession->virtqueue[i].vring.desc == NULL) {
701 			SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vdev->name, i);
702 			rc = -1;
703 			goto out;
704 		}
705 	}
706 
707 	rc = alloc_task_pool(bvsession);
708 	if (rc != 0) {
709 		SPDK_ERRLOG("%s: failed to alloc task pool.\n", bvdev->vdev.name);
710 		goto out;
711 	}
712 
713 	if (bvdev->bdev) {
714 		bvsession->io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
715 		if (!bvsession->io_channel) {
716 			free_task_pool(bvsession);
717 			SPDK_ERRLOG("Controller %s: IO channel allocation failed\n", vdev->name);
718 			rc = -1;
719 			goto out;
720 		}
721 	}
722 
723 	bvsession->requestq_poller = spdk_poller_register(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker,
724 				     bvsession, 0);
725 	SPDK_INFOLOG(SPDK_LOG_VHOST, "Started poller for vhost controller %s on lcore %d\n",
726 		     vdev->name, vsession->lcore);
727 out:
728 	spdk_vhost_session_event_done(event_ctx, rc);
729 	return rc;
730 }
731 
732 static int
733 spdk_vhost_blk_start(struct spdk_vhost_session *vsession)
734 {
735 	int rc;
736 
737 	vsession->lcore = spdk_vhost_allocate_reactor(vsession->vdev->cpumask);
738 	rc = spdk_vhost_session_send_event(vsession, spdk_vhost_blk_start_cb,
739 					   3, "start session");
740 
741 	if (rc != 0) {
742 		spdk_vhost_free_reactor(vsession->lcore);
743 		vsession->lcore = -1;
744 	}
745 
746 	return rc;
747 }
748 
749 static int
750 destroy_session_poller_cb(void *arg)
751 {
752 	struct spdk_vhost_blk_session *bvsession = arg;
753 	struct spdk_vhost_session *vsession = &bvsession->vsession;
754 	int i;
755 
756 	if (vsession->task_cnt > 0) {
757 		return -1;
758 	}
759 
760 	for (i = 0; i < vsession->max_queues; i++) {
761 		vsession->virtqueue[i].next_event_time = 0;
762 		spdk_vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
763 	}
764 
765 	SPDK_INFOLOG(SPDK_LOG_VHOST, "Stopping poller for vhost controller %s\n", vsession->vdev->name);
766 
767 	if (bvsession->io_channel) {
768 		spdk_put_io_channel(bvsession->io_channel);
769 		bvsession->io_channel = NULL;
770 	}
771 
772 	free_task_pool(bvsession);
773 	spdk_poller_unregister(&bvsession->destroy_ctx.poller);
774 	spdk_vhost_session_event_done(bvsession->destroy_ctx.event_ctx, 0);
775 
776 	return -1;
777 }
778 
779 static int
780 spdk_vhost_blk_stop_cb(struct spdk_vhost_dev *vdev,
781 		       struct spdk_vhost_session *vsession, void *event_ctx)
782 {
783 	struct spdk_vhost_blk_session *bvsession;
784 
785 	bvsession = to_blk_session(vsession);
786 	if (bvsession == NULL) {
787 		SPDK_ERRLOG("Trying to stop non-blk controller as a blk one.\n");
788 		goto err;
789 	}
790 
791 	bvsession->destroy_ctx.event_ctx = event_ctx;
792 	spdk_poller_unregister(&bvsession->requestq_poller);
793 	bvsession->destroy_ctx.poller = spdk_poller_register(destroy_session_poller_cb,
794 					bvsession, 1000);
795 	return 0;
796 
797 err:
798 	spdk_vhost_session_event_done(event_ctx, -1);
799 	return -1;
800 }
801 
802 static int
803 spdk_vhost_blk_stop(struct spdk_vhost_session *vsession)
804 {
805 	int rc;
806 
807 	rc = spdk_vhost_session_send_event(vsession, spdk_vhost_blk_stop_cb,
808 					   3, "stop session");
809 	if (rc != 0) {
810 		return rc;
811 	}
812 
813 	spdk_vhost_free_reactor(vsession->lcore);
814 	vsession->lcore = -1;
815 	return 0;
816 }
817 
818 static void
819 spdk_vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
820 {
821 	struct spdk_bdev *bdev = spdk_vhost_blk_get_dev(vdev);
822 	struct spdk_vhost_blk_dev *bvdev;
823 
824 	bvdev = to_blk_dev(vdev);
825 	if (bvdev == NULL) {
826 		return;
827 	}
828 
829 	assert(bvdev != NULL);
830 	spdk_json_write_named_object_begin(w, "block");
831 
832 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
833 
834 	spdk_json_write_name(w, "bdev");
835 	if (bdev) {
836 		spdk_json_write_string(w, spdk_bdev_get_name(bdev));
837 	} else {
838 		spdk_json_write_null(w);
839 	}
840 
841 	spdk_json_write_object_end(w);
842 }
843 
844 static void
845 spdk_vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
846 {
847 	struct spdk_vhost_blk_dev *bvdev;
848 
849 	bvdev = to_blk_dev(vdev);
850 	if (bvdev == NULL) {
851 		return;
852 	}
853 
854 	if (!bvdev->bdev) {
855 		return;
856 	}
857 
858 	spdk_json_write_object_begin(w);
859 	spdk_json_write_named_string(w, "method", "construct_vhost_blk_controller");
860 
861 	spdk_json_write_named_object_begin(w, "params");
862 	spdk_json_write_named_string(w, "ctrlr", vdev->name);
863 	spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev));
864 	spdk_json_write_named_string(w, "cpumask", spdk_cpuset_fmt(vdev->cpumask));
865 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
866 	spdk_json_write_object_end(w);
867 
868 	spdk_json_write_object_end(w);
869 }
870 
871 static int spdk_vhost_blk_destroy(struct spdk_vhost_dev *dev);
872 
873 static int
874 spdk_vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
875 			  uint32_t len)
876 {
877 	struct virtio_blk_config blkcfg;
878 	struct spdk_vhost_blk_dev *bvdev;
879 	struct spdk_bdev *bdev;
880 	uint32_t blk_size;
881 	uint64_t blkcnt;
882 
883 	bvdev = to_blk_dev(vdev);
884 	if (bvdev == NULL) {
885 		SPDK_ERRLOG("Trying to get virito_blk configuration failed\n");
886 		return -1;
887 	}
888 
889 	bdev = bvdev->bdev;
890 	if (bdev == NULL) {
891 		/* We can't just return -1 here as this GET_CONFIG message might
892 		 * be caused by a QEMU VM reboot. Returning -1 will indicate an
893 		 * error to QEMU, who might then decide to terminate itself.
894 		 * We don't want that. A simple reboot shouldn't break the system.
895 		 *
896 		 * Presenting a block device with block size 0 and block count 0
897 		 * doesn't cause any problems on QEMU side and the virtio-pci
898 		 * device is even still available inside the VM, but there will
899 		 * be no block device created for it - the kernel drivers will
900 		 * silently reject it.
901 		 */
902 		blk_size = 0;
903 		blkcnt = 0;
904 	} else {
905 		blk_size = spdk_bdev_get_block_size(bdev);
906 		blkcnt = spdk_bdev_get_num_blocks(bdev);
907 	}
908 
909 	memset(&blkcfg, 0, sizeof(blkcfg));
910 	blkcfg.blk_size = blk_size;
911 	/* minimum I/O size in blocks */
912 	blkcfg.min_io_size = 1;
913 	/* expressed in 512 Bytes sectors */
914 	blkcfg.capacity = (blkcnt * blk_size) / 512;
915 	blkcfg.size_max = 131072;
916 	/*  -2 for REQ and RESP and -1 for region boundary splitting */
917 	blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
918 	/* QEMU can overwrite this value when started */
919 	blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES;
920 
921 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
922 		/* 16MiB, expressed in 512 Bytes */
923 		blkcfg.max_discard_sectors = 32768;
924 		blkcfg.max_discard_seg = 1;
925 		blkcfg.discard_sector_alignment = blk_size / 512;
926 	}
927 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
928 		blkcfg.max_write_zeroes_sectors = 32768;
929 		blkcfg.max_write_zeroes_seg = 1;
930 	}
931 
932 	memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg)));
933 
934 	return 0;
935 }
936 
937 static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
938 	.virtio_features = SPDK_VHOST_FEATURES |
939 	(1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) |
940 	(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_RO) |
941 	(1ULL << VIRTIO_BLK_F_BLK_SIZE) | (1ULL << VIRTIO_BLK_F_TOPOLOGY) |
942 	(1ULL << VIRTIO_BLK_F_BARRIER)  | (1ULL << VIRTIO_BLK_F_SCSI) |
943 	(1ULL << VIRTIO_BLK_F_FLUSH)    | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) |
944 	(1ULL << VIRTIO_BLK_F_MQ)       | (1ULL << VIRTIO_BLK_F_DISCARD) |
945 	(1ULL << VIRTIO_BLK_F_WRITE_ZEROES),
946 	.disabled_features = SPDK_VHOST_DISABLED_FEATURES | (1ULL << VIRTIO_BLK_F_GEOMETRY) |
947 	(1ULL << VIRTIO_BLK_F_RO) | (1ULL << VIRTIO_BLK_F_FLUSH) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) |
948 	(1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI) | (1ULL << VIRTIO_BLK_F_DISCARD) |
949 	(1ULL << VIRTIO_BLK_F_WRITE_ZEROES),
950 	.session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session),
951 	.start_session =  spdk_vhost_blk_start,
952 	.stop_session = spdk_vhost_blk_stop,
953 	.vhost_get_config = spdk_vhost_blk_get_config,
954 	.dump_info_json = spdk_vhost_blk_dump_info_json,
955 	.write_config_json = spdk_vhost_blk_write_config_json,
956 	.remove_device = spdk_vhost_blk_destroy,
957 };
958 
959 int
960 spdk_vhost_blk_controller_construct(void)
961 {
962 	struct spdk_conf_section *sp;
963 	unsigned ctrlr_num;
964 	char *bdev_name;
965 	char *cpumask;
966 	char *name;
967 	bool readonly;
968 
969 	for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) {
970 		if (!spdk_conf_section_match_prefix(sp, "VhostBlk")) {
971 			continue;
972 		}
973 
974 		if (sscanf(spdk_conf_section_get_name(sp), "VhostBlk%u", &ctrlr_num) != 1) {
975 			SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
976 				    spdk_conf_section_get_name(sp));
977 			return -1;
978 		}
979 
980 		name = spdk_conf_section_get_val(sp, "Name");
981 		if (name == NULL) {
982 			SPDK_ERRLOG("VhostBlk%u: missing Name\n", ctrlr_num);
983 			return -1;
984 		}
985 
986 		cpumask = spdk_conf_section_get_val(sp, "Cpumask");
987 		readonly = spdk_conf_section_get_boolval(sp, "ReadOnly", false);
988 
989 		bdev_name = spdk_conf_section_get_val(sp, "Dev");
990 		if (bdev_name == NULL) {
991 			continue;
992 		}
993 
994 		if (spdk_vhost_blk_construct(name, cpumask, bdev_name, readonly) < 0) {
995 			return -1;
996 		}
997 	}
998 
999 	return 0;
1000 }
1001 
1002 int
1003 spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name, bool readonly)
1004 {
1005 	struct spdk_vhost_blk_dev *bvdev = NULL;
1006 	struct spdk_bdev *bdev;
1007 	uint64_t features = 0;
1008 	int ret = 0;
1009 
1010 	spdk_vhost_lock();
1011 	bdev = spdk_bdev_get_by_name(dev_name);
1012 	if (bdev == NULL) {
1013 		SPDK_ERRLOG("Controller %s: bdev '%s' not found\n",
1014 			    name, dev_name);
1015 		ret = -ENODEV;
1016 		goto out;
1017 	}
1018 
1019 	bvdev = spdk_dma_zmalloc(sizeof(*bvdev), SPDK_CACHE_LINE_SIZE, NULL);
1020 	if (bvdev == NULL) {
1021 		ret = -ENOMEM;
1022 		goto out;
1023 	}
1024 
1025 	ret = spdk_bdev_open(bdev, true, bdev_remove_cb, bvdev, &bvdev->bdev_desc);
1026 	if (ret != 0) {
1027 		SPDK_ERRLOG("Controller %s: could not open bdev '%s', error=%d\n",
1028 			    name, dev_name, ret);
1029 		goto out;
1030 	}
1031 
1032 	bvdev->bdev = bdev;
1033 	bvdev->readonly = readonly;
1034 	ret = spdk_vhost_dev_register(&bvdev->vdev, name, cpumask, &vhost_blk_device_backend);
1035 	if (ret != 0) {
1036 		spdk_bdev_close(bvdev->bdev_desc);
1037 		goto out;
1038 	}
1039 
1040 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
1041 		features |= (1ULL << VIRTIO_BLK_F_DISCARD);
1042 	}
1043 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
1044 		features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
1045 	}
1046 	if (readonly) {
1047 		features |= (1ULL << VIRTIO_BLK_F_RO);
1048 	}
1049 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) {
1050 		features |= (1ULL << VIRTIO_BLK_F_FLUSH);
1051 	}
1052 
1053 	if (features && rte_vhost_driver_enable_features(bvdev->vdev.path, features)) {
1054 		SPDK_ERRLOG("Controller %s: failed to enable features 0x%"PRIx64"\n", name, features);
1055 
1056 		if (spdk_vhost_dev_unregister(&bvdev->vdev) != 0) {
1057 			SPDK_ERRLOG("Controller %s: failed to remove controller\n", name);
1058 		}
1059 
1060 		spdk_bdev_close(bvdev->bdev_desc);
1061 		ret = -1;
1062 		goto out;
1063 	}
1064 
1065 	SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: using bdev '%s'\n", name, dev_name);
1066 out:
1067 	if (ret != 0 && bvdev) {
1068 		spdk_dma_free(bvdev);
1069 	}
1070 	spdk_vhost_unlock();
1071 	return ret;
1072 }
1073 
1074 static int
1075 spdk_vhost_blk_destroy(struct spdk_vhost_dev *vdev)
1076 {
1077 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
1078 	int rc;
1079 
1080 	if (!bvdev) {
1081 		return -EINVAL;
1082 	}
1083 
1084 	rc = spdk_vhost_dev_unregister(&bvdev->vdev);
1085 	if (rc != 0) {
1086 		return rc;
1087 	}
1088 
1089 	if (bvdev->bdev_desc) {
1090 		spdk_bdev_close(bvdev->bdev_desc);
1091 		bvdev->bdev_desc = NULL;
1092 	}
1093 	bvdev->bdev = NULL;
1094 
1095 	spdk_dma_free(bvdev);
1096 	return 0;
1097 }
1098 
1099 SPDK_LOG_REGISTER_COMPONENT("vhost_blk", SPDK_LOG_VHOST_BLK)
1100 SPDK_LOG_REGISTER_COMPONENT("vhost_blk_data", SPDK_LOG_VHOST_BLK_DATA)
1101