xref: /spdk/lib/vhost/vhost_blk.c (revision 9889ab2dc80e40dae92dcef361d53dcba722043d)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <linux/virtio_blk.h>
35 
36 #include "spdk/env.h"
37 #include "spdk/bdev.h"
38 #include "spdk/bdev_module.h"
39 #include "spdk/conf.h"
40 #include "spdk/thread.h"
41 #include "spdk/likely.h"
42 #include "spdk/string.h"
43 #include "spdk/util.h"
44 #include "spdk/vhost.h"
45 
46 #include "vhost_internal.h"
47 
48 struct spdk_vhost_blk_task {
49 	struct spdk_bdev_io *bdev_io;
50 	struct spdk_vhost_blk_session *bvsession;
51 	struct spdk_vhost_virtqueue *vq;
52 
53 	volatile uint8_t *status;
54 
55 	uint16_t req_idx;
56 
57 	/* for io wait */
58 	struct spdk_bdev_io_wait_entry bdev_io_wait;
59 
60 	/* If set, the task is currently used for I/O processing. */
61 	bool used;
62 
63 	/** Number of bytes that were written. */
64 	uint32_t used_len;
65 	uint16_t iovcnt;
66 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
67 };
68 
69 struct spdk_vhost_blk_dev {
70 	struct spdk_vhost_dev vdev;
71 	struct spdk_bdev *bdev;
72 	struct spdk_bdev_desc *bdev_desc;
73 	bool readonly;
74 };
75 
76 struct spdk_vhost_blk_session {
77 	/* The parent session must be the very first field in this struct */
78 	struct spdk_vhost_session vsession;
79 	struct spdk_vhost_blk_dev *bvdev;
80 	struct spdk_poller *requestq_poller;
81 	struct spdk_io_channel *io_channel;
82 	struct spdk_poller *stop_poller;
83 };
84 
85 /* forward declaration */
86 static const struct spdk_vhost_dev_backend vhost_blk_device_backend;
87 
88 static int
89 process_blk_request(struct spdk_vhost_blk_task *task,
90 		    struct spdk_vhost_blk_session *bvsession,
91 		    struct spdk_vhost_virtqueue *vq);
92 
93 static void
94 blk_task_finish(struct spdk_vhost_blk_task *task)
95 {
96 	assert(task->bvsession->vsession.task_cnt > 0);
97 	task->bvsession->vsession.task_cnt--;
98 	task->used = false;
99 }
100 
101 static void
102 invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status)
103 {
104 	if (task->status) {
105 		*task->status = status;
106 	}
107 
108 	vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, task->req_idx,
109 				   task->used_len);
110 	blk_task_finish(task);
111 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Invalid request (status=%" PRIu8")\n", status);
112 }
113 
114 /*
115  * Process task's descriptor chain and setup data related fields.
116  * Return
117  *   total size of suplied buffers
118  *
119  *   FIXME: Make this function return to rd_cnt and wr_cnt
120  */
121 static int
122 blk_iovs_setup(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq,
123 	       uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
124 {
125 	struct spdk_vhost_session *vsession = &bvsession->vsession;
126 	struct spdk_vhost_dev *vdev = vsession->vdev;
127 	struct vring_desc *desc, *desc_table;
128 	uint16_t out_cnt = 0, cnt = 0;
129 	uint32_t desc_table_size, len = 0;
130 	uint32_t desc_handled_cnt;
131 	int rc;
132 
133 	rc = vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size);
134 	if (rc != 0) {
135 		SPDK_ERRLOG("%s: invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
136 		return -1;
137 	}
138 
139 	desc_handled_cnt = 0;
140 	while (1) {
141 		/*
142 		 * Maximum cnt reached?
143 		 * Should not happen if request is well formatted, otherwise this is a BUG.
144 		 */
145 		if (spdk_unlikely(cnt == *iovs_cnt)) {
146 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
147 				      vsession->name, req_idx);
148 			return -1;
149 		}
150 
151 		if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) {
152 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
153 				      vsession->name, req_idx, cnt);
154 			return -1;
155 		}
156 
157 		len += desc->len;
158 
159 		out_cnt += vhost_vring_desc_is_wr(desc);
160 
161 		rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
162 		if (rc != 0) {
163 			SPDK_ERRLOG("%s: descriptor chain at index %"PRIu16" terminated unexpectedly.\n",
164 				    vsession->name, req_idx);
165 			return -1;
166 		} else if (desc == NULL) {
167 			break;
168 		}
169 
170 		desc_handled_cnt++;
171 		if (spdk_unlikely(desc_handled_cnt > desc_table_size)) {
172 			/* Break a cycle and report an error, if any. */
173 			SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n",
174 				    vsession->name, desc_table_size, desc_handled_cnt);
175 			return -1;
176 		}
177 	}
178 
179 	/*
180 	 * There must be least two descriptors.
181 	 * First contain request so it must be readable.
182 	 * Last descriptor contain buffer for response so it must be writable.
183 	 */
184 	if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
185 		return -1;
186 	}
187 
188 	*length = len;
189 	*iovs_cnt = cnt;
190 	return 0;
191 }
192 
193 static void
194 blk_request_finish(bool success, struct spdk_vhost_blk_task *task)
195 {
196 	*task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
197 	vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, task->req_idx,
198 				   task->used_len);
199 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Finished task (%p) req_idx=%d\n status: %s\n", task,
200 		      task->req_idx, success ? "OK" : "FAIL");
201 	blk_task_finish(task);
202 }
203 
204 static void
205 blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
206 {
207 	struct spdk_vhost_blk_task *task = cb_arg;
208 
209 	spdk_bdev_free_io(bdev_io);
210 	blk_request_finish(success, task);
211 }
212 
213 static void
214 blk_request_resubmit(void *arg)
215 {
216 	struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg;
217 	int rc = 0;
218 
219 	rc = process_blk_request(task, task->bvsession, task->vq);
220 	if (rc == 0) {
221 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p resubmitted ======\n", task);
222 	} else {
223 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p failed ======\n", task);
224 	}
225 }
226 
227 static inline void
228 blk_request_queue_io(struct spdk_vhost_blk_task *task)
229 {
230 	int rc;
231 	struct spdk_vhost_blk_session *bvsession = task->bvsession;
232 	struct spdk_bdev *bdev = bvsession->bvdev->bdev;
233 
234 	task->bdev_io_wait.bdev = bdev;
235 	task->bdev_io_wait.cb_fn = blk_request_resubmit;
236 	task->bdev_io_wait.cb_arg = task;
237 
238 	rc = spdk_bdev_queue_io_wait(bdev, bvsession->io_channel, &task->bdev_io_wait);
239 	if (rc != 0) {
240 		SPDK_ERRLOG("%s: failed to queue I/O, rc=%d\n", bvsession->vsession.name, rc);
241 		invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
242 	}
243 }
244 
245 static int
246 process_blk_request(struct spdk_vhost_blk_task *task,
247 		    struct spdk_vhost_blk_session *bvsession,
248 		    struct spdk_vhost_virtqueue *vq)
249 {
250 	struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
251 	const struct virtio_blk_outhdr *req;
252 	struct virtio_blk_discard_write_zeroes *desc;
253 	struct iovec *iov;
254 	uint32_t type;
255 	uint32_t payload_len;
256 	uint64_t flush_bytes;
257 	int rc;
258 
259 	if (blk_iovs_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt, &payload_len)) {
260 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
261 		/* Only READ and WRITE are supported for now. */
262 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
263 		return -1;
264 	}
265 
266 	iov = &task->iovs[0];
267 	if (spdk_unlikely(iov->iov_len != sizeof(*req))) {
268 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
269 			      "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n",
270 			      iov->iov_len, sizeof(*req), task->req_idx);
271 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
272 		return -1;
273 	}
274 
275 	req = iov->iov_base;
276 
277 	iov = &task->iovs[task->iovcnt - 1];
278 	if (spdk_unlikely(iov->iov_len != 1)) {
279 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
280 			      "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n",
281 			      iov->iov_len, 1, task->req_idx);
282 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
283 		return -1;
284 	}
285 
286 	task->status = iov->iov_base;
287 	payload_len -= sizeof(*req) + sizeof(*task->status);
288 	task->iovcnt -= 2;
289 
290 	type = req->type;
291 #ifdef VIRTIO_BLK_T_BARRIER
292 	/* Don't care about barier for now (as QEMU's virtio-blk do). */
293 	type &= ~VIRTIO_BLK_T_BARRIER;
294 #endif
295 
296 	switch (type) {
297 	case VIRTIO_BLK_T_IN:
298 	case VIRTIO_BLK_T_OUT:
299 		if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) {
300 			SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n",
301 				    type ? "WRITE" : "READ", task->req_idx);
302 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
303 			return -1;
304 		}
305 
306 		if (type == VIRTIO_BLK_T_IN) {
307 			task->used_len = payload_len + sizeof(*task->status);
308 			rc = spdk_bdev_readv(bvdev->bdev_desc, bvsession->io_channel,
309 					     &task->iovs[1], task->iovcnt, req->sector * 512,
310 					     payload_len, blk_request_complete_cb, task);
311 		} else if (!bvdev->readonly) {
312 			task->used_len = sizeof(*task->status);
313 			rc = spdk_bdev_writev(bvdev->bdev_desc, bvsession->io_channel,
314 					      &task->iovs[1], task->iovcnt, req->sector * 512,
315 					      payload_len, blk_request_complete_cb, task);
316 		} else {
317 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Device is in read-only mode!\n");
318 			rc = -1;
319 		}
320 
321 		if (rc) {
322 			if (rc == -ENOMEM) {
323 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
324 				blk_request_queue_io(task);
325 			} else {
326 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
327 				return -1;
328 			}
329 		}
330 		break;
331 	case VIRTIO_BLK_T_DISCARD:
332 		desc = task->iovs[1].iov_base;
333 		if (payload_len != sizeof(*desc)) {
334 			SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len);
335 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
336 			return -1;
337 		}
338 
339 		rc = spdk_bdev_unmap(bvdev->bdev_desc, bvsession->io_channel,
340 				     desc->sector * 512, desc->num_sectors * 512,
341 				     blk_request_complete_cb, task);
342 		if (rc) {
343 			if (rc == -ENOMEM) {
344 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
345 				blk_request_queue_io(task);
346 			} else {
347 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
348 				return -1;
349 			}
350 		}
351 		break;
352 	case VIRTIO_BLK_T_WRITE_ZEROES:
353 		desc = task->iovs[1].iov_base;
354 		if (payload_len != sizeof(*desc)) {
355 			SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len);
356 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
357 			return -1;
358 		}
359 
360 		/* Zeroed and Unmap the range, SPDK doen't support it. */
361 		if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
362 			SPDK_NOTICELOG("Can't support Write Zeroes with Unmap flag\n");
363 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
364 			return -1;
365 		}
366 
367 		rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvsession->io_channel,
368 					    desc->sector * 512, desc->num_sectors * 512,
369 					    blk_request_complete_cb, task);
370 		if (rc) {
371 			if (rc == -ENOMEM) {
372 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
373 				blk_request_queue_io(task);
374 			} else {
375 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
376 				return -1;
377 			}
378 		}
379 		break;
380 	case VIRTIO_BLK_T_FLUSH:
381 		flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev);
382 		if (req->sector != 0) {
383 			SPDK_NOTICELOG("sector must be zero for flush command\n");
384 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
385 			return -1;
386 		}
387 		rc = spdk_bdev_flush(bvdev->bdev_desc, bvsession->io_channel,
388 				     0, flush_bytes,
389 				     blk_request_complete_cb, task);
390 		if (rc) {
391 			if (rc == -ENOMEM) {
392 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
393 				blk_request_queue_io(task);
394 			} else {
395 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
396 				return -1;
397 			}
398 		}
399 		break;
400 	case VIRTIO_BLK_T_GET_ID:
401 		if (!task->iovcnt || !payload_len) {
402 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
403 			return -1;
404 		}
405 		task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len);
406 		spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev),
407 				task->used_len, ' ');
408 		blk_request_finish(true, task);
409 		break;
410 	default:
411 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Not supported request type '%"PRIu32"'.\n", type);
412 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
413 		return -1;
414 	}
415 
416 	return 0;
417 }
418 
419 static void
420 process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
421 {
422 	struct spdk_vhost_blk_task *task;
423 	struct spdk_vhost_session *vsession = &bvsession->vsession;
424 	int rc;
425 	uint16_t reqs[32];
426 	uint16_t reqs_cnt, i;
427 
428 	reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
429 	if (!reqs_cnt) {
430 		return;
431 	}
432 
433 	for (i = 0; i < reqs_cnt; i++) {
434 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n",
435 			      reqs[i]);
436 
437 		if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
438 			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
439 				    vsession->name, reqs[i], vq->vring.size);
440 			vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
441 			continue;
442 		}
443 
444 		task = &((struct spdk_vhost_blk_task *)vq->tasks)[reqs[i]];
445 		if (spdk_unlikely(task->used)) {
446 			SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
447 				    vsession->name, reqs[i]);
448 			vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
449 			continue;
450 		}
451 
452 		vsession->task_cnt++;
453 
454 		task->used = true;
455 		task->iovcnt = SPDK_COUNTOF(task->iovs);
456 		task->status = NULL;
457 		task->used_len = 0;
458 
459 		rc = process_blk_request(task, bvsession, vq);
460 		if (rc == 0) {
461 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task,
462 				      reqs[i]);
463 		} else {
464 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d failed ======\n", task, reqs[i]);
465 		}
466 	}
467 }
468 
469 static int
470 vdev_worker(void *arg)
471 {
472 	struct spdk_vhost_blk_session *bvsession = arg;
473 	struct spdk_vhost_session *vsession = &bvsession->vsession;
474 
475 	uint16_t q_idx;
476 
477 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
478 		process_vq(bvsession, &vsession->virtqueue[q_idx]);
479 	}
480 
481 	vhost_session_used_signal(vsession);
482 
483 	return -1;
484 }
485 
486 static void
487 no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
488 {
489 	struct spdk_vhost_session *vsession = &bvsession->vsession;
490 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
491 	uint32_t length;
492 	uint16_t iovcnt, req_idx;
493 
494 	if (vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) {
495 		return;
496 	}
497 
498 	iovcnt = SPDK_COUNTOF(iovs);
499 	if (blk_iovs_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) {
500 		*(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR;
501 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx);
502 	}
503 
504 	vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
505 }
506 
507 static int
508 no_bdev_vdev_worker(void *arg)
509 {
510 	struct spdk_vhost_blk_session *bvsession = arg;
511 	struct spdk_vhost_session *vsession = &bvsession->vsession;
512 	uint16_t q_idx;
513 
514 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
515 		no_bdev_process_vq(bvsession, &vsession->virtqueue[q_idx]);
516 	}
517 
518 	vhost_session_used_signal(vsession);
519 
520 	if (vsession->task_cnt == 0 && bvsession->io_channel) {
521 		spdk_put_io_channel(bvsession->io_channel);
522 		bvsession->io_channel = NULL;
523 	}
524 
525 	return -1;
526 }
527 
528 static struct spdk_vhost_blk_session *
529 to_blk_session(struct spdk_vhost_session *vsession)
530 {
531 	assert(vsession->vdev->backend == &vhost_blk_device_backend);
532 	return (struct spdk_vhost_blk_session *)vsession;
533 }
534 
535 static struct spdk_vhost_blk_dev *
536 to_blk_dev(struct spdk_vhost_dev *vdev)
537 {
538 	if (vdev == NULL) {
539 		return NULL;
540 	}
541 
542 	if (vdev->backend != &vhost_blk_device_backend) {
543 		SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name);
544 		return NULL;
545 	}
546 
547 	return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev);
548 }
549 
550 struct spdk_bdev *
551 spdk_vhost_blk_get_dev(struct spdk_vhost_dev *vdev)
552 {
553 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
554 
555 	assert(bvdev != NULL);
556 	return bvdev->bdev;
557 }
558 
559 static void
560 vhost_dev_bdev_remove_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx)
561 {
562 
563 	/* All sessions have been notified, time to close the bdev */
564 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
565 
566 	assert(bvdev != NULL);
567 	spdk_bdev_close(bvdev->bdev_desc);
568 	bvdev->bdev_desc = NULL;
569 	bvdev->bdev = NULL;
570 }
571 
572 static int
573 vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev,
574 			     struct spdk_vhost_session *vsession,
575 			     void *ctx)
576 {
577 	struct spdk_vhost_blk_session *bvsession;
578 
579 	bvsession = (struct spdk_vhost_blk_session *)vsession;
580 	if (bvsession->requestq_poller) {
581 		spdk_poller_unregister(&bvsession->requestq_poller);
582 		bvsession->requestq_poller = spdk_poller_register(no_bdev_vdev_worker, bvsession, 0);
583 	}
584 
585 	return 0;
586 }
587 
588 static void
589 bdev_remove_cb(void *remove_ctx)
590 {
591 	struct spdk_vhost_blk_dev *bvdev = remove_ctx;
592 
593 	SPDK_WARNLOG("%s: hot-removing bdev - all further requests will fail.\n",
594 		     bvdev->vdev.name);
595 
596 	spdk_vhost_lock();
597 	vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_remove_cb,
598 				  vhost_dev_bdev_remove_cpl_cb, NULL);
599 	spdk_vhost_unlock();
600 }
601 
602 static void
603 free_task_pool(struct spdk_vhost_blk_session *bvsession)
604 {
605 	struct spdk_vhost_session *vsession = &bvsession->vsession;
606 	struct spdk_vhost_virtqueue *vq;
607 	uint16_t i;
608 
609 	for (i = 0; i < vsession->max_queues; i++) {
610 		vq = &vsession->virtqueue[i];
611 		if (vq->tasks == NULL) {
612 			continue;
613 		}
614 
615 		spdk_free(vq->tasks);
616 		vq->tasks = NULL;
617 	}
618 }
619 
620 static int
621 alloc_task_pool(struct spdk_vhost_blk_session *bvsession)
622 {
623 	struct spdk_vhost_session *vsession = &bvsession->vsession;
624 	struct spdk_vhost_virtqueue *vq;
625 	struct spdk_vhost_blk_task *task;
626 	uint32_t task_cnt;
627 	uint16_t i;
628 	uint32_t j;
629 
630 	for (i = 0; i < vsession->max_queues; i++) {
631 		vq = &vsession->virtqueue[i];
632 		if (vq->vring.desc == NULL) {
633 			continue;
634 		}
635 
636 		task_cnt = vq->vring.size;
637 		if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
638 			/* sanity check */
639 			SPDK_ERRLOG("%s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
640 				    vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
641 			free_task_pool(bvsession);
642 			return -1;
643 		}
644 		vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt,
645 					 SPDK_CACHE_LINE_SIZE, NULL,
646 					 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
647 		if (vq->tasks == NULL) {
648 			SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
649 				    vsession->name, task_cnt, i);
650 			free_task_pool(bvsession);
651 			return -1;
652 		}
653 
654 		for (j = 0; j < task_cnt; j++) {
655 			task = &((struct spdk_vhost_blk_task *)vq->tasks)[j];
656 			task->bvsession = bvsession;
657 			task->req_idx = j;
658 			task->vq = vq;
659 		}
660 	}
661 
662 	return 0;
663 }
664 
665 static int
666 vhost_blk_start_cb(struct spdk_vhost_dev *vdev,
667 		   struct spdk_vhost_session *vsession, void *unused)
668 {
669 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
670 	struct spdk_vhost_blk_dev *bvdev;
671 	int i, rc = 0;
672 
673 	bvdev = to_blk_dev(vdev);
674 	assert(bvdev != NULL);
675 	bvsession->bvdev = bvdev;
676 
677 	/* validate all I/O queues are in a contiguous index range */
678 	for (i = 0; i < vsession->max_queues; i++) {
679 		if (vsession->virtqueue[i].vring.desc == NULL) {
680 			SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i);
681 			rc = -1;
682 			goto out;
683 		}
684 	}
685 
686 	rc = alloc_task_pool(bvsession);
687 	if (rc != 0) {
688 		SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name);
689 		goto out;
690 	}
691 
692 	if (bvdev->bdev) {
693 		bvsession->io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
694 		if (!bvsession->io_channel) {
695 			free_task_pool(bvsession);
696 			SPDK_ERRLOG("%s: I/O channel allocation failed\n", vsession->name);
697 			rc = -1;
698 			goto out;
699 		}
700 	}
701 
702 	bvsession->requestq_poller = spdk_poller_register(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker,
703 				     bvsession, 0);
704 	SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: started poller on lcore %d\n",
705 		     vsession->name, spdk_env_get_current_core());
706 out:
707 	vhost_session_start_done(vsession, rc);
708 	return rc;
709 }
710 
711 static int
712 vhost_blk_start(struct spdk_vhost_session *vsession)
713 {
714 	struct vhost_poll_group *pg;
715 
716 	pg = vhost_get_poll_group(vsession->vdev->cpumask);
717 	return vhost_session_send_event(pg, vsession, vhost_blk_start_cb,
718 					3, "start session");
719 }
720 
721 static int
722 destroy_session_poller_cb(void *arg)
723 {
724 	struct spdk_vhost_blk_session *bvsession = arg;
725 	struct spdk_vhost_session *vsession = &bvsession->vsession;
726 	int i;
727 
728 	if (vsession->task_cnt > 0) {
729 		return -1;
730 	}
731 
732 	if (spdk_vhost_trylock() != 0) {
733 		return -1;
734 	}
735 
736 	for (i = 0; i < vsession->max_queues; i++) {
737 		vsession->virtqueue[i].next_event_time = 0;
738 		vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
739 	}
740 
741 	SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: stopping poller on lcore %d\n",
742 		     vsession->name, spdk_env_get_current_core());
743 
744 	if (bvsession->io_channel) {
745 		spdk_put_io_channel(bvsession->io_channel);
746 		bvsession->io_channel = NULL;
747 	}
748 
749 	free_task_pool(bvsession);
750 	spdk_poller_unregister(&bvsession->stop_poller);
751 	vhost_session_stop_done(vsession, 0);
752 
753 	spdk_vhost_unlock();
754 	return -1;
755 }
756 
757 static int
758 vhost_blk_stop_cb(struct spdk_vhost_dev *vdev,
759 		  struct spdk_vhost_session *vsession, void *unused)
760 {
761 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
762 
763 	spdk_poller_unregister(&bvsession->requestq_poller);
764 	bvsession->stop_poller = spdk_poller_register(destroy_session_poller_cb,
765 				 bvsession, 1000);
766 	return 0;
767 }
768 
769 static int
770 vhost_blk_stop(struct spdk_vhost_session *vsession)
771 {
772 	return vhost_session_send_event(vsession->poll_group, vsession,
773 					vhost_blk_stop_cb, 3, "stop session");
774 }
775 
776 static void
777 vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
778 {
779 	struct spdk_bdev *bdev = spdk_vhost_blk_get_dev(vdev);
780 	struct spdk_vhost_blk_dev *bvdev;
781 
782 	bvdev = to_blk_dev(vdev);
783 	assert(bvdev != NULL);
784 	spdk_json_write_named_object_begin(w, "block");
785 
786 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
787 
788 	spdk_json_write_name(w, "bdev");
789 	if (bdev) {
790 		spdk_json_write_string(w, spdk_bdev_get_name(bdev));
791 	} else {
792 		spdk_json_write_null(w);
793 	}
794 
795 	spdk_json_write_object_end(w);
796 }
797 
798 static void
799 vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
800 {
801 	struct spdk_vhost_blk_dev *bvdev;
802 
803 	bvdev = to_blk_dev(vdev);
804 	assert(bvdev != NULL);
805 	if (!bvdev->bdev) {
806 		return;
807 	}
808 
809 	spdk_json_write_object_begin(w);
810 	spdk_json_write_named_string(w, "method", "vhost_create_blk_controller");
811 
812 	spdk_json_write_named_object_begin(w, "params");
813 	spdk_json_write_named_string(w, "ctrlr", vdev->name);
814 	spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev));
815 	spdk_json_write_named_string(w, "cpumask", spdk_cpuset_fmt(vdev->cpumask));
816 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
817 	spdk_json_write_object_end(w);
818 
819 	spdk_json_write_object_end(w);
820 }
821 
822 static int vhost_blk_destroy(struct spdk_vhost_dev *dev);
823 
824 static int
825 vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
826 		     uint32_t len)
827 {
828 	struct virtio_blk_config blkcfg;
829 	struct spdk_vhost_blk_dev *bvdev;
830 	struct spdk_bdev *bdev;
831 	uint32_t blk_size;
832 	uint64_t blkcnt;
833 
834 	bvdev = to_blk_dev(vdev);
835 	assert(bvdev != NULL);
836 	bdev = bvdev->bdev;
837 	if (bdev == NULL) {
838 		/* We can't just return -1 here as this GET_CONFIG message might
839 		 * be caused by a QEMU VM reboot. Returning -1 will indicate an
840 		 * error to QEMU, who might then decide to terminate itself.
841 		 * We don't want that. A simple reboot shouldn't break the system.
842 		 *
843 		 * Presenting a block device with block size 0 and block count 0
844 		 * doesn't cause any problems on QEMU side and the virtio-pci
845 		 * device is even still available inside the VM, but there will
846 		 * be no block device created for it - the kernel drivers will
847 		 * silently reject it.
848 		 */
849 		blk_size = 0;
850 		blkcnt = 0;
851 	} else {
852 		blk_size = spdk_bdev_get_block_size(bdev);
853 		blkcnt = spdk_bdev_get_num_blocks(bdev);
854 		if (spdk_bdev_get_buf_align(bdev) > 1) {
855 			blkcfg.size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE;
856 			blkcfg.seg_max = spdk_min(SPDK_VHOST_IOVS_MAX - 2 - 1, BDEV_IO_NUM_CHILD_IOV - 2 - 1);
857 		} else {
858 			blkcfg.size_max = 131072;
859 			/*  -2 for REQ and RESP and -1 for region boundary splitting */
860 			blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
861 		}
862 	}
863 
864 	memset(&blkcfg, 0, sizeof(blkcfg));
865 	blkcfg.blk_size = blk_size;
866 	/* minimum I/O size in blocks */
867 	blkcfg.min_io_size = 1;
868 	/* expressed in 512 Bytes sectors */
869 	blkcfg.capacity = (blkcnt * blk_size) / 512;
870 	/* QEMU can overwrite this value when started */
871 	blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES;
872 
873 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
874 		/* 16MiB, expressed in 512 Bytes */
875 		blkcfg.max_discard_sectors = 32768;
876 		blkcfg.max_discard_seg = 1;
877 		blkcfg.discard_sector_alignment = blk_size / 512;
878 	}
879 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
880 		blkcfg.max_write_zeroes_sectors = 32768;
881 		blkcfg.max_write_zeroes_seg = 1;
882 	}
883 
884 	memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg)));
885 
886 	return 0;
887 }
888 
889 static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
890 	.virtio_features = SPDK_VHOST_FEATURES |
891 	(1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) |
892 	(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_RO) |
893 	(1ULL << VIRTIO_BLK_F_BLK_SIZE) | (1ULL << VIRTIO_BLK_F_TOPOLOGY) |
894 	(1ULL << VIRTIO_BLK_F_BARRIER)  | (1ULL << VIRTIO_BLK_F_SCSI) |
895 	(1ULL << VIRTIO_BLK_F_FLUSH)    | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) |
896 	(1ULL << VIRTIO_BLK_F_MQ)       | (1ULL << VIRTIO_BLK_F_DISCARD) |
897 	(1ULL << VIRTIO_BLK_F_WRITE_ZEROES),
898 	.disabled_features = SPDK_VHOST_DISABLED_FEATURES | (1ULL << VIRTIO_BLK_F_GEOMETRY) |
899 	(1ULL << VIRTIO_BLK_F_RO) | (1ULL << VIRTIO_BLK_F_FLUSH) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) |
900 	(1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI) | (1ULL << VIRTIO_BLK_F_DISCARD) |
901 	(1ULL << VIRTIO_BLK_F_WRITE_ZEROES),
902 	.session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session),
903 	.start_session =  vhost_blk_start,
904 	.stop_session = vhost_blk_stop,
905 	.vhost_get_config = vhost_blk_get_config,
906 	.dump_info_json = vhost_blk_dump_info_json,
907 	.write_config_json = vhost_blk_write_config_json,
908 	.remove_device = vhost_blk_destroy,
909 };
910 
911 int
912 vhost_blk_controller_construct(void)
913 {
914 	struct spdk_conf_section *sp;
915 	unsigned ctrlr_num;
916 	char *bdev_name;
917 	char *cpumask;
918 	char *name;
919 	bool readonly;
920 
921 	for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) {
922 		if (!spdk_conf_section_match_prefix(sp, "VhostBlk")) {
923 			continue;
924 		}
925 
926 		if (sscanf(spdk_conf_section_get_name(sp), "VhostBlk%u", &ctrlr_num) != 1) {
927 			SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
928 				    spdk_conf_section_get_name(sp));
929 			return -1;
930 		}
931 
932 		name = spdk_conf_section_get_val(sp, "Name");
933 		if (name == NULL) {
934 			SPDK_ERRLOG("VhostBlk%u: missing Name\n", ctrlr_num);
935 			return -1;
936 		}
937 
938 		cpumask = spdk_conf_section_get_val(sp, "Cpumask");
939 		readonly = spdk_conf_section_get_boolval(sp, "ReadOnly", false);
940 
941 		bdev_name = spdk_conf_section_get_val(sp, "Dev");
942 		if (bdev_name == NULL) {
943 			continue;
944 		}
945 
946 		if (spdk_vhost_blk_construct(name, cpumask, bdev_name, readonly) < 0) {
947 			return -1;
948 		}
949 	}
950 
951 	return 0;
952 }
953 
954 int
955 spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name, bool readonly)
956 {
957 	struct spdk_vhost_blk_dev *bvdev = NULL;
958 	struct spdk_bdev *bdev;
959 	uint64_t features = 0;
960 	int ret = 0;
961 
962 	spdk_vhost_lock();
963 	bdev = spdk_bdev_get_by_name(dev_name);
964 	if (bdev == NULL) {
965 		SPDK_ERRLOG("%s: bdev '%s' not found\n",
966 			    name, dev_name);
967 		ret = -ENODEV;
968 		goto out;
969 	}
970 
971 	bvdev = calloc(1, sizeof(*bvdev));
972 	if (bvdev == NULL) {
973 		ret = -ENOMEM;
974 		goto out;
975 	}
976 
977 	ret = spdk_bdev_open(bdev, true, bdev_remove_cb, bvdev, &bvdev->bdev_desc);
978 	if (ret != 0) {
979 		SPDK_ERRLOG("%s: could not open bdev '%s', error=%d\n",
980 			    name, dev_name, ret);
981 		goto out;
982 	}
983 
984 	bvdev->bdev = bdev;
985 	bvdev->readonly = readonly;
986 	ret = vhost_dev_register(&bvdev->vdev, name, cpumask, &vhost_blk_device_backend);
987 	if (ret != 0) {
988 		spdk_bdev_close(bvdev->bdev_desc);
989 		goto out;
990 	}
991 
992 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
993 		features |= (1ULL << VIRTIO_BLK_F_DISCARD);
994 	}
995 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
996 		features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
997 	}
998 	if (readonly) {
999 		features |= (1ULL << VIRTIO_BLK_F_RO);
1000 	}
1001 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) {
1002 		features |= (1ULL << VIRTIO_BLK_F_FLUSH);
1003 	}
1004 
1005 	if (features && rte_vhost_driver_enable_features(bvdev->vdev.path, features)) {
1006 		SPDK_ERRLOG("%s: failed to enable features 0x%"PRIx64"\n", name, features);
1007 
1008 		if (vhost_dev_unregister(&bvdev->vdev) != 0) {
1009 			SPDK_ERRLOG("%s: failed to remove device\n", name);
1010 		}
1011 
1012 		spdk_bdev_close(bvdev->bdev_desc);
1013 		ret = -1;
1014 		goto out;
1015 	}
1016 
1017 	SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: using bdev '%s'\n", name, dev_name);
1018 out:
1019 	if (ret != 0 && bvdev) {
1020 		free(bvdev);
1021 	}
1022 	spdk_vhost_unlock();
1023 	return ret;
1024 }
1025 
1026 static int
1027 vhost_blk_destroy(struct spdk_vhost_dev *vdev)
1028 {
1029 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
1030 	int rc;
1031 
1032 	assert(bvdev != NULL);
1033 	rc = vhost_dev_unregister(&bvdev->vdev);
1034 	if (rc != 0) {
1035 		return rc;
1036 	}
1037 
1038 	if (bvdev->bdev_desc) {
1039 		spdk_bdev_close(bvdev->bdev_desc);
1040 		bvdev->bdev_desc = NULL;
1041 	}
1042 	bvdev->bdev = NULL;
1043 
1044 	free(bvdev);
1045 	return 0;
1046 }
1047 
1048 SPDK_LOG_REGISTER_COMPONENT("vhost_blk", SPDK_LOG_VHOST_BLK)
1049 SPDK_LOG_REGISTER_COMPONENT("vhost_blk_data", SPDK_LOG_VHOST_BLK_DATA)
1050