xref: /spdk/lib/vhost/vhost_blk.c (revision c4d9daeb7bf491bc0eb6e8d417b75d44773cb009)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <linux/virtio_blk.h>
35 
36 #include "spdk/env.h"
37 #include "spdk/bdev.h"
38 #include "spdk/conf.h"
39 #include "spdk/thread.h"
40 #include "spdk/likely.h"
41 #include "spdk/string.h"
42 #include "spdk/util.h"
43 #include "spdk/vhost.h"
44 
45 #include "vhost_internal.h"
46 
47 struct spdk_vhost_blk_task {
48 	struct spdk_bdev_io *bdev_io;
49 	struct spdk_vhost_blk_session *bvsession;
50 	struct spdk_vhost_virtqueue *vq;
51 
52 	volatile uint8_t *status;
53 
54 	uint16_t req_idx;
55 
56 	/* for io wait */
57 	struct spdk_bdev_io_wait_entry bdev_io_wait;
58 
59 	/* If set, the task is currently used for I/O processing. */
60 	bool used;
61 
62 	/** Number of bytes that were written. */
63 	uint32_t used_len;
64 	uint16_t iovcnt;
65 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
66 };
67 
68 struct spdk_vhost_blk_dev {
69 	struct spdk_vhost_dev vdev;
70 	struct spdk_bdev *bdev;
71 	struct spdk_bdev_desc *bdev_desc;
72 	bool readonly;
73 };
74 
75 struct spdk_vhost_blk_session {
76 	/* The parent session must be the very first field in this struct */
77 	struct spdk_vhost_session vsession;
78 	struct spdk_vhost_blk_dev *bvdev;
79 	struct spdk_poller *requestq_poller;
80 	struct spdk_io_channel *io_channel;
81 	struct spdk_poller *stop_poller;
82 };
83 
84 /* forward declaration */
85 static const struct spdk_vhost_dev_backend vhost_blk_device_backend;
86 
87 static int
88 process_blk_request(struct spdk_vhost_blk_task *task,
89 		    struct spdk_vhost_blk_session *bvsession,
90 		    struct spdk_vhost_virtqueue *vq);
91 
92 static void
93 blk_task_finish(struct spdk_vhost_blk_task *task)
94 {
95 	assert(task->bvsession->vsession.task_cnt > 0);
96 	task->bvsession->vsession.task_cnt--;
97 	task->used = false;
98 }
99 
100 static void
101 invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status)
102 {
103 	if (task->status) {
104 		*task->status = status;
105 	}
106 
107 	spdk_vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, task->req_idx,
108 					task->used_len);
109 	blk_task_finish(task);
110 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Invalid request (status=%" PRIu8")\n", status);
111 }
112 
113 /*
114  * Process task's descriptor chain and setup data related fields.
115  * Return
116  *   total size of suplied buffers
117  *
118  *   FIXME: Make this function return to rd_cnt and wr_cnt
119  */
120 static int
121 blk_iovs_setup(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq,
122 	       uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
123 {
124 	struct spdk_vhost_session *vsession = &bvsession->vsession;
125 	struct spdk_vhost_dev *vdev = vsession->vdev;
126 	struct vring_desc *desc, *desc_table;
127 	uint16_t out_cnt = 0, cnt = 0;
128 	uint32_t desc_table_size, len = 0;
129 	uint32_t desc_handled_cnt;
130 	int rc;
131 
132 	rc = spdk_vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size);
133 	if (rc != 0) {
134 		SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
135 		return -1;
136 	}
137 
138 	desc_handled_cnt = 0;
139 	while (1) {
140 		/*
141 		 * Maximum cnt reached?
142 		 * Should not happen if request is well formatted, otherwise this is a BUG.
143 		 */
144 		if (spdk_unlikely(cnt == *iovs_cnt)) {
145 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Max IOVs in request reached (req_idx = %"PRIu16").\n",
146 				      req_idx);
147 			return -1;
148 		}
149 
150 		if (spdk_unlikely(spdk_vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) {
151 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
152 				      req_idx, cnt);
153 			return -1;
154 		}
155 
156 		len += desc->len;
157 
158 		out_cnt += spdk_vhost_vring_desc_is_wr(desc);
159 
160 		rc = spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
161 		if (rc != 0) {
162 			SPDK_ERRLOG("%s: Descriptor chain at index %"PRIu16" terminated unexpectedly.\n",
163 				    vdev->name, req_idx);
164 			return -1;
165 		} else if (desc == NULL) {
166 			break;
167 		}
168 
169 		desc_handled_cnt++;
170 		if (spdk_unlikely(desc_handled_cnt > desc_table_size)) {
171 			/* Break a cycle and report an error, if any. */
172 			SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n",
173 				    vdev->name, desc_table_size, desc_handled_cnt);
174 			return -1;
175 		}
176 	}
177 
178 	/*
179 	 * There must be least two descriptors.
180 	 * First contain request so it must be readable.
181 	 * Last descriptor contain buffer for response so it must be writable.
182 	 */
183 	if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
184 		return -1;
185 	}
186 
187 	*length = len;
188 	*iovs_cnt = cnt;
189 	return 0;
190 }
191 
192 static void
193 blk_request_finish(bool success, struct spdk_vhost_blk_task *task)
194 {
195 	*task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
196 	spdk_vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, task->req_idx,
197 					task->used_len);
198 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Finished task (%p) req_idx=%d\n status: %s\n", task,
199 		      task->req_idx, success ? "OK" : "FAIL");
200 	blk_task_finish(task);
201 }
202 
203 static void
204 blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
205 {
206 	struct spdk_vhost_blk_task *task = cb_arg;
207 
208 	spdk_bdev_free_io(bdev_io);
209 	blk_request_finish(success, task);
210 }
211 
212 static void
213 blk_request_resubmit(void *arg)
214 {
215 	struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg;
216 	int rc = 0;
217 
218 	rc = process_blk_request(task, task->bvsession, task->vq);
219 	if (rc == 0) {
220 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p resubmitted ======\n", task);
221 	} else {
222 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p failed ======\n", task);
223 	}
224 }
225 
226 static inline void
227 blk_request_queue_io(struct spdk_vhost_blk_task *task)
228 {
229 	int rc;
230 	struct spdk_vhost_blk_session *bvsession = task->bvsession;
231 	struct spdk_bdev *bdev = bvsession->bvdev->bdev;
232 
233 	task->bdev_io_wait.bdev = bdev;
234 	task->bdev_io_wait.cb_fn = blk_request_resubmit;
235 	task->bdev_io_wait.cb_arg = task;
236 
237 	rc = spdk_bdev_queue_io_wait(bdev, bvsession->io_channel, &task->bdev_io_wait);
238 	if (rc != 0) {
239 		SPDK_ERRLOG("Queue io failed in vhost_blk, rc=%d\n", rc);
240 		invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
241 	}
242 }
243 
244 static int
245 process_blk_request(struct spdk_vhost_blk_task *task,
246 		    struct spdk_vhost_blk_session *bvsession,
247 		    struct spdk_vhost_virtqueue *vq)
248 {
249 	struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
250 	const struct virtio_blk_outhdr *req;
251 	struct virtio_blk_discard_write_zeroes *desc;
252 	struct iovec *iov;
253 	uint32_t type;
254 	uint32_t payload_len;
255 	uint64_t flush_bytes;
256 	int rc;
257 
258 	if (blk_iovs_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt, &payload_len)) {
259 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
260 		/* Only READ and WRITE are supported for now. */
261 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
262 		return -1;
263 	}
264 
265 	iov = &task->iovs[0];
266 	if (spdk_unlikely(iov->iov_len != sizeof(*req))) {
267 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
268 			      "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n",
269 			      iov->iov_len, sizeof(*req), task->req_idx);
270 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
271 		return -1;
272 	}
273 
274 	req = iov->iov_base;
275 
276 	iov = &task->iovs[task->iovcnt - 1];
277 	if (spdk_unlikely(iov->iov_len != 1)) {
278 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
279 			      "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n",
280 			      iov->iov_len, 1, task->req_idx);
281 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
282 		return -1;
283 	}
284 
285 	task->status = iov->iov_base;
286 	payload_len -= sizeof(*req) + sizeof(*task->status);
287 	task->iovcnt -= 2;
288 
289 	type = req->type;
290 #ifdef VIRTIO_BLK_T_BARRIER
291 	/* Don't care about barier for now (as QEMU's virtio-blk do). */
292 	type &= ~VIRTIO_BLK_T_BARRIER;
293 #endif
294 
295 	switch (type) {
296 	case VIRTIO_BLK_T_IN:
297 	case VIRTIO_BLK_T_OUT:
298 		if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) {
299 			SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n",
300 				    type ? "WRITE" : "READ", task->req_idx);
301 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
302 			return -1;
303 		}
304 
305 		if (type == VIRTIO_BLK_T_IN) {
306 			task->used_len = payload_len + sizeof(*task->status);
307 			rc = spdk_bdev_readv(bvdev->bdev_desc, bvsession->io_channel,
308 					     &task->iovs[1], task->iovcnt, req->sector * 512,
309 					     payload_len, blk_request_complete_cb, task);
310 		} else if (!bvdev->readonly) {
311 			task->used_len = sizeof(*task->status);
312 			rc = spdk_bdev_writev(bvdev->bdev_desc, bvsession->io_channel,
313 					      &task->iovs[1], task->iovcnt, req->sector * 512,
314 					      payload_len, blk_request_complete_cb, task);
315 		} else {
316 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Device is in read-only mode!\n");
317 			rc = -1;
318 		}
319 
320 		if (rc) {
321 			if (rc == -ENOMEM) {
322 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
323 				blk_request_queue_io(task);
324 			} else {
325 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
326 				return -1;
327 			}
328 		}
329 		break;
330 	case VIRTIO_BLK_T_DISCARD:
331 		desc = task->iovs[1].iov_base;
332 		if (payload_len != sizeof(*desc)) {
333 			SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len);
334 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
335 			return -1;
336 		}
337 
338 		rc = spdk_bdev_unmap(bvdev->bdev_desc, bvsession->io_channel,
339 				     desc->sector * 512, desc->num_sectors * 512,
340 				     blk_request_complete_cb, task);
341 		if (rc) {
342 			if (rc == -ENOMEM) {
343 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
344 				blk_request_queue_io(task);
345 			} else {
346 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
347 				return -1;
348 			}
349 		}
350 		break;
351 	case VIRTIO_BLK_T_WRITE_ZEROES:
352 		desc = task->iovs[1].iov_base;
353 		if (payload_len != sizeof(*desc)) {
354 			SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len);
355 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
356 			return -1;
357 		}
358 
359 		/* Zeroed and Unmap the range, SPDK doen't support it. */
360 		if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
361 			SPDK_NOTICELOG("Can't support Write Zeroes with Unmap flag\n");
362 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
363 			return -1;
364 		}
365 
366 		rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvsession->io_channel,
367 					    desc->sector * 512, desc->num_sectors * 512,
368 					    blk_request_complete_cb, task);
369 		if (rc) {
370 			if (rc == -ENOMEM) {
371 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
372 				blk_request_queue_io(task);
373 			} else {
374 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
375 				return -1;
376 			}
377 		}
378 		break;
379 	case VIRTIO_BLK_T_FLUSH:
380 		flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev);
381 		if (req->sector != 0) {
382 			SPDK_NOTICELOG("sector must be zero for flush command\n");
383 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
384 			return -1;
385 		}
386 		rc = spdk_bdev_flush(bvdev->bdev_desc, bvsession->io_channel,
387 				     0, flush_bytes,
388 				     blk_request_complete_cb, task);
389 		if (rc) {
390 			if (rc == -ENOMEM) {
391 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
392 				blk_request_queue_io(task);
393 			} else {
394 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
395 				return -1;
396 			}
397 		}
398 		break;
399 	case VIRTIO_BLK_T_GET_ID:
400 		if (!task->iovcnt || !payload_len) {
401 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
402 			return -1;
403 		}
404 		task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len);
405 		spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev),
406 				task->used_len, ' ');
407 		blk_request_finish(true, task);
408 		break;
409 	default:
410 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Not supported request type '%"PRIu32"'.\n", type);
411 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
412 		return -1;
413 	}
414 
415 	return 0;
416 }
417 
418 static void
419 process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
420 {
421 	struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
422 	struct spdk_vhost_blk_task *task;
423 	struct spdk_vhost_session *vsession = &bvsession->vsession;
424 	int rc;
425 	uint16_t reqs[32];
426 	uint16_t reqs_cnt, i;
427 
428 	reqs_cnt = spdk_vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
429 	if (!reqs_cnt) {
430 		return;
431 	}
432 
433 	for (i = 0; i < reqs_cnt; i++) {
434 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n",
435 			      reqs[i]);
436 
437 		if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
438 			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
439 				    bvdev->vdev.name, reqs[i], vq->vring.size);
440 			spdk_vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
441 			continue;
442 		}
443 
444 		task = &((struct spdk_vhost_blk_task *)vq->tasks)[reqs[i]];
445 		if (spdk_unlikely(task->used)) {
446 			SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
447 				    bvdev->vdev.name, reqs[i]);
448 			spdk_vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
449 			continue;
450 		}
451 
452 		vsession->task_cnt++;
453 
454 		task->used = true;
455 		task->iovcnt = SPDK_COUNTOF(task->iovs);
456 		task->status = NULL;
457 		task->used_len = 0;
458 
459 		rc = process_blk_request(task, bvsession, vq);
460 		if (rc == 0) {
461 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task,
462 				      reqs[i]);
463 		} else {
464 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d failed ======\n", task, reqs[i]);
465 		}
466 	}
467 }
468 
469 static int
470 vdev_worker(void *arg)
471 {
472 	struct spdk_vhost_blk_session *bvsession = arg;
473 	struct spdk_vhost_session *vsession = &bvsession->vsession;
474 
475 	uint16_t q_idx;
476 
477 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
478 		process_vq(bvsession, &vsession->virtqueue[q_idx]);
479 	}
480 
481 	spdk_vhost_session_used_signal(vsession);
482 
483 	return -1;
484 }
485 
486 static void
487 no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
488 {
489 	struct spdk_vhost_session *vsession = &bvsession->vsession;
490 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
491 	uint32_t length;
492 	uint16_t iovcnt, req_idx;
493 
494 	if (spdk_vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) {
495 		return;
496 	}
497 
498 	iovcnt = SPDK_COUNTOF(iovs);
499 	if (blk_iovs_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) {
500 		*(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR;
501 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx);
502 	}
503 
504 	spdk_vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
505 }
506 
507 static int
508 no_bdev_vdev_worker(void *arg)
509 {
510 	struct spdk_vhost_blk_session *bvsession = arg;
511 	struct spdk_vhost_session *vsession = &bvsession->vsession;
512 	uint16_t q_idx;
513 
514 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
515 		no_bdev_process_vq(bvsession, &vsession->virtqueue[q_idx]);
516 	}
517 
518 	spdk_vhost_session_used_signal(vsession);
519 
520 	if (vsession->task_cnt == 0 && bvsession->io_channel) {
521 		spdk_put_io_channel(bvsession->io_channel);
522 		bvsession->io_channel = NULL;
523 	}
524 
525 	return -1;
526 }
527 
528 static struct spdk_vhost_blk_session *
529 to_blk_session(struct spdk_vhost_session *vsession)
530 {
531 	if (vsession == NULL) {
532 		return NULL;
533 	}
534 
535 	if (vsession->vdev->backend != &vhost_blk_device_backend) {
536 		SPDK_ERRLOG("%s: not a vhost-blk device\n", vsession->vdev->name);
537 		return NULL;
538 	}
539 
540 	return (struct spdk_vhost_blk_session *)vsession;
541 }
542 
543 static struct spdk_vhost_blk_dev *
544 to_blk_dev(struct spdk_vhost_dev *vdev)
545 {
546 	if (vdev == NULL) {
547 		return NULL;
548 	}
549 
550 	if (vdev->backend != &vhost_blk_device_backend) {
551 		SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name);
552 		return NULL;
553 	}
554 
555 	return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev);
556 }
557 
558 struct spdk_bdev *
559 spdk_vhost_blk_get_dev(struct spdk_vhost_dev *vdev)
560 {
561 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
562 
563 	assert(bvdev != NULL);
564 	return bvdev->bdev;
565 }
566 
567 static int
568 _spdk_vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev, struct spdk_vhost_session *vsession,
569 				   void *ctx)
570 {
571 	struct spdk_vhost_blk_session *bvsession;
572 
573 	if (vdev == NULL) {
574 		/* Nothing to do */
575 		return 0;
576 	}
577 
578 	if (vsession == NULL) {
579 		/* All sessions have been notified, time to close the bdev */
580 		struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
581 
582 		assert(bvdev != NULL);
583 
584 		spdk_bdev_close(bvdev->bdev_desc);
585 		bvdev->bdev_desc = NULL;
586 		bvdev->bdev = NULL;
587 		return 0;
588 	}
589 
590 	bvsession = (struct spdk_vhost_blk_session *)vsession;
591 	if (bvsession->requestq_poller) {
592 		spdk_poller_unregister(&bvsession->requestq_poller);
593 		bvsession->requestq_poller = spdk_poller_register(no_bdev_vdev_worker, bvsession, 0);
594 	}
595 
596 	return 0;
597 }
598 
599 static void
600 bdev_remove_cb(void *remove_ctx)
601 {
602 	struct spdk_vhost_blk_dev *bvdev = remove_ctx;
603 
604 	SPDK_WARNLOG("Controller %s: Hot-removing bdev - all further requests will fail.\n",
605 		     bvdev->vdev.name);
606 
607 	spdk_vhost_lock();
608 	spdk_vhost_dev_foreach_session(&bvdev->vdev, _spdk_vhost_session_bdev_remove_cb, NULL);
609 	spdk_vhost_unlock();
610 }
611 
612 static void
613 free_task_pool(struct spdk_vhost_blk_session *bvsession)
614 {
615 	struct spdk_vhost_session *vsession = &bvsession->vsession;
616 	struct spdk_vhost_virtqueue *vq;
617 	uint16_t i;
618 
619 	for (i = 0; i < vsession->max_queues; i++) {
620 		vq = &vsession->virtqueue[i];
621 		if (vq->tasks == NULL) {
622 			continue;
623 		}
624 
625 		spdk_free(vq->tasks);
626 		vq->tasks = NULL;
627 	}
628 }
629 
630 static int
631 alloc_task_pool(struct spdk_vhost_blk_session *bvsession)
632 {
633 	struct spdk_vhost_session *vsession = &bvsession->vsession;
634 	struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
635 	struct spdk_vhost_virtqueue *vq;
636 	struct spdk_vhost_blk_task *task;
637 	uint32_t task_cnt;
638 	uint16_t i;
639 	uint32_t j;
640 
641 	for (i = 0; i < vsession->max_queues; i++) {
642 		vq = &vsession->virtqueue[i];
643 		if (vq->vring.desc == NULL) {
644 			continue;
645 		}
646 
647 		task_cnt = vq->vring.size;
648 		if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
649 			/* sanity check */
650 			SPDK_ERRLOG("Controller %s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
651 				    bvdev->vdev.name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
652 			free_task_pool(bvsession);
653 			return -1;
654 		}
655 		vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt,
656 					 SPDK_CACHE_LINE_SIZE, NULL,
657 					 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
658 		if (vq->tasks == NULL) {
659 			SPDK_ERRLOG("Controller %s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
660 				    bvdev->vdev.name, task_cnt, i);
661 			free_task_pool(bvsession);
662 			return -1;
663 		}
664 
665 		for (j = 0; j < task_cnt; j++) {
666 			task = &((struct spdk_vhost_blk_task *)vq->tasks)[j];
667 			task->bvsession = bvsession;
668 			task->req_idx = j;
669 			task->vq = vq;
670 		}
671 	}
672 
673 	return 0;
674 }
675 
676 static int
677 spdk_vhost_blk_start_cb(struct spdk_vhost_dev *vdev,
678 			struct spdk_vhost_session *vsession, void *unused)
679 {
680 	struct spdk_vhost_blk_dev *bvdev;
681 	struct spdk_vhost_blk_session *bvsession;
682 	int i, rc = 0;
683 
684 	bvsession = to_blk_session(vsession);
685 	if (bvsession == NULL) {
686 		SPDK_ERRLOG("Trying to start non-blk controller as a blk one.\n");
687 		rc = -1;
688 		goto out;
689 	}
690 
691 	bvdev = to_blk_dev(vdev);
692 	assert(bvdev != NULL);
693 	bvsession->bvdev = bvdev;
694 
695 	/* validate all I/O queues are in a contiguous index range */
696 	for (i = 0; i < vsession->max_queues; i++) {
697 		if (vsession->virtqueue[i].vring.desc == NULL) {
698 			SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vdev->name, i);
699 			rc = -1;
700 			goto out;
701 		}
702 	}
703 
704 	rc = alloc_task_pool(bvsession);
705 	if (rc != 0) {
706 		SPDK_ERRLOG("%s: failed to alloc task pool.\n", bvdev->vdev.name);
707 		goto out;
708 	}
709 
710 	if (bvdev->bdev) {
711 		bvsession->io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
712 		if (!bvsession->io_channel) {
713 			free_task_pool(bvsession);
714 			SPDK_ERRLOG("Controller %s: IO channel allocation failed\n", vdev->name);
715 			rc = -1;
716 			goto out;
717 		}
718 	}
719 
720 	bvsession->requestq_poller = spdk_poller_register(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker,
721 				     bvsession, 0);
722 	SPDK_INFOLOG(SPDK_LOG_VHOST, "Started poller for vhost controller %s on lcore %d\n",
723 		     vdev->name, spdk_env_get_current_core());
724 out:
725 	spdk_vhost_session_start_done(vsession, rc);
726 	return rc;
727 }
728 
729 static int
730 spdk_vhost_blk_start(struct spdk_vhost_session *vsession)
731 {
732 	int32_t lcore;
733 	int rc;
734 
735 	lcore = spdk_vhost_allocate_reactor(vsession->vdev->cpumask);
736 	rc = spdk_vhost_session_send_event(lcore, vsession, spdk_vhost_blk_start_cb,
737 					   3, "start session");
738 
739 	if (rc != 0) {
740 		spdk_vhost_free_reactor(lcore);
741 	}
742 
743 	return rc;
744 }
745 
746 static int
747 destroy_session_poller_cb(void *arg)
748 {
749 	struct spdk_vhost_blk_session *bvsession = arg;
750 	struct spdk_vhost_session *vsession = &bvsession->vsession;
751 	int i;
752 
753 	if (vsession->task_cnt > 0) {
754 		return -1;
755 	}
756 
757 	if (spdk_vhost_trylock() != 0) {
758 		return -1;
759 	}
760 
761 	for (i = 0; i < vsession->max_queues; i++) {
762 		vsession->virtqueue[i].next_event_time = 0;
763 		spdk_vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
764 	}
765 
766 	SPDK_INFOLOG(SPDK_LOG_VHOST, "Stopping poller for vhost controller %s\n", vsession->vdev->name);
767 
768 	if (bvsession->io_channel) {
769 		spdk_put_io_channel(bvsession->io_channel);
770 		bvsession->io_channel = NULL;
771 	}
772 
773 	free_task_pool(bvsession);
774 	spdk_poller_unregister(&bvsession->stop_poller);
775 	spdk_vhost_session_stop_done(vsession, 0);
776 
777 	spdk_vhost_unlock();
778 	return -1;
779 }
780 
781 static int
782 spdk_vhost_blk_stop_cb(struct spdk_vhost_dev *vdev,
783 		       struct spdk_vhost_session *vsession, void *unused)
784 {
785 	struct spdk_vhost_blk_session *bvsession;
786 
787 	bvsession = to_blk_session(vsession);
788 	if (bvsession == NULL) {
789 		SPDK_ERRLOG("Trying to stop non-blk controller as a blk one.\n");
790 		goto err;
791 	}
792 
793 	spdk_poller_unregister(&bvsession->requestq_poller);
794 	bvsession->stop_poller = spdk_poller_register(destroy_session_poller_cb,
795 				 bvsession, 1000);
796 	return 0;
797 
798 err:
799 	spdk_vhost_session_stop_done(vsession, -1);
800 	return -1;
801 }
802 
803 static int
804 spdk_vhost_blk_stop(struct spdk_vhost_session *vsession)
805 {
806 	return spdk_vhost_session_send_event(vsession->lcore, vsession,
807 					     spdk_vhost_blk_stop_cb, 3, "stop session");
808 }
809 
810 static void
811 spdk_vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
812 {
813 	struct spdk_bdev *bdev = spdk_vhost_blk_get_dev(vdev);
814 	struct spdk_vhost_blk_dev *bvdev;
815 
816 	bvdev = to_blk_dev(vdev);
817 	if (bvdev == NULL) {
818 		return;
819 	}
820 
821 	assert(bvdev != NULL);
822 	spdk_json_write_named_object_begin(w, "block");
823 
824 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
825 
826 	spdk_json_write_name(w, "bdev");
827 	if (bdev) {
828 		spdk_json_write_string(w, spdk_bdev_get_name(bdev));
829 	} else {
830 		spdk_json_write_null(w);
831 	}
832 
833 	spdk_json_write_object_end(w);
834 }
835 
836 static void
837 spdk_vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
838 {
839 	struct spdk_vhost_blk_dev *bvdev;
840 
841 	bvdev = to_blk_dev(vdev);
842 	if (bvdev == NULL) {
843 		return;
844 	}
845 
846 	if (!bvdev->bdev) {
847 		return;
848 	}
849 
850 	spdk_json_write_object_begin(w);
851 	spdk_json_write_named_string(w, "method", "construct_vhost_blk_controller");
852 
853 	spdk_json_write_named_object_begin(w, "params");
854 	spdk_json_write_named_string(w, "ctrlr", vdev->name);
855 	spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev));
856 	spdk_json_write_named_string(w, "cpumask", spdk_cpuset_fmt(vdev->cpumask));
857 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
858 	spdk_json_write_object_end(w);
859 
860 	spdk_json_write_object_end(w);
861 }
862 
863 static int spdk_vhost_blk_destroy(struct spdk_vhost_dev *dev);
864 
865 static int
866 spdk_vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
867 			  uint32_t len)
868 {
869 	struct virtio_blk_config blkcfg;
870 	struct spdk_vhost_blk_dev *bvdev;
871 	struct spdk_bdev *bdev;
872 	uint32_t blk_size;
873 	uint64_t blkcnt;
874 
875 	bvdev = to_blk_dev(vdev);
876 	if (bvdev == NULL) {
877 		SPDK_ERRLOG("Trying to get virito_blk configuration failed\n");
878 		return -1;
879 	}
880 
881 	bdev = bvdev->bdev;
882 	if (bdev == NULL) {
883 		/* We can't just return -1 here as this GET_CONFIG message might
884 		 * be caused by a QEMU VM reboot. Returning -1 will indicate an
885 		 * error to QEMU, who might then decide to terminate itself.
886 		 * We don't want that. A simple reboot shouldn't break the system.
887 		 *
888 		 * Presenting a block device with block size 0 and block count 0
889 		 * doesn't cause any problems on QEMU side and the virtio-pci
890 		 * device is even still available inside the VM, but there will
891 		 * be no block device created for it - the kernel drivers will
892 		 * silently reject it.
893 		 */
894 		blk_size = 0;
895 		blkcnt = 0;
896 	} else {
897 		blk_size = spdk_bdev_get_block_size(bdev);
898 		blkcnt = spdk_bdev_get_num_blocks(bdev);
899 	}
900 
901 	memset(&blkcfg, 0, sizeof(blkcfg));
902 	blkcfg.blk_size = blk_size;
903 	/* minimum I/O size in blocks */
904 	blkcfg.min_io_size = 1;
905 	/* expressed in 512 Bytes sectors */
906 	blkcfg.capacity = (blkcnt * blk_size) / 512;
907 	blkcfg.size_max = 131072;
908 	/*  -2 for REQ and RESP and -1 for region boundary splitting */
909 	blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
910 	/* QEMU can overwrite this value when started */
911 	blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES;
912 
913 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
914 		/* 16MiB, expressed in 512 Bytes */
915 		blkcfg.max_discard_sectors = 32768;
916 		blkcfg.max_discard_seg = 1;
917 		blkcfg.discard_sector_alignment = blk_size / 512;
918 	}
919 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
920 		blkcfg.max_write_zeroes_sectors = 32768;
921 		blkcfg.max_write_zeroes_seg = 1;
922 	}
923 
924 	memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg)));
925 
926 	return 0;
927 }
928 
929 static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
930 	.virtio_features = SPDK_VHOST_FEATURES |
931 	(1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) |
932 	(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_RO) |
933 	(1ULL << VIRTIO_BLK_F_BLK_SIZE) | (1ULL << VIRTIO_BLK_F_TOPOLOGY) |
934 	(1ULL << VIRTIO_BLK_F_BARRIER)  | (1ULL << VIRTIO_BLK_F_SCSI) |
935 	(1ULL << VIRTIO_BLK_F_FLUSH)    | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) |
936 	(1ULL << VIRTIO_BLK_F_MQ)       | (1ULL << VIRTIO_BLK_F_DISCARD) |
937 	(1ULL << VIRTIO_BLK_F_WRITE_ZEROES),
938 	.disabled_features = SPDK_VHOST_DISABLED_FEATURES | (1ULL << VIRTIO_BLK_F_GEOMETRY) |
939 	(1ULL << VIRTIO_BLK_F_RO) | (1ULL << VIRTIO_BLK_F_FLUSH) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) |
940 	(1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI) | (1ULL << VIRTIO_BLK_F_DISCARD) |
941 	(1ULL << VIRTIO_BLK_F_WRITE_ZEROES),
942 	.session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session),
943 	.start_session =  spdk_vhost_blk_start,
944 	.stop_session = spdk_vhost_blk_stop,
945 	.vhost_get_config = spdk_vhost_blk_get_config,
946 	.dump_info_json = spdk_vhost_blk_dump_info_json,
947 	.write_config_json = spdk_vhost_blk_write_config_json,
948 	.remove_device = spdk_vhost_blk_destroy,
949 };
950 
951 int
952 spdk_vhost_blk_controller_construct(void)
953 {
954 	struct spdk_conf_section *sp;
955 	unsigned ctrlr_num;
956 	char *bdev_name;
957 	char *cpumask;
958 	char *name;
959 	bool readonly;
960 
961 	for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) {
962 		if (!spdk_conf_section_match_prefix(sp, "VhostBlk")) {
963 			continue;
964 		}
965 
966 		if (sscanf(spdk_conf_section_get_name(sp), "VhostBlk%u", &ctrlr_num) != 1) {
967 			SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
968 				    spdk_conf_section_get_name(sp));
969 			return -1;
970 		}
971 
972 		name = spdk_conf_section_get_val(sp, "Name");
973 		if (name == NULL) {
974 			SPDK_ERRLOG("VhostBlk%u: missing Name\n", ctrlr_num);
975 			return -1;
976 		}
977 
978 		cpumask = spdk_conf_section_get_val(sp, "Cpumask");
979 		readonly = spdk_conf_section_get_boolval(sp, "ReadOnly", false);
980 
981 		bdev_name = spdk_conf_section_get_val(sp, "Dev");
982 		if (bdev_name == NULL) {
983 			continue;
984 		}
985 
986 		if (spdk_vhost_blk_construct(name, cpumask, bdev_name, readonly) < 0) {
987 			return -1;
988 		}
989 	}
990 
991 	return 0;
992 }
993 
994 int
995 spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name, bool readonly)
996 {
997 	struct spdk_vhost_blk_dev *bvdev = NULL;
998 	struct spdk_bdev *bdev;
999 	uint64_t features = 0;
1000 	int ret = 0;
1001 
1002 	spdk_vhost_lock();
1003 	bdev = spdk_bdev_get_by_name(dev_name);
1004 	if (bdev == NULL) {
1005 		SPDK_ERRLOG("Controller %s: bdev '%s' not found\n",
1006 			    name, dev_name);
1007 		ret = -ENODEV;
1008 		goto out;
1009 	}
1010 
1011 	bvdev = calloc(1, sizeof(*bvdev));
1012 	if (bvdev == NULL) {
1013 		ret = -ENOMEM;
1014 		goto out;
1015 	}
1016 
1017 	ret = spdk_bdev_open(bdev, true, bdev_remove_cb, bvdev, &bvdev->bdev_desc);
1018 	if (ret != 0) {
1019 		SPDK_ERRLOG("Controller %s: could not open bdev '%s', error=%d\n",
1020 			    name, dev_name, ret);
1021 		goto out;
1022 	}
1023 
1024 	bvdev->bdev = bdev;
1025 	bvdev->readonly = readonly;
1026 	ret = spdk_vhost_dev_register(&bvdev->vdev, name, cpumask, &vhost_blk_device_backend);
1027 	if (ret != 0) {
1028 		spdk_bdev_close(bvdev->bdev_desc);
1029 		goto out;
1030 	}
1031 
1032 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
1033 		features |= (1ULL << VIRTIO_BLK_F_DISCARD);
1034 	}
1035 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
1036 		features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
1037 	}
1038 	if (readonly) {
1039 		features |= (1ULL << VIRTIO_BLK_F_RO);
1040 	}
1041 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) {
1042 		features |= (1ULL << VIRTIO_BLK_F_FLUSH);
1043 	}
1044 
1045 	if (features && rte_vhost_driver_enable_features(bvdev->vdev.path, features)) {
1046 		SPDK_ERRLOG("Controller %s: failed to enable features 0x%"PRIx64"\n", name, features);
1047 
1048 		if (spdk_vhost_dev_unregister(&bvdev->vdev) != 0) {
1049 			SPDK_ERRLOG("Controller %s: failed to remove controller\n", name);
1050 		}
1051 
1052 		spdk_bdev_close(bvdev->bdev_desc);
1053 		ret = -1;
1054 		goto out;
1055 	}
1056 
1057 	SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: using bdev '%s'\n", name, dev_name);
1058 out:
1059 	if (ret != 0 && bvdev) {
1060 		free(bvdev);
1061 	}
1062 	spdk_vhost_unlock();
1063 	return ret;
1064 }
1065 
1066 static int
1067 spdk_vhost_blk_destroy(struct spdk_vhost_dev *vdev)
1068 {
1069 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
1070 	int rc;
1071 
1072 	if (!bvdev) {
1073 		return -EINVAL;
1074 	}
1075 
1076 	rc = spdk_vhost_dev_unregister(&bvdev->vdev);
1077 	if (rc != 0) {
1078 		return rc;
1079 	}
1080 
1081 	if (bvdev->bdev_desc) {
1082 		spdk_bdev_close(bvdev->bdev_desc);
1083 		bvdev->bdev_desc = NULL;
1084 	}
1085 	bvdev->bdev = NULL;
1086 
1087 	free(bvdev);
1088 	return 0;
1089 }
1090 
1091 SPDK_LOG_REGISTER_COMPONENT("vhost_blk", SPDK_LOG_VHOST_BLK)
1092 SPDK_LOG_REGISTER_COMPONENT("vhost_blk_data", SPDK_LOG_VHOST_BLK_DATA)
1093