xref: /spdk/lib/vhost/vhost_blk.c (revision be4a5602ce7d3e2d9cc7ff6cde0b0dcb99d647c8)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <linux/virtio_blk.h>
35 
36 #include "spdk/env.h"
37 #include "spdk/bdev.h"
38 #include "spdk/bdev_module.h"
39 #include "spdk/conf.h"
40 #include "spdk/thread.h"
41 #include "spdk/likely.h"
42 #include "spdk/string.h"
43 #include "spdk/util.h"
44 #include "spdk/vhost.h"
45 
46 #include "vhost_internal.h"
47 
48 /* Minimal set of features supported by every SPDK VHOST-BLK device */
49 #define SPDK_VHOST_BLK_FEATURES_BASE (SPDK_VHOST_FEATURES | \
50 		(1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) | \
51 		(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \
52 		(1ULL << VIRTIO_BLK_F_TOPOLOGY) | (1ULL << VIRTIO_BLK_F_BARRIER)  | \
53 		(1ULL << VIRTIO_BLK_F_SCSI)     | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
54 		(1ULL << VIRTIO_BLK_F_MQ))
55 
56 /* Not supported features */
57 #define SPDK_VHOST_BLK_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \
58 		(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
59 		(1ULL << VIRTIO_BLK_F_BARRIER)  | (1ULL << VIRTIO_BLK_F_SCSI))
60 
61 /* Vhost-blk support protocol features */
62 #define SPDK_VHOST_BLK_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_CONFIG) | \
63 		(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
64 
65 struct spdk_vhost_blk_task {
66 	struct spdk_bdev_io *bdev_io;
67 	struct spdk_vhost_blk_session *bvsession;
68 	struct spdk_vhost_virtqueue *vq;
69 
70 	volatile uint8_t *status;
71 
72 	uint16_t req_idx;
73 
74 	/* for io wait */
75 	struct spdk_bdev_io_wait_entry bdev_io_wait;
76 
77 	/* If set, the task is currently used for I/O processing. */
78 	bool used;
79 
80 	/** Number of bytes that were written. */
81 	uint32_t used_len;
82 	uint16_t iovcnt;
83 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
84 };
85 
86 struct spdk_vhost_blk_dev {
87 	struct spdk_vhost_dev vdev;
88 	struct spdk_bdev *bdev;
89 	struct spdk_bdev_desc *bdev_desc;
90 	bool readonly;
91 };
92 
93 struct spdk_vhost_blk_session {
94 	/* The parent session must be the very first field in this struct */
95 	struct spdk_vhost_session vsession;
96 	struct spdk_vhost_blk_dev *bvdev;
97 	struct spdk_poller *requestq_poller;
98 	struct spdk_io_channel *io_channel;
99 	struct spdk_poller *stop_poller;
100 };
101 
102 /* forward declaration */
103 static const struct spdk_vhost_dev_backend vhost_blk_device_backend;
104 
105 static int
106 process_blk_request(struct spdk_vhost_blk_task *task,
107 		    struct spdk_vhost_blk_session *bvsession,
108 		    struct spdk_vhost_virtqueue *vq);
109 
110 static void
111 blk_task_finish(struct spdk_vhost_blk_task *task)
112 {
113 	assert(task->bvsession->vsession.task_cnt > 0);
114 	task->bvsession->vsession.task_cnt--;
115 	task->used = false;
116 }
117 
118 static void
119 invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status)
120 {
121 	if (task->status) {
122 		*task->status = status;
123 	}
124 
125 	vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, task->req_idx,
126 				   task->used_len);
127 	blk_task_finish(task);
128 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Invalid request (status=%" PRIu8")\n", status);
129 }
130 
131 /*
132  * Process task's descriptor chain and setup data related fields.
133  * Return
134  *   total size of suplied buffers
135  *
136  *   FIXME: Make this function return to rd_cnt and wr_cnt
137  */
138 static int
139 blk_iovs_setup(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq,
140 	       uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
141 {
142 	struct spdk_vhost_session *vsession = &bvsession->vsession;
143 	struct spdk_vhost_dev *vdev = vsession->vdev;
144 	struct vring_desc *desc, *desc_table;
145 	uint16_t out_cnt = 0, cnt = 0;
146 	uint32_t desc_table_size, len = 0;
147 	uint32_t desc_handled_cnt;
148 	int rc;
149 
150 	rc = vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size);
151 	if (rc != 0) {
152 		SPDK_ERRLOG("%s: invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
153 		return -1;
154 	}
155 
156 	desc_handled_cnt = 0;
157 	while (1) {
158 		/*
159 		 * Maximum cnt reached?
160 		 * Should not happen if request is well formatted, otherwise this is a BUG.
161 		 */
162 		if (spdk_unlikely(cnt == *iovs_cnt)) {
163 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
164 				      vsession->name, req_idx);
165 			return -1;
166 		}
167 
168 		if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) {
169 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
170 				      vsession->name, req_idx, cnt);
171 			return -1;
172 		}
173 
174 		len += desc->len;
175 
176 		out_cnt += vhost_vring_desc_is_wr(desc);
177 
178 		rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
179 		if (rc != 0) {
180 			SPDK_ERRLOG("%s: descriptor chain at index %"PRIu16" terminated unexpectedly.\n",
181 				    vsession->name, req_idx);
182 			return -1;
183 		} else if (desc == NULL) {
184 			break;
185 		}
186 
187 		desc_handled_cnt++;
188 		if (spdk_unlikely(desc_handled_cnt > desc_table_size)) {
189 			/* Break a cycle and report an error, if any. */
190 			SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n",
191 				    vsession->name, desc_table_size, desc_handled_cnt);
192 			return -1;
193 		}
194 	}
195 
196 	/*
197 	 * There must be least two descriptors.
198 	 * First contain request so it must be readable.
199 	 * Last descriptor contain buffer for response so it must be writable.
200 	 */
201 	if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
202 		return -1;
203 	}
204 
205 	*length = len;
206 	*iovs_cnt = cnt;
207 	return 0;
208 }
209 
210 static void
211 blk_request_finish(bool success, struct spdk_vhost_blk_task *task)
212 {
213 	*task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
214 	vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, task->req_idx,
215 				   task->used_len);
216 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Finished task (%p) req_idx=%d\n status: %s\n", task,
217 		      task->req_idx, success ? "OK" : "FAIL");
218 	blk_task_finish(task);
219 }
220 
221 static void
222 blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
223 {
224 	struct spdk_vhost_blk_task *task = cb_arg;
225 
226 	spdk_bdev_free_io(bdev_io);
227 	blk_request_finish(success, task);
228 }
229 
230 static void
231 blk_request_resubmit(void *arg)
232 {
233 	struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg;
234 	int rc = 0;
235 
236 	rc = process_blk_request(task, task->bvsession, task->vq);
237 	if (rc == 0) {
238 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p resubmitted ======\n", task);
239 	} else {
240 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p failed ======\n", task);
241 	}
242 }
243 
244 static inline void
245 blk_request_queue_io(struct spdk_vhost_blk_task *task)
246 {
247 	int rc;
248 	struct spdk_vhost_blk_session *bvsession = task->bvsession;
249 	struct spdk_bdev *bdev = bvsession->bvdev->bdev;
250 
251 	task->bdev_io_wait.bdev = bdev;
252 	task->bdev_io_wait.cb_fn = blk_request_resubmit;
253 	task->bdev_io_wait.cb_arg = task;
254 
255 	rc = spdk_bdev_queue_io_wait(bdev, bvsession->io_channel, &task->bdev_io_wait);
256 	if (rc != 0) {
257 		SPDK_ERRLOG("%s: failed to queue I/O, rc=%d\n", bvsession->vsession.name, rc);
258 		invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
259 	}
260 }
261 
262 static int
263 process_blk_request(struct spdk_vhost_blk_task *task,
264 		    struct spdk_vhost_blk_session *bvsession,
265 		    struct spdk_vhost_virtqueue *vq)
266 {
267 	struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
268 	const struct virtio_blk_outhdr *req;
269 	struct virtio_blk_discard_write_zeroes *desc;
270 	struct iovec *iov;
271 	uint32_t type;
272 	uint32_t payload_len;
273 	uint64_t flush_bytes;
274 	int rc;
275 
276 	if (blk_iovs_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt, &payload_len)) {
277 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
278 		/* Only READ and WRITE are supported for now. */
279 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
280 		return -1;
281 	}
282 
283 	iov = &task->iovs[0];
284 	if (spdk_unlikely(iov->iov_len != sizeof(*req))) {
285 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
286 			      "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n",
287 			      iov->iov_len, sizeof(*req), task->req_idx);
288 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
289 		return -1;
290 	}
291 
292 	req = iov->iov_base;
293 
294 	iov = &task->iovs[task->iovcnt - 1];
295 	if (spdk_unlikely(iov->iov_len != 1)) {
296 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
297 			      "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n",
298 			      iov->iov_len, 1, task->req_idx);
299 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
300 		return -1;
301 	}
302 
303 	task->status = iov->iov_base;
304 	payload_len -= sizeof(*req) + sizeof(*task->status);
305 	task->iovcnt -= 2;
306 
307 	type = req->type;
308 #ifdef VIRTIO_BLK_T_BARRIER
309 	/* Don't care about barier for now (as QEMU's virtio-blk do). */
310 	type &= ~VIRTIO_BLK_T_BARRIER;
311 #endif
312 
313 	switch (type) {
314 	case VIRTIO_BLK_T_IN:
315 	case VIRTIO_BLK_T_OUT:
316 		if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) {
317 			SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n",
318 				    type ? "WRITE" : "READ", task->req_idx);
319 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
320 			return -1;
321 		}
322 
323 		if (type == VIRTIO_BLK_T_IN) {
324 			task->used_len = payload_len + sizeof(*task->status);
325 			rc = spdk_bdev_readv(bvdev->bdev_desc, bvsession->io_channel,
326 					     &task->iovs[1], task->iovcnt, req->sector * 512,
327 					     payload_len, blk_request_complete_cb, task);
328 		} else if (!bvdev->readonly) {
329 			task->used_len = sizeof(*task->status);
330 			rc = spdk_bdev_writev(bvdev->bdev_desc, bvsession->io_channel,
331 					      &task->iovs[1], task->iovcnt, req->sector * 512,
332 					      payload_len, blk_request_complete_cb, task);
333 		} else {
334 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Device is in read-only mode!\n");
335 			rc = -1;
336 		}
337 
338 		if (rc) {
339 			if (rc == -ENOMEM) {
340 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
341 				blk_request_queue_io(task);
342 			} else {
343 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
344 				return -1;
345 			}
346 		}
347 		break;
348 	case VIRTIO_BLK_T_DISCARD:
349 		desc = task->iovs[1].iov_base;
350 		if (payload_len != sizeof(*desc)) {
351 			SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len);
352 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
353 			return -1;
354 		}
355 
356 		rc = spdk_bdev_unmap(bvdev->bdev_desc, bvsession->io_channel,
357 				     desc->sector * 512, desc->num_sectors * 512,
358 				     blk_request_complete_cb, task);
359 		if (rc) {
360 			if (rc == -ENOMEM) {
361 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
362 				blk_request_queue_io(task);
363 			} else {
364 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
365 				return -1;
366 			}
367 		}
368 		break;
369 	case VIRTIO_BLK_T_WRITE_ZEROES:
370 		desc = task->iovs[1].iov_base;
371 		if (payload_len != sizeof(*desc)) {
372 			SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len);
373 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
374 			return -1;
375 		}
376 
377 		/* Zeroed and Unmap the range, SPDK doen't support it. */
378 		if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
379 			SPDK_NOTICELOG("Can't support Write Zeroes with Unmap flag\n");
380 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
381 			return -1;
382 		}
383 
384 		rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvsession->io_channel,
385 					    desc->sector * 512, desc->num_sectors * 512,
386 					    blk_request_complete_cb, task);
387 		if (rc) {
388 			if (rc == -ENOMEM) {
389 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
390 				blk_request_queue_io(task);
391 			} else {
392 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
393 				return -1;
394 			}
395 		}
396 		break;
397 	case VIRTIO_BLK_T_FLUSH:
398 		flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev);
399 		if (req->sector != 0) {
400 			SPDK_NOTICELOG("sector must be zero for flush command\n");
401 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
402 			return -1;
403 		}
404 		rc = spdk_bdev_flush(bvdev->bdev_desc, bvsession->io_channel,
405 				     0, flush_bytes,
406 				     blk_request_complete_cb, task);
407 		if (rc) {
408 			if (rc == -ENOMEM) {
409 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
410 				blk_request_queue_io(task);
411 			} else {
412 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
413 				return -1;
414 			}
415 		}
416 		break;
417 	case VIRTIO_BLK_T_GET_ID:
418 		if (!task->iovcnt || !payload_len) {
419 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
420 			return -1;
421 		}
422 		task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len);
423 		spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev),
424 				task->used_len, ' ');
425 		blk_request_finish(true, task);
426 		break;
427 	default:
428 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Not supported request type '%"PRIu32"'.\n", type);
429 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
430 		return -1;
431 	}
432 
433 	return 0;
434 }
435 
436 static void
437 submit_inflight_desc(struct spdk_vhost_blk_session *bvsession,
438 		     struct spdk_vhost_virtqueue *vq)
439 {
440 	struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
441 	struct spdk_vhost_blk_task *task;
442 	struct spdk_vhost_session *vsession = &bvsession->vsession;
443 	spdk_vhost_resubmit_info *resubmit = vq->vring_inflight.resubmit_inflight;
444 	spdk_vhost_resubmit_desc *resubmit_list;
445 	int rc;
446 	uint16_t req_idx;
447 
448 	if (spdk_likely(resubmit == NULL || resubmit->resubmit_list == NULL)) {
449 		return;
450 	}
451 
452 	resubmit_list = resubmit->resubmit_list;
453 	while (resubmit->resubmit_num-- > 0) {
454 		req_idx = resubmit_list[resubmit->resubmit_num].index;
455 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Start processing request idx %"PRIu16"======\n",
456 			      req_idx);
457 
458 		if (spdk_unlikely(req_idx >= vq->vring.size)) {
459 			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
460 				    bvdev->vdev.name, req_idx, vq->vring.size);
461 			vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
462 			continue;
463 		}
464 
465 		task = &((struct spdk_vhost_blk_task *)vq->tasks)[req_idx];
466 		if (spdk_unlikely(task->used)) {
467 			SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
468 				    bvdev->vdev.name, req_idx);
469 			vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
470 			continue;
471 		}
472 
473 		vsession->task_cnt++;
474 
475 		task->used = true;
476 		task->iovcnt = SPDK_COUNTOF(task->iovs);
477 		task->status = NULL;
478 		task->used_len = 0;
479 
480 		rc = process_blk_request(task, bvsession, vq);
481 		if (rc == 0) {
482 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task,
483 				      req_idx);
484 		} else {
485 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d failed ======\n", task,
486 				      req_idx);
487 		}
488 	}
489 
490 	free(resubmit_list);
491 	resubmit->resubmit_list = NULL;
492 }
493 
494 static void
495 process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
496 {
497 	struct spdk_vhost_blk_task *task;
498 	struct spdk_vhost_session *vsession = &bvsession->vsession;
499 	int rc;
500 	uint16_t reqs[32];
501 	uint16_t reqs_cnt, i;
502 	uint16_t vq_idx = vq->vring_idx;
503 
504 	submit_inflight_desc(bvsession, vq);
505 
506 	reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
507 	if (!reqs_cnt) {
508 		return;
509 	}
510 
511 	for (i = 0; i < reqs_cnt; i++) {
512 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n",
513 			      reqs[i]);
514 
515 		if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
516 			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
517 				    vsession->name, reqs[i], vq->vring.size);
518 			vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
519 			continue;
520 		}
521 
522 		rte_vhost_set_inflight_desc_split(vsession->vid, vq_idx, reqs[i]);
523 		task = &((struct spdk_vhost_blk_task *)vq->tasks)[reqs[i]];
524 		if (spdk_unlikely(task->used)) {
525 			SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
526 				    vsession->name, reqs[i]);
527 			vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
528 			continue;
529 		}
530 
531 		vsession->task_cnt++;
532 
533 		task->used = true;
534 		task->iovcnt = SPDK_COUNTOF(task->iovs);
535 		task->status = NULL;
536 		task->used_len = 0;
537 
538 		rc = process_blk_request(task, bvsession, vq);
539 		if (rc == 0) {
540 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task,
541 				      reqs[i]);
542 		} else {
543 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d failed ======\n", task, reqs[i]);
544 		}
545 	}
546 }
547 
548 static int
549 vdev_worker(void *arg)
550 {
551 	struct spdk_vhost_blk_session *bvsession = arg;
552 	struct spdk_vhost_session *vsession = &bvsession->vsession;
553 
554 	uint16_t q_idx;
555 
556 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
557 		process_vq(bvsession, &vsession->virtqueue[q_idx]);
558 	}
559 
560 	vhost_session_used_signal(vsession);
561 
562 	return -1;
563 }
564 
565 static void
566 no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
567 {
568 	struct spdk_vhost_session *vsession = &bvsession->vsession;
569 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
570 	uint32_t length;
571 	uint16_t iovcnt, req_idx;
572 
573 	if (vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) {
574 		return;
575 	}
576 
577 	iovcnt = SPDK_COUNTOF(iovs);
578 	if (blk_iovs_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) {
579 		*(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR;
580 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx);
581 	}
582 
583 	vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
584 }
585 
586 static int
587 no_bdev_vdev_worker(void *arg)
588 {
589 	struct spdk_vhost_blk_session *bvsession = arg;
590 	struct spdk_vhost_session *vsession = &bvsession->vsession;
591 	uint16_t q_idx;
592 
593 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
594 		no_bdev_process_vq(bvsession, &vsession->virtqueue[q_idx]);
595 	}
596 
597 	vhost_session_used_signal(vsession);
598 
599 	if (vsession->task_cnt == 0 && bvsession->io_channel) {
600 		spdk_put_io_channel(bvsession->io_channel);
601 		bvsession->io_channel = NULL;
602 	}
603 
604 	return -1;
605 }
606 
607 static struct spdk_vhost_blk_session *
608 to_blk_session(struct spdk_vhost_session *vsession)
609 {
610 	assert(vsession->vdev->backend == &vhost_blk_device_backend);
611 	return (struct spdk_vhost_blk_session *)vsession;
612 }
613 
614 static struct spdk_vhost_blk_dev *
615 to_blk_dev(struct spdk_vhost_dev *vdev)
616 {
617 	if (vdev == NULL) {
618 		return NULL;
619 	}
620 
621 	if (vdev->backend != &vhost_blk_device_backend) {
622 		SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name);
623 		return NULL;
624 	}
625 
626 	return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev);
627 }
628 
629 struct spdk_bdev *
630 spdk_vhost_blk_get_dev(struct spdk_vhost_dev *vdev)
631 {
632 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
633 
634 	assert(bvdev != NULL);
635 	return bvdev->bdev;
636 }
637 
638 static void
639 vhost_dev_bdev_remove_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx)
640 {
641 
642 	/* All sessions have been notified, time to close the bdev */
643 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
644 
645 	assert(bvdev != NULL);
646 	spdk_bdev_close(bvdev->bdev_desc);
647 	bvdev->bdev_desc = NULL;
648 	bvdev->bdev = NULL;
649 }
650 
651 static int
652 vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev,
653 			     struct spdk_vhost_session *vsession,
654 			     void *ctx)
655 {
656 	struct spdk_vhost_blk_session *bvsession;
657 
658 	bvsession = (struct spdk_vhost_blk_session *)vsession;
659 	if (bvsession->requestq_poller) {
660 		spdk_poller_unregister(&bvsession->requestq_poller);
661 		bvsession->requestq_poller = spdk_poller_register(no_bdev_vdev_worker, bvsession, 0);
662 	}
663 
664 	return 0;
665 }
666 
667 static void
668 bdev_remove_cb(void *remove_ctx)
669 {
670 	struct spdk_vhost_blk_dev *bvdev = remove_ctx;
671 
672 	SPDK_WARNLOG("%s: hot-removing bdev - all further requests will fail.\n",
673 		     bvdev->vdev.name);
674 
675 	spdk_vhost_lock();
676 	vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_remove_cb,
677 				  vhost_dev_bdev_remove_cpl_cb, NULL);
678 	spdk_vhost_unlock();
679 }
680 
681 static void
682 free_task_pool(struct spdk_vhost_blk_session *bvsession)
683 {
684 	struct spdk_vhost_session *vsession = &bvsession->vsession;
685 	struct spdk_vhost_virtqueue *vq;
686 	uint16_t i;
687 
688 	for (i = 0; i < vsession->max_queues; i++) {
689 		vq = &vsession->virtqueue[i];
690 		if (vq->tasks == NULL) {
691 			continue;
692 		}
693 
694 		spdk_free(vq->tasks);
695 		vq->tasks = NULL;
696 	}
697 }
698 
699 static int
700 alloc_task_pool(struct spdk_vhost_blk_session *bvsession)
701 {
702 	struct spdk_vhost_session *vsession = &bvsession->vsession;
703 	struct spdk_vhost_virtqueue *vq;
704 	struct spdk_vhost_blk_task *task;
705 	uint32_t task_cnt;
706 	uint16_t i;
707 	uint32_t j;
708 
709 	for (i = 0; i < vsession->max_queues; i++) {
710 		vq = &vsession->virtqueue[i];
711 		if (vq->vring.desc == NULL) {
712 			continue;
713 		}
714 
715 		task_cnt = vq->vring.size;
716 		if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
717 			/* sanity check */
718 			SPDK_ERRLOG("%s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
719 				    vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
720 			free_task_pool(bvsession);
721 			return -1;
722 		}
723 		vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt,
724 					 SPDK_CACHE_LINE_SIZE, NULL,
725 					 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
726 		if (vq->tasks == NULL) {
727 			SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
728 				    vsession->name, task_cnt, i);
729 			free_task_pool(bvsession);
730 			return -1;
731 		}
732 
733 		for (j = 0; j < task_cnt; j++) {
734 			task = &((struct spdk_vhost_blk_task *)vq->tasks)[j];
735 			task->bvsession = bvsession;
736 			task->req_idx = j;
737 			task->vq = vq;
738 		}
739 	}
740 
741 	return 0;
742 }
743 
744 static int
745 vhost_blk_start_cb(struct spdk_vhost_dev *vdev,
746 		   struct spdk_vhost_session *vsession, void *unused)
747 {
748 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
749 	struct spdk_vhost_blk_dev *bvdev;
750 	int i, rc = 0;
751 
752 	bvdev = to_blk_dev(vdev);
753 	assert(bvdev != NULL);
754 	bvsession->bvdev = bvdev;
755 
756 	/* validate all I/O queues are in a contiguous index range */
757 	for (i = 0; i < vsession->max_queues; i++) {
758 		if (vsession->virtqueue[i].vring.desc == NULL) {
759 			SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i);
760 			rc = -1;
761 			goto out;
762 		}
763 	}
764 
765 	rc = alloc_task_pool(bvsession);
766 	if (rc != 0) {
767 		SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name);
768 		goto out;
769 	}
770 
771 	if (bvdev->bdev) {
772 		bvsession->io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
773 		if (!bvsession->io_channel) {
774 			free_task_pool(bvsession);
775 			SPDK_ERRLOG("%s: I/O channel allocation failed\n", vsession->name);
776 			rc = -1;
777 			goto out;
778 		}
779 	}
780 
781 	bvsession->requestq_poller = spdk_poller_register(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker,
782 				     bvsession, 0);
783 	SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: started poller on lcore %d\n",
784 		     vsession->name, spdk_env_get_current_core());
785 out:
786 	vhost_session_start_done(vsession, rc);
787 	return rc;
788 }
789 
790 static int
791 vhost_blk_start(struct spdk_vhost_session *vsession)
792 {
793 	struct vhost_poll_group *pg;
794 
795 	pg = vhost_get_poll_group(&vsession->vdev->cpumask);
796 	return vhost_session_send_event(pg, vsession, vhost_blk_start_cb,
797 					3, "start session");
798 }
799 
800 static int
801 destroy_session_poller_cb(void *arg)
802 {
803 	struct spdk_vhost_blk_session *bvsession = arg;
804 	struct spdk_vhost_session *vsession = &bvsession->vsession;
805 	int i;
806 
807 	if (vsession->task_cnt > 0) {
808 		return -1;
809 	}
810 
811 	if (spdk_vhost_trylock() != 0) {
812 		return -1;
813 	}
814 
815 	for (i = 0; i < vsession->max_queues; i++) {
816 		vsession->virtqueue[i].next_event_time = 0;
817 		vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
818 	}
819 
820 	SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: stopping poller on lcore %d\n",
821 		     vsession->name, spdk_env_get_current_core());
822 
823 	if (bvsession->io_channel) {
824 		spdk_put_io_channel(bvsession->io_channel);
825 		bvsession->io_channel = NULL;
826 	}
827 
828 	free_task_pool(bvsession);
829 	spdk_poller_unregister(&bvsession->stop_poller);
830 	vhost_session_stop_done(vsession, 0);
831 
832 	spdk_vhost_unlock();
833 	return -1;
834 }
835 
836 static int
837 vhost_blk_stop_cb(struct spdk_vhost_dev *vdev,
838 		  struct spdk_vhost_session *vsession, void *unused)
839 {
840 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
841 
842 	spdk_poller_unregister(&bvsession->requestq_poller);
843 	bvsession->stop_poller = spdk_poller_register(destroy_session_poller_cb,
844 				 bvsession, 1000);
845 	return 0;
846 }
847 
848 static int
849 vhost_blk_stop(struct spdk_vhost_session *vsession)
850 {
851 	return vhost_session_send_event(vsession->poll_group, vsession,
852 					vhost_blk_stop_cb, 3, "stop session");
853 }
854 
855 static void
856 vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
857 {
858 	struct spdk_bdev *bdev = spdk_vhost_blk_get_dev(vdev);
859 	struct spdk_vhost_blk_dev *bvdev;
860 
861 	bvdev = to_blk_dev(vdev);
862 	assert(bvdev != NULL);
863 	spdk_json_write_named_object_begin(w, "block");
864 
865 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
866 
867 	spdk_json_write_name(w, "bdev");
868 	if (bdev) {
869 		spdk_json_write_string(w, spdk_bdev_get_name(bdev));
870 	} else {
871 		spdk_json_write_null(w);
872 	}
873 
874 	spdk_json_write_object_end(w);
875 }
876 
877 static void
878 vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
879 {
880 	struct spdk_vhost_blk_dev *bvdev;
881 
882 	bvdev = to_blk_dev(vdev);
883 	assert(bvdev != NULL);
884 	if (!bvdev->bdev) {
885 		return;
886 	}
887 
888 	spdk_json_write_object_begin(w);
889 	spdk_json_write_named_string(w, "method", "vhost_create_blk_controller");
890 
891 	spdk_json_write_named_object_begin(w, "params");
892 	spdk_json_write_named_string(w, "ctrlr", vdev->name);
893 	spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev));
894 	spdk_json_write_named_string(w, "cpumask", spdk_cpuset_fmt(&vdev->cpumask));
895 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
896 	spdk_json_write_object_end(w);
897 
898 	spdk_json_write_object_end(w);
899 }
900 
901 static int vhost_blk_destroy(struct spdk_vhost_dev *dev);
902 
903 static int
904 vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
905 		     uint32_t len)
906 {
907 	struct virtio_blk_config blkcfg;
908 	struct spdk_vhost_blk_dev *bvdev;
909 	struct spdk_bdev *bdev;
910 	uint32_t blk_size;
911 	uint64_t blkcnt;
912 
913 	bvdev = to_blk_dev(vdev);
914 	assert(bvdev != NULL);
915 	bdev = bvdev->bdev;
916 	if (bdev == NULL) {
917 		/* We can't just return -1 here as this GET_CONFIG message might
918 		 * be caused by a QEMU VM reboot. Returning -1 will indicate an
919 		 * error to QEMU, who might then decide to terminate itself.
920 		 * We don't want that. A simple reboot shouldn't break the system.
921 		 *
922 		 * Presenting a block device with block size 0 and block count 0
923 		 * doesn't cause any problems on QEMU side and the virtio-pci
924 		 * device is even still available inside the VM, but there will
925 		 * be no block device created for it - the kernel drivers will
926 		 * silently reject it.
927 		 */
928 		blk_size = 0;
929 		blkcnt = 0;
930 	} else {
931 		blk_size = spdk_bdev_get_block_size(bdev);
932 		blkcnt = spdk_bdev_get_num_blocks(bdev);
933 		if (spdk_bdev_get_buf_align(bdev) > 1) {
934 			blkcfg.size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE;
935 			blkcfg.seg_max = spdk_min(SPDK_VHOST_IOVS_MAX - 2 - 1, BDEV_IO_NUM_CHILD_IOV - 2 - 1);
936 		} else {
937 			blkcfg.size_max = 131072;
938 			/*  -2 for REQ and RESP and -1 for region boundary splitting */
939 			blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
940 		}
941 	}
942 
943 	memset(&blkcfg, 0, sizeof(blkcfg));
944 	blkcfg.blk_size = blk_size;
945 	/* minimum I/O size in blocks */
946 	blkcfg.min_io_size = 1;
947 	/* expressed in 512 Bytes sectors */
948 	blkcfg.capacity = (blkcnt * blk_size) / 512;
949 	/* QEMU can overwrite this value when started */
950 	blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES;
951 
952 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
953 		/* 16MiB, expressed in 512 Bytes */
954 		blkcfg.max_discard_sectors = 32768;
955 		blkcfg.max_discard_seg = 1;
956 		blkcfg.discard_sector_alignment = blk_size / 512;
957 	}
958 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
959 		blkcfg.max_write_zeroes_sectors = 32768;
960 		blkcfg.max_write_zeroes_seg = 1;
961 	}
962 
963 	memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg)));
964 
965 	return 0;
966 }
967 
968 static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
969 	.session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session),
970 	.start_session =  vhost_blk_start,
971 	.stop_session = vhost_blk_stop,
972 	.vhost_get_config = vhost_blk_get_config,
973 	.dump_info_json = vhost_blk_dump_info_json,
974 	.write_config_json = vhost_blk_write_config_json,
975 	.remove_device = vhost_blk_destroy,
976 };
977 
978 int
979 vhost_blk_controller_construct(void)
980 {
981 	struct spdk_conf_section *sp;
982 	unsigned ctrlr_num;
983 	char *bdev_name;
984 	char *cpumask;
985 	char *name;
986 	bool readonly;
987 
988 	for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) {
989 		if (!spdk_conf_section_match_prefix(sp, "VhostBlk")) {
990 			continue;
991 		}
992 
993 		if (sscanf(spdk_conf_section_get_name(sp), "VhostBlk%u", &ctrlr_num) != 1) {
994 			SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
995 				    spdk_conf_section_get_name(sp));
996 			return -1;
997 		}
998 
999 		name = spdk_conf_section_get_val(sp, "Name");
1000 		if (name == NULL) {
1001 			SPDK_ERRLOG("VhostBlk%u: missing Name\n", ctrlr_num);
1002 			return -1;
1003 		}
1004 
1005 		cpumask = spdk_conf_section_get_val(sp, "Cpumask");
1006 		readonly = spdk_conf_section_get_boolval(sp, "ReadOnly", false);
1007 
1008 		bdev_name = spdk_conf_section_get_val(sp, "Dev");
1009 		if (bdev_name == NULL) {
1010 			continue;
1011 		}
1012 
1013 		if (spdk_vhost_blk_construct(name, cpumask, bdev_name, readonly) < 0) {
1014 			return -1;
1015 		}
1016 	}
1017 
1018 	return 0;
1019 }
1020 
1021 int
1022 spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name, bool readonly)
1023 {
1024 	struct spdk_vhost_blk_dev *bvdev = NULL;
1025 	struct spdk_vhost_dev *vdev;
1026 	struct spdk_bdev *bdev;
1027 	int ret = 0;
1028 
1029 	spdk_vhost_lock();
1030 	bdev = spdk_bdev_get_by_name(dev_name);
1031 	if (bdev == NULL) {
1032 		SPDK_ERRLOG("%s: bdev '%s' not found\n",
1033 			    name, dev_name);
1034 		ret = -ENODEV;
1035 		goto out;
1036 	}
1037 
1038 	bvdev = calloc(1, sizeof(*bvdev));
1039 	if (bvdev == NULL) {
1040 		ret = -ENOMEM;
1041 		goto out;
1042 	}
1043 
1044 	vdev = &bvdev->vdev;
1045 	vdev->virtio_features = SPDK_VHOST_BLK_FEATURES_BASE;
1046 	vdev->disabled_features = SPDK_VHOST_BLK_DISABLED_FEATURES;
1047 	vdev->protocol_features = SPDK_VHOST_BLK_PROTOCOL_FEATURES;
1048 
1049 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
1050 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_DISCARD);
1051 	}
1052 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
1053 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
1054 	}
1055 	if (readonly) {
1056 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_RO);
1057 	}
1058 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) {
1059 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_FLUSH);
1060 	}
1061 
1062 	ret = spdk_bdev_open(bdev, true, bdev_remove_cb, bvdev, &bvdev->bdev_desc);
1063 	if (ret != 0) {
1064 		SPDK_ERRLOG("%s: could not open bdev '%s', error=%d\n",
1065 			    name, dev_name, ret);
1066 		goto out;
1067 	}
1068 
1069 	bvdev->bdev = bdev;
1070 	bvdev->readonly = readonly;
1071 	ret = vhost_dev_register(vdev, name, cpumask, &vhost_blk_device_backend);
1072 	if (ret != 0) {
1073 		spdk_bdev_close(bvdev->bdev_desc);
1074 		goto out;
1075 	}
1076 
1077 	SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: using bdev '%s'\n", name, dev_name);
1078 out:
1079 	if (ret != 0 && bvdev) {
1080 		free(bvdev);
1081 	}
1082 	spdk_vhost_unlock();
1083 	return ret;
1084 }
1085 
1086 static int
1087 vhost_blk_destroy(struct spdk_vhost_dev *vdev)
1088 {
1089 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
1090 	int rc;
1091 
1092 	assert(bvdev != NULL);
1093 	rc = vhost_dev_unregister(&bvdev->vdev);
1094 	if (rc != 0) {
1095 		return rc;
1096 	}
1097 
1098 	if (bvdev->bdev_desc) {
1099 		spdk_bdev_close(bvdev->bdev_desc);
1100 		bvdev->bdev_desc = NULL;
1101 	}
1102 	bvdev->bdev = NULL;
1103 
1104 	free(bvdev);
1105 	return 0;
1106 }
1107 
1108 SPDK_LOG_REGISTER_COMPONENT("vhost_blk", SPDK_LOG_VHOST_BLK)
1109 SPDK_LOG_REGISTER_COMPONENT("vhost_blk_data", SPDK_LOG_VHOST_BLK_DATA)
1110