xref: /spdk/lib/vhost/vhost_blk.c (revision 78b696bca594269c6a5e0ac235e44e94fc69e4f8)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <linux/virtio_blk.h>
35 
36 #include "spdk/env.h"
37 #include "spdk/bdev.h"
38 #include "spdk/bdev_module.h"
39 #include "spdk/conf.h"
40 #include "spdk/thread.h"
41 #include "spdk/likely.h"
42 #include "spdk/string.h"
43 #include "spdk/util.h"
44 #include "spdk/vhost.h"
45 
46 #include "vhost_internal.h"
47 
48 /* Minimal set of features supported by every SPDK VHOST-BLK device */
49 #define SPDK_VHOST_BLK_FEATURES_BASE (SPDK_VHOST_FEATURES | \
50 		(1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) | \
51 		(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \
52 		(1ULL << VIRTIO_BLK_F_TOPOLOGY) | (1ULL << VIRTIO_BLK_F_BARRIER)  | \
53 		(1ULL << VIRTIO_BLK_F_SCSI)     | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
54 		(1ULL << VIRTIO_BLK_F_MQ))
55 
56 /* Not supported features */
57 #define SPDK_VHOST_BLK_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \
58 		(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
59 		(1ULL << VIRTIO_BLK_F_BARRIER)  | (1ULL << VIRTIO_BLK_F_SCSI))
60 
61 /* Vhost-blk support protocol features */
62 #ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB
63 #define SPDK_VHOST_BLK_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_CONFIG) | \
64 		(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
65 #else
66 #define SPDK_VHOST_BLK_PROTOCOL_FEATURES (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)
67 #endif
68 
69 struct spdk_vhost_blk_task {
70 	struct spdk_bdev_io *bdev_io;
71 	struct spdk_vhost_blk_session *bvsession;
72 	struct spdk_vhost_virtqueue *vq;
73 
74 	volatile uint8_t *status;
75 
76 	uint16_t req_idx;
77 
78 	/* for io wait */
79 	struct spdk_bdev_io_wait_entry bdev_io_wait;
80 
81 	/* If set, the task is currently used for I/O processing. */
82 	bool used;
83 
84 	/** Number of bytes that were written. */
85 	uint32_t used_len;
86 	uint16_t iovcnt;
87 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
88 };
89 
90 struct spdk_vhost_blk_dev {
91 	struct spdk_vhost_dev vdev;
92 	struct spdk_bdev *bdev;
93 	struct spdk_bdev_desc *bdev_desc;
94 	bool readonly;
95 };
96 
97 struct spdk_vhost_blk_session {
98 	/* The parent session must be the very first field in this struct */
99 	struct spdk_vhost_session vsession;
100 	struct spdk_vhost_blk_dev *bvdev;
101 	struct spdk_poller *requestq_poller;
102 	struct spdk_io_channel *io_channel;
103 	struct spdk_poller *stop_poller;
104 };
105 
106 /* forward declaration */
107 static const struct spdk_vhost_dev_backend vhost_blk_device_backend;
108 
109 static int
110 process_blk_request(struct spdk_vhost_blk_task *task,
111 		    struct spdk_vhost_blk_session *bvsession,
112 		    struct spdk_vhost_virtqueue *vq);
113 
114 static void
115 blk_task_finish(struct spdk_vhost_blk_task *task)
116 {
117 	assert(task->bvsession->vsession.task_cnt > 0);
118 	task->bvsession->vsession.task_cnt--;
119 	task->used = false;
120 }
121 
122 static void
123 blk_task_init(struct spdk_vhost_blk_task *task)
124 {
125 	task->used = true;
126 	task->iovcnt = SPDK_COUNTOF(task->iovs);
127 	task->status = NULL;
128 	task->used_len = 0;
129 }
130 
131 static void
132 invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status)
133 {
134 	if (task->status) {
135 		*task->status = status;
136 	}
137 
138 	vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, task->req_idx,
139 				   task->used_len);
140 	blk_task_finish(task);
141 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Invalid request (status=%" PRIu8")\n", status);
142 }
143 
144 /*
145  * Process task's descriptor chain and setup data related fields.
146  * Return
147  *   total size of suplied buffers
148  *
149  *   FIXME: Make this function return to rd_cnt and wr_cnt
150  */
151 static int
152 blk_iovs_setup(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq,
153 	       uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
154 {
155 	struct spdk_vhost_session *vsession = &bvsession->vsession;
156 	struct spdk_vhost_dev *vdev = vsession->vdev;
157 	struct vring_desc *desc, *desc_table;
158 	uint16_t out_cnt = 0, cnt = 0;
159 	uint32_t desc_table_size, len = 0;
160 	uint32_t desc_handled_cnt;
161 	int rc;
162 
163 	rc = vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size);
164 	if (rc != 0) {
165 		SPDK_ERRLOG("%s: invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
166 		return -1;
167 	}
168 
169 	desc_handled_cnt = 0;
170 	while (1) {
171 		/*
172 		 * Maximum cnt reached?
173 		 * Should not happen if request is well formatted, otherwise this is a BUG.
174 		 */
175 		if (spdk_unlikely(cnt == *iovs_cnt)) {
176 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
177 				      vsession->name, req_idx);
178 			return -1;
179 		}
180 
181 		if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) {
182 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
183 				      vsession->name, req_idx, cnt);
184 			return -1;
185 		}
186 
187 		len += desc->len;
188 
189 		out_cnt += vhost_vring_desc_is_wr(desc);
190 
191 		rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
192 		if (rc != 0) {
193 			SPDK_ERRLOG("%s: descriptor chain at index %"PRIu16" terminated unexpectedly.\n",
194 				    vsession->name, req_idx);
195 			return -1;
196 		} else if (desc == NULL) {
197 			break;
198 		}
199 
200 		desc_handled_cnt++;
201 		if (spdk_unlikely(desc_handled_cnt > desc_table_size)) {
202 			/* Break a cycle and report an error, if any. */
203 			SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n",
204 				    vsession->name, desc_table_size, desc_handled_cnt);
205 			return -1;
206 		}
207 	}
208 
209 	/*
210 	 * There must be least two descriptors.
211 	 * First contain request so it must be readable.
212 	 * Last descriptor contain buffer for response so it must be writable.
213 	 */
214 	if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
215 		return -1;
216 	}
217 
218 	*length = len;
219 	*iovs_cnt = cnt;
220 	return 0;
221 }
222 
223 static void
224 blk_request_finish(bool success, struct spdk_vhost_blk_task *task)
225 {
226 	*task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
227 	vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, task->req_idx,
228 				   task->used_len);
229 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Finished task (%p) req_idx=%d\n status: %s\n", task,
230 		      task->req_idx, success ? "OK" : "FAIL");
231 	blk_task_finish(task);
232 }
233 
234 static void
235 blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
236 {
237 	struct spdk_vhost_blk_task *task = cb_arg;
238 
239 	spdk_bdev_free_io(bdev_io);
240 	blk_request_finish(success, task);
241 }
242 
243 static void
244 blk_request_resubmit(void *arg)
245 {
246 	struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg;
247 	int rc = 0;
248 
249 	blk_task_init(task);
250 
251 	rc = process_blk_request(task, task->bvsession, task->vq);
252 	if (rc == 0) {
253 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p resubmitted ======\n", task);
254 	} else {
255 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p failed ======\n", task);
256 	}
257 }
258 
259 static inline void
260 blk_request_queue_io(struct spdk_vhost_blk_task *task)
261 {
262 	int rc;
263 	struct spdk_vhost_blk_session *bvsession = task->bvsession;
264 	struct spdk_bdev *bdev = bvsession->bvdev->bdev;
265 
266 	task->bdev_io_wait.bdev = bdev;
267 	task->bdev_io_wait.cb_fn = blk_request_resubmit;
268 	task->bdev_io_wait.cb_arg = task;
269 
270 	rc = spdk_bdev_queue_io_wait(bdev, bvsession->io_channel, &task->bdev_io_wait);
271 	if (rc != 0) {
272 		SPDK_ERRLOG("%s: failed to queue I/O, rc=%d\n", bvsession->vsession.name, rc);
273 		invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
274 	}
275 }
276 
277 static int
278 process_blk_request(struct spdk_vhost_blk_task *task,
279 		    struct spdk_vhost_blk_session *bvsession,
280 		    struct spdk_vhost_virtqueue *vq)
281 {
282 	struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
283 	const struct virtio_blk_outhdr *req;
284 	struct virtio_blk_discard_write_zeroes *desc;
285 	struct iovec *iov;
286 	uint32_t type;
287 	uint32_t payload_len;
288 	uint64_t flush_bytes;
289 	int rc;
290 
291 	if (blk_iovs_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt, &payload_len)) {
292 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
293 		/* Only READ and WRITE are supported for now. */
294 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
295 		return -1;
296 	}
297 
298 	iov = &task->iovs[0];
299 	if (spdk_unlikely(iov->iov_len != sizeof(*req))) {
300 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
301 			      "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n",
302 			      iov->iov_len, sizeof(*req), task->req_idx);
303 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
304 		return -1;
305 	}
306 
307 	req = iov->iov_base;
308 
309 	iov = &task->iovs[task->iovcnt - 1];
310 	if (spdk_unlikely(iov->iov_len != 1)) {
311 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
312 			      "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n",
313 			      iov->iov_len, 1, task->req_idx);
314 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
315 		return -1;
316 	}
317 
318 	task->status = iov->iov_base;
319 	payload_len -= sizeof(*req) + sizeof(*task->status);
320 	task->iovcnt -= 2;
321 
322 	type = req->type;
323 #ifdef VIRTIO_BLK_T_BARRIER
324 	/* Don't care about barier for now (as QEMU's virtio-blk do). */
325 	type &= ~VIRTIO_BLK_T_BARRIER;
326 #endif
327 
328 	switch (type) {
329 	case VIRTIO_BLK_T_IN:
330 	case VIRTIO_BLK_T_OUT:
331 		if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) {
332 			SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n",
333 				    type ? "WRITE" : "READ", task->req_idx);
334 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
335 			return -1;
336 		}
337 
338 		if (type == VIRTIO_BLK_T_IN) {
339 			task->used_len = payload_len + sizeof(*task->status);
340 			rc = spdk_bdev_readv(bvdev->bdev_desc, bvsession->io_channel,
341 					     &task->iovs[1], task->iovcnt, req->sector * 512,
342 					     payload_len, blk_request_complete_cb, task);
343 		} else if (!bvdev->readonly) {
344 			task->used_len = sizeof(*task->status);
345 			rc = spdk_bdev_writev(bvdev->bdev_desc, bvsession->io_channel,
346 					      &task->iovs[1], task->iovcnt, req->sector * 512,
347 					      payload_len, blk_request_complete_cb, task);
348 		} else {
349 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Device is in read-only mode!\n");
350 			rc = -1;
351 		}
352 
353 		if (rc) {
354 			if (rc == -ENOMEM) {
355 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
356 				blk_request_queue_io(task);
357 			} else {
358 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
359 				return -1;
360 			}
361 		}
362 		break;
363 	case VIRTIO_BLK_T_DISCARD:
364 		desc = task->iovs[1].iov_base;
365 		if (payload_len != sizeof(*desc)) {
366 			SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len);
367 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
368 			return -1;
369 		}
370 
371 		rc = spdk_bdev_unmap(bvdev->bdev_desc, bvsession->io_channel,
372 				     desc->sector * 512, desc->num_sectors * 512,
373 				     blk_request_complete_cb, task);
374 		if (rc) {
375 			if (rc == -ENOMEM) {
376 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
377 				blk_request_queue_io(task);
378 			} else {
379 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
380 				return -1;
381 			}
382 		}
383 		break;
384 	case VIRTIO_BLK_T_WRITE_ZEROES:
385 		desc = task->iovs[1].iov_base;
386 		if (payload_len != sizeof(*desc)) {
387 			SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len);
388 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
389 			return -1;
390 		}
391 
392 		/* Zeroed and Unmap the range, SPDK doen't support it. */
393 		if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
394 			SPDK_NOTICELOG("Can't support Write Zeroes with Unmap flag\n");
395 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
396 			return -1;
397 		}
398 
399 		rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvsession->io_channel,
400 					    desc->sector * 512, desc->num_sectors * 512,
401 					    blk_request_complete_cb, task);
402 		if (rc) {
403 			if (rc == -ENOMEM) {
404 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
405 				blk_request_queue_io(task);
406 			} else {
407 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
408 				return -1;
409 			}
410 		}
411 		break;
412 	case VIRTIO_BLK_T_FLUSH:
413 		flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev);
414 		if (req->sector != 0) {
415 			SPDK_NOTICELOG("sector must be zero for flush command\n");
416 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
417 			return -1;
418 		}
419 		rc = spdk_bdev_flush(bvdev->bdev_desc, bvsession->io_channel,
420 				     0, flush_bytes,
421 				     blk_request_complete_cb, task);
422 		if (rc) {
423 			if (rc == -ENOMEM) {
424 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
425 				blk_request_queue_io(task);
426 			} else {
427 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
428 				return -1;
429 			}
430 		}
431 		break;
432 	case VIRTIO_BLK_T_GET_ID:
433 		if (!task->iovcnt || !payload_len) {
434 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
435 			return -1;
436 		}
437 		task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len);
438 		spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev),
439 				task->used_len, ' ');
440 		blk_request_finish(true, task);
441 		break;
442 	default:
443 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Not supported request type '%"PRIu32"'.\n", type);
444 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
445 		return -1;
446 	}
447 
448 	return 0;
449 }
450 
451 static void
452 process_blk_task(struct spdk_vhost_virtqueue *vq, uint16_t req_idx)
453 {
454 	struct spdk_vhost_blk_task *task;
455 
456 	task = &((struct spdk_vhost_blk_task *)vq->tasks)[req_idx];
457 	if (spdk_unlikely(task->used)) {
458 		SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
459 			    task->bvsession->vsession.name, req_idx);
460 		task->used_len = 0;
461 		vhost_vq_used_ring_enqueue(&task->bvsession->vsession, vq, req_idx, 0);
462 		return;
463 	}
464 
465 	task->bvsession->vsession.task_cnt++;
466 
467 	blk_task_init(task);
468 
469 	if (process_blk_request(task, task->bvsession, vq) == 0) {
470 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task,
471 			      req_idx);
472 	} else {
473 		SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, req_idx);
474 	}
475 }
476 
477 static void
478 submit_inflight_desc(struct spdk_vhost_blk_session *bvsession,
479 		     struct spdk_vhost_virtqueue *vq)
480 {
481 	struct spdk_vhost_session *vsession = &bvsession->vsession;
482 	spdk_vhost_resubmit_info *resubmit = vq->vring_inflight.resubmit_inflight;
483 	spdk_vhost_resubmit_desc *resubmit_list;
484 	uint16_t req_idx;
485 
486 	if (spdk_likely(resubmit == NULL || resubmit->resubmit_list == NULL)) {
487 		return;
488 	}
489 
490 	resubmit_list = resubmit->resubmit_list;
491 	while (resubmit->resubmit_num-- > 0) {
492 		req_idx = resubmit_list[resubmit->resubmit_num].index;
493 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Start processing request idx %"PRIu16"======\n",
494 			      req_idx);
495 
496 		if (spdk_unlikely(req_idx >= vq->vring.size)) {
497 			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
498 				    vsession->name, req_idx, vq->vring.size);
499 			vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
500 			continue;
501 		}
502 
503 		process_blk_task(vq, req_idx);
504 	}
505 
506 	free(resubmit_list);
507 	resubmit->resubmit_list = NULL;
508 }
509 
510 static void
511 process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
512 {
513 	struct spdk_vhost_session *vsession = &bvsession->vsession;
514 	uint16_t reqs[32];
515 	uint16_t reqs_cnt, i;
516 	uint16_t vq_idx = vq->vring_idx;
517 
518 	submit_inflight_desc(bvsession, vq);
519 
520 	reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
521 	if (!reqs_cnt) {
522 		return;
523 	}
524 
525 	for (i = 0; i < reqs_cnt; i++) {
526 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n",
527 			      reqs[i]);
528 
529 		if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
530 			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
531 				    vsession->name, reqs[i], vq->vring.size);
532 			vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
533 			continue;
534 		}
535 
536 		rte_vhost_set_inflight_desc_split(vsession->vid, vq_idx, reqs[i]);
537 
538 		process_blk_task(vq, reqs[i]);
539 	}
540 }
541 
542 static int
543 vdev_worker(void *arg)
544 {
545 	struct spdk_vhost_blk_session *bvsession = arg;
546 	struct spdk_vhost_session *vsession = &bvsession->vsession;
547 
548 	uint16_t q_idx;
549 
550 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
551 		process_vq(bvsession, &vsession->virtqueue[q_idx]);
552 	}
553 
554 	vhost_session_used_signal(vsession);
555 
556 	return -1;
557 }
558 
559 static void
560 no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
561 {
562 	struct spdk_vhost_session *vsession = &bvsession->vsession;
563 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
564 	uint32_t length;
565 	uint16_t iovcnt, req_idx;
566 
567 	if (vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) {
568 		return;
569 	}
570 
571 	iovcnt = SPDK_COUNTOF(iovs);
572 	if (blk_iovs_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) {
573 		*(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR;
574 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx);
575 	}
576 
577 	vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
578 }
579 
580 static int
581 no_bdev_vdev_worker(void *arg)
582 {
583 	struct spdk_vhost_blk_session *bvsession = arg;
584 	struct spdk_vhost_session *vsession = &bvsession->vsession;
585 	uint16_t q_idx;
586 
587 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
588 		no_bdev_process_vq(bvsession, &vsession->virtqueue[q_idx]);
589 	}
590 
591 	vhost_session_used_signal(vsession);
592 
593 	if (vsession->task_cnt == 0 && bvsession->io_channel) {
594 		spdk_put_io_channel(bvsession->io_channel);
595 		bvsession->io_channel = NULL;
596 	}
597 
598 	return -1;
599 }
600 
601 static struct spdk_vhost_blk_session *
602 to_blk_session(struct spdk_vhost_session *vsession)
603 {
604 	assert(vsession->vdev->backend == &vhost_blk_device_backend);
605 	return (struct spdk_vhost_blk_session *)vsession;
606 }
607 
608 static struct spdk_vhost_blk_dev *
609 to_blk_dev(struct spdk_vhost_dev *vdev)
610 {
611 	if (vdev == NULL) {
612 		return NULL;
613 	}
614 
615 	if (vdev->backend != &vhost_blk_device_backend) {
616 		SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name);
617 		return NULL;
618 	}
619 
620 	return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev);
621 }
622 
623 struct spdk_bdev *
624 spdk_vhost_blk_get_dev(struct spdk_vhost_dev *vdev)
625 {
626 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
627 
628 	assert(bvdev != NULL);
629 	return bvdev->bdev;
630 }
631 
632 static void
633 vhost_dev_bdev_remove_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx)
634 {
635 
636 	/* All sessions have been notified, time to close the bdev */
637 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
638 
639 	assert(bvdev != NULL);
640 	spdk_bdev_close(bvdev->bdev_desc);
641 	bvdev->bdev_desc = NULL;
642 	bvdev->bdev = NULL;
643 }
644 
645 static int
646 vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev,
647 			     struct spdk_vhost_session *vsession,
648 			     void *ctx)
649 {
650 	struct spdk_vhost_blk_session *bvsession;
651 
652 	bvsession = (struct spdk_vhost_blk_session *)vsession;
653 	if (bvsession->requestq_poller) {
654 		spdk_poller_unregister(&bvsession->requestq_poller);
655 		bvsession->requestq_poller = spdk_poller_register(no_bdev_vdev_worker, bvsession, 0);
656 	}
657 
658 	return 0;
659 }
660 
661 static void
662 bdev_remove_cb(void *remove_ctx)
663 {
664 	struct spdk_vhost_blk_dev *bvdev = remove_ctx;
665 
666 	SPDK_WARNLOG("%s: hot-removing bdev - all further requests will fail.\n",
667 		     bvdev->vdev.name);
668 
669 	spdk_vhost_lock();
670 	vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_remove_cb,
671 				  vhost_dev_bdev_remove_cpl_cb, NULL);
672 	spdk_vhost_unlock();
673 }
674 
675 static void
676 free_task_pool(struct spdk_vhost_blk_session *bvsession)
677 {
678 	struct spdk_vhost_session *vsession = &bvsession->vsession;
679 	struct spdk_vhost_virtqueue *vq;
680 	uint16_t i;
681 
682 	for (i = 0; i < vsession->max_queues; i++) {
683 		vq = &vsession->virtqueue[i];
684 		if (vq->tasks == NULL) {
685 			continue;
686 		}
687 
688 		spdk_free(vq->tasks);
689 		vq->tasks = NULL;
690 	}
691 }
692 
693 static int
694 alloc_task_pool(struct spdk_vhost_blk_session *bvsession)
695 {
696 	struct spdk_vhost_session *vsession = &bvsession->vsession;
697 	struct spdk_vhost_virtqueue *vq;
698 	struct spdk_vhost_blk_task *task;
699 	uint32_t task_cnt;
700 	uint16_t i;
701 	uint32_t j;
702 
703 	for (i = 0; i < vsession->max_queues; i++) {
704 		vq = &vsession->virtqueue[i];
705 		if (vq->vring.desc == NULL) {
706 			continue;
707 		}
708 
709 		task_cnt = vq->vring.size;
710 		if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
711 			/* sanity check */
712 			SPDK_ERRLOG("%s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
713 				    vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
714 			free_task_pool(bvsession);
715 			return -1;
716 		}
717 		vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt,
718 					 SPDK_CACHE_LINE_SIZE, NULL,
719 					 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
720 		if (vq->tasks == NULL) {
721 			SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
722 				    vsession->name, task_cnt, i);
723 			free_task_pool(bvsession);
724 			return -1;
725 		}
726 
727 		for (j = 0; j < task_cnt; j++) {
728 			task = &((struct spdk_vhost_blk_task *)vq->tasks)[j];
729 			task->bvsession = bvsession;
730 			task->req_idx = j;
731 			task->vq = vq;
732 		}
733 	}
734 
735 	return 0;
736 }
737 
738 static int
739 vhost_blk_start_cb(struct spdk_vhost_dev *vdev,
740 		   struct spdk_vhost_session *vsession, void *unused)
741 {
742 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
743 	struct spdk_vhost_blk_dev *bvdev;
744 	int i, rc = 0;
745 
746 	bvdev = to_blk_dev(vdev);
747 	assert(bvdev != NULL);
748 	bvsession->bvdev = bvdev;
749 
750 	/* validate all I/O queues are in a contiguous index range */
751 	for (i = 0; i < vsession->max_queues; i++) {
752 		if (vsession->virtqueue[i].vring.desc == NULL) {
753 			SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i);
754 			rc = -1;
755 			goto out;
756 		}
757 	}
758 
759 	rc = alloc_task_pool(bvsession);
760 	if (rc != 0) {
761 		SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name);
762 		goto out;
763 	}
764 
765 	if (bvdev->bdev) {
766 		bvsession->io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
767 		if (!bvsession->io_channel) {
768 			free_task_pool(bvsession);
769 			SPDK_ERRLOG("%s: I/O channel allocation failed\n", vsession->name);
770 			rc = -1;
771 			goto out;
772 		}
773 	}
774 
775 	bvsession->requestq_poller = spdk_poller_register(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker,
776 				     bvsession, 0);
777 	SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: started poller on lcore %d\n",
778 		     vsession->name, spdk_env_get_current_core());
779 out:
780 	vhost_session_start_done(vsession, rc);
781 	return rc;
782 }
783 
784 static int
785 vhost_blk_start(struct spdk_vhost_session *vsession)
786 {
787 	return vhost_session_send_event(vsession, vhost_blk_start_cb,
788 					3, "start session");
789 }
790 
791 static int
792 destroy_session_poller_cb(void *arg)
793 {
794 	struct spdk_vhost_blk_session *bvsession = arg;
795 	struct spdk_vhost_session *vsession = &bvsession->vsession;
796 	int i;
797 
798 	if (vsession->task_cnt > 0) {
799 		return -1;
800 	}
801 
802 	if (spdk_vhost_trylock() != 0) {
803 		return -1;
804 	}
805 
806 	for (i = 0; i < vsession->max_queues; i++) {
807 		vsession->virtqueue[i].next_event_time = 0;
808 		vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
809 	}
810 
811 	SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: stopping poller on lcore %d\n",
812 		     vsession->name, spdk_env_get_current_core());
813 
814 	if (bvsession->io_channel) {
815 		spdk_put_io_channel(bvsession->io_channel);
816 		bvsession->io_channel = NULL;
817 	}
818 
819 	free_task_pool(bvsession);
820 	spdk_poller_unregister(&bvsession->stop_poller);
821 	vhost_session_stop_done(vsession, 0);
822 
823 	spdk_vhost_unlock();
824 	return -1;
825 }
826 
827 static int
828 vhost_blk_stop_cb(struct spdk_vhost_dev *vdev,
829 		  struct spdk_vhost_session *vsession, void *unused)
830 {
831 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
832 
833 	spdk_poller_unregister(&bvsession->requestq_poller);
834 	bvsession->stop_poller = spdk_poller_register(destroy_session_poller_cb,
835 				 bvsession, 1000);
836 	return 0;
837 }
838 
839 static int
840 vhost_blk_stop(struct spdk_vhost_session *vsession)
841 {
842 	return vhost_session_send_event(vsession, vhost_blk_stop_cb,
843 					3, "stop session");
844 }
845 
846 static void
847 vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
848 {
849 	struct spdk_bdev *bdev = spdk_vhost_blk_get_dev(vdev);
850 	struct spdk_vhost_blk_dev *bvdev;
851 
852 	bvdev = to_blk_dev(vdev);
853 	assert(bvdev != NULL);
854 	spdk_json_write_named_object_begin(w, "block");
855 
856 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
857 
858 	spdk_json_write_name(w, "bdev");
859 	if (bdev) {
860 		spdk_json_write_string(w, spdk_bdev_get_name(bdev));
861 	} else {
862 		spdk_json_write_null(w);
863 	}
864 
865 	spdk_json_write_object_end(w);
866 }
867 
868 static void
869 vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
870 {
871 	struct spdk_vhost_blk_dev *bvdev;
872 
873 	bvdev = to_blk_dev(vdev);
874 	assert(bvdev != NULL);
875 	if (!bvdev->bdev) {
876 		return;
877 	}
878 
879 	spdk_json_write_object_begin(w);
880 	spdk_json_write_named_string(w, "method", "vhost_create_blk_controller");
881 
882 	spdk_json_write_named_object_begin(w, "params");
883 	spdk_json_write_named_string(w, "ctrlr", vdev->name);
884 	spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev));
885 	spdk_json_write_named_string(w, "cpumask",
886 				     spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread)));
887 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
888 	spdk_json_write_object_end(w);
889 
890 	spdk_json_write_object_end(w);
891 }
892 
893 static int vhost_blk_destroy(struct spdk_vhost_dev *dev);
894 
895 static int
896 vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
897 		     uint32_t len)
898 {
899 	struct virtio_blk_config blkcfg;
900 	struct spdk_vhost_blk_dev *bvdev;
901 	struct spdk_bdev *bdev;
902 	uint32_t blk_size;
903 	uint64_t blkcnt;
904 
905 	memset(&blkcfg, 0, sizeof(blkcfg));
906 	bvdev = to_blk_dev(vdev);
907 	assert(bvdev != NULL);
908 	bdev = bvdev->bdev;
909 	if (bdev == NULL) {
910 		/* We can't just return -1 here as this GET_CONFIG message might
911 		 * be caused by a QEMU VM reboot. Returning -1 will indicate an
912 		 * error to QEMU, who might then decide to terminate itself.
913 		 * We don't want that. A simple reboot shouldn't break the system.
914 		 *
915 		 * Presenting a block device with block size 0 and block count 0
916 		 * doesn't cause any problems on QEMU side and the virtio-pci
917 		 * device is even still available inside the VM, but there will
918 		 * be no block device created for it - the kernel drivers will
919 		 * silently reject it.
920 		 */
921 		blk_size = 0;
922 		blkcnt = 0;
923 	} else {
924 		blk_size = spdk_bdev_get_block_size(bdev);
925 		blkcnt = spdk_bdev_get_num_blocks(bdev);
926 		if (spdk_bdev_get_buf_align(bdev) > 1) {
927 			blkcfg.size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE;
928 			blkcfg.seg_max = spdk_min(SPDK_VHOST_IOVS_MAX - 2 - 1, BDEV_IO_NUM_CHILD_IOV - 2 - 1);
929 		} else {
930 			blkcfg.size_max = 131072;
931 			/*  -2 for REQ and RESP and -1 for region boundary splitting */
932 			blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
933 		}
934 	}
935 
936 	blkcfg.blk_size = blk_size;
937 	/* minimum I/O size in blocks */
938 	blkcfg.min_io_size = 1;
939 	/* expressed in 512 Bytes sectors */
940 	blkcfg.capacity = (blkcnt * blk_size) / 512;
941 	/* QEMU can overwrite this value when started */
942 	blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES;
943 
944 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
945 		/* 16MiB, expressed in 512 Bytes */
946 		blkcfg.max_discard_sectors = 32768;
947 		blkcfg.max_discard_seg = 1;
948 		blkcfg.discard_sector_alignment = blk_size / 512;
949 	}
950 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
951 		blkcfg.max_write_zeroes_sectors = 32768;
952 		blkcfg.max_write_zeroes_seg = 1;
953 	}
954 
955 	memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg)));
956 
957 	return 0;
958 }
959 
960 static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
961 	.session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session),
962 	.start_session =  vhost_blk_start,
963 	.stop_session = vhost_blk_stop,
964 	.vhost_get_config = vhost_blk_get_config,
965 	.dump_info_json = vhost_blk_dump_info_json,
966 	.write_config_json = vhost_blk_write_config_json,
967 	.remove_device = vhost_blk_destroy,
968 };
969 
970 int
971 vhost_blk_controller_construct(void)
972 {
973 	struct spdk_conf_section *sp;
974 	unsigned ctrlr_num;
975 	char *bdev_name;
976 	char *cpumask;
977 	char *name;
978 	bool readonly;
979 
980 	for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) {
981 		if (!spdk_conf_section_match_prefix(sp, "VhostBlk")) {
982 			continue;
983 		}
984 
985 		if (sscanf(spdk_conf_section_get_name(sp), "VhostBlk%u", &ctrlr_num) != 1) {
986 			SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
987 				    spdk_conf_section_get_name(sp));
988 			return -1;
989 		}
990 
991 		name = spdk_conf_section_get_val(sp, "Name");
992 		if (name == NULL) {
993 			SPDK_ERRLOG("VhostBlk%u: missing Name\n", ctrlr_num);
994 			return -1;
995 		}
996 
997 		cpumask = spdk_conf_section_get_val(sp, "Cpumask");
998 		readonly = spdk_conf_section_get_boolval(sp, "ReadOnly", false);
999 
1000 		bdev_name = spdk_conf_section_get_val(sp, "Dev");
1001 		if (bdev_name == NULL) {
1002 			continue;
1003 		}
1004 
1005 		if (spdk_vhost_blk_construct(name, cpumask, bdev_name, readonly) < 0) {
1006 			return -1;
1007 		}
1008 	}
1009 
1010 	return 0;
1011 }
1012 
1013 int
1014 spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name, bool readonly)
1015 {
1016 	struct spdk_vhost_blk_dev *bvdev = NULL;
1017 	struct spdk_vhost_dev *vdev;
1018 	struct spdk_bdev *bdev;
1019 	int ret = 0;
1020 
1021 	spdk_vhost_lock();
1022 	bdev = spdk_bdev_get_by_name(dev_name);
1023 	if (bdev == NULL) {
1024 		SPDK_ERRLOG("%s: bdev '%s' not found\n",
1025 			    name, dev_name);
1026 		ret = -ENODEV;
1027 		goto out;
1028 	}
1029 
1030 	bvdev = calloc(1, sizeof(*bvdev));
1031 	if (bvdev == NULL) {
1032 		ret = -ENOMEM;
1033 		goto out;
1034 	}
1035 
1036 	vdev = &bvdev->vdev;
1037 	vdev->virtio_features = SPDK_VHOST_BLK_FEATURES_BASE;
1038 	vdev->disabled_features = SPDK_VHOST_BLK_DISABLED_FEATURES;
1039 	vdev->protocol_features = SPDK_VHOST_BLK_PROTOCOL_FEATURES;
1040 
1041 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
1042 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_DISCARD);
1043 	}
1044 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
1045 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
1046 	}
1047 	if (readonly) {
1048 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_RO);
1049 	}
1050 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) {
1051 		vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_FLUSH);
1052 	}
1053 
1054 	ret = spdk_bdev_open(bdev, true, bdev_remove_cb, bvdev, &bvdev->bdev_desc);
1055 	if (ret != 0) {
1056 		SPDK_ERRLOG("%s: could not open bdev '%s', error=%d\n",
1057 			    name, dev_name, ret);
1058 		goto out;
1059 	}
1060 
1061 	bvdev->bdev = bdev;
1062 	bvdev->readonly = readonly;
1063 	ret = vhost_dev_register(vdev, name, cpumask, &vhost_blk_device_backend);
1064 	if (ret != 0) {
1065 		spdk_bdev_close(bvdev->bdev_desc);
1066 		goto out;
1067 	}
1068 
1069 	SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: using bdev '%s'\n", name, dev_name);
1070 out:
1071 	if (ret != 0 && bvdev) {
1072 		free(bvdev);
1073 	}
1074 	spdk_vhost_unlock();
1075 	return ret;
1076 }
1077 
1078 static int
1079 vhost_blk_destroy(struct spdk_vhost_dev *vdev)
1080 {
1081 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
1082 	int rc;
1083 
1084 	assert(bvdev != NULL);
1085 	rc = vhost_dev_unregister(&bvdev->vdev);
1086 	if (rc != 0) {
1087 		return rc;
1088 	}
1089 
1090 	if (bvdev->bdev_desc) {
1091 		spdk_bdev_close(bvdev->bdev_desc);
1092 		bvdev->bdev_desc = NULL;
1093 	}
1094 	bvdev->bdev = NULL;
1095 
1096 	free(bvdev);
1097 	return 0;
1098 }
1099 
1100 SPDK_LOG_REGISTER_COMPONENT("vhost_blk", SPDK_LOG_VHOST_BLK)
1101 SPDK_LOG_REGISTER_COMPONENT("vhost_blk_data", SPDK_LOG_VHOST_BLK_DATA)
1102