xref: /spdk/lib/vhost/vhost_blk.c (revision fa2d95b3fe66e7f5c543eaef89fa00d4eaa0e6e7)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <linux/virtio_blk.h>
35 
36 #include "spdk/env.h"
37 #include "spdk/bdev.h"
38 #include "spdk/conf.h"
39 #include "spdk/thread.h"
40 #include "spdk/likely.h"
41 #include "spdk/string.h"
42 #include "spdk/util.h"
43 #include "spdk/vhost.h"
44 
45 #include "vhost_internal.h"
46 
47 struct spdk_vhost_blk_task {
48 	struct spdk_bdev_io *bdev_io;
49 	struct spdk_vhost_blk_session *bvsession;
50 	struct spdk_vhost_virtqueue *vq;
51 
52 	volatile uint8_t *status;
53 
54 	uint16_t req_idx;
55 
56 	/* for io wait */
57 	struct spdk_bdev_io_wait_entry bdev_io_wait;
58 
59 	/* If set, the task is currently used for I/O processing. */
60 	bool used;
61 
62 	/** Number of bytes that were written. */
63 	uint32_t used_len;
64 	uint16_t iovcnt;
65 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
66 };
67 
68 struct spdk_vhost_blk_dev {
69 	struct spdk_vhost_dev vdev;
70 	struct spdk_bdev *bdev;
71 	struct spdk_bdev_desc *bdev_desc;
72 	bool readonly;
73 };
74 
75 struct spdk_vhost_blk_session {
76 	/* The parent session must be the very first field in this struct */
77 	struct spdk_vhost_session vsession;
78 	struct spdk_vhost_blk_dev *bvdev;
79 	struct spdk_poller *requestq_poller;
80 	struct spdk_io_channel *io_channel;
81 	struct spdk_poller *stop_poller;
82 };
83 
84 /* forward declaration */
85 static const struct spdk_vhost_dev_backend vhost_blk_device_backend;
86 
87 static int
88 process_blk_request(struct spdk_vhost_blk_task *task,
89 		    struct spdk_vhost_blk_session *bvsession,
90 		    struct spdk_vhost_virtqueue *vq);
91 
92 static void
93 blk_task_finish(struct spdk_vhost_blk_task *task)
94 {
95 	assert(task->bvsession->vsession.task_cnt > 0);
96 	task->bvsession->vsession.task_cnt--;
97 	task->used = false;
98 }
99 
100 static void
101 invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status)
102 {
103 	if (task->status) {
104 		*task->status = status;
105 	}
106 
107 	spdk_vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, task->req_idx,
108 					task->used_len);
109 	blk_task_finish(task);
110 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Invalid request (status=%" PRIu8")\n", status);
111 }
112 
113 /*
114  * Process task's descriptor chain and setup data related fields.
115  * Return
116  *   total size of suplied buffers
117  *
118  *   FIXME: Make this function return to rd_cnt and wr_cnt
119  */
120 static int
121 blk_iovs_setup(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq,
122 	       uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
123 {
124 	struct spdk_vhost_session *vsession = &bvsession->vsession;
125 	struct spdk_vhost_dev *vdev = vsession->vdev;
126 	struct vring_desc *desc, *desc_table;
127 	uint16_t out_cnt = 0, cnt = 0;
128 	uint32_t desc_table_size, len = 0;
129 	uint32_t desc_handled_cnt;
130 	int rc;
131 
132 	rc = spdk_vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size);
133 	if (rc != 0) {
134 		SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
135 		return -1;
136 	}
137 
138 	desc_handled_cnt = 0;
139 	while (1) {
140 		/*
141 		 * Maximum cnt reached?
142 		 * Should not happen if request is well formatted, otherwise this is a BUG.
143 		 */
144 		if (spdk_unlikely(cnt == *iovs_cnt)) {
145 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Max IOVs in request reached (req_idx = %"PRIu16").\n",
146 				      req_idx);
147 			return -1;
148 		}
149 
150 		if (spdk_unlikely(spdk_vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) {
151 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
152 				      req_idx, cnt);
153 			return -1;
154 		}
155 
156 		len += desc->len;
157 
158 		out_cnt += spdk_vhost_vring_desc_is_wr(desc);
159 
160 		rc = spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
161 		if (rc != 0) {
162 			SPDK_ERRLOG("%s: Descriptor chain at index %"PRIu16" terminated unexpectedly.\n",
163 				    vdev->name, req_idx);
164 			return -1;
165 		} else if (desc == NULL) {
166 			break;
167 		}
168 
169 		desc_handled_cnt++;
170 		if (spdk_unlikely(desc_handled_cnt > desc_table_size)) {
171 			/* Break a cycle and report an error, if any. */
172 			SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n",
173 				    vdev->name, desc_table_size, desc_handled_cnt);
174 			return -1;
175 		}
176 	}
177 
178 	/*
179 	 * There must be least two descriptors.
180 	 * First contain request so it must be readable.
181 	 * Last descriptor contain buffer for response so it must be writable.
182 	 */
183 	if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
184 		return -1;
185 	}
186 
187 	*length = len;
188 	*iovs_cnt = cnt;
189 	return 0;
190 }
191 
192 static void
193 blk_request_finish(bool success, struct spdk_vhost_blk_task *task)
194 {
195 	*task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
196 	spdk_vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, task->req_idx,
197 					task->used_len);
198 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Finished task (%p) req_idx=%d\n status: %s\n", task,
199 		      task->req_idx, success ? "OK" : "FAIL");
200 	blk_task_finish(task);
201 }
202 
203 static void
204 blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
205 {
206 	struct spdk_vhost_blk_task *task = cb_arg;
207 
208 	spdk_bdev_free_io(bdev_io);
209 	blk_request_finish(success, task);
210 }
211 
212 static void
213 blk_request_resubmit(void *arg)
214 {
215 	struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg;
216 	int rc = 0;
217 
218 	rc = process_blk_request(task, task->bvsession, task->vq);
219 	if (rc == 0) {
220 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p resubmitted ======\n", task);
221 	} else {
222 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p failed ======\n", task);
223 	}
224 }
225 
226 static inline void
227 blk_request_queue_io(struct spdk_vhost_blk_task *task)
228 {
229 	int rc;
230 	struct spdk_vhost_blk_session *bvsession = task->bvsession;
231 	struct spdk_bdev *bdev = bvsession->bvdev->bdev;
232 
233 	task->bdev_io_wait.bdev = bdev;
234 	task->bdev_io_wait.cb_fn = blk_request_resubmit;
235 	task->bdev_io_wait.cb_arg = task;
236 
237 	rc = spdk_bdev_queue_io_wait(bdev, bvsession->io_channel, &task->bdev_io_wait);
238 	if (rc != 0) {
239 		SPDK_ERRLOG("Queue io failed in vhost_blk, rc=%d\n", rc);
240 		invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
241 	}
242 }
243 
244 static int
245 process_blk_request(struct spdk_vhost_blk_task *task,
246 		    struct spdk_vhost_blk_session *bvsession,
247 		    struct spdk_vhost_virtqueue *vq)
248 {
249 	struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
250 	const struct virtio_blk_outhdr *req;
251 	struct virtio_blk_discard_write_zeroes *desc;
252 	struct iovec *iov;
253 	uint32_t type;
254 	uint32_t payload_len;
255 	uint64_t flush_bytes;
256 	int rc;
257 
258 	if (blk_iovs_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt, &payload_len)) {
259 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
260 		/* Only READ and WRITE are supported for now. */
261 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
262 		return -1;
263 	}
264 
265 	iov = &task->iovs[0];
266 	if (spdk_unlikely(iov->iov_len != sizeof(*req))) {
267 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
268 			      "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n",
269 			      iov->iov_len, sizeof(*req), task->req_idx);
270 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
271 		return -1;
272 	}
273 
274 	req = iov->iov_base;
275 
276 	iov = &task->iovs[task->iovcnt - 1];
277 	if (spdk_unlikely(iov->iov_len != 1)) {
278 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
279 			      "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n",
280 			      iov->iov_len, 1, task->req_idx);
281 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
282 		return -1;
283 	}
284 
285 	task->status = iov->iov_base;
286 	payload_len -= sizeof(*req) + sizeof(*task->status);
287 	task->iovcnt -= 2;
288 
289 	type = req->type;
290 #ifdef VIRTIO_BLK_T_BARRIER
291 	/* Don't care about barier for now (as QEMU's virtio-blk do). */
292 	type &= ~VIRTIO_BLK_T_BARRIER;
293 #endif
294 
295 	switch (type) {
296 	case VIRTIO_BLK_T_IN:
297 	case VIRTIO_BLK_T_OUT:
298 		if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) {
299 			SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n",
300 				    type ? "WRITE" : "READ", task->req_idx);
301 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
302 			return -1;
303 		}
304 
305 		if (type == VIRTIO_BLK_T_IN) {
306 			task->used_len = payload_len + sizeof(*task->status);
307 			rc = spdk_bdev_readv(bvdev->bdev_desc, bvsession->io_channel,
308 					     &task->iovs[1], task->iovcnt, req->sector * 512,
309 					     payload_len, blk_request_complete_cb, task);
310 		} else if (!bvdev->readonly) {
311 			task->used_len = sizeof(*task->status);
312 			rc = spdk_bdev_writev(bvdev->bdev_desc, bvsession->io_channel,
313 					      &task->iovs[1], task->iovcnt, req->sector * 512,
314 					      payload_len, blk_request_complete_cb, task);
315 		} else {
316 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Device is in read-only mode!\n");
317 			rc = -1;
318 		}
319 
320 		if (rc) {
321 			if (rc == -ENOMEM) {
322 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
323 				blk_request_queue_io(task);
324 			} else {
325 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
326 				return -1;
327 			}
328 		}
329 		break;
330 	case VIRTIO_BLK_T_DISCARD:
331 		desc = task->iovs[1].iov_base;
332 		if (payload_len != sizeof(*desc)) {
333 			SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len);
334 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
335 			return -1;
336 		}
337 
338 		rc = spdk_bdev_unmap(bvdev->bdev_desc, bvsession->io_channel,
339 				     desc->sector * 512, desc->num_sectors * 512,
340 				     blk_request_complete_cb, task);
341 		if (rc) {
342 			if (rc == -ENOMEM) {
343 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
344 				blk_request_queue_io(task);
345 			} else {
346 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
347 				return -1;
348 			}
349 		}
350 		break;
351 	case VIRTIO_BLK_T_WRITE_ZEROES:
352 		desc = task->iovs[1].iov_base;
353 		if (payload_len != sizeof(*desc)) {
354 			SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len);
355 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
356 			return -1;
357 		}
358 
359 		/* Zeroed and Unmap the range, SPDK doen't support it. */
360 		if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
361 			SPDK_NOTICELOG("Can't support Write Zeroes with Unmap flag\n");
362 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
363 			return -1;
364 		}
365 
366 		rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvsession->io_channel,
367 					    desc->sector * 512, desc->num_sectors * 512,
368 					    blk_request_complete_cb, task);
369 		if (rc) {
370 			if (rc == -ENOMEM) {
371 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
372 				blk_request_queue_io(task);
373 			} else {
374 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
375 				return -1;
376 			}
377 		}
378 		break;
379 	case VIRTIO_BLK_T_FLUSH:
380 		flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev);
381 		if (req->sector != 0) {
382 			SPDK_NOTICELOG("sector must be zero for flush command\n");
383 			invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
384 			return -1;
385 		}
386 		rc = spdk_bdev_flush(bvdev->bdev_desc, bvsession->io_channel,
387 				     0, flush_bytes,
388 				     blk_request_complete_cb, task);
389 		if (rc) {
390 			if (rc == -ENOMEM) {
391 				SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
392 				blk_request_queue_io(task);
393 			} else {
394 				invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
395 				return -1;
396 			}
397 		}
398 		break;
399 	case VIRTIO_BLK_T_GET_ID:
400 		if (!task->iovcnt || !payload_len) {
401 			invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
402 			return -1;
403 		}
404 		task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len);
405 		spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev),
406 				task->used_len, ' ');
407 		blk_request_finish(true, task);
408 		break;
409 	default:
410 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Not supported request type '%"PRIu32"'.\n", type);
411 		invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
412 		return -1;
413 	}
414 
415 	return 0;
416 }
417 
418 static void
419 process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
420 {
421 	struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
422 	struct spdk_vhost_blk_task *task;
423 	struct spdk_vhost_session *vsession = &bvsession->vsession;
424 	int rc;
425 	uint16_t reqs[32];
426 	uint16_t reqs_cnt, i;
427 
428 	reqs_cnt = spdk_vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
429 	if (!reqs_cnt) {
430 		return;
431 	}
432 
433 	for (i = 0; i < reqs_cnt; i++) {
434 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n",
435 			      reqs[i]);
436 
437 		if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
438 			SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
439 				    bvdev->vdev.name, reqs[i], vq->vring.size);
440 			spdk_vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
441 			continue;
442 		}
443 
444 		task = &((struct spdk_vhost_blk_task *)vq->tasks)[reqs[i]];
445 		if (spdk_unlikely(task->used)) {
446 			SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
447 				    bvdev->vdev.name, reqs[i]);
448 			spdk_vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
449 			continue;
450 		}
451 
452 		vsession->task_cnt++;
453 
454 		task->used = true;
455 		task->iovcnt = SPDK_COUNTOF(task->iovs);
456 		task->status = NULL;
457 		task->used_len = 0;
458 
459 		rc = process_blk_request(task, bvsession, vq);
460 		if (rc == 0) {
461 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task,
462 				      reqs[i]);
463 		} else {
464 			SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d failed ======\n", task, reqs[i]);
465 		}
466 	}
467 }
468 
469 static int
470 vdev_worker(void *arg)
471 {
472 	struct spdk_vhost_blk_session *bvsession = arg;
473 	struct spdk_vhost_session *vsession = &bvsession->vsession;
474 
475 	uint16_t q_idx;
476 
477 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
478 		process_vq(bvsession, &vsession->virtqueue[q_idx]);
479 	}
480 
481 	spdk_vhost_session_used_signal(vsession);
482 
483 	return -1;
484 }
485 
486 static void
487 no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
488 {
489 	struct spdk_vhost_session *vsession = &bvsession->vsession;
490 	struct iovec iovs[SPDK_VHOST_IOVS_MAX];
491 	uint32_t length;
492 	uint16_t iovcnt, req_idx;
493 
494 	if (spdk_vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) {
495 		return;
496 	}
497 
498 	iovcnt = SPDK_COUNTOF(iovs);
499 	if (blk_iovs_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) {
500 		*(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR;
501 		SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx);
502 	}
503 
504 	spdk_vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
505 }
506 
507 static int
508 no_bdev_vdev_worker(void *arg)
509 {
510 	struct spdk_vhost_blk_session *bvsession = arg;
511 	struct spdk_vhost_session *vsession = &bvsession->vsession;
512 	uint16_t q_idx;
513 
514 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
515 		no_bdev_process_vq(bvsession, &vsession->virtqueue[q_idx]);
516 	}
517 
518 	spdk_vhost_session_used_signal(vsession);
519 
520 	if (vsession->task_cnt == 0 && bvsession->io_channel) {
521 		spdk_put_io_channel(bvsession->io_channel);
522 		bvsession->io_channel = NULL;
523 	}
524 
525 	return -1;
526 }
527 
528 static struct spdk_vhost_blk_session *
529 to_blk_session(struct spdk_vhost_session *vsession)
530 {
531 	if (vsession == NULL) {
532 		return NULL;
533 	}
534 
535 	if (vsession->vdev->backend != &vhost_blk_device_backend) {
536 		SPDK_ERRLOG("%s: not a vhost-blk device\n", vsession->vdev->name);
537 		return NULL;
538 	}
539 
540 	return (struct spdk_vhost_blk_session *)vsession;
541 }
542 
543 static struct spdk_vhost_blk_dev *
544 to_blk_dev(struct spdk_vhost_dev *vdev)
545 {
546 	if (vdev == NULL) {
547 		return NULL;
548 	}
549 
550 	if (vdev->backend != &vhost_blk_device_backend) {
551 		SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name);
552 		return NULL;
553 	}
554 
555 	return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev);
556 }
557 
558 struct spdk_bdev *
559 spdk_vhost_blk_get_dev(struct spdk_vhost_dev *vdev)
560 {
561 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
562 
563 	assert(bvdev != NULL);
564 	return bvdev->bdev;
565 }
566 
567 static int
568 _spdk_vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev, struct spdk_vhost_session *vsession,
569 				   void *ctx)
570 {
571 	struct spdk_vhost_blk_session *bvsession;
572 
573 	if (vdev == NULL) {
574 		/* Nothing to do */
575 		return 0;
576 	}
577 
578 	if (vsession == NULL) {
579 		/* All sessions have been notified, time to close the bdev */
580 		struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
581 
582 		assert(bvdev != NULL);
583 
584 		spdk_bdev_close(bvdev->bdev_desc);
585 		bvdev->bdev_desc = NULL;
586 		bvdev->bdev = NULL;
587 		return 0;
588 	}
589 
590 	bvsession = (struct spdk_vhost_blk_session *)vsession;
591 	if (bvsession->requestq_poller) {
592 		spdk_poller_unregister(&bvsession->requestq_poller);
593 		bvsession->requestq_poller = spdk_poller_register(no_bdev_vdev_worker, bvsession, 0);
594 	}
595 
596 	return 0;
597 }
598 
599 static void
600 bdev_remove_cb(void *remove_ctx)
601 {
602 	struct spdk_vhost_blk_dev *bvdev = remove_ctx;
603 
604 	SPDK_WARNLOG("Controller %s: Hot-removing bdev - all further requests will fail.\n",
605 		     bvdev->vdev.name);
606 
607 	spdk_vhost_lock();
608 	spdk_vhost_dev_foreach_session(&bvdev->vdev, _spdk_vhost_session_bdev_remove_cb, NULL);
609 	spdk_vhost_unlock();
610 }
611 
612 static void
613 free_task_pool(struct spdk_vhost_blk_session *bvsession)
614 {
615 	struct spdk_vhost_session *vsession = &bvsession->vsession;
616 	struct spdk_vhost_virtqueue *vq;
617 	uint16_t i;
618 
619 	for (i = 0; i < vsession->max_queues; i++) {
620 		vq = &vsession->virtqueue[i];
621 		if (vq->tasks == NULL) {
622 			continue;
623 		}
624 
625 		spdk_dma_free(vq->tasks);
626 		vq->tasks = NULL;
627 	}
628 }
629 
630 static int
631 alloc_task_pool(struct spdk_vhost_blk_session *bvsession)
632 {
633 	struct spdk_vhost_session *vsession = &bvsession->vsession;
634 	struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
635 	struct spdk_vhost_virtqueue *vq;
636 	struct spdk_vhost_blk_task *task;
637 	uint32_t task_cnt;
638 	uint16_t i;
639 	uint32_t j;
640 
641 	for (i = 0; i < vsession->max_queues; i++) {
642 		vq = &vsession->virtqueue[i];
643 		if (vq->vring.desc == NULL) {
644 			continue;
645 		}
646 
647 		task_cnt = vq->vring.size;
648 		if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
649 			/* sanity check */
650 			SPDK_ERRLOG("Controller %s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
651 				    bvdev->vdev.name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
652 			free_task_pool(bvsession);
653 			return -1;
654 		}
655 		vq->tasks = spdk_dma_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt,
656 					     SPDK_CACHE_LINE_SIZE, NULL);
657 		if (vq->tasks == NULL) {
658 			SPDK_ERRLOG("Controller %s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
659 				    bvdev->vdev.name, task_cnt, i);
660 			free_task_pool(bvsession);
661 			return -1;
662 		}
663 
664 		for (j = 0; j < task_cnt; j++) {
665 			task = &((struct spdk_vhost_blk_task *)vq->tasks)[j];
666 			task->bvsession = bvsession;
667 			task->req_idx = j;
668 			task->vq = vq;
669 		}
670 	}
671 
672 	return 0;
673 }
674 
675 static int
676 spdk_vhost_blk_start_cb(struct spdk_vhost_dev *vdev,
677 			struct spdk_vhost_session *vsession, void *unused)
678 {
679 	struct spdk_vhost_blk_dev *bvdev;
680 	struct spdk_vhost_blk_session *bvsession;
681 	int i, rc = 0;
682 
683 	bvsession = to_blk_session(vsession);
684 	if (bvsession == NULL) {
685 		SPDK_ERRLOG("Trying to start non-blk controller as a blk one.\n");
686 		rc = -1;
687 		goto out;
688 	}
689 
690 	bvdev = to_blk_dev(vdev);
691 	assert(bvdev != NULL);
692 	bvsession->bvdev = bvdev;
693 
694 	/* validate all I/O queues are in a contiguous index range */
695 	for (i = 0; i < vsession->max_queues; i++) {
696 		if (vsession->virtqueue[i].vring.desc == NULL) {
697 			SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vdev->name, i);
698 			rc = -1;
699 			goto out;
700 		}
701 	}
702 
703 	rc = alloc_task_pool(bvsession);
704 	if (rc != 0) {
705 		SPDK_ERRLOG("%s: failed to alloc task pool.\n", bvdev->vdev.name);
706 		goto out;
707 	}
708 
709 	if (bvdev->bdev) {
710 		bvsession->io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
711 		if (!bvsession->io_channel) {
712 			free_task_pool(bvsession);
713 			SPDK_ERRLOG("Controller %s: IO channel allocation failed\n", vdev->name);
714 			rc = -1;
715 			goto out;
716 		}
717 	}
718 
719 	bvsession->requestq_poller = spdk_poller_register(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker,
720 				     bvsession, 0);
721 	SPDK_INFOLOG(SPDK_LOG_VHOST, "Started poller for vhost controller %s on lcore %d\n",
722 		     vdev->name, vsession->lcore);
723 out:
724 	spdk_vhost_session_event_done(vsession, rc);
725 	return rc;
726 }
727 
728 static int
729 spdk_vhost_blk_start(struct spdk_vhost_session *vsession)
730 {
731 	int rc;
732 
733 	vsession->lcore = spdk_vhost_allocate_reactor(vsession->vdev->cpumask);
734 	rc = spdk_vhost_session_send_event(vsession, spdk_vhost_blk_start_cb,
735 					   3, "start session");
736 
737 	if (rc != 0) {
738 		spdk_vhost_free_reactor(vsession->lcore);
739 		vsession->lcore = -1;
740 	}
741 
742 	return rc;
743 }
744 
745 static int
746 destroy_session_poller_cb(void *arg)
747 {
748 	struct spdk_vhost_blk_session *bvsession = arg;
749 	struct spdk_vhost_session *vsession = &bvsession->vsession;
750 	int i;
751 
752 	if (vsession->task_cnt > 0) {
753 		return -1;
754 	}
755 
756 	for (i = 0; i < vsession->max_queues; i++) {
757 		vsession->virtqueue[i].next_event_time = 0;
758 		spdk_vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
759 	}
760 
761 	SPDK_INFOLOG(SPDK_LOG_VHOST, "Stopping poller for vhost controller %s\n", vsession->vdev->name);
762 
763 	if (bvsession->io_channel) {
764 		spdk_put_io_channel(bvsession->io_channel);
765 		bvsession->io_channel = NULL;
766 	}
767 
768 	free_task_pool(bvsession);
769 	spdk_poller_unregister(&bvsession->stop_poller);
770 	spdk_vhost_session_event_done(vsession, 0);
771 
772 	return -1;
773 }
774 
775 static int
776 spdk_vhost_blk_stop_cb(struct spdk_vhost_dev *vdev,
777 		       struct spdk_vhost_session *vsession, void *unused)
778 {
779 	struct spdk_vhost_blk_session *bvsession;
780 
781 	bvsession = to_blk_session(vsession);
782 	if (bvsession == NULL) {
783 		SPDK_ERRLOG("Trying to stop non-blk controller as a blk one.\n");
784 		goto err;
785 	}
786 
787 	spdk_poller_unregister(&bvsession->requestq_poller);
788 	bvsession->stop_poller = spdk_poller_register(destroy_session_poller_cb,
789 				 bvsession, 1000);
790 	return 0;
791 
792 err:
793 	spdk_vhost_session_event_done(vsession, -1);
794 	return -1;
795 }
796 
797 static int
798 spdk_vhost_blk_stop(struct spdk_vhost_session *vsession)
799 {
800 	int rc;
801 
802 	rc = spdk_vhost_session_send_event(vsession, spdk_vhost_blk_stop_cb,
803 					   3, "stop session");
804 	if (rc != 0) {
805 		return rc;
806 	}
807 
808 	spdk_vhost_free_reactor(vsession->lcore);
809 	vsession->lcore = -1;
810 	return 0;
811 }
812 
813 static void
814 spdk_vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
815 {
816 	struct spdk_bdev *bdev = spdk_vhost_blk_get_dev(vdev);
817 	struct spdk_vhost_blk_dev *bvdev;
818 
819 	bvdev = to_blk_dev(vdev);
820 	if (bvdev == NULL) {
821 		return;
822 	}
823 
824 	assert(bvdev != NULL);
825 	spdk_json_write_named_object_begin(w, "block");
826 
827 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
828 
829 	spdk_json_write_name(w, "bdev");
830 	if (bdev) {
831 		spdk_json_write_string(w, spdk_bdev_get_name(bdev));
832 	} else {
833 		spdk_json_write_null(w);
834 	}
835 
836 	spdk_json_write_object_end(w);
837 }
838 
839 static void
840 spdk_vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
841 {
842 	struct spdk_vhost_blk_dev *bvdev;
843 
844 	bvdev = to_blk_dev(vdev);
845 	if (bvdev == NULL) {
846 		return;
847 	}
848 
849 	if (!bvdev->bdev) {
850 		return;
851 	}
852 
853 	spdk_json_write_object_begin(w);
854 	spdk_json_write_named_string(w, "method", "construct_vhost_blk_controller");
855 
856 	spdk_json_write_named_object_begin(w, "params");
857 	spdk_json_write_named_string(w, "ctrlr", vdev->name);
858 	spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev));
859 	spdk_json_write_named_string(w, "cpumask", spdk_cpuset_fmt(vdev->cpumask));
860 	spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
861 	spdk_json_write_object_end(w);
862 
863 	spdk_json_write_object_end(w);
864 }
865 
866 static int spdk_vhost_blk_destroy(struct spdk_vhost_dev *dev);
867 
868 static int
869 spdk_vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
870 			  uint32_t len)
871 {
872 	struct virtio_blk_config blkcfg;
873 	struct spdk_vhost_blk_dev *bvdev;
874 	struct spdk_bdev *bdev;
875 	uint32_t blk_size;
876 	uint64_t blkcnt;
877 
878 	bvdev = to_blk_dev(vdev);
879 	if (bvdev == NULL) {
880 		SPDK_ERRLOG("Trying to get virito_blk configuration failed\n");
881 		return -1;
882 	}
883 
884 	bdev = bvdev->bdev;
885 	if (bdev == NULL) {
886 		/* We can't just return -1 here as this GET_CONFIG message might
887 		 * be caused by a QEMU VM reboot. Returning -1 will indicate an
888 		 * error to QEMU, who might then decide to terminate itself.
889 		 * We don't want that. A simple reboot shouldn't break the system.
890 		 *
891 		 * Presenting a block device with block size 0 and block count 0
892 		 * doesn't cause any problems on QEMU side and the virtio-pci
893 		 * device is even still available inside the VM, but there will
894 		 * be no block device created for it - the kernel drivers will
895 		 * silently reject it.
896 		 */
897 		blk_size = 0;
898 		blkcnt = 0;
899 	} else {
900 		blk_size = spdk_bdev_get_block_size(bdev);
901 		blkcnt = spdk_bdev_get_num_blocks(bdev);
902 	}
903 
904 	memset(&blkcfg, 0, sizeof(blkcfg));
905 	blkcfg.blk_size = blk_size;
906 	/* minimum I/O size in blocks */
907 	blkcfg.min_io_size = 1;
908 	/* expressed in 512 Bytes sectors */
909 	blkcfg.capacity = (blkcnt * blk_size) / 512;
910 	blkcfg.size_max = 131072;
911 	/*  -2 for REQ and RESP and -1 for region boundary splitting */
912 	blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
913 	/* QEMU can overwrite this value when started */
914 	blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES;
915 
916 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
917 		/* 16MiB, expressed in 512 Bytes */
918 		blkcfg.max_discard_sectors = 32768;
919 		blkcfg.max_discard_seg = 1;
920 		blkcfg.discard_sector_alignment = blk_size / 512;
921 	}
922 	if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
923 		blkcfg.max_write_zeroes_sectors = 32768;
924 		blkcfg.max_write_zeroes_seg = 1;
925 	}
926 
927 	memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg)));
928 
929 	return 0;
930 }
931 
932 static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
933 	.virtio_features = SPDK_VHOST_FEATURES |
934 	(1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) |
935 	(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_RO) |
936 	(1ULL << VIRTIO_BLK_F_BLK_SIZE) | (1ULL << VIRTIO_BLK_F_TOPOLOGY) |
937 	(1ULL << VIRTIO_BLK_F_BARRIER)  | (1ULL << VIRTIO_BLK_F_SCSI) |
938 	(1ULL << VIRTIO_BLK_F_FLUSH)    | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) |
939 	(1ULL << VIRTIO_BLK_F_MQ)       | (1ULL << VIRTIO_BLK_F_DISCARD) |
940 	(1ULL << VIRTIO_BLK_F_WRITE_ZEROES),
941 	.disabled_features = SPDK_VHOST_DISABLED_FEATURES | (1ULL << VIRTIO_BLK_F_GEOMETRY) |
942 	(1ULL << VIRTIO_BLK_F_RO) | (1ULL << VIRTIO_BLK_F_FLUSH) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) |
943 	(1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI) | (1ULL << VIRTIO_BLK_F_DISCARD) |
944 	(1ULL << VIRTIO_BLK_F_WRITE_ZEROES),
945 	.session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session),
946 	.start_session =  spdk_vhost_blk_start,
947 	.stop_session = spdk_vhost_blk_stop,
948 	.vhost_get_config = spdk_vhost_blk_get_config,
949 	.dump_info_json = spdk_vhost_blk_dump_info_json,
950 	.write_config_json = spdk_vhost_blk_write_config_json,
951 	.remove_device = spdk_vhost_blk_destroy,
952 };
953 
954 int
955 spdk_vhost_blk_controller_construct(void)
956 {
957 	struct spdk_conf_section *sp;
958 	unsigned ctrlr_num;
959 	char *bdev_name;
960 	char *cpumask;
961 	char *name;
962 	bool readonly;
963 
964 	for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) {
965 		if (!spdk_conf_section_match_prefix(sp, "VhostBlk")) {
966 			continue;
967 		}
968 
969 		if (sscanf(spdk_conf_section_get_name(sp), "VhostBlk%u", &ctrlr_num) != 1) {
970 			SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
971 				    spdk_conf_section_get_name(sp));
972 			return -1;
973 		}
974 
975 		name = spdk_conf_section_get_val(sp, "Name");
976 		if (name == NULL) {
977 			SPDK_ERRLOG("VhostBlk%u: missing Name\n", ctrlr_num);
978 			return -1;
979 		}
980 
981 		cpumask = spdk_conf_section_get_val(sp, "Cpumask");
982 		readonly = spdk_conf_section_get_boolval(sp, "ReadOnly", false);
983 
984 		bdev_name = spdk_conf_section_get_val(sp, "Dev");
985 		if (bdev_name == NULL) {
986 			continue;
987 		}
988 
989 		if (spdk_vhost_blk_construct(name, cpumask, bdev_name, readonly) < 0) {
990 			return -1;
991 		}
992 	}
993 
994 	return 0;
995 }
996 
997 int
998 spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name, bool readonly)
999 {
1000 	struct spdk_vhost_blk_dev *bvdev = NULL;
1001 	struct spdk_bdev *bdev;
1002 	uint64_t features = 0;
1003 	int ret = 0;
1004 
1005 	spdk_vhost_lock();
1006 	bdev = spdk_bdev_get_by_name(dev_name);
1007 	if (bdev == NULL) {
1008 		SPDK_ERRLOG("Controller %s: bdev '%s' not found\n",
1009 			    name, dev_name);
1010 		ret = -ENODEV;
1011 		goto out;
1012 	}
1013 
1014 	bvdev = spdk_dma_zmalloc(sizeof(*bvdev), SPDK_CACHE_LINE_SIZE, NULL);
1015 	if (bvdev == NULL) {
1016 		ret = -ENOMEM;
1017 		goto out;
1018 	}
1019 
1020 	ret = spdk_bdev_open(bdev, true, bdev_remove_cb, bvdev, &bvdev->bdev_desc);
1021 	if (ret != 0) {
1022 		SPDK_ERRLOG("Controller %s: could not open bdev '%s', error=%d\n",
1023 			    name, dev_name, ret);
1024 		goto out;
1025 	}
1026 
1027 	bvdev->bdev = bdev;
1028 	bvdev->readonly = readonly;
1029 	ret = spdk_vhost_dev_register(&bvdev->vdev, name, cpumask, &vhost_blk_device_backend);
1030 	if (ret != 0) {
1031 		spdk_bdev_close(bvdev->bdev_desc);
1032 		goto out;
1033 	}
1034 
1035 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
1036 		features |= (1ULL << VIRTIO_BLK_F_DISCARD);
1037 	}
1038 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
1039 		features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
1040 	}
1041 	if (readonly) {
1042 		features |= (1ULL << VIRTIO_BLK_F_RO);
1043 	}
1044 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) {
1045 		features |= (1ULL << VIRTIO_BLK_F_FLUSH);
1046 	}
1047 
1048 	if (features && rte_vhost_driver_enable_features(bvdev->vdev.path, features)) {
1049 		SPDK_ERRLOG("Controller %s: failed to enable features 0x%"PRIx64"\n", name, features);
1050 
1051 		if (spdk_vhost_dev_unregister(&bvdev->vdev) != 0) {
1052 			SPDK_ERRLOG("Controller %s: failed to remove controller\n", name);
1053 		}
1054 
1055 		spdk_bdev_close(bvdev->bdev_desc);
1056 		ret = -1;
1057 		goto out;
1058 	}
1059 
1060 	SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: using bdev '%s'\n", name, dev_name);
1061 out:
1062 	if (ret != 0 && bvdev) {
1063 		spdk_dma_free(bvdev);
1064 	}
1065 	spdk_vhost_unlock();
1066 	return ret;
1067 }
1068 
1069 static int
1070 spdk_vhost_blk_destroy(struct spdk_vhost_dev *vdev)
1071 {
1072 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
1073 	int rc;
1074 
1075 	if (!bvdev) {
1076 		return -EINVAL;
1077 	}
1078 
1079 	rc = spdk_vhost_dev_unregister(&bvdev->vdev);
1080 	if (rc != 0) {
1081 		return rc;
1082 	}
1083 
1084 	if (bvdev->bdev_desc) {
1085 		spdk_bdev_close(bvdev->bdev_desc);
1086 		bvdev->bdev_desc = NULL;
1087 	}
1088 	bvdev->bdev = NULL;
1089 
1090 	spdk_dma_free(bvdev);
1091 	return 0;
1092 }
1093 
1094 SPDK_LOG_REGISTER_COMPONENT("vhost_blk", SPDK_LOG_VHOST_BLK)
1095 SPDK_LOG_REGISTER_COMPONENT("vhost_blk_data", SPDK_LOG_VHOST_BLK_DATA)
1096