xref: /dpdk/examples/vhost_blk/vhost_blk.c (revision f69ed1044230c218c9afd8f1b47b6fe6aa1eeec5)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2019 Intel Corporation
3  */
4 
5 #include <stdint.h>
6 #include <unistd.h>
7 #include <stdbool.h>
8 #include <signal.h>
9 #include <assert.h>
10 #include <semaphore.h>
11 #include <linux/virtio_blk.h>
12 #include <linux/virtio_ring.h>
13 
14 #include <rte_atomic.h>
15 #include <rte_cycles.h>
16 #include <rte_log.h>
17 #include <rte_malloc.h>
18 #include <rte_vhost.h>
19 
20 #include "vhost_blk.h"
21 #include "blk_spec.h"
22 
23 #define VIRTQ_DESC_F_NEXT	1
24 #define VIRTQ_DESC_F_AVAIL	(1 << 7)
25 #define VIRTQ_DESC_F_USED	(1 << 15)
26 
27 #define MAX_TASK		12
28 
29 #define VHOST_BLK_FEATURES ((1ULL << VIRTIO_F_RING_PACKED) | \
30 			    (1ULL << VIRTIO_F_VERSION_1) |\
31 			    (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \
32 			    (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))
33 
34 /* Path to folder where character device will be created. Can be set by user. */
35 static char dev_pathname[PATH_MAX] = "";
36 static sem_t exit_sem;
37 static int g_should_stop = -1;
38 
39 struct vhost_blk_ctrlr *
40 vhost_blk_ctrlr_find(const char *ctrlr_name)
41 {
42 	if (ctrlr_name == NULL)
43 		return NULL;
44 
45 	/* currently we only support 1 socket file fd */
46 	return g_vhost_ctrlr;
47 }
48 
49 static uint64_t gpa_to_vva(int vid, uint64_t gpa, uint64_t *len)
50 {
51 	char path[PATH_MAX];
52 	struct vhost_blk_ctrlr *ctrlr;
53 	int ret = 0;
54 
55 	ret = rte_vhost_get_ifname(vid, path, PATH_MAX);
56 	if (ret) {
57 		fprintf(stderr, "Cannot get socket name\n");
58 		assert(ret != 0);
59 	}
60 
61 	ctrlr = vhost_blk_ctrlr_find(path);
62 	if (!ctrlr) {
63 		fprintf(stderr, "Controller is not ready\n");
64 		assert(ctrlr != NULL);
65 	}
66 
67 	assert(ctrlr->mem != NULL);
68 
69 	return rte_vhost_va_from_guest_pa(ctrlr->mem, gpa, len);
70 }
71 
72 static struct vring_packed_desc *
73 descriptor_get_next_packed(struct rte_vhost_vring *vq,
74 			     uint16_t *idx)
75 {
76 	if (vq->desc_packed[*idx % vq->size].flags & VIRTQ_DESC_F_NEXT) {
77 		*idx += 1;
78 		return &vq->desc_packed[*idx % vq->size];
79 	}
80 
81 	return NULL;
82 }
83 
84 static bool
85 descriptor_has_next_packed(struct vring_packed_desc *cur_desc)
86 {
87 	return !!(cur_desc->flags & VRING_DESC_F_NEXT);
88 }
89 
90 static bool
91 descriptor_is_wr_packed(struct vring_packed_desc *cur_desc)
92 {
93 	return !!(cur_desc->flags & VRING_DESC_F_WRITE);
94 }
95 
96 static struct rte_vhost_inflight_desc_packed *
97 inflight_desc_get_next(struct rte_vhost_inflight_info_packed *inflight_packed,
98 			       struct rte_vhost_inflight_desc_packed *cur_desc)
99 {
100 	if (!!(cur_desc->flags & VIRTQ_DESC_F_NEXT))
101 		return &inflight_packed->desc[cur_desc->next];
102 
103 	return NULL;
104 }
105 
106 static bool
107 inflight_desc_has_next(struct rte_vhost_inflight_desc_packed *cur_desc)
108 {
109 	return !!(cur_desc->flags & VRING_DESC_F_NEXT);
110 }
111 
112 static bool
113 inflight_desc_is_wr(struct rte_vhost_inflight_desc_packed *cur_desc)
114 {
115 	return !!(cur_desc->flags & VRING_DESC_F_WRITE);
116 }
117 
118 static void
119 inflight_process_payload_chain_packed(struct inflight_blk_task *task)
120 {
121 	void *data;
122 	uint64_t chunck_len;
123 	struct vhost_blk_task *blk_task;
124 	struct rte_vhost_inflight_desc_packed *desc;
125 
126 	blk_task = &task->blk_task;
127 	blk_task->iovs_cnt = 0;
128 
129 	do {
130 		desc = task->inflight_desc;
131 		chunck_len = desc->len;
132 		data = (void *)(uintptr_t)gpa_to_vva(blk_task->bdev->vid,
133 						     desc->addr,
134 						     &chunck_len);
135 		if (!data || chunck_len != desc->len) {
136 			fprintf(stderr, "failed to translate desc address.\n");
137 			return;
138 		}
139 
140 		blk_task->iovs[blk_task->iovs_cnt].iov_base = data;
141 		blk_task->iovs[blk_task->iovs_cnt].iov_len = desc->len;
142 		blk_task->data_len += desc->len;
143 		blk_task->iovs_cnt++;
144 		task->inflight_desc = inflight_desc_get_next(
145 					task->inflight_packed, desc);
146 	} while (inflight_desc_has_next(task->inflight_desc));
147 
148 	chunck_len = task->inflight_desc->len;
149 	blk_task->status = (void *)(uintptr_t)gpa_to_vva(
150 		blk_task->bdev->vid, task->inflight_desc->addr, &chunck_len);
151 	if (!blk_task->status || chunck_len != task->inflight_desc->len)
152 		fprintf(stderr, "failed to translate desc address.\n");
153 }
154 
155 static void
156 inflight_submit_completion_packed(struct inflight_blk_task *task,
157 					      uint32_t q_idx, uint16_t *used_id,
158 					      bool *used_wrap_counter)
159 {
160 	struct vhost_blk_ctrlr *ctrlr;
161 	struct rte_vhost_vring *vq;
162 	struct vring_packed_desc *desc;
163 	int ret;
164 
165 	ctrlr = vhost_blk_ctrlr_find(dev_pathname);
166 	vq = task->blk_task.vq;
167 
168 	ret = rte_vhost_set_last_inflight_io_packed(ctrlr->bdev->vid, q_idx,
169 						    task->blk_task.head_idx);
170 	if (ret != 0)
171 		fprintf(stderr, "failed to set last inflight io\n");
172 
173 	desc = &vq->desc_packed[*used_id];
174 	desc->id = task->blk_task.buffer_id;
175 	rte_smp_mb();
176 	if (*used_wrap_counter)
177 		desc->flags |= VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED;
178 	else
179 		desc->flags &= ~(VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED);
180 	rte_smp_mb();
181 
182 	*used_id += task->blk_task.iovs_cnt + 2;
183 	if (*used_id >= vq->size) {
184 		*used_id -= vq->size;
185 		*used_wrap_counter = !(*used_wrap_counter);
186 	}
187 
188 	ret = rte_vhost_clr_inflight_desc_packed(ctrlr->bdev->vid, q_idx,
189 						 task->blk_task.head_idx);
190 	if (ret != 0)
191 		fprintf(stderr, "failed to clear inflight io\n");
192 
193 	/* Send an interrupt back to the guest VM so that it knows
194 	 * a completion is ready to be processed.
195 	 */
196 	rte_vhost_vring_call(task->blk_task.bdev->vid, q_idx);
197 }
198 
199 static void
200 submit_completion_packed(struct vhost_blk_task *task, uint32_t q_idx,
201 				  uint16_t *used_id, bool *used_wrap_counter)
202 {
203 	struct vhost_blk_ctrlr *ctrlr;
204 	struct rte_vhost_vring *vq;
205 	struct vring_packed_desc *desc;
206 	int ret;
207 
208 	ctrlr = vhost_blk_ctrlr_find(dev_pathname);
209 	vq = task->vq;
210 
211 	ret = rte_vhost_set_last_inflight_io_packed(ctrlr->bdev->vid, q_idx,
212 						    task->inflight_idx);
213 	if (ret != 0)
214 		fprintf(stderr, "failed to set last inflight io\n");
215 
216 	desc = &vq->desc_packed[*used_id];
217 	desc->id = task->buffer_id;
218 	rte_smp_mb();
219 	if (*used_wrap_counter)
220 		desc->flags |= VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED;
221 	else
222 		desc->flags &= ~(VIRTQ_DESC_F_AVAIL | VIRTQ_DESC_F_USED);
223 	rte_smp_mb();
224 
225 	*used_id += task->iovs_cnt + 2;
226 	if (*used_id >= vq->size) {
227 		*used_id -= vq->size;
228 		*used_wrap_counter = !(*used_wrap_counter);
229 	}
230 
231 	ret = rte_vhost_clr_inflight_desc_packed(ctrlr->bdev->vid, q_idx,
232 						 task->inflight_idx);
233 	if (ret != 0)
234 		fprintf(stderr, "failed to clear inflight io\n");
235 
236 	/* Send an interrupt back to the guest VM so that it knows
237 	 * a completion is ready to be processed.
238 	 */
239 	rte_vhost_vring_call(task->bdev->vid, q_idx);
240 }
241 
242 static void
243 vhost_process_payload_chain_packed(struct vhost_blk_task *task,
244 	uint16_t *idx)
245 {
246 	void *data;
247 	uint64_t chunck_len;
248 
249 	task->iovs_cnt = 0;
250 
251 	do {
252 		chunck_len = task->desc_packed->len;
253 		data = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid,
254 						     task->desc_packed->addr,
255 							 &chunck_len);
256 		if (!data || chunck_len != task->desc_packed->len) {
257 			fprintf(stderr, "failed to translate desc address.\n");
258 			return;
259 		}
260 
261 		task->iovs[task->iovs_cnt].iov_base = data;
262 		task->iovs[task->iovs_cnt].iov_len = task->desc_packed->len;
263 		task->data_len += task->desc_packed->len;
264 		task->iovs_cnt++;
265 		task->desc_packed = descriptor_get_next_packed(task->vq, idx);
266 	} while (descriptor_has_next_packed(task->desc_packed));
267 
268 	task->last_idx = *idx % task->vq->size;
269 	chunck_len = task->desc_packed->len;
270 	task->status = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid,
271 						   task->desc_packed->addr,
272 						   &chunck_len);
273 	if (!task->status || chunck_len != task->desc_packed->len)
274 		fprintf(stderr, "failed to translate desc address.\n");
275 }
276 
277 
278 static int
279 descriptor_is_available(struct rte_vhost_vring *vring, uint16_t idx,
280 					bool avail_wrap_counter)
281 {
282 	uint16_t flags = vring->desc_packed[idx].flags;
283 
284 	return ((!!(flags & VIRTQ_DESC_F_AVAIL) == avail_wrap_counter) &&
285 		(!!(flags & VIRTQ_DESC_F_USED) != avail_wrap_counter));
286 }
287 
288 static void
289 process_requestq_packed(struct vhost_blk_ctrlr *ctrlr, uint32_t q_idx)
290 {
291 	bool avail_wrap_counter, used_wrap_counter;
292 	uint16_t avail_idx, used_idx;
293 	int ret;
294 	uint64_t chunck_len;
295 	struct vhost_blk_queue *blk_vq;
296 	struct rte_vhost_vring *vq;
297 	struct vhost_blk_task *task;
298 
299 	blk_vq = &ctrlr->bdev->queues[q_idx];
300 	vq = &blk_vq->vq;
301 
302 	avail_idx = blk_vq->last_avail_idx;
303 	avail_wrap_counter = blk_vq->avail_wrap_counter;
304 	used_idx = blk_vq->last_used_idx;
305 	used_wrap_counter = blk_vq->used_wrap_counter;
306 
307 	task = rte_zmalloc(NULL, sizeof(*task), 0);
308 	assert(task != NULL);
309 	task->vq = vq;
310 	task->bdev = ctrlr->bdev;
311 
312 	while (descriptor_is_available(vq, avail_idx, avail_wrap_counter)) {
313 		task->head_idx = avail_idx;
314 		task->desc_packed = &task->vq->desc_packed[task->head_idx];
315 		task->iovs_cnt = 0;
316 		task->data_len = 0;
317 		task->req = NULL;
318 		task->status = NULL;
319 
320 		/* does not support indirect descriptors */
321 		assert((task->desc_packed->flags & VRING_DESC_F_INDIRECT) == 0);
322 
323 		chunck_len = task->desc_packed->len;
324 		task->req = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid,
325 			task->desc_packed->addr, &chunck_len);
326 		if (!task->req || chunck_len != task->desc_packed->len) {
327 			fprintf(stderr, "failed to translate desc address.\n");
328 			rte_free(task);
329 			return;
330 		}
331 
332 		task->desc_packed = descriptor_get_next_packed(task->vq,
333 								&avail_idx);
334 		assert(task->desc_packed != NULL);
335 		if (!descriptor_has_next_packed(task->desc_packed)) {
336 			task->dxfer_dir = BLK_DIR_NONE;
337 			task->last_idx = avail_idx % vq->size;
338 			chunck_len = task->desc_packed->len;
339 			task->status = (void *)(uintptr_t)
340 					      gpa_to_vva(task->bdev->vid,
341 							task->desc_packed->addr,
342 							&chunck_len);
343 			if (!task->status ||
344 				chunck_len != task->desc_packed->len) {
345 				fprintf(stderr,
346 					"failed to translate desc address.\n");
347 				rte_free(task);
348 				return;
349 			}
350 		} else {
351 			task->readtype = descriptor_is_wr_packed(
352 							task->desc_packed);
353 			vhost_process_payload_chain_packed(task, &avail_idx);
354 		}
355 		task->buffer_id = vq->desc_packed[task->last_idx].id;
356 		rte_vhost_set_inflight_desc_packed(ctrlr->bdev->vid, q_idx,
357 						   task->head_idx,
358 						   task->last_idx,
359 						   &task->inflight_idx);
360 
361 		if (++avail_idx >= vq->size) {
362 			avail_idx -= vq->size;
363 			avail_wrap_counter = !avail_wrap_counter;
364 		}
365 		blk_vq->last_avail_idx = avail_idx;
366 		blk_vq->avail_wrap_counter = avail_wrap_counter;
367 
368 		ret = vhost_bdev_process_blk_commands(ctrlr->bdev, task);
369 		if (ret) {
370 			/* invalid response */
371 			*task->status = VIRTIO_BLK_S_IOERR;
372 		} else {
373 			/* successfully */
374 			*task->status = VIRTIO_BLK_S_OK;
375 		}
376 
377 		submit_completion_packed(task, q_idx, &used_idx,
378 						&used_wrap_counter);
379 		blk_vq->last_used_idx = used_idx;
380 		blk_vq->used_wrap_counter = used_wrap_counter;
381 	}
382 
383 	rte_free(task);
384 }
385 
386 static void
387 submit_inflight_vq_packed(struct vhost_blk_ctrlr *ctrlr,
388 	uint16_t q_idx)
389 {
390 	bool used_wrap_counter;
391 	int req_idx, ret;
392 	uint16_t used_idx;
393 	uint64_t chunck_len;
394 	struct vhost_blk_queue *blk_vq;
395 	struct rte_vhost_ring_inflight *inflight_vq;
396 	struct rte_vhost_resubmit_info *resubmit_info;
397 	struct rte_vhost_vring *vq;
398 	struct inflight_blk_task *task;
399 	struct vhost_blk_task *blk_task;
400 	struct rte_vhost_inflight_info_packed *inflight_info;
401 
402 	blk_vq = &ctrlr->bdev->queues[q_idx];
403 	vq = &blk_vq->vq;
404 	inflight_vq = &blk_vq->inflight_vq;
405 	resubmit_info = inflight_vq->resubmit_inflight;
406 	inflight_info = inflight_vq->inflight_packed;
407 	used_idx = blk_vq->last_used_idx;
408 	used_wrap_counter = blk_vq->used_wrap_counter;
409 
410 	task = rte_malloc(NULL, sizeof(*task), 0);
411 	if (!task) {
412 		fprintf(stderr, "failed to allocate memory\n");
413 		return;
414 	}
415 	blk_task = &task->blk_task;
416 	blk_task->vq = vq;
417 	blk_task->bdev = ctrlr->bdev;
418 	task->inflight_packed = inflight_vq->inflight_packed;
419 
420 	while (resubmit_info->resubmit_num-- > 0) {
421 		req_idx = resubmit_info->resubmit_num;
422 		blk_task->head_idx =
423 			resubmit_info->resubmit_list[req_idx].index;
424 		task->inflight_desc =
425 			&inflight_info->desc[blk_task->head_idx];
426 		task->blk_task.iovs_cnt = 0;
427 		task->blk_task.data_len = 0;
428 		task->blk_task.req = NULL;
429 		task->blk_task.status = NULL;
430 
431 		/* update the avail idx too
432 		 * as it's initial value equals to used idx
433 		 */
434 		blk_vq->last_avail_idx += task->inflight_desc->num;
435 		if (blk_vq->last_avail_idx >= vq->size) {
436 			blk_vq->last_avail_idx -= vq->size;
437 			blk_vq->avail_wrap_counter =
438 				!blk_vq->avail_wrap_counter;
439 		}
440 
441 		/* does not support indirect descriptors */
442 		assert(task->inflight_desc != NULL);
443 		assert((task->inflight_desc->flags &
444 			VRING_DESC_F_INDIRECT) == 0);
445 
446 		chunck_len = task->inflight_desc->len;
447 		blk_task->req = (void *)(uintptr_t)
448 				     gpa_to_vva(blk_task->bdev->vid,
449 						task->inflight_desc->addr,
450 						&chunck_len);
451 		if (!blk_task->req ||
452 			chunck_len != task->inflight_desc->len) {
453 			fprintf(stderr, "failed to translate desc address.\n");
454 			rte_free(task);
455 			return;
456 		}
457 
458 		task->inflight_desc = inflight_desc_get_next(
459 			task->inflight_packed, task->inflight_desc);
460 		assert(task->inflight_desc != NULL);
461 		if (!inflight_desc_has_next(task->inflight_desc)) {
462 			blk_task->dxfer_dir = BLK_DIR_NONE;
463 			chunck_len = task->inflight_desc->len;
464 			blk_task->status = (void *)(uintptr_t)
465 				gpa_to_vva(blk_task->bdev->vid,
466 						task->inflight_desc->addr,
467 						&chunck_len);
468 			if (!blk_task->status ||
469 			    chunck_len != task->inflight_desc->len) {
470 				fprintf(stderr,
471 					"failed to translate desc address.\n");
472 				rte_free(task);
473 				return;
474 			}
475 		} else {
476 			blk_task->readtype =
477 			inflight_desc_is_wr(task->inflight_desc);
478 			inflight_process_payload_chain_packed(task);
479 		}
480 
481 		blk_task->buffer_id = task->inflight_desc->id;
482 
483 		ret = vhost_bdev_process_blk_commands(ctrlr->bdev, blk_task);
484 		if (ret)
485 			/* invalid response */
486 			*blk_task->status = VIRTIO_BLK_S_IOERR;
487 		else
488 			/* successfully */
489 			*blk_task->status = VIRTIO_BLK_S_OK;
490 
491 		inflight_submit_completion_packed(task, q_idx, &used_idx,
492 						  &used_wrap_counter);
493 
494 		blk_vq->last_used_idx = used_idx;
495 		blk_vq->used_wrap_counter = used_wrap_counter;
496 	}
497 
498 	rte_free(task);
499 }
500 
501 static struct vring_desc *
502 descriptor_get_next_split(struct vring_desc *vq_desc,
503 				   struct vring_desc *cur_desc)
504 {
505 	return &vq_desc[cur_desc->next];
506 }
507 
508 static bool
509 descriptor_has_next_split(struct vring_desc *cur_desc)
510 {
511 	return !!(cur_desc->flags & VRING_DESC_F_NEXT);
512 }
513 
514 static bool
515 descriptor_is_wr_split(struct vring_desc *cur_desc)
516 {
517 	return !!(cur_desc->flags & VRING_DESC_F_WRITE);
518 }
519 
520 static void
521 vhost_process_payload_chain_split(struct vhost_blk_task *task)
522 {
523 	void *data;
524 	uint64_t chunck_len;
525 
526 	task->iovs_cnt = 0;
527 
528 	do {
529 		chunck_len = task->desc_split->len;
530 		data = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid,
531 						     task->desc_split->addr,
532 						     &chunck_len);
533 		if (!data || chunck_len != task->desc_split->len) {
534 			fprintf(stderr, "failed to translate desc address.\n");
535 			return;
536 		}
537 
538 		task->iovs[task->iovs_cnt].iov_base = data;
539 		task->iovs[task->iovs_cnt].iov_len = task->desc_split->len;
540 		task->data_len += task->desc_split->len;
541 		task->iovs_cnt++;
542 		task->desc_split =
543 		descriptor_get_next_split(task->vq->desc, task->desc_split);
544 	} while (descriptor_has_next_split(task->desc_split));
545 
546 	chunck_len = task->desc_split->len;
547 	task->status = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid,
548 						     task->desc_split->addr,
549 						     &chunck_len);
550 	if (!task->status || chunck_len != task->desc_split->len)
551 		fprintf(stderr, "failed to translate desc address.\n");
552 }
553 
554 static void
555 submit_completion_split(struct vhost_blk_task *task, uint32_t vid,
556 	uint32_t q_idx)
557 {
558 	struct rte_vhost_vring *vq;
559 	struct vring_used *used;
560 
561 	vq = task->vq;
562 	used = vq->used;
563 
564 	rte_vhost_set_last_inflight_io_split(vid, q_idx, task->req_idx);
565 
566 	/* Fill out the next entry in the "used" ring.  id = the
567 	 * index of the descriptor that contained the blk request.
568 	 * len = the total amount of data transferred for the blk
569 	 * request. We must report the correct len, for variable
570 	 * length blk CDBs, where we may return less data than
571 	 * allocated by the guest VM.
572 	 */
573 	used->ring[used->idx & (vq->size - 1)].id = task->req_idx;
574 	used->ring[used->idx & (vq->size - 1)].len = task->data_len;
575 	rte_smp_mb();
576 	used->idx++;
577 	rte_smp_mb();
578 
579 	rte_vhost_clr_inflight_desc_split(vid, q_idx, used->idx, task->req_idx);
580 
581 	/* Send an interrupt back to the guest VM so that it knows
582 	 * a completion is ready to be processed.
583 	 */
584 	rte_vhost_vring_call(task->bdev->vid, q_idx);
585 }
586 
587 static void
588 submit_inflight_vq_split(struct vhost_blk_ctrlr *ctrlr,
589 	uint32_t q_idx)
590 {
591 	struct vhost_blk_queue *blk_vq;
592 	struct rte_vhost_ring_inflight *inflight_vq;
593 	struct rte_vhost_resubmit_info *resubmit_inflight;
594 	struct rte_vhost_resubmit_desc *resubmit_list;
595 	struct vhost_blk_task *task;
596 	int req_idx;
597 	uint64_t chunck_len;
598 	int ret;
599 
600 	blk_vq = &ctrlr->bdev->queues[q_idx];
601 	inflight_vq = &blk_vq->inflight_vq;
602 	resubmit_inflight = inflight_vq->resubmit_inflight;
603 	resubmit_list = resubmit_inflight->resubmit_list;
604 
605 	task = rte_zmalloc(NULL, sizeof(*task), 0);
606 	assert(task != NULL);
607 
608 	task->ctrlr = ctrlr;
609 	task->bdev = ctrlr->bdev;
610 	task->vq = &blk_vq->vq;
611 
612 	while (resubmit_inflight->resubmit_num-- > 0) {
613 		req_idx = resubmit_list[resubmit_inflight->resubmit_num].index;
614 		task->req_idx = req_idx;
615 		task->desc_split = &task->vq->desc[task->req_idx];
616 		task->iovs_cnt = 0;
617 		task->data_len = 0;
618 		task->req = NULL;
619 		task->status = NULL;
620 
621 		/* does not support indirect descriptors */
622 		assert(task->desc_split != NULL);
623 		assert((task->desc_split->flags & VRING_DESC_F_INDIRECT) == 0);
624 
625 		chunck_len = task->desc_split->len;
626 		task->req = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid,
627 				task->desc_split->addr, &chunck_len);
628 		if (!task->req || chunck_len != task->desc_split->len) {
629 			fprintf(stderr, "failed to translate desc address.\n");
630 			rte_free(task);
631 			return;
632 		}
633 
634 		task->desc_split = descriptor_get_next_split(task->vq->desc,
635 							     task->desc_split);
636 		if (!descriptor_has_next_split(task->desc_split)) {
637 			task->dxfer_dir = BLK_DIR_NONE;
638 			chunck_len = task->desc_split->len;
639 			task->status = (void *)(uintptr_t)
640 				       gpa_to_vva(task->bdev->vid,
641 						  task->desc_split->addr,
642 						  &chunck_len);
643 			if (!task->status ||
644 				chunck_len != task->desc_split->len) {
645 				fprintf(stderr,
646 					"failed to translate desc address.\n");
647 				rte_free(task);
648 				return;
649 			}
650 		} else {
651 			task->readtype =
652 				descriptor_is_wr_split(task->desc_split);
653 			vhost_process_payload_chain_split(task);
654 		}
655 
656 		ret = vhost_bdev_process_blk_commands(ctrlr->bdev, task);
657 		if (ret) {
658 			/* invalid response */
659 			*task->status = VIRTIO_BLK_S_IOERR;
660 		} else {
661 			/* successfully */
662 			*task->status = VIRTIO_BLK_S_OK;
663 		}
664 		submit_completion_split(task, ctrlr->bdev->vid, q_idx);
665 	}
666 
667 	rte_free(task);
668 }
669 
670 static void
671 process_requestq_split(struct vhost_blk_ctrlr *ctrlr, uint32_t q_idx)
672 {
673 	int ret;
674 	int req_idx;
675 	uint16_t last_idx;
676 	uint64_t chunck_len;
677 	struct vhost_blk_queue *blk_vq;
678 	struct rte_vhost_vring *vq;
679 	struct vhost_blk_task *task;
680 
681 	blk_vq = &ctrlr->bdev->queues[q_idx];
682 	vq = &blk_vq->vq;
683 
684 	task = rte_zmalloc(NULL, sizeof(*task), 0);
685 	assert(task != NULL);
686 	task->ctrlr = ctrlr;
687 	task->bdev = ctrlr->bdev;
688 	task->vq = vq;
689 
690 	while (vq->avail->idx != blk_vq->last_avail_idx) {
691 		last_idx = blk_vq->last_avail_idx & (vq->size - 1);
692 		req_idx = vq->avail->ring[last_idx];
693 		task->req_idx = req_idx;
694 		task->desc_split = &task->vq->desc[task->req_idx];
695 		task->iovs_cnt = 0;
696 		task->data_len = 0;
697 		task->req = NULL;
698 		task->status = NULL;
699 
700 		rte_vhost_set_inflight_desc_split(ctrlr->bdev->vid, q_idx,
701 							task->req_idx);
702 
703 		/* does not support indirect descriptors */
704 		assert((task->desc_split->flags & VRING_DESC_F_INDIRECT) == 0);
705 
706 		chunck_len = task->desc_split->len;
707 		task->req = (void *)(uintptr_t)gpa_to_vva(task->bdev->vid,
708 				task->desc_split->addr, &chunck_len);
709 		if (!task->req || chunck_len != task->desc_split->len) {
710 			fprintf(stderr, "failed to translate desc address.\n");
711 			rte_free(task);
712 			return;
713 		}
714 
715 		task->desc_split = descriptor_get_next_split(task->vq->desc,
716 							     task->desc_split);
717 		if (!descriptor_has_next_split(task->desc_split)) {
718 			task->dxfer_dir = BLK_DIR_NONE;
719 			chunck_len = task->desc_split->len;
720 			task->status = (void *)(uintptr_t)
721 					      gpa_to_vva(task->bdev->vid,
722 							 task->desc_split->addr,
723 							 &chunck_len);
724 			if (!task->status ||
725 				chunck_len != task->desc_split->len) {
726 				fprintf(stderr,
727 					"failed to translate desc address.\n");
728 				rte_free(task);
729 				return;
730 			}
731 		} else {
732 			task->readtype =
733 				descriptor_is_wr_split(task->desc_split);
734 			vhost_process_payload_chain_split(task);
735 		}
736 		blk_vq->last_avail_idx++;
737 
738 		ret = vhost_bdev_process_blk_commands(ctrlr->bdev, task);
739 		if (ret) {
740 			/* invalid response */
741 			*task->status = VIRTIO_BLK_S_IOERR;
742 		} else {
743 			/* successfully */
744 			*task->status = VIRTIO_BLK_S_OK;
745 		}
746 
747 		submit_completion_split(task, ctrlr->bdev->vid, q_idx);
748 	}
749 
750 	rte_free(task);
751 }
752 
753 static void *
754 ctrlr_worker(void *arg)
755 {
756 	struct vhost_blk_ctrlr *ctrlr = (struct vhost_blk_ctrlr *)arg;
757 	struct vhost_blk_queue *blk_vq;
758 	struct rte_vhost_ring_inflight *inflight_vq;
759 	cpu_set_t cpuset;
760 	pthread_t thread;
761 	int i;
762 
763 	fprintf(stdout, "Ctrlr Worker Thread start\n");
764 
765 	if (ctrlr == NULL || ctrlr->bdev == NULL) {
766 		fprintf(stderr,
767 			"%s: Error, invalid argument passed to worker thread\n",
768 			__func__);
769 		exit(0);
770 	}
771 
772 	thread = pthread_self();
773 	CPU_ZERO(&cpuset);
774 	CPU_SET(0, &cpuset);
775 	pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
776 
777 	for (i = 0; i < NUM_OF_BLK_QUEUES; i++) {
778 		blk_vq = &ctrlr->bdev->queues[i];
779 		inflight_vq = &blk_vq->inflight_vq;
780 		if (inflight_vq->resubmit_inflight != NULL &&
781 		    inflight_vq->resubmit_inflight->resubmit_num != 0) {
782 			if (ctrlr->packed_ring)
783 				submit_inflight_vq_packed(ctrlr, i);
784 			else
785 				submit_inflight_vq_split(ctrlr, i);
786 		}
787 	}
788 
789 	while (!g_should_stop && ctrlr->bdev != NULL) {
790 		for (i = 0; i < NUM_OF_BLK_QUEUES; i++) {
791 			if (ctrlr->packed_ring)
792 				process_requestq_packed(ctrlr, i);
793 			else
794 				process_requestq_split(ctrlr, i);
795 		}
796 	}
797 
798 	g_should_stop = 2;
799 	fprintf(stdout, "Ctrlr Worker Thread Exiting\n");
800 	sem_post(&exit_sem);
801 	return NULL;
802 }
803 
804 static int
805 new_device(int vid)
806 {
807 	struct vhost_blk_ctrlr *ctrlr;
808 	struct vhost_blk_queue *blk_vq;
809 	struct rte_vhost_vring *vq;
810 	uint64_t features;
811 	pthread_t tid;
812 	int i, ret;
813 
814 	ctrlr = vhost_blk_ctrlr_find(dev_pathname);
815 	if (!ctrlr) {
816 		fprintf(stderr, "Controller is not ready\n");
817 		return -1;
818 	}
819 
820 	if (ctrlr->started)
821 		return 0;
822 
823 	ctrlr->bdev->vid = vid;
824 	ret = rte_vhost_get_negotiated_features(vid, &features);
825 	if (ret) {
826 		fprintf(stderr, "failed to get the negotiated features\n");
827 		return -1;
828 	}
829 	ctrlr->packed_ring = !!(features & (1ULL << VIRTIO_F_RING_PACKED));
830 
831 	ret = rte_vhost_get_mem_table(vid, &ctrlr->mem);
832 	if (ret)
833 		fprintf(stderr, "Get Controller memory region failed\n");
834 	assert(ctrlr->mem != NULL);
835 
836 	/* Disable Notifications and init last idx */
837 	for (i = 0; i < NUM_OF_BLK_QUEUES; i++) {
838 		blk_vq = &ctrlr->bdev->queues[i];
839 		vq = &blk_vq->vq;
840 
841 		ret = rte_vhost_get_vhost_vring(ctrlr->bdev->vid, i, vq);
842 		assert(ret == 0);
843 
844 		ret = rte_vhost_get_vring_base(ctrlr->bdev->vid, i,
845 					       &blk_vq->last_avail_idx,
846 					       &blk_vq->last_used_idx);
847 		assert(ret == 0);
848 
849 		ret = rte_vhost_get_vhost_ring_inflight(ctrlr->bdev->vid, i,
850 							&blk_vq->inflight_vq);
851 		assert(ret == 0);
852 
853 		if (ctrlr->packed_ring) {
854 			/* for the reconnection */
855 			ret = rte_vhost_get_vring_base_from_inflight(
856 				ctrlr->bdev->vid, i,
857 				&blk_vq->last_avail_idx,
858 				&blk_vq->last_used_idx);
859 			assert(ret == 0);
860 
861 			blk_vq->avail_wrap_counter = blk_vq->last_avail_idx &
862 				(1 << 15);
863 			blk_vq->last_avail_idx = blk_vq->last_avail_idx &
864 				0x7fff;
865 			blk_vq->used_wrap_counter = blk_vq->last_used_idx &
866 				(1 << 15);
867 			blk_vq->last_used_idx = blk_vq->last_used_idx &
868 				0x7fff;
869 		}
870 
871 		rte_vhost_enable_guest_notification(vid, i, 0);
872 	}
873 
874 	/* start polling vring */
875 	g_should_stop = 0;
876 	fprintf(stdout, "New Device %s, Device ID %d\n", dev_pathname, vid);
877 	if (pthread_create(&tid, NULL, &ctrlr_worker, ctrlr) < 0) {
878 		fprintf(stderr, "Worker Thread Started Failed\n");
879 		return -1;
880 	}
881 
882 	/* device has been started */
883 	ctrlr->started = 1;
884 	pthread_detach(tid);
885 	return 0;
886 }
887 
888 static void
889 destroy_device(int vid)
890 {
891 	char path[PATH_MAX];
892 	struct vhost_blk_ctrlr *ctrlr;
893 	struct vhost_blk_queue *blk_vq;
894 	int i, ret;
895 
896 	ret = rte_vhost_get_ifname(vid, path, PATH_MAX);
897 	if (ret) {
898 		fprintf(stderr, "Destroy Ctrlr Failed\n");
899 		return;
900 	}
901 
902 	fprintf(stdout, "Destroy %s Device ID %d\n", path, vid);
903 	ctrlr = vhost_blk_ctrlr_find(path);
904 	if (!ctrlr) {
905 		fprintf(stderr, "Destroy Ctrlr Failed\n");
906 		return;
907 	}
908 
909 	if (!ctrlr->started)
910 		return;
911 
912 	g_should_stop = 1;
913 	while (g_should_stop != 2)
914 		;
915 
916 	for (i = 0; i < NUM_OF_BLK_QUEUES; i++) {
917 		blk_vq = &ctrlr->bdev->queues[i];
918 		if (ctrlr->packed_ring) {
919 			blk_vq->last_avail_idx |= (blk_vq->avail_wrap_counter <<
920 				15);
921 			blk_vq->last_used_idx |= (blk_vq->used_wrap_counter <<
922 				15);
923 		}
924 		rte_vhost_set_vring_base(ctrlr->bdev->vid, i,
925 					 blk_vq->last_avail_idx,
926 					 blk_vq->last_used_idx);
927 	}
928 
929 	free(ctrlr->mem);
930 
931 	ctrlr->started = 0;
932 	sem_wait(&exit_sem);
933 }
934 
935 static int
936 new_connection(int vid)
937 {
938 	/* extend the proper features for block device */
939 	vhost_session_install_rte_compat_hooks(vid);
940 
941 	return 0;
942 }
943 
944 struct vhost_device_ops vhost_blk_device_ops = {
945 	.new_device =  new_device,
946 	.destroy_device = destroy_device,
947 	.new_connection = new_connection,
948 };
949 
950 static struct vhost_block_dev *
951 vhost_blk_bdev_construct(const char *bdev_name,
952 	const char *bdev_serial, uint32_t blk_size, uint64_t blk_cnt,
953 	bool wce_enable)
954 {
955 	struct vhost_block_dev *bdev;
956 
957 	bdev = rte_zmalloc(NULL, sizeof(*bdev), RTE_CACHE_LINE_SIZE);
958 	if (!bdev)
959 		return NULL;
960 
961 	strncpy(bdev->name, bdev_name, sizeof(bdev->name));
962 	strncpy(bdev->product_name, bdev_serial, sizeof(bdev->product_name));
963 	bdev->blocklen = blk_size;
964 	bdev->blockcnt = blk_cnt;
965 	bdev->write_cache = wce_enable;
966 
967 	fprintf(stdout, "blocklen=%d, blockcnt=%"PRIx64"\n", bdev->blocklen,
968 		bdev->blockcnt);
969 
970 	/* use memory as disk storage space */
971 	bdev->data = rte_zmalloc(NULL, blk_cnt * blk_size, 0);
972 	if (!bdev->data) {
973 		fprintf(stderr, "no enough reserved huge memory for disk\n");
974 		free(bdev);
975 		return NULL;
976 	}
977 
978 	return bdev;
979 }
980 
981 static struct vhost_blk_ctrlr *
982 vhost_blk_ctrlr_construct(const char *ctrlr_name)
983 {
984 	int ret;
985 	struct vhost_blk_ctrlr *ctrlr;
986 	char *path;
987 	char cwd[PATH_MAX];
988 
989 	/* always use current directory */
990 	path = getcwd(cwd, PATH_MAX);
991 	if (!path) {
992 		fprintf(stderr, "Cannot get current working directory\n");
993 		return NULL;
994 	}
995 	snprintf(dev_pathname, sizeof(dev_pathname), "%s/%s", path, ctrlr_name);
996 
997 	unlink(dev_pathname);
998 
999 	if (rte_vhost_driver_register(dev_pathname, 0) != 0) {
1000 		fprintf(stderr, "socket %s already exists\n", dev_pathname);
1001 		return NULL;
1002 	}
1003 
1004 	ret = rte_vhost_driver_set_features(dev_pathname, VHOST_BLK_FEATURES);
1005 	if (ret != 0) {
1006 		fprintf(stderr, "Set vhost driver features failed\n");
1007 		rte_vhost_driver_unregister(dev_pathname);
1008 		return NULL;
1009 	}
1010 
1011 	/* set proper features */
1012 	vhost_dev_install_rte_compat_hooks(dev_pathname);
1013 
1014 	ctrlr = rte_zmalloc(NULL, sizeof(*ctrlr), RTE_CACHE_LINE_SIZE);
1015 	if (!ctrlr) {
1016 		rte_vhost_driver_unregister(dev_pathname);
1017 		return NULL;
1018 	}
1019 
1020 	/* hardcoded block device information with 128MiB */
1021 	ctrlr->bdev = vhost_blk_bdev_construct("malloc0", "vhost_blk_malloc0",
1022 						4096, 32768, 0);
1023 	if (!ctrlr->bdev) {
1024 		rte_free(ctrlr);
1025 		rte_vhost_driver_unregister(dev_pathname);
1026 		return NULL;
1027 	}
1028 
1029 	rte_vhost_driver_callback_register(dev_pathname,
1030 					   &vhost_blk_device_ops);
1031 
1032 	return ctrlr;
1033 }
1034 
1035 static void
1036 signal_handler(__rte_unused int signum)
1037 {
1038 	struct vhost_blk_ctrlr *ctrlr;
1039 
1040 	unlink(dev_pathname);
1041 
1042 	if (g_should_stop != -1) {
1043 		g_should_stop = 1;
1044 		while (g_should_stop != 2)
1045 			;
1046 	}
1047 
1048 	ctrlr = vhost_blk_ctrlr_find(dev_pathname);
1049 	if (ctrlr != NULL) {
1050 		if (ctrlr->bdev != NULL) {
1051 			rte_free(ctrlr->bdev->data);
1052 			rte_free(ctrlr->bdev);
1053 		}
1054 		rte_free(ctrlr);
1055 	}
1056 
1057 	rte_vhost_driver_unregister(dev_pathname);
1058 	exit(0);
1059 }
1060 
1061 int main(int argc, char *argv[])
1062 {
1063 	int ret;
1064 
1065 	signal(SIGINT, signal_handler);
1066 
1067 	/* init EAL */
1068 	ret = rte_eal_init(argc, argv);
1069 	if (ret < 0)
1070 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1071 
1072 	g_vhost_ctrlr = vhost_blk_ctrlr_construct("vhost.socket");
1073 	if (g_vhost_ctrlr == NULL) {
1074 		fprintf(stderr, "Construct vhost blk controller failed\n");
1075 		return 0;
1076 	}
1077 
1078 	if (sem_init(&exit_sem, 0, 0) < 0) {
1079 		fprintf(stderr, "Error init exit_sem\n");
1080 		return -1;
1081 	}
1082 
1083 	rte_vhost_driver_start(dev_pathname);
1084 
1085 	/* loop for exit the application */
1086 	while (1)
1087 		sleep(1);
1088 
1089 	return 0;
1090 }
1091