xref: /spdk/lib/vhost/vhost.c (revision b94d358a498a9c4f2b3416aeb6a73200d9fdb514)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "spdk/env.h"
37 #include "spdk/likely.h"
38 #include "spdk/string.h"
39 #include "spdk/util.h"
40 #include "spdk/memory.h"
41 #include "spdk/barrier.h"
42 #include "spdk/vhost.h"
43 #include "vhost_internal.h"
44 
45 static struct spdk_cpuset g_vhost_core_mask;
46 
47 /* Path to folder where character device will be created. Can be set by user. */
48 static char dev_dirname[PATH_MAX] = "";
49 
50 /* Thread performing all vhost management operations */
51 static struct spdk_thread *g_vhost_init_thread;
52 
53 static spdk_vhost_fini_cb g_fini_cpl_cb;
54 
55 /**
56  * DPDK calls our callbacks synchronously but the work those callbacks
57  * perform needs to be async. Luckily, all DPDK callbacks are called on
58  * a DPDK-internal pthread, so we'll just wait on a semaphore in there.
59  */
60 static sem_t g_dpdk_sem;
61 
62 /** Return code for the current DPDK callback */
63 static int g_dpdk_response;
64 
65 struct vhost_session_fn_ctx {
66 	/** Device pointer obtained before enqueuing the event */
67 	struct spdk_vhost_dev *vdev;
68 
69 	/** ID of the session to send event to. */
70 	uint32_t vsession_id;
71 
72 	/** User provided function to be executed on session's thread. */
73 	spdk_vhost_session_fn cb_fn;
74 
75 	/**
76 	 * User provided function to be called on the init thread
77 	 * after iterating through all sessions.
78 	 */
79 	spdk_vhost_dev_fn cpl_fn;
80 
81 	/** Custom user context */
82 	void *user_ctx;
83 };
84 
85 static TAILQ_HEAD(, spdk_vhost_dev) g_vhost_devices = TAILQ_HEAD_INITIALIZER(
86 			g_vhost_devices);
87 static pthread_mutex_t g_vhost_mutex = PTHREAD_MUTEX_INITIALIZER;
88 
89 void *vhost_gpa_to_vva(struct spdk_vhost_session *vsession, uint64_t addr, uint64_t len)
90 {
91 	void *vva;
92 	uint64_t newlen;
93 
94 	newlen = len;
95 	vva = (void *)rte_vhost_va_from_guest_pa(vsession->mem, addr, &newlen);
96 	if (newlen != len) {
97 		return NULL;
98 	}
99 
100 	return vva;
101 
102 }
103 
104 static void
105 vhost_log_req_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue,
106 		   uint16_t req_id)
107 {
108 	struct vring_desc *desc, *desc_table;
109 	uint32_t desc_table_size;
110 	int rc;
111 
112 	if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
113 		return;
114 	}
115 
116 	rc = vhost_vq_get_desc(vsession, virtqueue, req_id, &desc, &desc_table, &desc_table_size);
117 	if (spdk_unlikely(rc != 0)) {
118 		SPDK_ERRLOG("Can't log used ring descriptors!\n");
119 		return;
120 	}
121 
122 	do {
123 		if (vhost_vring_desc_is_wr(desc)) {
124 			/* To be honest, only pages realy touched should be logged, but
125 			 * doing so would require tracking those changes in each backed.
126 			 * Also backend most likely will touch all/most of those pages so
127 			 * for lets assume we touched all pages passed to as writeable buffers. */
128 			rte_vhost_log_write(vsession->vid, desc->addr, desc->len);
129 		}
130 		vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
131 	} while (desc);
132 }
133 
134 static void
135 vhost_log_used_vring_elem(struct spdk_vhost_session *vsession,
136 			  struct spdk_vhost_virtqueue *virtqueue,
137 			  uint16_t idx)
138 {
139 	uint64_t offset, len;
140 
141 	if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
142 		return;
143 	}
144 
145 	if (spdk_unlikely(virtqueue->packed.packed_ring)) {
146 		offset = idx * sizeof(struct vring_packed_desc);
147 		len = sizeof(struct vring_packed_desc);
148 	} else {
149 		offset = offsetof(struct vring_used, ring[idx]);
150 		len = sizeof(virtqueue->vring.used->ring[idx]);
151 	}
152 
153 	rte_vhost_log_used_vring(vsession->vid, virtqueue->vring_idx, offset, len);
154 }
155 
156 static void
157 vhost_log_used_vring_idx(struct spdk_vhost_session *vsession,
158 			 struct spdk_vhost_virtqueue *virtqueue)
159 {
160 	uint64_t offset, len;
161 	uint16_t vq_idx;
162 
163 	if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
164 		return;
165 	}
166 
167 	offset = offsetof(struct vring_used, idx);
168 	len = sizeof(virtqueue->vring.used->idx);
169 	vq_idx = virtqueue - vsession->virtqueue;
170 
171 	rte_vhost_log_used_vring(vsession->vid, vq_idx, offset, len);
172 }
173 
174 /*
175  * Get available requests from avail ring.
176  */
177 uint16_t
178 vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *virtqueue, uint16_t *reqs,
179 			uint16_t reqs_len)
180 {
181 	struct rte_vhost_vring *vring = &virtqueue->vring;
182 	struct vring_avail *avail = vring->avail;
183 	uint16_t size_mask = vring->size - 1;
184 	uint16_t last_idx = virtqueue->last_avail_idx, avail_idx = avail->idx;
185 	uint16_t count, i;
186 
187 	spdk_smp_rmb();
188 
189 	count = avail_idx - last_idx;
190 	if (spdk_likely(count == 0)) {
191 		return 0;
192 	}
193 
194 	if (spdk_unlikely(count > vring->size)) {
195 		/* TODO: the queue is unrecoverably broken and should be marked so.
196 		 * For now we will fail silently and report there are no new avail entries.
197 		 */
198 		return 0;
199 	}
200 
201 	count = spdk_min(count, reqs_len);
202 	if (virtqueue->vsession && virtqueue->vsession->interrupt_mode) {
203 		/* if completed IO number is larger than SPDK_AIO_QUEUE_DEPTH,
204 		 * io_getevent should be called again to ensure all completed IO are processed.
205 		 */
206 		int rc;
207 		uint64_t num_events;
208 
209 		rc = read(vring->kickfd, &num_events, sizeof(num_events));
210 		if (rc < 0) {
211 			SPDK_ERRLOG("failed to acknowledge kickfd: %s.\n", spdk_strerror(errno));
212 			return -errno;
213 		}
214 
215 		if ((uint16_t)(avail_idx - last_idx) != num_events) {
216 			SPDK_DEBUGLOG(vhost_ring,
217 				      "virtqueue gets %d reqs, but kickfd shows %lu reqs\n",
218 				      avail_idx - last_idx, num_events);
219 		}
220 
221 		if (num_events > count) {
222 			SPDK_DEBUGLOG(vhost_ring,
223 				      "virtqueue kickfd shows %lu reqs, take %d, send notice for other reqs\n",
224 				      num_events, reqs_len);
225 			num_events -= count;
226 			rc = write(vring->kickfd, &num_events, sizeof(num_events));
227 			if (rc < 0) {
228 				SPDK_ERRLOG("failed to kick vring: %s.\n", spdk_strerror(errno));
229 				return -errno;
230 			}
231 		}
232 	}
233 
234 	virtqueue->last_avail_idx += count;
235 	for (i = 0; i < count; i++) {
236 		reqs[i] = vring->avail->ring[(last_idx + i) & size_mask];
237 	}
238 
239 	SPDK_DEBUGLOG(vhost_ring,
240 		      "AVAIL: last_idx=%"PRIu16" avail_idx=%"PRIu16" count=%"PRIu16"\n",
241 		      last_idx, avail_idx, count);
242 
243 	return count;
244 }
245 
246 static bool
247 vhost_vring_desc_is_indirect(struct vring_desc *cur_desc)
248 {
249 	return !!(cur_desc->flags & VRING_DESC_F_INDIRECT);
250 }
251 
252 static bool
253 vhost_vring_packed_desc_is_indirect(struct vring_packed_desc *cur_desc)
254 {
255 	return (cur_desc->flags & VRING_DESC_F_INDIRECT) != 0;
256 }
257 
258 int
259 vhost_vq_get_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue,
260 		  uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table,
261 		  uint32_t *desc_table_size)
262 {
263 	if (spdk_unlikely(req_idx >= virtqueue->vring.size)) {
264 		return -1;
265 	}
266 
267 	*desc = &virtqueue->vring.desc[req_idx];
268 
269 	if (vhost_vring_desc_is_indirect(*desc)) {
270 		*desc_table_size = (*desc)->len / sizeof(**desc);
271 		*desc_table = vhost_gpa_to_vva(vsession, (*desc)->addr,
272 					       sizeof(**desc) * *desc_table_size);
273 		*desc = *desc_table;
274 		if (*desc == NULL) {
275 			return -1;
276 		}
277 
278 		return 0;
279 	}
280 
281 	*desc_table = virtqueue->vring.desc;
282 	*desc_table_size = virtqueue->vring.size;
283 
284 	return 0;
285 }
286 
287 int
288 vhost_vq_get_desc_packed(struct spdk_vhost_session *vsession,
289 			 struct spdk_vhost_virtqueue *virtqueue,
290 			 uint16_t req_idx, struct vring_packed_desc **desc,
291 			 struct vring_packed_desc **desc_table, uint32_t *desc_table_size)
292 {
293 	*desc =  &virtqueue->vring.desc_packed[req_idx];
294 
295 	/* In packed ring when the desc is non-indirect we get next desc
296 	 * by judging (desc->flag & VRING_DESC_F_NEXT) != 0. When the desc
297 	 * is indirect we get next desc by idx and desc_table_size. It's
298 	 * different from split ring.
299 	 */
300 	if (vhost_vring_packed_desc_is_indirect(*desc)) {
301 		*desc_table_size = (*desc)->len / sizeof(struct vring_packed_desc);
302 		*desc_table = vhost_gpa_to_vva(vsession, (*desc)->addr,
303 					       (*desc)->len);
304 		*desc = *desc_table;
305 		if (spdk_unlikely(*desc == NULL)) {
306 			return -1;
307 		}
308 	} else {
309 		*desc_table = NULL;
310 		*desc_table_size  = 0;
311 	}
312 
313 	return 0;
314 }
315 
316 int
317 vhost_vq_used_signal(struct spdk_vhost_session *vsession,
318 		     struct spdk_vhost_virtqueue *virtqueue)
319 {
320 	if (virtqueue->used_req_cnt == 0) {
321 		return 0;
322 	}
323 
324 	virtqueue->req_cnt += virtqueue->used_req_cnt;
325 	virtqueue->used_req_cnt = 0;
326 
327 	SPDK_DEBUGLOG(vhost_ring,
328 		      "Queue %td - USED RING: sending IRQ: last used %"PRIu16"\n",
329 		      virtqueue - vsession->virtqueue, virtqueue->last_used_idx);
330 
331 	if (rte_vhost_vring_call(vsession->vid, virtqueue->vring_idx) == 0) {
332 		/* interrupt signalled */
333 		return 1;
334 	} else {
335 		/* interrupt not signalled */
336 		return 0;
337 	}
338 }
339 
340 static void
341 session_vq_io_stats_update(struct spdk_vhost_session *vsession,
342 			   struct spdk_vhost_virtqueue *virtqueue, uint64_t now)
343 {
344 	uint32_t irq_delay_base = vsession->coalescing_delay_time_base;
345 	uint32_t io_threshold = vsession->coalescing_io_rate_threshold;
346 	int32_t irq_delay;
347 	uint32_t req_cnt;
348 
349 	req_cnt = virtqueue->req_cnt + virtqueue->used_req_cnt;
350 	if (req_cnt <= io_threshold) {
351 		return;
352 	}
353 
354 	irq_delay = (irq_delay_base * (req_cnt - io_threshold)) / io_threshold;
355 	virtqueue->irq_delay_time = (uint32_t) spdk_max(0, irq_delay);
356 
357 	virtqueue->req_cnt = 0;
358 	virtqueue->next_event_time = now;
359 }
360 
361 static void
362 check_session_vq_io_stats(struct spdk_vhost_session *vsession,
363 			  struct spdk_vhost_virtqueue *virtqueue, uint64_t now)
364 {
365 	if (now < vsession->next_stats_check_time) {
366 		return;
367 	}
368 
369 	vsession->next_stats_check_time = now + vsession->stats_check_interval;
370 	session_vq_io_stats_update(vsession, virtqueue, now);
371 }
372 
373 static inline bool
374 vhost_vq_event_is_suppressed(struct spdk_vhost_virtqueue *vq)
375 {
376 	if (spdk_unlikely(vq->packed.packed_ring)) {
377 		if (vq->vring.driver_event->flags & VRING_PACKED_EVENT_FLAG_DISABLE) {
378 			return true;
379 		}
380 	} else {
381 		if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) {
382 			return true;
383 		}
384 	}
385 
386 	return false;
387 }
388 
389 void
390 vhost_session_vq_used_signal(struct spdk_vhost_virtqueue *virtqueue)
391 {
392 	struct spdk_vhost_session *vsession = virtqueue->vsession;
393 	uint64_t now;
394 
395 	if (vsession->coalescing_delay_time_base == 0) {
396 		if (virtqueue->vring.desc == NULL) {
397 			return;
398 		}
399 
400 		if (vhost_vq_event_is_suppressed(virtqueue)) {
401 			return;
402 		}
403 
404 		vhost_vq_used_signal(vsession, virtqueue);
405 	} else {
406 		now = spdk_get_ticks();
407 		check_session_vq_io_stats(vsession, virtqueue, now);
408 
409 		/* No need for event right now */
410 		if (now < virtqueue->next_event_time) {
411 			return;
412 		}
413 
414 		if (vhost_vq_event_is_suppressed(virtqueue)) {
415 			return;
416 		}
417 
418 		if (!vhost_vq_used_signal(vsession, virtqueue)) {
419 			return;
420 		}
421 
422 		/* Syscall is quite long so update time */
423 		now = spdk_get_ticks();
424 		virtqueue->next_event_time = now + virtqueue->irq_delay_time;
425 	}
426 }
427 
428 void
429 vhost_session_used_signal(struct spdk_vhost_session *vsession)
430 {
431 	struct spdk_vhost_virtqueue *virtqueue;
432 	uint16_t q_idx;
433 
434 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
435 		virtqueue = &vsession->virtqueue[q_idx];
436 		vhost_session_vq_used_signal(virtqueue);
437 	}
438 }
439 
440 static int
441 vhost_session_set_coalescing(struct spdk_vhost_dev *vdev,
442 			     struct spdk_vhost_session *vsession, void *ctx)
443 {
444 	vsession->coalescing_delay_time_base =
445 		vdev->coalescing_delay_us * spdk_get_ticks_hz() / 1000000ULL;
446 	vsession->coalescing_io_rate_threshold =
447 		vdev->coalescing_iops_threshold * SPDK_VHOST_STATS_CHECK_INTERVAL_MS / 1000U;
448 	return 0;
449 }
450 
451 static int
452 vhost_dev_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us,
453 			 uint32_t iops_threshold)
454 {
455 	uint64_t delay_time_base = delay_base_us * spdk_get_ticks_hz() / 1000000ULL;
456 	uint32_t io_rate = iops_threshold * SPDK_VHOST_STATS_CHECK_INTERVAL_MS / 1000U;
457 
458 	if (delay_time_base >= UINT32_MAX) {
459 		SPDK_ERRLOG("Delay time of %"PRIu32" is to big\n", delay_base_us);
460 		return -EINVAL;
461 	} else if (io_rate == 0) {
462 		SPDK_ERRLOG("IOPS rate of %"PRIu32" is too low. Min is %u\n", io_rate,
463 			    1000U / SPDK_VHOST_STATS_CHECK_INTERVAL_MS);
464 		return -EINVAL;
465 	}
466 
467 	vdev->coalescing_delay_us = delay_base_us;
468 	vdev->coalescing_iops_threshold = iops_threshold;
469 	return 0;
470 }
471 
472 int
473 spdk_vhost_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us,
474 			  uint32_t iops_threshold)
475 {
476 	int rc;
477 
478 	rc = vhost_dev_set_coalescing(vdev, delay_base_us, iops_threshold);
479 	if (rc != 0) {
480 		return rc;
481 	}
482 
483 	vhost_dev_foreach_session(vdev, vhost_session_set_coalescing, NULL, NULL);
484 	return 0;
485 }
486 
487 void
488 spdk_vhost_get_coalescing(struct spdk_vhost_dev *vdev, uint32_t *delay_base_us,
489 			  uint32_t *iops_threshold)
490 {
491 	if (delay_base_us) {
492 		*delay_base_us = vdev->coalescing_delay_us;
493 	}
494 
495 	if (iops_threshold) {
496 		*iops_threshold = vdev->coalescing_iops_threshold;
497 	}
498 }
499 
500 /*
501  * Enqueue id and len to used ring.
502  */
503 void
504 vhost_vq_used_ring_enqueue(struct spdk_vhost_session *vsession,
505 			   struct spdk_vhost_virtqueue *virtqueue,
506 			   uint16_t id, uint32_t len)
507 {
508 	struct rte_vhost_vring *vring = &virtqueue->vring;
509 	struct vring_used *used = vring->used;
510 	uint16_t last_idx = virtqueue->last_used_idx & (vring->size - 1);
511 	uint16_t vq_idx = virtqueue->vring_idx;
512 
513 	SPDK_DEBUGLOG(vhost_ring,
514 		      "Queue %td - USED RING: last_idx=%"PRIu16" req id=%"PRIu16" len=%"PRIu32"\n",
515 		      virtqueue - vsession->virtqueue, virtqueue->last_used_idx, id, len);
516 
517 	vhost_log_req_desc(vsession, virtqueue, id);
518 
519 	virtqueue->last_used_idx++;
520 	used->ring[last_idx].id = id;
521 	used->ring[last_idx].len = len;
522 
523 	/* Ensure the used ring is updated before we log it or increment used->idx. */
524 	spdk_smp_wmb();
525 
526 	rte_vhost_set_last_inflight_io_split(vsession->vid, vq_idx, id);
527 
528 	vhost_log_used_vring_elem(vsession, virtqueue, last_idx);
529 	* (volatile uint16_t *) &used->idx = virtqueue->last_used_idx;
530 	vhost_log_used_vring_idx(vsession, virtqueue);
531 
532 	rte_vhost_clr_inflight_desc_split(vsession->vid, vq_idx, virtqueue->last_used_idx, id);
533 
534 	virtqueue->used_req_cnt++;
535 
536 	if (vsession->interrupt_mode) {
537 		if (virtqueue->vring.desc == NULL || vhost_vq_event_is_suppressed(virtqueue)) {
538 			return;
539 		}
540 
541 		vhost_vq_used_signal(vsession, virtqueue);
542 	}
543 }
544 
545 void
546 vhost_vq_packed_ring_enqueue(struct spdk_vhost_session *vsession,
547 			     struct spdk_vhost_virtqueue *virtqueue,
548 			     uint16_t num_descs, uint16_t buffer_id,
549 			     uint32_t length)
550 {
551 	struct vring_packed_desc *desc = &virtqueue->vring.desc_packed[virtqueue->last_used_idx];
552 	bool used, avail;
553 
554 	SPDK_DEBUGLOG(vhost_ring,
555 		      "Queue %td - RING: buffer_id=%"PRIu16"\n",
556 		      virtqueue - vsession->virtqueue, buffer_id);
557 
558 	/* When the descriptor is used, two flags in descriptor
559 	 * avail flag and used flag are set to equal
560 	 * and used flag value == used_wrap_counter.
561 	 */
562 	used = !!(desc->flags & VRING_DESC_F_USED);
563 	avail = !!(desc->flags & VRING_DESC_F_AVAIL);
564 	if (spdk_unlikely(used == virtqueue->packed.used_phase && used == avail)) {
565 		SPDK_ERRLOG("descriptor has been used before\n");
566 		return;
567 	}
568 
569 	/* In used desc addr is unused and len specifies the buffer length
570 	 * that has been written to by the device.
571 	 */
572 	desc->addr = 0;
573 	desc->len = length;
574 
575 	/* This bit specifies whether any data has been written by the device */
576 	if (length != 0) {
577 		desc->flags |= VRING_DESC_F_WRITE;
578 	}
579 
580 	/* Buffer ID is included in the last descriptor in the list.
581 	 * The driver needs to keep track of the size of the list corresponding
582 	 * to each buffer ID.
583 	 */
584 	desc->id = buffer_id;
585 
586 	/* A device MUST NOT make the descriptor used before buffer_id is
587 	 * written to the descriptor.
588 	 */
589 	spdk_smp_wmb();
590 	/* To mark a desc as used, the device sets the F_USED bit in flags to match
591 	 * the internal Device ring wrap counter. It also sets the F_AVAIL bit to
592 	 * match the same value.
593 	 */
594 	if (virtqueue->packed.used_phase) {
595 		desc->flags |= VRING_DESC_F_AVAIL_USED;
596 	} else {
597 		desc->flags &= ~VRING_DESC_F_AVAIL_USED;
598 	}
599 
600 	vhost_log_used_vring_elem(vsession, virtqueue, virtqueue->last_used_idx);
601 	virtqueue->last_used_idx += num_descs;
602 	if (virtqueue->last_used_idx >= virtqueue->vring.size) {
603 		virtqueue->last_used_idx -= virtqueue->vring.size;
604 		virtqueue->packed.used_phase = !virtqueue->packed.used_phase;
605 	}
606 
607 	virtqueue->used_req_cnt++;
608 }
609 
610 bool
611 vhost_vq_packed_ring_is_avail(struct spdk_vhost_virtqueue *virtqueue)
612 {
613 	uint16_t flags = virtqueue->vring.desc_packed[virtqueue->last_avail_idx].flags;
614 
615 	/* To mark a desc as available, the driver sets the F_AVAIL bit in flags
616 	 * to match the internal avail wrap counter. It also sets the F_USED bit to
617 	 * match the inverse value but it's not mandatory.
618 	 */
619 	return (!!(flags & VRING_DESC_F_AVAIL) == virtqueue->packed.avail_phase);
620 }
621 
622 bool
623 vhost_vring_packed_desc_is_wr(struct vring_packed_desc *cur_desc)
624 {
625 	return (cur_desc->flags & VRING_DESC_F_WRITE) != 0;
626 }
627 
628 int
629 vhost_vring_packed_desc_get_next(struct vring_packed_desc **desc, uint16_t *req_idx,
630 				 struct spdk_vhost_virtqueue *vq,
631 				 struct vring_packed_desc *desc_table,
632 				 uint32_t desc_table_size)
633 {
634 	if (desc_table != NULL) {
635 		/* When the desc_table isn't NULL means it's indirect and we get the next
636 		 * desc by req_idx and desc_table_size. The return value is NULL means
637 		 * we reach the last desc of this request.
638 		 */
639 		(*req_idx)++;
640 		if (*req_idx < desc_table_size) {
641 			*desc = &desc_table[*req_idx];
642 		} else {
643 			*desc = NULL;
644 		}
645 	} else {
646 		/* When the desc_table is NULL means it's non-indirect and we get the next
647 		 * desc by req_idx and F_NEXT in flags. The return value is NULL means
648 		 * we reach the last desc of this request. When return new desc
649 		 * we update the req_idx too.
650 		 */
651 		if (((*desc)->flags & VRING_DESC_F_NEXT) == 0) {
652 			*desc = NULL;
653 			return 0;
654 		}
655 
656 		*req_idx = (*req_idx + 1) % vq->vring.size;
657 		*desc = &vq->vring.desc_packed[*req_idx];
658 	}
659 
660 	return 0;
661 }
662 
663 static int
664 vhost_vring_desc_payload_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
665 				uint16_t *iov_index, uintptr_t payload, uint64_t remaining)
666 {
667 	uintptr_t vva;
668 	uint64_t len;
669 
670 	do {
671 		if (*iov_index >= SPDK_VHOST_IOVS_MAX) {
672 			SPDK_ERRLOG("SPDK_VHOST_IOVS_MAX(%d) reached\n", SPDK_VHOST_IOVS_MAX);
673 			return -1;
674 		}
675 		len = remaining;
676 		vva = (uintptr_t)rte_vhost_va_from_guest_pa(vsession->mem, payload, &len);
677 		if (vva == 0 || len == 0) {
678 			SPDK_ERRLOG("gpa_to_vva(%p) == NULL\n", (void *)payload);
679 			return -1;
680 		}
681 		iov[*iov_index].iov_base = (void *)vva;
682 		iov[*iov_index].iov_len = len;
683 		remaining -= len;
684 		payload += len;
685 		(*iov_index)++;
686 	} while (remaining);
687 
688 	return 0;
689 }
690 
691 int
692 vhost_vring_packed_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
693 			       uint16_t *iov_index, const struct vring_packed_desc *desc)
694 {
695 	return vhost_vring_desc_payload_to_iov(vsession, iov, iov_index,
696 					       desc->addr, desc->len);
697 }
698 
699 /* 1, Traverse the desc chain to get the buffer_id and return buffer_id as task_idx.
700  * 2, Update the vq->last_avail_idx to point next available desc chain.
701  * 3, Update the avail_wrap_counter if last_avail_idx overturn.
702  */
703 uint16_t
704 vhost_vring_packed_desc_get_buffer_id(struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
705 				      uint16_t *num_descs)
706 {
707 	struct vring_packed_desc *desc;
708 	uint16_t desc_head = req_idx;
709 
710 	*num_descs = 1;
711 
712 	desc =  &vq->vring.desc_packed[req_idx];
713 	if (!vhost_vring_packed_desc_is_indirect(desc)) {
714 		while ((desc->flags & VRING_DESC_F_NEXT) != 0) {
715 			req_idx = (req_idx + 1) % vq->vring.size;
716 			desc = &vq->vring.desc_packed[req_idx];
717 			(*num_descs)++;
718 		}
719 	}
720 
721 	/* Queue Size doesn't have to be a power of 2
722 	 * Device maintains last_avail_idx so we can make sure
723 	 * the value is valid(0 ~ vring.size - 1)
724 	 */
725 	vq->last_avail_idx = (req_idx + 1) % vq->vring.size;
726 	if (vq->last_avail_idx < desc_head) {
727 		vq->packed.avail_phase = !vq->packed.avail_phase;
728 	}
729 
730 	return desc->id;
731 }
732 
733 int
734 vhost_vring_desc_get_next(struct vring_desc **desc,
735 			  struct vring_desc *desc_table, uint32_t desc_table_size)
736 {
737 	struct vring_desc *old_desc = *desc;
738 	uint16_t next_idx;
739 
740 	if ((old_desc->flags & VRING_DESC_F_NEXT) == 0) {
741 		*desc = NULL;
742 		return 0;
743 	}
744 
745 	next_idx = old_desc->next;
746 	if (spdk_unlikely(next_idx >= desc_table_size)) {
747 		*desc = NULL;
748 		return -1;
749 	}
750 
751 	*desc = &desc_table[next_idx];
752 	return 0;
753 }
754 
755 int
756 vhost_vring_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
757 			uint16_t *iov_index, const struct vring_desc *desc)
758 {
759 	return vhost_vring_desc_payload_to_iov(vsession, iov, iov_index,
760 					       desc->addr, desc->len);
761 }
762 
763 static struct spdk_vhost_session *
764 vhost_session_find_by_id(struct spdk_vhost_dev *vdev, unsigned id)
765 {
766 	struct spdk_vhost_session *vsession;
767 
768 	TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
769 		if (vsession->id == id) {
770 			return vsession;
771 		}
772 	}
773 
774 	return NULL;
775 }
776 
777 struct spdk_vhost_session *
778 vhost_session_find_by_vid(int vid)
779 {
780 	struct spdk_vhost_dev *vdev;
781 	struct spdk_vhost_session *vsession;
782 
783 	TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) {
784 		TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
785 			if (vsession->vid == vid) {
786 				return vsession;
787 			}
788 		}
789 	}
790 
791 	return NULL;
792 }
793 
794 struct spdk_vhost_dev *
795 spdk_vhost_dev_next(struct spdk_vhost_dev *vdev)
796 {
797 	if (vdev == NULL) {
798 		return TAILQ_FIRST(&g_vhost_devices);
799 	}
800 
801 	return TAILQ_NEXT(vdev, tailq);
802 }
803 
804 struct spdk_vhost_dev *
805 spdk_vhost_dev_find(const char *ctrlr_name)
806 {
807 	struct spdk_vhost_dev *vdev;
808 	size_t dev_dirname_len = strlen(dev_dirname);
809 
810 	if (strncmp(ctrlr_name, dev_dirname, dev_dirname_len) == 0) {
811 		ctrlr_name += dev_dirname_len;
812 	}
813 
814 	TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) {
815 		if (strcmp(vdev->name, ctrlr_name) == 0) {
816 			return vdev;
817 		}
818 	}
819 
820 	return NULL;
821 }
822 
823 static int
824 vhost_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask)
825 {
826 	int rc;
827 
828 	if (cpumask == NULL) {
829 		return -1;
830 	}
831 
832 	if (mask == NULL) {
833 		spdk_cpuset_copy(cpumask, &g_vhost_core_mask);
834 		return 0;
835 	}
836 
837 	rc = spdk_cpuset_parse(cpumask, mask);
838 	if (rc < 0) {
839 		SPDK_ERRLOG("invalid cpumask %s\n", mask);
840 		return -1;
841 	}
842 
843 	spdk_cpuset_and(cpumask, &g_vhost_core_mask);
844 
845 	if (spdk_cpuset_count(cpumask) == 0) {
846 		SPDK_ERRLOG("no cpu is selected among core mask(=%s)\n",
847 			    spdk_cpuset_fmt(&g_vhost_core_mask));
848 		return -1;
849 	}
850 
851 	return 0;
852 }
853 
854 static void
855 vhost_setup_core_mask(void *ctx)
856 {
857 	struct spdk_thread *thread = spdk_get_thread();
858 	spdk_cpuset_or(&g_vhost_core_mask, spdk_thread_get_cpumask(thread));
859 }
860 
861 static void
862 vhost_setup_core_mask_done(void *ctx)
863 {
864 	spdk_vhost_init_cb init_cb = ctx;
865 
866 	if (spdk_cpuset_count(&g_vhost_core_mask) == 0) {
867 		init_cb(-ECHILD);
868 		return;
869 	}
870 
871 	init_cb(0);
872 }
873 
874 static void
875 vhost_dev_thread_exit(void *arg1)
876 {
877 	spdk_thread_exit(spdk_get_thread());
878 }
879 
880 int
881 vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str,
882 		   const struct spdk_vhost_dev_backend *backend)
883 {
884 	char path[PATH_MAX];
885 	struct spdk_cpuset cpumask = {};
886 	int rc;
887 
888 	assert(vdev);
889 	if (name == NULL) {
890 		SPDK_ERRLOG("Can't register controller with no name\n");
891 		return -EINVAL;
892 	}
893 
894 	if (vhost_parse_core_mask(mask_str, &cpumask) != 0) {
895 		SPDK_ERRLOG("cpumask %s is invalid (core mask is 0x%s)\n",
896 			    mask_str, spdk_cpuset_fmt(&g_vhost_core_mask));
897 		return -EINVAL;
898 	}
899 
900 	if (spdk_vhost_dev_find(name)) {
901 		SPDK_ERRLOG("vhost controller %s already exists.\n", name);
902 		return -EEXIST;
903 	}
904 
905 	if (snprintf(path, sizeof(path), "%s%s", dev_dirname, name) >= (int)sizeof(path)) {
906 		SPDK_ERRLOG("Resulting socket path for controller %s is too long: %s%s\n", name, dev_dirname,
907 			    name);
908 		return -EINVAL;
909 	}
910 
911 	vdev->name = strdup(name);
912 	vdev->path = strdup(path);
913 	if (vdev->name == NULL || vdev->path == NULL) {
914 		rc = -EIO;
915 		goto out;
916 	}
917 
918 	vdev->thread = spdk_thread_create(vdev->name, &cpumask);
919 	if (vdev->thread == NULL) {
920 		SPDK_ERRLOG("Failed to create thread for vhost controller %s.\n", name);
921 		rc = -EIO;
922 		goto out;
923 	}
924 
925 	vdev->registered = true;
926 	vdev->backend = backend;
927 	TAILQ_INIT(&vdev->vsessions);
928 
929 	vhost_dev_set_coalescing(vdev, SPDK_VHOST_COALESCING_DELAY_BASE_US,
930 				 SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD);
931 
932 	if (vhost_register_unix_socket(path, name, vdev->virtio_features, vdev->disabled_features,
933 				       vdev->protocol_features)) {
934 		spdk_thread_send_msg(vdev->thread, vhost_dev_thread_exit, NULL);
935 		rc = -EIO;
936 		goto out;
937 	}
938 
939 	TAILQ_INSERT_TAIL(&g_vhost_devices, vdev, tailq);
940 
941 	SPDK_INFOLOG(vhost, "Controller %s: new controller added\n", vdev->name);
942 	return 0;
943 
944 out:
945 	free(vdev->name);
946 	free(vdev->path);
947 	return rc;
948 }
949 
950 int
951 vhost_dev_unregister(struct spdk_vhost_dev *vdev)
952 {
953 	if (!TAILQ_EMPTY(&vdev->vsessions)) {
954 		SPDK_ERRLOG("Controller %s has still valid connection.\n", vdev->name);
955 		return -EBUSY;
956 	}
957 
958 	if (vdev->registered && vhost_driver_unregister(vdev->path) != 0) {
959 		SPDK_ERRLOG("Could not unregister controller %s with vhost library\n"
960 			    "Check if domain socket %s still exists\n",
961 			    vdev->name, vdev->path);
962 		return -EIO;
963 	}
964 
965 	SPDK_INFOLOG(vhost, "Controller %s: removed\n", vdev->name);
966 
967 	spdk_thread_send_msg(vdev->thread, vhost_dev_thread_exit, NULL);
968 
969 	free(vdev->name);
970 	free(vdev->path);
971 	TAILQ_REMOVE(&g_vhost_devices, vdev, tailq);
972 	return 0;
973 }
974 
975 const char *
976 spdk_vhost_dev_get_name(struct spdk_vhost_dev *vdev)
977 {
978 	assert(vdev != NULL);
979 	return vdev->name;
980 }
981 
982 const struct spdk_cpuset *
983 spdk_vhost_dev_get_cpumask(struct spdk_vhost_dev *vdev)
984 {
985 	assert(vdev != NULL);
986 	return spdk_thread_get_cpumask(vdev->thread);
987 }
988 
989 static void
990 wait_for_semaphore(int timeout_sec, const char *errmsg)
991 {
992 	struct timespec timeout;
993 	int rc;
994 
995 	clock_gettime(CLOCK_REALTIME, &timeout);
996 	timeout.tv_sec += timeout_sec;
997 	rc = sem_timedwait(&g_dpdk_sem, &timeout);
998 	if (rc != 0) {
999 		SPDK_ERRLOG("Timeout waiting for event: %s.\n", errmsg);
1000 		sem_wait(&g_dpdk_sem);
1001 	}
1002 }
1003 
1004 static void
1005 vhost_session_cb_done(int rc)
1006 {
1007 	g_dpdk_response = rc;
1008 	sem_post(&g_dpdk_sem);
1009 }
1010 
1011 void
1012 vhost_session_start_done(struct spdk_vhost_session *vsession, int response)
1013 {
1014 	if (response == 0) {
1015 		vsession->started = true;
1016 
1017 		assert(vsession->vdev->active_session_num < UINT32_MAX);
1018 		vsession->vdev->active_session_num++;
1019 	}
1020 
1021 	vhost_session_cb_done(response);
1022 }
1023 
1024 void
1025 vhost_session_stop_done(struct spdk_vhost_session *vsession, int response)
1026 {
1027 	if (response == 0) {
1028 		vsession->started = false;
1029 
1030 		assert(vsession->vdev->active_session_num > 0);
1031 		vsession->vdev->active_session_num--;
1032 	}
1033 
1034 	vhost_session_cb_done(response);
1035 }
1036 
1037 static void
1038 vhost_event_cb(void *arg1)
1039 {
1040 	struct vhost_session_fn_ctx *ctx = arg1;
1041 	struct spdk_vhost_session *vsession;
1042 
1043 	if (pthread_mutex_trylock(&g_vhost_mutex) != 0) {
1044 		spdk_thread_send_msg(spdk_get_thread(), vhost_event_cb, arg1);
1045 		return;
1046 	}
1047 
1048 	vsession = vhost_session_find_by_id(ctx->vdev, ctx->vsession_id);
1049 	ctx->cb_fn(ctx->vdev, vsession, NULL);
1050 	pthread_mutex_unlock(&g_vhost_mutex);
1051 }
1052 
1053 int
1054 vhost_session_send_event(struct spdk_vhost_session *vsession,
1055 			 spdk_vhost_session_fn cb_fn, unsigned timeout_sec,
1056 			 const char *errmsg)
1057 {
1058 	struct vhost_session_fn_ctx ev_ctx = {0};
1059 	struct spdk_vhost_dev *vdev = vsession->vdev;
1060 
1061 	ev_ctx.vdev = vdev;
1062 	ev_ctx.vsession_id = vsession->id;
1063 	ev_ctx.cb_fn = cb_fn;
1064 
1065 	spdk_thread_send_msg(vdev->thread, vhost_event_cb, &ev_ctx);
1066 
1067 	pthread_mutex_unlock(&g_vhost_mutex);
1068 	wait_for_semaphore(timeout_sec, errmsg);
1069 	pthread_mutex_lock(&g_vhost_mutex);
1070 
1071 	return g_dpdk_response;
1072 }
1073 
1074 static void
1075 foreach_session_finish_cb(void *arg1)
1076 {
1077 	struct vhost_session_fn_ctx *ev_ctx = arg1;
1078 	struct spdk_vhost_dev *vdev = ev_ctx->vdev;
1079 
1080 	if (pthread_mutex_trylock(&g_vhost_mutex) != 0) {
1081 		spdk_thread_send_msg(spdk_get_thread(),
1082 				     foreach_session_finish_cb, arg1);
1083 		return;
1084 	}
1085 
1086 	assert(vdev->pending_async_op_num > 0);
1087 	vdev->pending_async_op_num--;
1088 	if (ev_ctx->cpl_fn != NULL) {
1089 		ev_ctx->cpl_fn(vdev, ev_ctx->user_ctx);
1090 	}
1091 
1092 	pthread_mutex_unlock(&g_vhost_mutex);
1093 	free(ev_ctx);
1094 }
1095 
1096 static void
1097 foreach_session(void *arg1)
1098 {
1099 	struct vhost_session_fn_ctx *ev_ctx = arg1;
1100 	struct spdk_vhost_session *vsession;
1101 	struct spdk_vhost_dev *vdev = ev_ctx->vdev;
1102 	int rc;
1103 
1104 	if (pthread_mutex_trylock(&g_vhost_mutex) != 0) {
1105 		spdk_thread_send_msg(spdk_get_thread(), foreach_session, arg1);
1106 		return;
1107 	}
1108 
1109 	TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
1110 		if (vsession->initialized) {
1111 			rc = ev_ctx->cb_fn(vdev, vsession, ev_ctx->user_ctx);
1112 			if (rc < 0) {
1113 				goto out;
1114 			}
1115 		}
1116 	}
1117 
1118 out:
1119 	pthread_mutex_unlock(&g_vhost_mutex);
1120 
1121 	spdk_thread_send_msg(g_vhost_init_thread, foreach_session_finish_cb, arg1);
1122 }
1123 
1124 void
1125 vhost_dev_foreach_session(struct spdk_vhost_dev *vdev,
1126 			  spdk_vhost_session_fn fn,
1127 			  spdk_vhost_dev_fn cpl_fn,
1128 			  void *arg)
1129 {
1130 	struct vhost_session_fn_ctx *ev_ctx;
1131 
1132 	ev_ctx = calloc(1, sizeof(*ev_ctx));
1133 	if (ev_ctx == NULL) {
1134 		SPDK_ERRLOG("Failed to alloc vhost event.\n");
1135 		assert(false);
1136 		return;
1137 	}
1138 
1139 	ev_ctx->vdev = vdev;
1140 	ev_ctx->cb_fn = fn;
1141 	ev_ctx->cpl_fn = cpl_fn;
1142 	ev_ctx->user_ctx = arg;
1143 
1144 	assert(vdev->pending_async_op_num < UINT32_MAX);
1145 	vdev->pending_async_op_num++;
1146 
1147 	spdk_thread_send_msg(vdev->thread, foreach_session, ev_ctx);
1148 }
1149 
1150 static int
1151 _stop_session(struct spdk_vhost_session *vsession)
1152 {
1153 	struct spdk_vhost_dev *vdev = vsession->vdev;
1154 	struct spdk_vhost_virtqueue *q;
1155 	int rc;
1156 	uint16_t i;
1157 
1158 	rc = vdev->backend->stop_session(vsession);
1159 	if (rc != 0) {
1160 		SPDK_ERRLOG("Couldn't stop device with vid %d.\n", vsession->vid);
1161 		pthread_mutex_unlock(&g_vhost_mutex);
1162 		return rc;
1163 	}
1164 
1165 	for (i = 0; i < vsession->max_queues; i++) {
1166 		q = &vsession->virtqueue[i];
1167 
1168 		/* vring.desc and vring.desc_packed are in a union struct
1169 		 * so q->vring.desc can replace q->vring.desc_packed.
1170 		 */
1171 		if (q->vring.desc == NULL) {
1172 			continue;
1173 		}
1174 
1175 		/* Packed virtqueues support up to 2^15 entries each
1176 		 * so left one bit can be used as wrap counter.
1177 		 */
1178 		if (q->packed.packed_ring) {
1179 			q->last_avail_idx = q->last_avail_idx |
1180 					    ((uint16_t)q->packed.avail_phase << 15);
1181 			q->last_used_idx = q->last_used_idx |
1182 					   ((uint16_t)q->packed.used_phase << 15);
1183 		}
1184 
1185 		rte_vhost_set_vring_base(vsession->vid, i, q->last_avail_idx, q->last_used_idx);
1186 	}
1187 
1188 	vhost_session_mem_unregister(vsession->mem);
1189 	free(vsession->mem);
1190 
1191 	return 0;
1192 }
1193 
1194 int
1195 vhost_stop_device_cb(int vid)
1196 {
1197 	struct spdk_vhost_session *vsession;
1198 	int rc;
1199 
1200 	pthread_mutex_lock(&g_vhost_mutex);
1201 	vsession = vhost_session_find_by_vid(vid);
1202 	if (vsession == NULL) {
1203 		SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
1204 		pthread_mutex_unlock(&g_vhost_mutex);
1205 		return -EINVAL;
1206 	}
1207 
1208 	if (!vsession->started) {
1209 		/* already stopped, nothing to do */
1210 		pthread_mutex_unlock(&g_vhost_mutex);
1211 		return -EALREADY;
1212 	}
1213 
1214 	rc = _stop_session(vsession);
1215 	pthread_mutex_unlock(&g_vhost_mutex);
1216 
1217 	return rc;
1218 }
1219 
1220 int
1221 vhost_start_device_cb(int vid)
1222 {
1223 	struct spdk_vhost_dev *vdev;
1224 	struct spdk_vhost_session *vsession;
1225 	int rc = -1;
1226 	uint16_t i;
1227 	bool packed_ring;
1228 
1229 	pthread_mutex_lock(&g_vhost_mutex);
1230 
1231 	vsession = vhost_session_find_by_vid(vid);
1232 	if (vsession == NULL) {
1233 		SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
1234 		goto out;
1235 	}
1236 
1237 	if (spdk_interrupt_mode_is_enabled()) {
1238 		vsession->interrupt_mode = true;
1239 	}
1240 
1241 	vdev = vsession->vdev;
1242 	if (vsession->started) {
1243 		/* already started, nothing to do */
1244 		rc = 0;
1245 		goto out;
1246 	}
1247 
1248 	if (vhost_get_negotiated_features(vid, &vsession->negotiated_features) != 0) {
1249 		SPDK_ERRLOG("vhost device %d: Failed to get negotiated driver features\n", vid);
1250 		goto out;
1251 	}
1252 
1253 	packed_ring = ((vsession->negotiated_features & (1ULL << VIRTIO_F_RING_PACKED)) != 0);
1254 
1255 	vsession->max_queues = 0;
1256 	memset(vsession->virtqueue, 0, sizeof(vsession->virtqueue));
1257 	for (i = 0; i < SPDK_VHOST_MAX_VQUEUES; i++) {
1258 		struct spdk_vhost_virtqueue *q = &vsession->virtqueue[i];
1259 
1260 		q->vsession = vsession;
1261 		q->vring_idx = -1;
1262 		if (rte_vhost_get_vhost_vring(vid, i, &q->vring)) {
1263 			continue;
1264 		}
1265 		q->vring_idx = i;
1266 		rte_vhost_get_vhost_ring_inflight(vid, i, &q->vring_inflight);
1267 
1268 		/* vring.desc and vring.desc_packed are in a union struct
1269 		 * so q->vring.desc can replace q->vring.desc_packed.
1270 		 */
1271 		if (q->vring.desc == NULL || q->vring.size == 0) {
1272 			continue;
1273 		}
1274 
1275 		if (rte_vhost_get_vring_base(vsession->vid, i, &q->last_avail_idx, &q->last_used_idx)) {
1276 			q->vring.desc = NULL;
1277 			continue;
1278 		}
1279 
1280 		if (packed_ring) {
1281 			/* Packed virtqueues support up to 2^15 entries each
1282 			 * so left one bit can be used as wrap counter.
1283 			 */
1284 			q->packed.avail_phase = q->last_avail_idx >> 15;
1285 			q->last_avail_idx = q->last_avail_idx & 0x7FFF;
1286 			q->packed.used_phase = q->last_used_idx >> 15;
1287 			q->last_used_idx = q->last_used_idx & 0x7FFF;
1288 
1289 			if (!vsession->interrupt_mode) {
1290 				/* Disable I/O submission notifications, we'll be polling. */
1291 				q->vring.device_event->flags = VRING_PACKED_EVENT_FLAG_DISABLE;
1292 			}
1293 		} else {
1294 			if (!vsession->interrupt_mode) {
1295 				/* Disable I/O submission notifications, we'll be polling. */
1296 				q->vring.used->flags = VRING_USED_F_NO_NOTIFY;
1297 			}
1298 		}
1299 
1300 		q->packed.packed_ring = packed_ring;
1301 		vsession->max_queues = i + 1;
1302 	}
1303 
1304 	if (vhost_get_mem_table(vid, &vsession->mem) != 0) {
1305 		SPDK_ERRLOG("vhost device %d: Failed to get guest memory table\n", vid);
1306 		goto out;
1307 	}
1308 
1309 	/*
1310 	 * Not sure right now but this look like some kind of QEMU bug and guest IO
1311 	 * might be frozed without kicking all queues after live-migration. This look like
1312 	 * the previous vhost instance failed to effectively deliver all interrupts before
1313 	 * the GET_VRING_BASE message. This shouldn't harm guest since spurious interrupts
1314 	 * should be ignored by guest virtio driver.
1315 	 *
1316 	 * Tested on QEMU 2.10.91 and 2.11.50.
1317 	 */
1318 	for (i = 0; i < vsession->max_queues; i++) {
1319 		struct spdk_vhost_virtqueue *q = &vsession->virtqueue[i];
1320 
1321 		/* vring.desc and vring.desc_packed are in a union struct
1322 		 * so q->vring.desc can replace q->vring.desc_packed.
1323 		 */
1324 		if (q->vring.desc != NULL && q->vring.size > 0) {
1325 			rte_vhost_vring_call(vsession->vid, q->vring_idx);
1326 		}
1327 	}
1328 
1329 	vhost_session_set_coalescing(vdev, vsession, NULL);
1330 	vhost_session_mem_register(vsession->mem);
1331 	vsession->initialized = true;
1332 	rc = vdev->backend->start_session(vsession);
1333 	if (rc != 0) {
1334 		vhost_session_mem_unregister(vsession->mem);
1335 		free(vsession->mem);
1336 		goto out;
1337 	}
1338 
1339 out:
1340 	pthread_mutex_unlock(&g_vhost_mutex);
1341 	return rc;
1342 }
1343 
1344 int
1345 spdk_vhost_set_socket_path(const char *basename)
1346 {
1347 	int ret;
1348 
1349 	if (basename && strlen(basename) > 0) {
1350 		ret = snprintf(dev_dirname, sizeof(dev_dirname) - 2, "%s", basename);
1351 		if (ret <= 0) {
1352 			return -EINVAL;
1353 		}
1354 		if ((size_t)ret >= sizeof(dev_dirname) - 2) {
1355 			SPDK_ERRLOG("Char dev dir path length %d is too long\n", ret);
1356 			return -EINVAL;
1357 		}
1358 
1359 		if (dev_dirname[ret - 1] != '/') {
1360 			dev_dirname[ret] = '/';
1361 			dev_dirname[ret + 1]  = '\0';
1362 		}
1363 	}
1364 
1365 	return 0;
1366 }
1367 
1368 void
1369 vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
1370 {
1371 	assert(vdev->backend->dump_info_json != NULL);
1372 	vdev->backend->dump_info_json(vdev, w);
1373 }
1374 
1375 int
1376 spdk_vhost_dev_remove(struct spdk_vhost_dev *vdev)
1377 {
1378 	if (vdev->pending_async_op_num) {
1379 		return -EBUSY;
1380 	}
1381 
1382 	return vdev->backend->remove_device(vdev);
1383 }
1384 
1385 int
1386 vhost_new_connection_cb(int vid, const char *ifname)
1387 {
1388 	struct spdk_vhost_dev *vdev;
1389 	struct spdk_vhost_session *vsession;
1390 
1391 	pthread_mutex_lock(&g_vhost_mutex);
1392 
1393 	vdev = spdk_vhost_dev_find(ifname);
1394 	if (vdev == NULL) {
1395 		SPDK_ERRLOG("Couldn't find device with vid %d to create connection for.\n", vid);
1396 		pthread_mutex_unlock(&g_vhost_mutex);
1397 		return -1;
1398 	}
1399 
1400 	/* We expect sessions inside vdev->vsessions to be sorted in ascending
1401 	 * order in regard of vsession->id. For now we always set id = vsessions_cnt++
1402 	 * and append each session to the very end of the vsessions list.
1403 	 * This is required for spdk_vhost_dev_foreach_session() to work.
1404 	 */
1405 	if (vdev->vsessions_num == UINT_MAX) {
1406 		assert(false);
1407 		return -EINVAL;
1408 	}
1409 
1410 	if (posix_memalign((void **)&vsession, SPDK_CACHE_LINE_SIZE, sizeof(*vsession) +
1411 			   vdev->backend->session_ctx_size)) {
1412 		SPDK_ERRLOG("vsession alloc failed\n");
1413 		pthread_mutex_unlock(&g_vhost_mutex);
1414 		return -1;
1415 	}
1416 	memset(vsession, 0, sizeof(*vsession) + vdev->backend->session_ctx_size);
1417 
1418 	vsession->vdev = vdev;
1419 	vsession->vid = vid;
1420 	vsession->id = vdev->vsessions_num++;
1421 	vsession->name = spdk_sprintf_alloc("%ss%u", vdev->name, vsession->vid);
1422 	if (vsession->name == NULL) {
1423 		SPDK_ERRLOG("vsession alloc failed\n");
1424 		pthread_mutex_unlock(&g_vhost_mutex);
1425 		free(vsession);
1426 		return -1;
1427 	}
1428 	vsession->started = false;
1429 	vsession->initialized = false;
1430 	vsession->next_stats_check_time = 0;
1431 	vsession->stats_check_interval = SPDK_VHOST_STATS_CHECK_INTERVAL_MS *
1432 					 spdk_get_ticks_hz() / 1000UL;
1433 	TAILQ_INSERT_TAIL(&vdev->vsessions, vsession, tailq);
1434 
1435 	vhost_session_install_rte_compat_hooks(vsession);
1436 	pthread_mutex_unlock(&g_vhost_mutex);
1437 	return 0;
1438 }
1439 
1440 int
1441 vhost_destroy_connection_cb(int vid)
1442 {
1443 	struct spdk_vhost_session *vsession;
1444 	int rc = 0;
1445 
1446 	pthread_mutex_lock(&g_vhost_mutex);
1447 	vsession = vhost_session_find_by_vid(vid);
1448 	if (vsession == NULL) {
1449 		SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
1450 		pthread_mutex_unlock(&g_vhost_mutex);
1451 		return -EINVAL;
1452 	}
1453 
1454 	if (vsession->started) {
1455 		rc = _stop_session(vsession);
1456 	}
1457 
1458 	TAILQ_REMOVE(&vsession->vdev->vsessions, vsession, tailq);
1459 	free(vsession->name);
1460 	free(vsession);
1461 	pthread_mutex_unlock(&g_vhost_mutex);
1462 
1463 	return rc;
1464 }
1465 
1466 void
1467 spdk_vhost_lock(void)
1468 {
1469 	pthread_mutex_lock(&g_vhost_mutex);
1470 }
1471 
1472 int
1473 spdk_vhost_trylock(void)
1474 {
1475 	return -pthread_mutex_trylock(&g_vhost_mutex);
1476 }
1477 
1478 void
1479 spdk_vhost_unlock(void)
1480 {
1481 	pthread_mutex_unlock(&g_vhost_mutex);
1482 }
1483 
1484 void
1485 spdk_vhost_init(spdk_vhost_init_cb init_cb)
1486 {
1487 	size_t len;
1488 	int ret;
1489 
1490 	g_vhost_init_thread = spdk_get_thread();
1491 	assert(g_vhost_init_thread != NULL);
1492 
1493 	if (dev_dirname[0] == '\0') {
1494 		if (getcwd(dev_dirname, sizeof(dev_dirname) - 1) == NULL) {
1495 			SPDK_ERRLOG("getcwd failed (%d): %s\n", errno, spdk_strerror(errno));
1496 			ret = -1;
1497 			goto out;
1498 		}
1499 
1500 		len = strlen(dev_dirname);
1501 		if (dev_dirname[len - 1] != '/') {
1502 			dev_dirname[len] = '/';
1503 			dev_dirname[len + 1] = '\0';
1504 		}
1505 	}
1506 
1507 	ret = sem_init(&g_dpdk_sem, 0, 0);
1508 	if (ret != 0) {
1509 		SPDK_ERRLOG("Failed to initialize semaphore for rte_vhost pthread.\n");
1510 		ret = -1;
1511 		goto out;
1512 	}
1513 
1514 	spdk_cpuset_zero(&g_vhost_core_mask);
1515 
1516 	/* iterate threads instead of using SPDK_ENV_FOREACH_CORE to ensure that threads are really
1517 	 * created.
1518 	 */
1519 	spdk_for_each_thread(vhost_setup_core_mask, init_cb, vhost_setup_core_mask_done);
1520 	return;
1521 out:
1522 	init_cb(ret);
1523 }
1524 
1525 static void
1526 vhost_fini(void *arg1)
1527 {
1528 	struct spdk_vhost_dev *vdev, *tmp;
1529 
1530 	spdk_vhost_lock();
1531 	vdev = spdk_vhost_dev_next(NULL);
1532 	while (vdev != NULL) {
1533 		tmp = spdk_vhost_dev_next(vdev);
1534 		spdk_vhost_dev_remove(vdev);
1535 		/* don't care if it fails, there's nothing we can do for now */
1536 		vdev = tmp;
1537 	}
1538 	spdk_vhost_unlock();
1539 
1540 	spdk_cpuset_zero(&g_vhost_core_mask);
1541 
1542 	/* All devices are removed now. */
1543 	sem_destroy(&g_dpdk_sem);
1544 
1545 	g_fini_cpl_cb();
1546 }
1547 
1548 static void *
1549 session_shutdown(void *arg)
1550 {
1551 	struct spdk_vhost_dev *vdev = NULL;
1552 
1553 	TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) {
1554 		vhost_driver_unregister(vdev->path);
1555 		vdev->registered = false;
1556 	}
1557 
1558 	SPDK_INFOLOG(vhost, "Exiting\n");
1559 	spdk_thread_send_msg(g_vhost_init_thread, vhost_fini, NULL);
1560 	return NULL;
1561 }
1562 
1563 void
1564 spdk_vhost_fini(spdk_vhost_fini_cb fini_cb)
1565 {
1566 	pthread_t tid;
1567 	int rc;
1568 
1569 	assert(spdk_get_thread() == g_vhost_init_thread);
1570 	g_fini_cpl_cb = fini_cb;
1571 
1572 	/* rte_vhost API for removing sockets is not asynchronous. Since it may call SPDK
1573 	 * ops for stopping a device or removing a connection, we need to call it from
1574 	 * a separate thread to avoid deadlock.
1575 	 */
1576 	rc = pthread_create(&tid, NULL, &session_shutdown, NULL);
1577 	if (rc < 0) {
1578 		SPDK_ERRLOG("Failed to start session shutdown thread (%d): %s\n", rc, spdk_strerror(rc));
1579 		abort();
1580 	}
1581 	pthread_detach(tid);
1582 }
1583 
1584 void
1585 spdk_vhost_config_json(struct spdk_json_write_ctx *w)
1586 {
1587 	struct spdk_vhost_dev *vdev;
1588 	uint32_t delay_base_us;
1589 	uint32_t iops_threshold;
1590 
1591 	spdk_json_write_array_begin(w);
1592 
1593 	spdk_vhost_lock();
1594 	vdev = spdk_vhost_dev_next(NULL);
1595 	while (vdev != NULL) {
1596 		vdev->backend->write_config_json(vdev, w);
1597 
1598 		spdk_vhost_get_coalescing(vdev, &delay_base_us, &iops_threshold);
1599 		if (delay_base_us) {
1600 			spdk_json_write_object_begin(w);
1601 			spdk_json_write_named_string(w, "method", "vhost_controller_set_coalescing");
1602 
1603 			spdk_json_write_named_object_begin(w, "params");
1604 			spdk_json_write_named_string(w, "ctrlr", vdev->name);
1605 			spdk_json_write_named_uint32(w, "delay_base_us", delay_base_us);
1606 			spdk_json_write_named_uint32(w, "iops_threshold", iops_threshold);
1607 			spdk_json_write_object_end(w);
1608 
1609 			spdk_json_write_object_end(w);
1610 		}
1611 		vdev = spdk_vhost_dev_next(vdev);
1612 	}
1613 	spdk_vhost_unlock();
1614 
1615 	spdk_json_write_array_end(w);
1616 }
1617 
1618 SPDK_LOG_REGISTER_COMPONENT(vhost)
1619 SPDK_LOG_REGISTER_COMPONENT(vhost_ring)
1620