xref: /spdk/lib/vhost/vhost.c (revision 7961de43413e7f818f7499bf8518909beb59c82f)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "spdk/env.h"
37 #include "spdk/likely.h"
38 #include "spdk/string.h"
39 #include "spdk/util.h"
40 #include "spdk/barrier.h"
41 #include "spdk/vhost.h"
42 #include "vhost_internal.h"
43 
44 #include "spdk_internal/memory.h"
45 
46 static TAILQ_HEAD(, vhost_poll_group) g_poll_groups = TAILQ_HEAD_INITIALIZER(g_poll_groups);
47 
48 /* Temporary cpuset for poll group assignment */
49 static struct spdk_cpuset *g_tmp_cpuset;
50 
51 /* Path to folder where character device will be created. Can be set by user. */
52 static char dev_dirname[PATH_MAX] = "";
53 
54 /* Thread performing all vhost management operations */
55 static struct spdk_thread *g_vhost_init_thread;
56 
57 static spdk_vhost_fini_cb g_fini_cpl_cb;
58 
59 /**
60  * DPDK calls our callbacks synchronously but the work those callbacks
61  * perform needs to be async. Luckily, all DPDK callbacks are called on
62  * a DPDK-internal pthread, so we'll just wait on a semaphore in there.
63  */
64 static sem_t g_dpdk_sem;
65 
66 /** Return code for the current DPDK callback */
67 static int g_dpdk_response;
68 
69 struct vhost_session_fn_ctx {
70 	/** Device pointer obtained before enqueuing the event */
71 	struct spdk_vhost_dev *vdev;
72 
73 	/** ID of the session to send event to. */
74 	uint32_t vsession_id;
75 
76 	/** User provided function to be executed on session's thread. */
77 	spdk_vhost_session_fn cb_fn;
78 
79 	/**
80 	 * User provided function to be called on the init thread
81 	 * after iterating through all sessions.
82 	 */
83 	spdk_vhost_dev_fn cpl_fn;
84 
85 	/** Custom user context */
86 	void *user_ctx;
87 };
88 
89 static TAILQ_HEAD(, spdk_vhost_dev) g_vhost_devices = TAILQ_HEAD_INITIALIZER(
90 			g_vhost_devices);
91 static pthread_mutex_t g_vhost_mutex = PTHREAD_MUTEX_INITIALIZER;
92 
93 void *vhost_gpa_to_vva(struct spdk_vhost_session *vsession, uint64_t addr, uint64_t len)
94 {
95 	void *vva;
96 	uint64_t newlen;
97 
98 	newlen = len;
99 	vva = (void *)rte_vhost_va_from_guest_pa(vsession->mem, addr, &newlen);
100 	if (newlen != len) {
101 		return NULL;
102 	}
103 
104 	return vva;
105 
106 }
107 
108 static void
109 vhost_log_req_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue,
110 		   uint16_t req_id)
111 {
112 	struct vring_desc *desc, *desc_table;
113 	uint32_t desc_table_size;
114 	int rc;
115 
116 	if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
117 		return;
118 	}
119 
120 	rc = vhost_vq_get_desc(vsession, virtqueue, req_id, &desc, &desc_table, &desc_table_size);
121 	if (spdk_unlikely(rc != 0)) {
122 		SPDK_ERRLOG("Can't log used ring descriptors!\n");
123 		return;
124 	}
125 
126 	do {
127 		if (vhost_vring_desc_is_wr(desc)) {
128 			/* To be honest, only pages realy touched should be logged, but
129 			 * doing so would require tracking those changes in each backed.
130 			 * Also backend most likely will touch all/most of those pages so
131 			 * for lets assume we touched all pages passed to as writeable buffers. */
132 			rte_vhost_log_write(vsession->vid, desc->addr, desc->len);
133 		}
134 		vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
135 	} while (desc);
136 }
137 
138 static void
139 vhost_log_used_vring_elem(struct spdk_vhost_session *vsession,
140 			  struct spdk_vhost_virtqueue *virtqueue,
141 			  uint16_t idx)
142 {
143 	uint64_t offset, len;
144 	uint16_t vq_idx;
145 
146 	if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
147 		return;
148 	}
149 
150 	offset = offsetof(struct vring_used, ring[idx]);
151 	len = sizeof(virtqueue->vring.used->ring[idx]);
152 	vq_idx = virtqueue - vsession->virtqueue;
153 
154 	rte_vhost_log_used_vring(vsession->vid, vq_idx, offset, len);
155 }
156 
157 static void
158 vhost_log_used_vring_idx(struct spdk_vhost_session *vsession,
159 			 struct spdk_vhost_virtqueue *virtqueue)
160 {
161 	uint64_t offset, len;
162 	uint16_t vq_idx;
163 
164 	if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
165 		return;
166 	}
167 
168 	offset = offsetof(struct vring_used, idx);
169 	len = sizeof(virtqueue->vring.used->idx);
170 	vq_idx = virtqueue - vsession->virtqueue;
171 
172 	rte_vhost_log_used_vring(vsession->vid, vq_idx, offset, len);
173 }
174 
175 /*
176  * Get available requests from avail ring.
177  */
178 uint16_t
179 vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *virtqueue, uint16_t *reqs,
180 			uint16_t reqs_len)
181 {
182 	struct rte_vhost_vring *vring = &virtqueue->vring;
183 	struct vring_avail *avail = vring->avail;
184 	uint16_t size_mask = vring->size - 1;
185 	uint16_t last_idx = virtqueue->last_avail_idx, avail_idx = avail->idx;
186 	uint16_t count, i;
187 
188 	count = avail_idx - last_idx;
189 	if (spdk_likely(count == 0)) {
190 		return 0;
191 	}
192 
193 	if (spdk_unlikely(count > vring->size)) {
194 		/* TODO: the queue is unrecoverably broken and should be marked so.
195 		 * For now we will fail silently and report there are no new avail entries.
196 		 */
197 		return 0;
198 	}
199 
200 	count = spdk_min(count, reqs_len);
201 	virtqueue->last_avail_idx += count;
202 	for (i = 0; i < count; i++) {
203 		reqs[i] = vring->avail->ring[(last_idx + i) & size_mask];
204 	}
205 
206 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
207 		      "AVAIL: last_idx=%"PRIu16" avail_idx=%"PRIu16" count=%"PRIu16"\n",
208 		      last_idx, avail_idx, count);
209 
210 	return count;
211 }
212 
213 static bool
214 vhost_vring_desc_is_indirect(struct vring_desc *cur_desc)
215 {
216 	return !!(cur_desc->flags & VRING_DESC_F_INDIRECT);
217 }
218 
219 int
220 vhost_vq_get_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue,
221 		  uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table,
222 		  uint32_t *desc_table_size)
223 {
224 	if (spdk_unlikely(req_idx >= virtqueue->vring.size)) {
225 		return -1;
226 	}
227 
228 	*desc = &virtqueue->vring.desc[req_idx];
229 
230 	if (vhost_vring_desc_is_indirect(*desc)) {
231 		*desc_table_size = (*desc)->len / sizeof(**desc);
232 		*desc_table = vhost_gpa_to_vva(vsession, (*desc)->addr,
233 					       sizeof(**desc) * *desc_table_size);
234 		*desc = *desc_table;
235 		if (*desc == NULL) {
236 			return -1;
237 		}
238 
239 		return 0;
240 	}
241 
242 	*desc_table = virtqueue->vring.desc;
243 	*desc_table_size = virtqueue->vring.size;
244 
245 	return 0;
246 }
247 
248 int
249 vhost_vq_used_signal(struct spdk_vhost_session *vsession,
250 		     struct spdk_vhost_virtqueue *virtqueue)
251 {
252 	if (virtqueue->used_req_cnt == 0) {
253 		return 0;
254 	}
255 
256 	virtqueue->req_cnt += virtqueue->used_req_cnt;
257 	virtqueue->used_req_cnt = 0;
258 
259 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
260 		      "Queue %td - USED RING: sending IRQ: last used %"PRIu16"\n",
261 		      virtqueue - vsession->virtqueue, virtqueue->last_used_idx);
262 
263 	if (rte_vhost_vring_call(vsession->vid, virtqueue->vring_idx) == 0) {
264 		/* interrupt signalled */
265 		return 1;
266 	} else {
267 		/* interrupt not signalled */
268 		return 0;
269 	}
270 }
271 
272 
273 static void
274 check_session_io_stats(struct spdk_vhost_session *vsession, uint64_t now)
275 {
276 	struct spdk_vhost_virtqueue *virtqueue;
277 	uint32_t irq_delay_base = vsession->coalescing_delay_time_base;
278 	uint32_t io_threshold = vsession->coalescing_io_rate_threshold;
279 	int32_t irq_delay;
280 	uint32_t req_cnt;
281 	uint16_t q_idx;
282 
283 	if (now < vsession->next_stats_check_time) {
284 		return;
285 	}
286 
287 	vsession->next_stats_check_time = now + vsession->stats_check_interval;
288 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
289 		virtqueue = &vsession->virtqueue[q_idx];
290 
291 		req_cnt = virtqueue->req_cnt + virtqueue->used_req_cnt;
292 		if (req_cnt <= io_threshold) {
293 			continue;
294 		}
295 
296 		irq_delay = (irq_delay_base * (req_cnt - io_threshold)) / io_threshold;
297 		virtqueue->irq_delay_time = (uint32_t) spdk_max(0, irq_delay);
298 
299 		virtqueue->req_cnt = 0;
300 		virtqueue->next_event_time = now;
301 	}
302 }
303 
304 void
305 vhost_session_used_signal(struct spdk_vhost_session *vsession)
306 {
307 	struct spdk_vhost_virtqueue *virtqueue;
308 	uint64_t now;
309 	uint16_t q_idx;
310 
311 	if (vsession->coalescing_delay_time_base == 0) {
312 		for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
313 			virtqueue = &vsession->virtqueue[q_idx];
314 
315 			if (virtqueue->vring.desc == NULL ||
316 			    (virtqueue->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
317 				continue;
318 			}
319 
320 			vhost_vq_used_signal(vsession, virtqueue);
321 		}
322 	} else {
323 		now = spdk_get_ticks();
324 		check_session_io_stats(vsession, now);
325 
326 		for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
327 			virtqueue = &vsession->virtqueue[q_idx];
328 
329 			/* No need for event right now */
330 			if (now < virtqueue->next_event_time ||
331 			    (virtqueue->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
332 				continue;
333 			}
334 
335 			if (!vhost_vq_used_signal(vsession, virtqueue)) {
336 				continue;
337 			}
338 
339 			/* Syscall is quite long so update time */
340 			now = spdk_get_ticks();
341 			virtqueue->next_event_time = now + virtqueue->irq_delay_time;
342 		}
343 	}
344 }
345 
346 static int
347 vhost_session_set_coalescing(struct spdk_vhost_dev *vdev,
348 			     struct spdk_vhost_session *vsession, void *ctx)
349 {
350 	vsession->coalescing_delay_time_base =
351 		vdev->coalescing_delay_us * spdk_get_ticks_hz() / 1000000ULL;
352 	vsession->coalescing_io_rate_threshold =
353 		vdev->coalescing_iops_threshold * SPDK_VHOST_STATS_CHECK_INTERVAL_MS / 1000U;
354 	return 0;
355 }
356 
357 static int
358 vhost_dev_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us,
359 			 uint32_t iops_threshold)
360 {
361 	uint64_t delay_time_base = delay_base_us * spdk_get_ticks_hz() / 1000000ULL;
362 	uint32_t io_rate = iops_threshold * SPDK_VHOST_STATS_CHECK_INTERVAL_MS / 1000U;
363 
364 	if (delay_time_base >= UINT32_MAX) {
365 		SPDK_ERRLOG("Delay time of %"PRIu32" is to big\n", delay_base_us);
366 		return -EINVAL;
367 	} else if (io_rate == 0) {
368 		SPDK_ERRLOG("IOPS rate of %"PRIu32" is too low. Min is %u\n", io_rate,
369 			    1000U / SPDK_VHOST_STATS_CHECK_INTERVAL_MS);
370 		return -EINVAL;
371 	}
372 
373 	vdev->coalescing_delay_us = delay_base_us;
374 	vdev->coalescing_iops_threshold = iops_threshold;
375 	return 0;
376 }
377 
378 int
379 spdk_vhost_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us,
380 			  uint32_t iops_threshold)
381 {
382 	int rc;
383 
384 	rc = vhost_dev_set_coalescing(vdev, delay_base_us, iops_threshold);
385 	if (rc != 0) {
386 		return rc;
387 	}
388 
389 	vhost_dev_foreach_session(vdev, vhost_session_set_coalescing, NULL, NULL);
390 	return 0;
391 }
392 
393 void
394 spdk_vhost_get_coalescing(struct spdk_vhost_dev *vdev, uint32_t *delay_base_us,
395 			  uint32_t *iops_threshold)
396 {
397 	if (delay_base_us) {
398 		*delay_base_us = vdev->coalescing_delay_us;
399 	}
400 
401 	if (iops_threshold) {
402 		*iops_threshold = vdev->coalescing_iops_threshold;
403 	}
404 }
405 
406 /*
407  * Enqueue id and len to used ring.
408  */
409 void
410 vhost_vq_used_ring_enqueue(struct spdk_vhost_session *vsession,
411 			   struct spdk_vhost_virtqueue *virtqueue,
412 			   uint16_t id, uint32_t len)
413 {
414 	struct rte_vhost_vring *vring = &virtqueue->vring;
415 	struct vring_used *used = vring->used;
416 	uint16_t last_idx = virtqueue->last_used_idx & (vring->size - 1);
417 
418 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
419 		      "Queue %td - USED RING: last_idx=%"PRIu16" req id=%"PRIu16" len=%"PRIu32"\n",
420 		      virtqueue - vsession->virtqueue, virtqueue->last_used_idx, id, len);
421 
422 	vhost_log_req_desc(vsession, virtqueue, id);
423 
424 	virtqueue->last_used_idx++;
425 	used->ring[last_idx].id = id;
426 	used->ring[last_idx].len = len;
427 
428 	/* Ensure the used ring is updated before we log it or increment used->idx. */
429 	spdk_smp_wmb();
430 
431 	vhost_log_used_vring_elem(vsession, virtqueue, last_idx);
432 	* (volatile uint16_t *) &used->idx = virtqueue->last_used_idx;
433 	vhost_log_used_vring_idx(vsession, virtqueue);
434 
435 	virtqueue->used_req_cnt++;
436 }
437 
438 int
439 vhost_vring_desc_get_next(struct vring_desc **desc,
440 			  struct vring_desc *desc_table, uint32_t desc_table_size)
441 {
442 	struct vring_desc *old_desc = *desc;
443 	uint16_t next_idx;
444 
445 	if ((old_desc->flags & VRING_DESC_F_NEXT) == 0) {
446 		*desc = NULL;
447 		return 0;
448 	}
449 
450 	next_idx = old_desc->next;
451 	if (spdk_unlikely(next_idx >= desc_table_size)) {
452 		*desc = NULL;
453 		return -1;
454 	}
455 
456 	*desc = &desc_table[next_idx];
457 	return 0;
458 }
459 
460 bool
461 vhost_vring_desc_is_wr(struct vring_desc *cur_desc)
462 {
463 	return !!(cur_desc->flags & VRING_DESC_F_WRITE);
464 }
465 
466 int
467 vhost_vring_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
468 			uint16_t *iov_index, const struct vring_desc *desc)
469 {
470 	uint64_t len;
471 	uint64_t remaining = desc->len;
472 	uintptr_t payload = desc->addr;
473 	uintptr_t vva;
474 
475 	do {
476 		if (*iov_index >= SPDK_VHOST_IOVS_MAX) {
477 			SPDK_ERRLOG("SPDK_VHOST_IOVS_MAX(%d) reached\n", SPDK_VHOST_IOVS_MAX);
478 			return -1;
479 		}
480 		len = remaining;
481 		vva = (uintptr_t)rte_vhost_va_from_guest_pa(vsession->mem, payload, &len);
482 		if (vva == 0 || len == 0) {
483 			SPDK_ERRLOG("gpa_to_vva(%p) == NULL\n", (void *)payload);
484 			return -1;
485 		}
486 		iov[*iov_index].iov_base = (void *)vva;
487 		iov[*iov_index].iov_len = len;
488 		remaining -= len;
489 		payload += len;
490 		(*iov_index)++;
491 	} while (remaining);
492 
493 	return 0;
494 }
495 
496 static struct spdk_vhost_session *
497 vhost_session_find_by_id(struct spdk_vhost_dev *vdev, unsigned id)
498 {
499 	struct spdk_vhost_session *vsession;
500 
501 	TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
502 		if (vsession->id == id) {
503 			return vsession;
504 		}
505 	}
506 
507 	return NULL;
508 }
509 
510 struct spdk_vhost_session *
511 vhost_session_find_by_vid(int vid)
512 {
513 	struct spdk_vhost_dev *vdev;
514 	struct spdk_vhost_session *vsession;
515 
516 	TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) {
517 		TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
518 			if (vsession->vid == vid) {
519 				return vsession;
520 			}
521 		}
522 	}
523 
524 	return NULL;
525 }
526 
527 struct spdk_vhost_dev *
528 spdk_vhost_dev_next(struct spdk_vhost_dev *vdev)
529 {
530 	if (vdev == NULL) {
531 		return TAILQ_FIRST(&g_vhost_devices);
532 	}
533 
534 	return TAILQ_NEXT(vdev, tailq);
535 }
536 
537 struct spdk_vhost_dev *
538 spdk_vhost_dev_find(const char *ctrlr_name)
539 {
540 	struct spdk_vhost_dev *vdev;
541 	size_t dev_dirname_len = strlen(dev_dirname);
542 
543 	if (strncmp(ctrlr_name, dev_dirname, dev_dirname_len) == 0) {
544 		ctrlr_name += dev_dirname_len;
545 	}
546 
547 	TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) {
548 		if (strcmp(vdev->name, ctrlr_name) == 0) {
549 			return vdev;
550 		}
551 	}
552 
553 	return NULL;
554 }
555 
556 static int
557 vhost_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask)
558 {
559 	int rc;
560 
561 	if (cpumask == NULL) {
562 		return -1;
563 	}
564 
565 	if (mask == NULL) {
566 		spdk_cpuset_copy(cpumask, spdk_app_get_core_mask());
567 		return 0;
568 	}
569 
570 	rc = spdk_app_parse_core_mask(mask, cpumask);
571 	if (rc < 0) {
572 		SPDK_ERRLOG("invalid cpumask %s\n", mask);
573 		return -1;
574 	}
575 
576 	if (spdk_cpuset_count(cpumask) == 0) {
577 		SPDK_ERRLOG("no cpu is selected among reactor mask(=%s)\n",
578 			    spdk_cpuset_fmt(spdk_app_get_core_mask()));
579 		return -1;
580 	}
581 
582 	return 0;
583 }
584 
585 int
586 vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str,
587 		   const struct spdk_vhost_dev_backend *backend)
588 {
589 	char path[PATH_MAX];
590 	struct spdk_cpuset *cpumask;
591 	int rc;
592 
593 	assert(vdev);
594 	if (name == NULL) {
595 		SPDK_ERRLOG("Can't register controller with no name\n");
596 		return -EINVAL;
597 	}
598 
599 	cpumask = spdk_cpuset_alloc();
600 	if (!cpumask) {
601 		SPDK_ERRLOG("spdk_cpuset_alloc failed\n");
602 		return -ENOMEM;
603 	}
604 
605 	if (vhost_parse_core_mask(mask_str, cpumask) != 0) {
606 		SPDK_ERRLOG("cpumask %s is invalid (app mask is 0x%s)\n",
607 			    mask_str, spdk_cpuset_fmt(spdk_app_get_core_mask()));
608 		rc = -EINVAL;
609 		goto out;
610 	}
611 
612 	if (spdk_vhost_dev_find(name)) {
613 		SPDK_ERRLOG("vhost controller %s already exists.\n", name);
614 		rc = -EEXIST;
615 		goto out;
616 	}
617 
618 	if (snprintf(path, sizeof(path), "%s%s", dev_dirname, name) >= (int)sizeof(path)) {
619 		SPDK_ERRLOG("Resulting socket path for controller %s is too long: %s%s\n", name, dev_dirname,
620 			    name);
621 		rc = -EINVAL;
622 		goto out;
623 	}
624 
625 	vdev->name = strdup(name);
626 	vdev->path = strdup(path);
627 	if (vdev->name == NULL || vdev->path == NULL) {
628 		free(vdev->name);
629 		free(vdev->path);
630 		rc = -EIO;
631 		goto out;
632 	}
633 
634 	vdev->cpumask = cpumask;
635 	vdev->registered = true;
636 	vdev->backend = backend;
637 	TAILQ_INIT(&vdev->vsessions);
638 	TAILQ_INSERT_TAIL(&g_vhost_devices, vdev, tailq);
639 
640 	vhost_dev_set_coalescing(vdev, SPDK_VHOST_COALESCING_DELAY_BASE_US,
641 				 SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD);
642 
643 	if (vhost_register_unix_socket(path, name, vdev->virtio_features, vdev->disabled_features)) {
644 		TAILQ_REMOVE(&g_vhost_devices, vdev, tailq);
645 		free(vdev->name);
646 		free(vdev->path);
647 		rc = -EIO;
648 		goto out;
649 	}
650 
651 	SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: new controller added\n", vdev->name);
652 	return 0;
653 
654 out:
655 	spdk_cpuset_free(cpumask);
656 	return rc;
657 }
658 
659 int
660 vhost_dev_unregister(struct spdk_vhost_dev *vdev)
661 {
662 	if (!TAILQ_EMPTY(&vdev->vsessions)) {
663 		SPDK_ERRLOG("Controller %s has still valid connection.\n", vdev->name);
664 		return -EBUSY;
665 	}
666 
667 	if (vdev->registered && vhost_driver_unregister(vdev->path) != 0) {
668 		SPDK_ERRLOG("Could not unregister controller %s with vhost library\n"
669 			    "Check if domain socket %s still exists\n",
670 			    vdev->name, vdev->path);
671 		return -EIO;
672 	}
673 
674 	SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: removed\n", vdev->name);
675 
676 	free(vdev->name);
677 	free(vdev->path);
678 	spdk_cpuset_free(vdev->cpumask);
679 	TAILQ_REMOVE(&g_vhost_devices, vdev, tailq);
680 	return 0;
681 }
682 
683 static struct spdk_vhost_session *
684 vhost_session_next(struct spdk_vhost_dev *vdev, unsigned prev_id)
685 {
686 	struct spdk_vhost_session *vsession;
687 
688 	TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
689 		if (vsession->id > prev_id) {
690 			return vsession;
691 		}
692 	}
693 
694 	return NULL;
695 }
696 
697 const char *
698 spdk_vhost_dev_get_name(struct spdk_vhost_dev *vdev)
699 {
700 	assert(vdev != NULL);
701 	return vdev->name;
702 }
703 
704 const struct spdk_cpuset *
705 spdk_vhost_dev_get_cpumask(struct spdk_vhost_dev *vdev)
706 {
707 	assert(vdev != NULL);
708 	return vdev->cpumask;
709 }
710 
711 struct vhost_poll_group *
712 vhost_get_poll_group(struct spdk_cpuset *cpumask)
713 {
714 	struct vhost_poll_group *pg, *selected_pg;
715 	uint32_t min_ctrlrs;
716 
717 	min_ctrlrs = INT_MAX;
718 	selected_pg = TAILQ_FIRST(&g_poll_groups);
719 
720 	TAILQ_FOREACH(pg, &g_poll_groups, tailq) {
721 		spdk_cpuset_copy(g_tmp_cpuset, cpumask);
722 		spdk_cpuset_and(g_tmp_cpuset, spdk_thread_get_cpumask(pg->thread));
723 
724 		/* ignore threads which could be relocated to a non-masked cpu. */
725 		if (!spdk_cpuset_equal(g_tmp_cpuset, spdk_thread_get_cpumask(pg->thread))) {
726 			continue;
727 		}
728 
729 		if (pg->ref < min_ctrlrs) {
730 			selected_pg = pg;
731 			min_ctrlrs = pg->ref;
732 		}
733 	}
734 
735 	assert(selected_pg != NULL);
736 	return selected_pg;
737 }
738 
739 static struct vhost_poll_group *
740 _get_current_poll_group(void)
741 {
742 	struct vhost_poll_group *pg;
743 	struct spdk_thread *cur_thread = spdk_get_thread();
744 
745 	TAILQ_FOREACH(pg, &g_poll_groups, tailq) {
746 		if (pg->thread == cur_thread) {
747 			return pg;
748 		}
749 	}
750 
751 	return NULL;
752 }
753 
754 static void
755 wait_for_semaphore(int timeout_sec, const char *errmsg)
756 {
757 	struct timespec timeout;
758 	int rc;
759 
760 	clock_gettime(CLOCK_REALTIME, &timeout);
761 	timeout.tv_sec += timeout_sec;
762 	rc = sem_timedwait(&g_dpdk_sem, &timeout);
763 	if (rc != 0) {
764 		SPDK_ERRLOG("Timeout waiting for event: %s.\n", errmsg);
765 		sem_wait(&g_dpdk_sem);
766 	}
767 }
768 
769 static void
770 vhost_session_cb_done(int rc)
771 {
772 	g_dpdk_response = rc;
773 	sem_post(&g_dpdk_sem);
774 }
775 
776 void
777 vhost_session_start_done(struct spdk_vhost_session *vsession, int response)
778 {
779 	if (response == 0) {
780 		vsession->started = true;
781 		vsession->poll_group = _get_current_poll_group();
782 		assert(vsession->poll_group != NULL);
783 		assert(vsession->poll_group->ref < UINT_MAX);
784 		vsession->poll_group->ref++;
785 
786 		assert(vsession->vdev->active_session_num < UINT32_MAX);
787 		vsession->vdev->active_session_num++;
788 	}
789 
790 	vhost_session_cb_done(response);
791 }
792 
793 void
794 vhost_session_stop_done(struct spdk_vhost_session *vsession, int response)
795 {
796 	if (response == 0) {
797 		vsession->started = false;
798 		assert(vsession->poll_group != NULL);
799 		assert(vsession->poll_group->ref > 0);
800 		vsession->poll_group->ref--;
801 		vsession->poll_group = NULL;
802 
803 		assert(vsession->vdev->active_session_num > 0);
804 		vsession->vdev->active_session_num--;
805 	}
806 
807 	vhost_session_cb_done(response);
808 }
809 
810 static void
811 vhost_event_cb(void *arg1)
812 {
813 	struct vhost_session_fn_ctx *ctx = arg1;
814 	struct spdk_vhost_session *vsession;
815 
816 	if (pthread_mutex_trylock(&g_vhost_mutex) != 0) {
817 		spdk_thread_send_msg(spdk_get_thread(), vhost_event_cb, arg1);
818 		return;
819 	}
820 
821 	vsession = vhost_session_find_by_id(ctx->vdev, ctx->vsession_id);
822 	ctx->cb_fn(ctx->vdev, vsession, NULL);
823 	pthread_mutex_unlock(&g_vhost_mutex);
824 }
825 
826 int
827 vhost_session_send_event(struct vhost_poll_group *pg,
828 			 struct spdk_vhost_session *vsession,
829 			 spdk_vhost_session_fn cb_fn, unsigned timeout_sec,
830 			 const char *errmsg)
831 {
832 	struct vhost_session_fn_ctx ev_ctx = {0};
833 
834 	ev_ctx.vdev = vsession->vdev;
835 	ev_ctx.vsession_id = vsession->id;
836 	ev_ctx.cb_fn = cb_fn;
837 
838 	spdk_thread_send_msg(pg->thread, vhost_event_cb, &ev_ctx);
839 
840 	pthread_mutex_unlock(&g_vhost_mutex);
841 	wait_for_semaphore(timeout_sec, errmsg);
842 	pthread_mutex_lock(&g_vhost_mutex);
843 
844 	return g_dpdk_response;
845 }
846 
847 static void foreach_session_continue(struct vhost_session_fn_ctx *ev_ctx,
848 				     struct spdk_vhost_session *vsession);
849 
850 static void
851 foreach_session_finish_cb(void *arg1)
852 {
853 	struct vhost_session_fn_ctx *ctx = arg1;
854 	struct spdk_vhost_dev *vdev = ctx->vdev;
855 
856 	if (pthread_mutex_trylock(&g_vhost_mutex) != 0) {
857 		spdk_thread_send_msg(spdk_get_thread(),
858 				     foreach_session_finish_cb, arg1);
859 		return;
860 	}
861 
862 	assert(vdev->pending_async_op_num > 0);
863 	vdev->pending_async_op_num--;
864 	if (ctx->cpl_fn != NULL) {
865 		ctx->cpl_fn(vdev, ctx->user_ctx);
866 	}
867 
868 	pthread_mutex_unlock(&g_vhost_mutex);
869 	free(ctx);
870 }
871 
872 static void
873 foreach_session_continue_cb(void *arg1)
874 {
875 	struct vhost_session_fn_ctx *ctx = arg1;
876 	struct spdk_vhost_session *vsession = NULL;
877 	struct spdk_vhost_dev *vdev = ctx->vdev;
878 	int rc;
879 
880 	if (pthread_mutex_trylock(&g_vhost_mutex) != 0) {
881 		spdk_thread_send_msg(spdk_get_thread(),
882 				     foreach_session_continue_cb, arg1);
883 		return;
884 	}
885 
886 	vsession = vhost_session_find_by_id(vdev, ctx->vsession_id);
887 	if (vsession == NULL || !vsession->initialized) {
888 		/* The session must have been removed in the meantime, so we
889 		 * just skip it in our foreach chain
890 		 */
891 		goto out_unlock_continue;
892 	}
893 
894 	if (vsession->started && vsession->poll_group->thread != spdk_get_thread()) {
895 		/* if session has been relocated to other thread, it is no longer thread-safe
896 		 * to access its contents here. Even though we're running under the global
897 		 * vhost mutex, the session itself (and its pollers) are not. We need to chase
898 		 * the session thread as many times as necessary.
899 		 */
900 		spdk_thread_send_msg(vsession->poll_group->thread,
901 				     foreach_session_continue_cb, arg1);
902 		pthread_mutex_unlock(&g_vhost_mutex);
903 		return;
904 	}
905 
906 	rc = ctx->cb_fn(vdev, vsession, ctx->user_ctx);
907 	if (rc < 0) {
908 		pthread_mutex_unlock(&g_vhost_mutex);
909 		free(ctx);
910 		return;
911 	}
912 
913 out_unlock_continue:
914 	vsession = vhost_session_next(vdev, ctx->vsession_id);
915 	foreach_session_continue(ctx, vsession);
916 	pthread_mutex_unlock(&g_vhost_mutex);
917 }
918 
919 static void
920 foreach_session_continue(struct vhost_session_fn_ctx *ev_ctx,
921 			 struct spdk_vhost_session *vsession)
922 {
923 	struct spdk_vhost_dev *vdev = ev_ctx->vdev;
924 	int rc;
925 
926 	while (vsession != NULL && !vsession->started) {
927 		if (vsession->initialized) {
928 			rc = ev_ctx->cb_fn(vdev, vsession, ev_ctx->user_ctx);
929 			if (rc < 0) {
930 				return;
931 			}
932 		}
933 
934 		vsession = vhost_session_next(vdev, vsession->id);
935 	}
936 
937 	if (vsession != NULL) {
938 		ev_ctx->vsession_id = vsession->id;
939 		spdk_thread_send_msg(vsession->poll_group->thread,
940 				     foreach_session_continue_cb, ev_ctx);
941 	} else {
942 		ev_ctx->vsession_id = UINT32_MAX;
943 		spdk_thread_send_msg(g_vhost_init_thread,
944 				     foreach_session_finish_cb, ev_ctx);
945 	}
946 }
947 
948 void
949 vhost_dev_foreach_session(struct spdk_vhost_dev *vdev,
950 			  spdk_vhost_session_fn fn,
951 			  spdk_vhost_dev_fn cpl_fn,
952 			  void *arg)
953 {
954 	struct spdk_vhost_session *vsession = TAILQ_FIRST(&vdev->vsessions);
955 	struct vhost_session_fn_ctx *ev_ctx;
956 
957 	ev_ctx = calloc(1, sizeof(*ev_ctx));
958 	if (ev_ctx == NULL) {
959 		SPDK_ERRLOG("Failed to alloc vhost event.\n");
960 		assert(false);
961 		return;
962 	}
963 
964 	ev_ctx->vdev = vdev;
965 	ev_ctx->cb_fn = fn;
966 	ev_ctx->cpl_fn = cpl_fn;
967 	ev_ctx->user_ctx = arg;
968 
969 	assert(vdev->pending_async_op_num < UINT32_MAX);
970 	vdev->pending_async_op_num++;
971 	foreach_session_continue(ev_ctx, vsession);
972 }
973 
974 static int
975 _stop_session(struct spdk_vhost_session *vsession)
976 {
977 	struct spdk_vhost_dev *vdev = vsession->vdev;
978 	struct spdk_vhost_virtqueue *q;
979 	int rc;
980 	uint16_t i;
981 
982 	rc = vdev->backend->stop_session(vsession);
983 	if (rc != 0) {
984 		SPDK_ERRLOG("Couldn't stop device with vid %d.\n", vsession->vid);
985 		pthread_mutex_unlock(&g_vhost_mutex);
986 		return rc;
987 	}
988 
989 	for (i = 0; i < vsession->max_queues; i++) {
990 		q = &vsession->virtqueue[i];
991 		if (q->vring.desc == NULL) {
992 			continue;
993 		}
994 		rte_vhost_set_vring_base(vsession->vid, i, q->last_avail_idx, q->last_used_idx);
995 	}
996 
997 	vhost_session_mem_unregister(vsession->mem);
998 	free(vsession->mem);
999 
1000 	return 0;
1001 }
1002 
1003 int
1004 vhost_stop_device_cb(int vid)
1005 {
1006 	struct spdk_vhost_session *vsession;
1007 	int rc;
1008 
1009 	pthread_mutex_lock(&g_vhost_mutex);
1010 	vsession = vhost_session_find_by_vid(vid);
1011 	if (vsession == NULL) {
1012 		SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
1013 		pthread_mutex_unlock(&g_vhost_mutex);
1014 		return -EINVAL;
1015 	}
1016 
1017 	if (!vsession->started) {
1018 		/* already stopped, nothing to do */
1019 		pthread_mutex_unlock(&g_vhost_mutex);
1020 		return -EALREADY;
1021 	}
1022 
1023 	rc = _stop_session(vsession);
1024 	pthread_mutex_unlock(&g_vhost_mutex);
1025 
1026 	return rc;
1027 }
1028 
1029 int
1030 vhost_start_device_cb(int vid)
1031 {
1032 	struct spdk_vhost_dev *vdev;
1033 	struct spdk_vhost_session *vsession;
1034 	int rc = -1;
1035 	uint16_t i;
1036 
1037 	pthread_mutex_lock(&g_vhost_mutex);
1038 
1039 	vsession = vhost_session_find_by_vid(vid);
1040 	if (vsession == NULL) {
1041 		SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
1042 		goto out;
1043 	}
1044 
1045 	vdev = vsession->vdev;
1046 	if (vsession->started) {
1047 		/* already started, nothing to do */
1048 		rc = 0;
1049 		goto out;
1050 	}
1051 
1052 	vsession->max_queues = 0;
1053 	memset(vsession->virtqueue, 0, sizeof(vsession->virtqueue));
1054 	for (i = 0; i < SPDK_VHOST_MAX_VQUEUES; i++) {
1055 		struct spdk_vhost_virtqueue *q = &vsession->virtqueue[i];
1056 
1057 		q->vring_idx = -1;
1058 		if (rte_vhost_get_vhost_vring(vid, i, &q->vring)) {
1059 			continue;
1060 		}
1061 		q->vring_idx = i;
1062 
1063 		if (q->vring.desc == NULL || q->vring.size == 0) {
1064 			continue;
1065 		}
1066 
1067 		if (rte_vhost_get_vring_base(vsession->vid, i, &q->last_avail_idx, &q->last_used_idx)) {
1068 			q->vring.desc = NULL;
1069 			continue;
1070 		}
1071 
1072 		/* Disable I/O submission notifications, we'll be polling. */
1073 		q->vring.used->flags = VRING_USED_F_NO_NOTIFY;
1074 		vsession->max_queues = i + 1;
1075 	}
1076 
1077 	if (vhost_get_negotiated_features(vid, &vsession->negotiated_features) != 0) {
1078 		SPDK_ERRLOG("vhost device %d: Failed to get negotiated driver features\n", vid);
1079 		goto out;
1080 	}
1081 
1082 	if (vhost_get_mem_table(vid, &vsession->mem) != 0) {
1083 		SPDK_ERRLOG("vhost device %d: Failed to get guest memory table\n", vid);
1084 		goto out;
1085 	}
1086 
1087 	/*
1088 	 * Not sure right now but this look like some kind of QEMU bug and guest IO
1089 	 * might be frozed without kicking all queues after live-migration. This look like
1090 	 * the previous vhost instance failed to effectively deliver all interrupts before
1091 	 * the GET_VRING_BASE message. This shouldn't harm guest since spurious interrupts
1092 	 * should be ignored by guest virtio driver.
1093 	 *
1094 	 * Tested on QEMU 2.10.91 and 2.11.50.
1095 	 */
1096 	for (i = 0; i < vsession->max_queues; i++) {
1097 		struct spdk_vhost_virtqueue *q = &vsession->virtqueue[i];
1098 
1099 		if (q->vring.desc != NULL && q->vring.size > 0) {
1100 			rte_vhost_vring_call(vsession->vid, q->vring_idx);
1101 		}
1102 	}
1103 
1104 	vhost_session_set_coalescing(vdev, vsession, NULL);
1105 	vhost_session_mem_register(vsession->mem);
1106 	vsession->initialized = true;
1107 	rc = vdev->backend->start_session(vsession);
1108 	if (rc != 0) {
1109 		vhost_session_mem_unregister(vsession->mem);
1110 		free(vsession->mem);
1111 		goto out;
1112 	}
1113 
1114 out:
1115 	pthread_mutex_unlock(&g_vhost_mutex);
1116 	return rc;
1117 }
1118 
1119 #ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
1120 int
1121 vhost_get_config_cb(int vid, uint8_t *config, uint32_t len)
1122 {
1123 	struct spdk_vhost_session *vsession;
1124 	struct spdk_vhost_dev *vdev;
1125 	int rc = -1;
1126 
1127 	pthread_mutex_lock(&g_vhost_mutex);
1128 	vsession = vhost_session_find_by_vid(vid);
1129 	if (vsession == NULL) {
1130 		SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
1131 		goto out;
1132 	}
1133 
1134 	vdev = vsession->vdev;
1135 	if (vdev->backend->vhost_get_config) {
1136 		rc = vdev->backend->vhost_get_config(vdev, config, len);
1137 	}
1138 
1139 out:
1140 	pthread_mutex_unlock(&g_vhost_mutex);
1141 	return rc;
1142 }
1143 
1144 int
1145 vhost_set_config_cb(int vid, uint8_t *config, uint32_t offset, uint32_t size, uint32_t flags)
1146 {
1147 	struct spdk_vhost_session *vsession;
1148 	struct spdk_vhost_dev *vdev;
1149 	int rc = -1;
1150 
1151 	pthread_mutex_lock(&g_vhost_mutex);
1152 	vsession = vhost_session_find_by_vid(vid);
1153 	if (vsession == NULL) {
1154 		SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
1155 		goto out;
1156 	}
1157 
1158 	vdev = vsession->vdev;
1159 	if (vdev->backend->vhost_set_config) {
1160 		rc = vdev->backend->vhost_set_config(vdev, config, offset, size, flags);
1161 	}
1162 
1163 out:
1164 	pthread_mutex_unlock(&g_vhost_mutex);
1165 	return rc;
1166 }
1167 #endif
1168 
1169 int
1170 spdk_vhost_set_socket_path(const char *basename)
1171 {
1172 	int ret;
1173 
1174 	if (basename && strlen(basename) > 0) {
1175 		ret = snprintf(dev_dirname, sizeof(dev_dirname) - 2, "%s", basename);
1176 		if (ret <= 0) {
1177 			return -EINVAL;
1178 		}
1179 		if ((size_t)ret >= sizeof(dev_dirname) - 2) {
1180 			SPDK_ERRLOG("Char dev dir path length %d is too long\n", ret);
1181 			return -EINVAL;
1182 		}
1183 
1184 		if (dev_dirname[ret - 1] != '/') {
1185 			dev_dirname[ret] = '/';
1186 			dev_dirname[ret + 1]  = '\0';
1187 		}
1188 	}
1189 
1190 	return 0;
1191 }
1192 
1193 void
1194 vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
1195 {
1196 	assert(vdev->backend->dump_info_json != NULL);
1197 	vdev->backend->dump_info_json(vdev, w);
1198 }
1199 
1200 int
1201 spdk_vhost_dev_remove(struct spdk_vhost_dev *vdev)
1202 {
1203 	if (vdev->pending_async_op_num) {
1204 		return -EBUSY;
1205 	}
1206 
1207 	return vdev->backend->remove_device(vdev);
1208 }
1209 
1210 int
1211 vhost_new_connection_cb(int vid, const char *ifname)
1212 {
1213 	struct spdk_vhost_dev *vdev;
1214 	struct spdk_vhost_session *vsession;
1215 
1216 	pthread_mutex_lock(&g_vhost_mutex);
1217 
1218 	vdev = spdk_vhost_dev_find(ifname);
1219 	if (vdev == NULL) {
1220 		SPDK_ERRLOG("Couldn't find device with vid %d to create connection for.\n", vid);
1221 		pthread_mutex_unlock(&g_vhost_mutex);
1222 		return -1;
1223 	}
1224 
1225 	/* We expect sessions inside vdev->vsessions to be sorted in ascending
1226 	 * order in regard of vsession->id. For now we always set id = vsessions_cnt++
1227 	 * and append each session to the very end of the vsessions list.
1228 	 * This is required for spdk_vhost_dev_foreach_session() to work.
1229 	 */
1230 	if (vdev->vsessions_num == UINT_MAX) {
1231 		assert(false);
1232 		return -EINVAL;
1233 	}
1234 
1235 	if (posix_memalign((void **)&vsession, SPDK_CACHE_LINE_SIZE, sizeof(*vsession) +
1236 			   vdev->backend->session_ctx_size)) {
1237 		SPDK_ERRLOG("vsession alloc failed\n");
1238 		pthread_mutex_unlock(&g_vhost_mutex);
1239 		return -1;
1240 	}
1241 	memset(vsession, 0, sizeof(*vsession) + vdev->backend->session_ctx_size);
1242 
1243 	vsession->vdev = vdev;
1244 	vsession->vid = vid;
1245 	vsession->id = vdev->vsessions_num++;
1246 	vsession->name = spdk_sprintf_alloc("%ss%u", vdev->name, vsession->vid);
1247 	if (vsession->name == NULL) {
1248 		SPDK_ERRLOG("vsession alloc failed\n");
1249 		pthread_mutex_unlock(&g_vhost_mutex);
1250 		free(vsession);
1251 		return -1;
1252 	}
1253 	vsession->poll_group = NULL;
1254 	vsession->started = false;
1255 	vsession->initialized = false;
1256 	vsession->next_stats_check_time = 0;
1257 	vsession->stats_check_interval = SPDK_VHOST_STATS_CHECK_INTERVAL_MS *
1258 					 spdk_get_ticks_hz() / 1000UL;
1259 	TAILQ_INSERT_TAIL(&vdev->vsessions, vsession, tailq);
1260 
1261 	vhost_session_install_rte_compat_hooks(vsession);
1262 	pthread_mutex_unlock(&g_vhost_mutex);
1263 	return 0;
1264 }
1265 
1266 int
1267 vhost_destroy_connection_cb(int vid)
1268 {
1269 	struct spdk_vhost_session *vsession;
1270 	int rc = 0;
1271 
1272 	pthread_mutex_lock(&g_vhost_mutex);
1273 	vsession = vhost_session_find_by_vid(vid);
1274 	if (vsession == NULL) {
1275 		SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
1276 		pthread_mutex_unlock(&g_vhost_mutex);
1277 		return -EINVAL;
1278 	}
1279 
1280 	if (vsession->started) {
1281 		rc = _stop_session(vsession);
1282 	}
1283 
1284 	TAILQ_REMOVE(&vsession->vdev->vsessions, vsession, tailq);
1285 	free(vsession->name);
1286 	free(vsession);
1287 	pthread_mutex_unlock(&g_vhost_mutex);
1288 
1289 	return rc;
1290 }
1291 
1292 void
1293 spdk_vhost_lock(void)
1294 {
1295 	pthread_mutex_lock(&g_vhost_mutex);
1296 }
1297 
1298 int
1299 spdk_vhost_trylock(void)
1300 {
1301 	return -pthread_mutex_trylock(&g_vhost_mutex);
1302 }
1303 
1304 void
1305 spdk_vhost_unlock(void)
1306 {
1307 	pthread_mutex_unlock(&g_vhost_mutex);
1308 }
1309 
1310 static void
1311 vhost_create_poll_group_done(void *ctx)
1312 {
1313 	spdk_vhost_init_cb init_cb = ctx;
1314 	int ret;
1315 
1316 	if (TAILQ_EMPTY(&g_poll_groups)) {
1317 		/* No threads? Iteration failed? */
1318 		init_cb(-ECHILD);
1319 		return;
1320 	}
1321 
1322 	ret = vhost_scsi_controller_construct();
1323 	if (ret != 0) {
1324 		SPDK_ERRLOG("Cannot construct vhost controllers\n");
1325 		goto out;
1326 	}
1327 
1328 	ret = vhost_blk_controller_construct();
1329 	if (ret != 0) {
1330 		SPDK_ERRLOG("Cannot construct vhost block controllers\n");
1331 		goto out;
1332 	}
1333 
1334 #ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
1335 	ret = vhost_nvme_controller_construct();
1336 	if (ret != 0) {
1337 		SPDK_ERRLOG("Cannot construct vhost NVMe controllers\n");
1338 		goto out;
1339 	}
1340 #endif
1341 
1342 out:
1343 	init_cb(ret);
1344 }
1345 
1346 static void
1347 vhost_create_poll_group(void *ctx)
1348 {
1349 	struct vhost_poll_group *pg;
1350 
1351 	pg = calloc(1, sizeof(*pg));
1352 	if (!pg) {
1353 		SPDK_ERRLOG("Not enough memory to allocate poll groups\n");
1354 		spdk_app_stop(-ENOMEM);
1355 		return;
1356 	}
1357 
1358 	pg->thread = spdk_get_thread();
1359 	TAILQ_INSERT_TAIL(&g_poll_groups, pg, tailq);
1360 }
1361 
1362 void
1363 spdk_vhost_init(spdk_vhost_init_cb init_cb)
1364 {
1365 	size_t len;
1366 	int ret;
1367 
1368 	g_vhost_init_thread = spdk_get_thread();
1369 	assert(g_vhost_init_thread != NULL);
1370 
1371 	if (dev_dirname[0] == '\0') {
1372 		if (getcwd(dev_dirname, sizeof(dev_dirname) - 1) == NULL) {
1373 			SPDK_ERRLOG("getcwd failed (%d): %s\n", errno, spdk_strerror(errno));
1374 			ret = -1;
1375 			goto err_out;
1376 		}
1377 
1378 		len = strlen(dev_dirname);
1379 		if (dev_dirname[len - 1] != '/') {
1380 			dev_dirname[len] = '/';
1381 			dev_dirname[len + 1] = '\0';
1382 		}
1383 	}
1384 
1385 	g_tmp_cpuset = spdk_cpuset_alloc();
1386 	if (g_tmp_cpuset == NULL) {
1387 		ret = -1;
1388 		goto err_out;
1389 	}
1390 
1391 	ret = sem_init(&g_dpdk_sem, 0, 0);
1392 	if (ret != 0) {
1393 		SPDK_ERRLOG("Failed to initialize semaphore for rte_vhost pthread.\n");
1394 		spdk_cpuset_free(g_tmp_cpuset);
1395 		ret = -1;
1396 		goto err_out;
1397 	}
1398 
1399 	spdk_for_each_thread(vhost_create_poll_group,
1400 			     init_cb,
1401 			     vhost_create_poll_group_done);
1402 	return;
1403 err_out:
1404 	init_cb(ret);
1405 }
1406 
1407 static void
1408 _spdk_vhost_fini(void *arg1)
1409 {
1410 	struct spdk_vhost_dev *vdev, *tmp;
1411 	struct vhost_poll_group *pg, *tpg;
1412 
1413 	spdk_vhost_lock();
1414 	vdev = spdk_vhost_dev_next(NULL);
1415 	while (vdev != NULL) {
1416 		tmp = spdk_vhost_dev_next(vdev);
1417 		spdk_vhost_dev_remove(vdev);
1418 		/* don't care if it fails, there's nothing we can do for now */
1419 		vdev = tmp;
1420 	}
1421 	spdk_vhost_unlock();
1422 
1423 	/* All devices are removed now. */
1424 	sem_destroy(&g_dpdk_sem);
1425 	spdk_cpuset_free(g_tmp_cpuset);
1426 	TAILQ_FOREACH_SAFE(pg, &g_poll_groups, tailq, tpg) {
1427 		TAILQ_REMOVE(&g_poll_groups, pg, tailq);
1428 		free(pg);
1429 	}
1430 	g_fini_cpl_cb();
1431 }
1432 
1433 static void *
1434 session_shutdown(void *arg)
1435 {
1436 	struct spdk_vhost_dev *vdev = NULL;
1437 
1438 	TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) {
1439 		vhost_driver_unregister(vdev->path);
1440 		vdev->registered = false;
1441 	}
1442 
1443 	SPDK_INFOLOG(SPDK_LOG_VHOST, "Exiting\n");
1444 	spdk_thread_send_msg(g_vhost_init_thread, _spdk_vhost_fini, NULL);
1445 	return NULL;
1446 }
1447 
1448 void
1449 spdk_vhost_fini(spdk_vhost_fini_cb fini_cb)
1450 {
1451 	pthread_t tid;
1452 	int rc;
1453 
1454 	assert(spdk_get_thread() == g_vhost_init_thread);
1455 	g_fini_cpl_cb = fini_cb;
1456 
1457 	/* rte_vhost API for removing sockets is not asynchronous. Since it may call SPDK
1458 	 * ops for stopping a device or removing a connection, we need to call it from
1459 	 * a separate thread to avoid deadlock.
1460 	 */
1461 	rc = pthread_create(&tid, NULL, &session_shutdown, NULL);
1462 	if (rc < 0) {
1463 		SPDK_ERRLOG("Failed to start session shutdown thread (%d): %s\n", rc, spdk_strerror(rc));
1464 		abort();
1465 	}
1466 	pthread_detach(tid);
1467 }
1468 
1469 void
1470 spdk_vhost_config_json(struct spdk_json_write_ctx *w)
1471 {
1472 	struct spdk_vhost_dev *vdev;
1473 	uint32_t delay_base_us;
1474 	uint32_t iops_threshold;
1475 
1476 	spdk_json_write_array_begin(w);
1477 
1478 	spdk_vhost_lock();
1479 	vdev = spdk_vhost_dev_next(NULL);
1480 	while (vdev != NULL) {
1481 		vdev->backend->write_config_json(vdev, w);
1482 
1483 		spdk_vhost_get_coalescing(vdev, &delay_base_us, &iops_threshold);
1484 		if (delay_base_us) {
1485 			spdk_json_write_object_begin(w);
1486 			spdk_json_write_named_string(w, "method", "vhost_controller_set_coalescing");
1487 
1488 			spdk_json_write_named_object_begin(w, "params");
1489 			spdk_json_write_named_string(w, "ctrlr", vdev->name);
1490 			spdk_json_write_named_uint32(w, "delay_base_us", delay_base_us);
1491 			spdk_json_write_named_uint32(w, "iops_threshold", iops_threshold);
1492 			spdk_json_write_object_end(w);
1493 
1494 			spdk_json_write_object_end(w);
1495 		}
1496 		vdev = spdk_vhost_dev_next(vdev);
1497 	}
1498 	spdk_vhost_unlock();
1499 
1500 	spdk_json_write_array_end(w);
1501 }
1502 
1503 SPDK_LOG_REGISTER_COMPONENT("vhost", SPDK_LOG_VHOST)
1504 SPDK_LOG_REGISTER_COMPONENT("vhost_ring", SPDK_LOG_VHOST_RING)
1505