xref: /spdk/lib/vhost/vhost.c (revision f93b6fb0a4ebcee203e7c44c9e170c20bbce96cc)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "spdk/env.h"
37 #include "spdk/likely.h"
38 #include "spdk/string.h"
39 #include "spdk/util.h"
40 #include "spdk/barrier.h"
41 #include "spdk/vhost.h"
42 #include "vhost_internal.h"
43 
44 #include "spdk_internal/memory.h"
45 
46 static uint32_t *g_num_ctrlrs;
47 
48 /* Path to folder where character device will be created. Can be set by user. */
49 static char dev_dirname[PATH_MAX] = "";
50 
51 struct spdk_vhost_session_fn_ctx {
52 	/** Device pointer obtained before enqueuing the event */
53 	struct spdk_vhost_dev *vdev;
54 
55 	/** ID of the session to send event to. */
56 	uint32_t vsession_id;
57 
58 	/** User callback function to be executed on given lcore. */
59 	spdk_vhost_session_fn cb_fn;
60 
61 	/** Semaphore used to signal that event is done. */
62 	sem_t sem;
63 
64 	/** Response to be written by enqueued event. */
65 	int response;
66 };
67 
68 static int new_connection(int vid);
69 static int start_device(int vid);
70 static void stop_device(int vid);
71 static void destroy_connection(int vid);
72 
73 #ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
74 static int get_config(int vid, uint8_t *config, uint32_t len);
75 static int set_config(int vid, uint8_t *config, uint32_t offset,
76 		      uint32_t size, uint32_t flags);
77 #endif
78 
79 const struct vhost_device_ops g_spdk_vhost_ops = {
80 	.new_device =  start_device,
81 	.destroy_device = stop_device,
82 	.new_connection = new_connection,
83 	.destroy_connection = destroy_connection,
84 #ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
85 	.get_config = get_config,
86 	.set_config = set_config,
87 	.vhost_nvme_admin_passthrough = spdk_vhost_nvme_admin_passthrough,
88 	.vhost_nvme_set_cq_call = spdk_vhost_nvme_set_cq_call,
89 	.vhost_nvme_get_cap = spdk_vhost_nvme_get_cap,
90 	.vhost_nvme_set_bar_mr = spdk_vhost_nvme_set_bar_mr,
91 #endif
92 };
93 
94 static TAILQ_HEAD(, spdk_vhost_dev) g_spdk_vhost_devices = TAILQ_HEAD_INITIALIZER(
95 			g_spdk_vhost_devices);
96 static pthread_mutex_t g_spdk_vhost_mutex = PTHREAD_MUTEX_INITIALIZER;
97 
98 void *spdk_vhost_gpa_to_vva(struct spdk_vhost_session *vsession, uint64_t addr, uint64_t len)
99 {
100 	void *vva;
101 	uint64_t newlen;
102 
103 	newlen = len;
104 	vva = (void *)rte_vhost_va_from_guest_pa(vsession->mem, addr, &newlen);
105 	if (newlen != len) {
106 		return NULL;
107 	}
108 
109 	return vva;
110 
111 }
112 
113 static void
114 spdk_vhost_log_req_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue,
115 			uint16_t req_id)
116 {
117 	struct vring_desc *desc, *desc_table;
118 	uint32_t desc_table_size;
119 	int rc;
120 
121 	if (spdk_likely(!spdk_vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
122 		return;
123 	}
124 
125 	rc = spdk_vhost_vq_get_desc(vsession, virtqueue, req_id, &desc, &desc_table, &desc_table_size);
126 	if (spdk_unlikely(rc != 0)) {
127 		SPDK_ERRLOG("Can't log used ring descriptors!\n");
128 		return;
129 	}
130 
131 	do {
132 		if (spdk_vhost_vring_desc_is_wr(desc)) {
133 			/* To be honest, only pages realy touched should be logged, but
134 			 * doing so would require tracking those changes in each backed.
135 			 * Also backend most likely will touch all/most of those pages so
136 			 * for lets assume we touched all pages passed to as writeable buffers. */
137 			rte_vhost_log_write(vsession->vid, desc->addr, desc->len);
138 		}
139 		spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
140 	} while (desc);
141 }
142 
143 static void
144 spdk_vhost_log_used_vring_elem(struct spdk_vhost_session *vsession,
145 			       struct spdk_vhost_virtqueue *virtqueue,
146 			       uint16_t idx)
147 {
148 	uint64_t offset, len;
149 	uint16_t vq_idx;
150 
151 	if (spdk_likely(!spdk_vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
152 		return;
153 	}
154 
155 	offset = offsetof(struct vring_used, ring[idx]);
156 	len = sizeof(virtqueue->vring.used->ring[idx]);
157 	vq_idx = virtqueue - vsession->virtqueue;
158 
159 	rte_vhost_log_used_vring(vsession->vid, vq_idx, offset, len);
160 }
161 
162 static void
163 spdk_vhost_log_used_vring_idx(struct spdk_vhost_session *vsession,
164 			      struct spdk_vhost_virtqueue *virtqueue)
165 {
166 	uint64_t offset, len;
167 	uint16_t vq_idx;
168 
169 	if (spdk_likely(!spdk_vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
170 		return;
171 	}
172 
173 	offset = offsetof(struct vring_used, idx);
174 	len = sizeof(virtqueue->vring.used->idx);
175 	vq_idx = virtqueue - vsession->virtqueue;
176 
177 	rte_vhost_log_used_vring(vsession->vid, vq_idx, offset, len);
178 }
179 
180 /*
181  * Get available requests from avail ring.
182  */
183 uint16_t
184 spdk_vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *virtqueue, uint16_t *reqs,
185 			     uint16_t reqs_len)
186 {
187 	struct rte_vhost_vring *vring = &virtqueue->vring;
188 	struct vring_avail *avail = vring->avail;
189 	uint16_t size_mask = vring->size - 1;
190 	uint16_t last_idx = virtqueue->last_avail_idx, avail_idx = avail->idx;
191 	uint16_t count, i;
192 
193 	count = avail_idx - last_idx;
194 	if (spdk_likely(count == 0)) {
195 		return 0;
196 	}
197 
198 	if (spdk_unlikely(count > vring->size)) {
199 		/* TODO: the queue is unrecoverably broken and should be marked so.
200 		 * For now we will fail silently and report there are no new avail entries.
201 		 */
202 		return 0;
203 	}
204 
205 	count = spdk_min(count, reqs_len);
206 	virtqueue->last_avail_idx += count;
207 	for (i = 0; i < count; i++) {
208 		reqs[i] = vring->avail->ring[(last_idx + i) & size_mask];
209 	}
210 
211 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
212 		      "AVAIL: last_idx=%"PRIu16" avail_idx=%"PRIu16" count=%"PRIu16"\n",
213 		      last_idx, avail_idx, count);
214 
215 	return count;
216 }
217 
218 static bool
219 spdk_vhost_vring_desc_is_indirect(struct vring_desc *cur_desc)
220 {
221 	return !!(cur_desc->flags & VRING_DESC_F_INDIRECT);
222 }
223 
224 int
225 spdk_vhost_vq_get_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue,
226 		       uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table,
227 		       uint32_t *desc_table_size)
228 {
229 	if (spdk_unlikely(req_idx >= virtqueue->vring.size)) {
230 		return -1;
231 	}
232 
233 	*desc = &virtqueue->vring.desc[req_idx];
234 
235 	if (spdk_vhost_vring_desc_is_indirect(*desc)) {
236 		*desc_table_size = (*desc)->len / sizeof(**desc);
237 		*desc_table = spdk_vhost_gpa_to_vva(vsession, (*desc)->addr,
238 						    sizeof(**desc) * *desc_table_size);
239 		*desc = *desc_table;
240 		if (*desc == NULL) {
241 			return -1;
242 		}
243 
244 		return 0;
245 	}
246 
247 	*desc_table = virtqueue->vring.desc;
248 	*desc_table_size = virtqueue->vring.size;
249 
250 	return 0;
251 }
252 
253 int
254 spdk_vhost_vq_used_signal(struct spdk_vhost_session *vsession,
255 			  struct spdk_vhost_virtqueue *virtqueue)
256 {
257 	if (virtqueue->used_req_cnt == 0) {
258 		return 0;
259 	}
260 
261 	virtqueue->req_cnt += virtqueue->used_req_cnt;
262 	virtqueue->used_req_cnt = 0;
263 
264 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
265 		      "Queue %td - USED RING: sending IRQ: last used %"PRIu16"\n",
266 		      virtqueue - vsession->virtqueue, virtqueue->last_used_idx);
267 
268 	eventfd_write(virtqueue->vring.callfd, (eventfd_t)1);
269 	return 1;
270 }
271 
272 
273 static void
274 check_session_io_stats(struct spdk_vhost_session *vsession, uint64_t now)
275 {
276 	struct spdk_vhost_virtqueue *virtqueue;
277 	uint32_t irq_delay_base = vsession->coalescing_delay_time_base;
278 	uint32_t io_threshold = vsession->coalescing_io_rate_threshold;
279 	int32_t irq_delay;
280 	uint32_t req_cnt;
281 	uint16_t q_idx;
282 
283 	if (now < vsession->next_stats_check_time) {
284 		return;
285 	}
286 
287 	vsession->next_stats_check_time = now + vsession->stats_check_interval;
288 	for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
289 		virtqueue = &vsession->virtqueue[q_idx];
290 
291 		req_cnt = virtqueue->req_cnt + virtqueue->used_req_cnt;
292 		if (req_cnt <= io_threshold) {
293 			continue;
294 		}
295 
296 		irq_delay = (irq_delay_base * (req_cnt - io_threshold)) / io_threshold;
297 		virtqueue->irq_delay_time = (uint32_t) spdk_max(0, irq_delay);
298 
299 		virtqueue->req_cnt = 0;
300 		virtqueue->next_event_time = now;
301 	}
302 }
303 
304 void
305 spdk_vhost_session_used_signal(struct spdk_vhost_session *vsession)
306 {
307 	struct spdk_vhost_virtqueue *virtqueue;
308 	uint64_t now;
309 	uint16_t q_idx;
310 
311 	if (vsession->coalescing_delay_time_base == 0) {
312 		for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
313 			virtqueue = &vsession->virtqueue[q_idx];
314 
315 			if (virtqueue->vring.desc == NULL ||
316 			    (virtqueue->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
317 				continue;
318 			}
319 
320 			spdk_vhost_vq_used_signal(vsession, virtqueue);
321 		}
322 	} else {
323 		now = spdk_get_ticks();
324 		check_session_io_stats(vsession, now);
325 
326 		for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
327 			virtqueue = &vsession->virtqueue[q_idx];
328 
329 			/* No need for event right now */
330 			if (now < virtqueue->next_event_time ||
331 			    (virtqueue->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
332 				continue;
333 			}
334 
335 			if (!spdk_vhost_vq_used_signal(vsession, virtqueue)) {
336 				continue;
337 			}
338 
339 			/* Syscall is quite long so update time */
340 			now = spdk_get_ticks();
341 			virtqueue->next_event_time = now + virtqueue->irq_delay_time;
342 		}
343 	}
344 }
345 
346 static int
347 spdk_vhost_session_set_coalescing(struct spdk_vhost_dev *vdev,
348 				  struct spdk_vhost_session *vsession, void *ctx)
349 {
350 	if (vdev == NULL || vsession == NULL) {
351 		/* nothing to do */
352 		return 0;
353 	}
354 
355 	vsession->coalescing_delay_time_base =
356 		vdev->coalescing_delay_us * spdk_get_ticks_hz() / 1000000ULL;
357 	vsession->coalescing_io_rate_threshold =
358 		vdev->coalescing_iops_threshold * SPDK_VHOST_STATS_CHECK_INTERVAL_MS / 1000U;
359 	return 0;
360 }
361 
362 int
363 spdk_vhost_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us,
364 			  uint32_t iops_threshold)
365 {
366 	uint64_t delay_time_base = delay_base_us * spdk_get_ticks_hz() / 1000000ULL;
367 	uint32_t io_rate = iops_threshold * SPDK_VHOST_STATS_CHECK_INTERVAL_MS / 1000U;
368 
369 	if (delay_time_base >= UINT32_MAX) {
370 		SPDK_ERRLOG("Delay time of %"PRIu32" is to big\n", delay_base_us);
371 		return -EINVAL;
372 	} else if (io_rate == 0) {
373 		SPDK_ERRLOG("IOPS rate of %"PRIu32" is too low. Min is %u\n", io_rate,
374 			    1000U / SPDK_VHOST_STATS_CHECK_INTERVAL_MS);
375 		return -EINVAL;
376 	}
377 
378 	vdev->coalescing_delay_us = delay_base_us;
379 	vdev->coalescing_iops_threshold = iops_threshold;
380 
381 	spdk_vhost_dev_foreach_session(vdev, spdk_vhost_session_set_coalescing, NULL);
382 	return 0;
383 }
384 
385 void
386 spdk_vhost_get_coalescing(struct spdk_vhost_dev *vdev, uint32_t *delay_base_us,
387 			  uint32_t *iops_threshold)
388 {
389 	if (delay_base_us) {
390 		*delay_base_us = vdev->coalescing_delay_us;
391 	}
392 
393 	if (iops_threshold) {
394 		*iops_threshold = vdev->coalescing_iops_threshold;
395 	}
396 }
397 
398 /*
399  * Enqueue id and len to used ring.
400  */
401 void
402 spdk_vhost_vq_used_ring_enqueue(struct spdk_vhost_session *vsession,
403 				struct spdk_vhost_virtqueue *virtqueue,
404 				uint16_t id, uint32_t len)
405 {
406 	struct rte_vhost_vring *vring = &virtqueue->vring;
407 	struct vring_used *used = vring->used;
408 	uint16_t last_idx = virtqueue->last_used_idx & (vring->size - 1);
409 
410 	SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
411 		      "Queue %td - USED RING: last_idx=%"PRIu16" req id=%"PRIu16" len=%"PRIu32"\n",
412 		      virtqueue - vsession->virtqueue, virtqueue->last_used_idx, id, len);
413 
414 	spdk_vhost_log_req_desc(vsession, virtqueue, id);
415 
416 	virtqueue->last_used_idx++;
417 	used->ring[last_idx].id = id;
418 	used->ring[last_idx].len = len;
419 
420 	/* Ensure the used ring is updated before we log it or increment used->idx. */
421 	spdk_smp_wmb();
422 
423 	spdk_vhost_log_used_vring_elem(vsession, virtqueue, last_idx);
424 	* (volatile uint16_t *) &used->idx = virtqueue->last_used_idx;
425 	spdk_vhost_log_used_vring_idx(vsession, virtqueue);
426 
427 	/* Ensure all our used ring changes are visible to the guest at the time
428 	 * of interrupt.
429 	 * TODO: this is currently an sfence on x86. For other architectures we
430 	 * will most likely need an smp_mb(), but smp_mb() is an overkill for x86.
431 	 */
432 	spdk_wmb();
433 
434 	virtqueue->used_req_cnt++;
435 }
436 
437 int
438 spdk_vhost_vring_desc_get_next(struct vring_desc **desc,
439 			       struct vring_desc *desc_table, uint32_t desc_table_size)
440 {
441 	struct vring_desc *old_desc = *desc;
442 	uint16_t next_idx;
443 
444 	if ((old_desc->flags & VRING_DESC_F_NEXT) == 0) {
445 		*desc = NULL;
446 		return 0;
447 	}
448 
449 	next_idx = old_desc->next;
450 	if (spdk_unlikely(next_idx >= desc_table_size)) {
451 		*desc = NULL;
452 		return -1;
453 	}
454 
455 	*desc = &desc_table[next_idx];
456 	return 0;
457 }
458 
459 bool
460 spdk_vhost_vring_desc_is_wr(struct vring_desc *cur_desc)
461 {
462 	return !!(cur_desc->flags & VRING_DESC_F_WRITE);
463 }
464 
465 int
466 spdk_vhost_vring_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
467 			     uint16_t *iov_index, const struct vring_desc *desc)
468 {
469 	uint32_t remaining = desc->len;
470 	uint32_t to_boundary;
471 	uint32_t len;
472 	uintptr_t payload = desc->addr;
473 	uintptr_t vva;
474 
475 	do {
476 		if (*iov_index >= SPDK_VHOST_IOVS_MAX) {
477 			SPDK_ERRLOG("SPDK_VHOST_IOVS_MAX(%d) reached\n", SPDK_VHOST_IOVS_MAX);
478 			return -1;
479 		}
480 		vva = (uintptr_t)rte_vhost_gpa_to_vva(vsession->mem, payload);
481 		if (vva == 0) {
482 			SPDK_ERRLOG("gpa_to_vva(%p) == NULL\n", (void *)payload);
483 			return -1;
484 		}
485 		to_boundary = VALUE_2MB - _2MB_OFFSET(payload);
486 		if (spdk_likely(remaining <= to_boundary)) {
487 			len = remaining;
488 		} else {
489 			/*
490 			 * Descriptor crosses a 2MB hugepage boundary.  vhost memory regions are allocated
491 			 *  from hugepage memory, so this means this descriptor may be described by
492 			 *  discontiguous vhost memory regions.  Do not blindly split on the 2MB boundary,
493 			 *  only split it if the two sides of the boundary do not map to the same vhost
494 			 *  memory region.  This helps ensure we do not exceed the max number of IOVs
495 			 *  defined by SPDK_VHOST_IOVS_MAX.
496 			 */
497 			len = to_boundary;
498 			while (len < remaining) {
499 				if (vva + len != (uintptr_t)rte_vhost_gpa_to_vva(vsession->mem, payload + len)) {
500 					break;
501 				}
502 				len += spdk_min(remaining - len, VALUE_2MB);
503 			}
504 		}
505 		iov[*iov_index].iov_base = (void *)vva;
506 		iov[*iov_index].iov_len = len;
507 		remaining -= len;
508 		payload += len;
509 		(*iov_index)++;
510 	} while (remaining);
511 
512 	return 0;
513 }
514 
515 static struct spdk_vhost_session *
516 spdk_vhost_session_find_by_id(struct spdk_vhost_dev *vdev, unsigned id)
517 {
518 	struct spdk_vhost_session *vsession;
519 
520 	TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
521 		if (vsession->id == id) {
522 			return vsession;
523 		}
524 	}
525 
526 	return NULL;
527 }
528 
529 struct spdk_vhost_session *
530 spdk_vhost_session_find_by_vid(int vid)
531 {
532 	struct spdk_vhost_dev *vdev;
533 	struct spdk_vhost_session *vsession;
534 
535 	TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) {
536 		TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
537 			if (vsession->vid == vid) {
538 				return vsession;
539 			}
540 		}
541 	}
542 
543 	return NULL;
544 }
545 
546 #define SHIFT_2MB	21
547 #define SIZE_2MB	(1ULL << SHIFT_2MB)
548 #define FLOOR_2MB(x)	(((uintptr_t)x) / SIZE_2MB) << SHIFT_2MB
549 #define CEIL_2MB(x)	((((uintptr_t)x) + SIZE_2MB - 1) / SIZE_2MB) << SHIFT_2MB
550 
551 static void
552 spdk_vhost_session_mem_register(struct spdk_vhost_session *vsession)
553 {
554 	struct rte_vhost_mem_region *region;
555 	uint32_t i;
556 
557 	for (i = 0; i < vsession->mem->nregions; i++) {
558 		uint64_t start, end, len;
559 		region = &vsession->mem->regions[i];
560 		start = FLOOR_2MB(region->mmap_addr);
561 		end = CEIL_2MB(region->mmap_addr + region->mmap_size);
562 		len = end - start;
563 		SPDK_INFOLOG(SPDK_LOG_VHOST, "Registering VM memory for vtophys translation - 0x%jx len:0x%jx\n",
564 			     start, len);
565 
566 		if (spdk_mem_register((void *)start, len) != 0) {
567 			SPDK_WARNLOG("Failed to register memory region %"PRIu32". Future vtophys translation might fail.\n",
568 				     i);
569 			continue;
570 		}
571 	}
572 }
573 
574 static void
575 spdk_vhost_session_mem_unregister(struct spdk_vhost_session *vsession)
576 {
577 	struct rte_vhost_mem_region *region;
578 	uint32_t i;
579 
580 	for (i = 0; i < vsession->mem->nregions; i++) {
581 		uint64_t start, end, len;
582 		region = &vsession->mem->regions[i];
583 		start = FLOOR_2MB(region->mmap_addr);
584 		end = CEIL_2MB(region->mmap_addr + region->mmap_size);
585 		len = end - start;
586 
587 		if (spdk_vtophys((void *) start, NULL) == SPDK_VTOPHYS_ERROR) {
588 			continue; /* region has not been registered */
589 		}
590 
591 		if (spdk_mem_unregister((void *)start, len) != 0) {
592 			assert(false);
593 		}
594 	}
595 
596 }
597 
598 void
599 spdk_vhost_free_reactor(uint32_t lcore)
600 {
601 	g_num_ctrlrs[lcore]--;
602 }
603 
604 struct spdk_vhost_dev *
605 spdk_vhost_dev_next(struct spdk_vhost_dev *vdev)
606 {
607 	if (vdev == NULL) {
608 		return TAILQ_FIRST(&g_spdk_vhost_devices);
609 	}
610 
611 	return TAILQ_NEXT(vdev, tailq);
612 }
613 
614 struct spdk_vhost_dev *
615 spdk_vhost_dev_find(const char *ctrlr_name)
616 {
617 	struct spdk_vhost_dev *vdev;
618 	size_t dev_dirname_len = strlen(dev_dirname);
619 
620 	if (strncmp(ctrlr_name, dev_dirname, dev_dirname_len) == 0) {
621 		ctrlr_name += dev_dirname_len;
622 	}
623 
624 	TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) {
625 		if (strcmp(vdev->name, ctrlr_name) == 0) {
626 			return vdev;
627 		}
628 	}
629 
630 	return NULL;
631 }
632 
633 static int
634 spdk_vhost_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask)
635 {
636 	int rc;
637 
638 	if (cpumask == NULL) {
639 		return -1;
640 	}
641 
642 	if (mask == NULL) {
643 		spdk_cpuset_copy(cpumask, spdk_app_get_core_mask());
644 		return 0;
645 	}
646 
647 	rc = spdk_app_parse_core_mask(mask, cpumask);
648 	if (rc < 0) {
649 		SPDK_ERRLOG("invalid cpumask %s\n", mask);
650 		return -1;
651 	}
652 
653 	if (spdk_cpuset_count(cpumask) == 0) {
654 		SPDK_ERRLOG("no cpu is selected among reactor mask(=%s)\n",
655 			    spdk_cpuset_fmt(spdk_app_get_core_mask()));
656 		return -1;
657 	}
658 
659 	return 0;
660 }
661 
662 static void *
663 _start_rte_driver(void *arg)
664 {
665 	char *path = arg;
666 
667 	if (rte_vhost_driver_start(path) != 0) {
668 		return NULL;
669 	}
670 
671 	return path;
672 }
673 
674 int
675 spdk_vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str,
676 			const struct spdk_vhost_dev_backend *backend)
677 {
678 	char path[PATH_MAX];
679 	struct stat file_stat;
680 	struct spdk_cpuset *cpumask;
681 	int rc;
682 
683 	assert(vdev);
684 	if (name == NULL) {
685 		SPDK_ERRLOG("Can't register controller with no name\n");
686 		return -EINVAL;
687 	}
688 
689 	cpumask = spdk_cpuset_alloc();
690 	if (!cpumask) {
691 		SPDK_ERRLOG("spdk_cpuset_alloc failed\n");
692 		return -ENOMEM;
693 	}
694 
695 	if (spdk_vhost_parse_core_mask(mask_str, cpumask) != 0) {
696 		SPDK_ERRLOG("cpumask %s is invalid (app mask is 0x%s)\n",
697 			    mask_str, spdk_cpuset_fmt(spdk_app_get_core_mask()));
698 		rc = -EINVAL;
699 		goto out;
700 	}
701 
702 	if (spdk_vhost_dev_find(name)) {
703 		SPDK_ERRLOG("vhost controller %s already exists.\n", name);
704 		rc = -EEXIST;
705 		goto out;
706 	}
707 
708 	if (snprintf(path, sizeof(path), "%s%s", dev_dirname, name) >= (int)sizeof(path)) {
709 		SPDK_ERRLOG("Resulting socket path for controller %s is too long: %s%s\n", name, dev_dirname,
710 			    name);
711 		rc = -EINVAL;
712 		goto out;
713 	}
714 
715 	/* Register vhost driver to handle vhost messages. */
716 	if (stat(path, &file_stat) != -1) {
717 		if (!S_ISSOCK(file_stat.st_mode)) {
718 			SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": "
719 				    "The file already exists and is not a socket.\n",
720 				    path);
721 			rc = -EIO;
722 			goto out;
723 		} else if (unlink(path) != 0) {
724 			SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": "
725 				    "The socket already exists and failed to unlink.\n",
726 				    path);
727 			rc = -EIO;
728 			goto out;
729 		}
730 	}
731 
732 	if (rte_vhost_driver_register(path, 0) != 0) {
733 		SPDK_ERRLOG("Could not register controller %s with vhost library\n", name);
734 		SPDK_ERRLOG("Check if domain socket %s already exists\n", path);
735 		rc = -EIO;
736 		goto out;
737 	}
738 	if (rte_vhost_driver_set_features(path, backend->virtio_features) ||
739 	    rte_vhost_driver_disable_features(path, backend->disabled_features)) {
740 		SPDK_ERRLOG("Couldn't set vhost features for controller %s\n", name);
741 
742 		rte_vhost_driver_unregister(path);
743 		rc = -EIO;
744 		goto out;
745 	}
746 
747 	if (rte_vhost_driver_callback_register(path, &g_spdk_vhost_ops) != 0) {
748 		rte_vhost_driver_unregister(path);
749 		SPDK_ERRLOG("Couldn't register callbacks for controller %s\n", name);
750 		rc = -EIO;
751 		goto out;
752 	}
753 
754 	vdev->name = strdup(name);
755 	vdev->path = strdup(path);
756 	if (vdev->name == NULL || vdev->path == NULL) {
757 		free(vdev->name);
758 		free(vdev->path);
759 		rte_vhost_driver_unregister(path);
760 		rc = -EIO;
761 		goto out;
762 	}
763 
764 	vdev->cpumask = cpumask;
765 	vdev->registered = true;
766 	vdev->backend = backend;
767 	TAILQ_INIT(&vdev->vsessions);
768 	TAILQ_INSERT_TAIL(&g_spdk_vhost_devices, vdev, tailq);
769 
770 	spdk_vhost_set_coalescing(vdev, SPDK_VHOST_COALESCING_DELAY_BASE_US,
771 				  SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD);
772 
773 	spdk_vhost_dev_install_rte_compat_hooks(vdev);
774 
775 	/* The following might start a POSIX thread that polls for incoming
776 	 * socket connections and calls backend->start/stop_device. These backend
777 	 * callbacks are also protected by the global SPDK vhost mutex, so we're
778 	 * safe with not initializing the vdev just yet.
779 	 */
780 	if (spdk_call_unaffinitized(_start_rte_driver, path) == NULL) {
781 		SPDK_ERRLOG("Failed to start vhost driver for controller %s (%d): %s\n",
782 			    name, errno, spdk_strerror(errno));
783 		rte_vhost_driver_unregister(path);
784 		TAILQ_REMOVE(&g_spdk_vhost_devices, vdev, tailq);
785 		free(vdev->name);
786 		free(vdev->path);
787 		rc = -EIO;
788 		goto out;
789 	}
790 
791 	SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: new controller added\n", vdev->name);
792 	return 0;
793 
794 out:
795 	spdk_cpuset_free(cpumask);
796 	return rc;
797 }
798 
799 int
800 spdk_vhost_dev_unregister(struct spdk_vhost_dev *vdev)
801 {
802 	if (!TAILQ_EMPTY(&vdev->vsessions)) {
803 		SPDK_ERRLOG("Controller %s has still valid connection.\n", vdev->name);
804 		return -EBUSY;
805 	}
806 
807 	if (vdev->registered && rte_vhost_driver_unregister(vdev->path) != 0) {
808 		SPDK_ERRLOG("Could not unregister controller %s with vhost library\n"
809 			    "Check if domain socket %s still exists\n",
810 			    vdev->name, vdev->path);
811 		return -EIO;
812 	}
813 
814 	SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: removed\n", vdev->name);
815 
816 	free(vdev->name);
817 	free(vdev->path);
818 	spdk_cpuset_free(vdev->cpumask);
819 	TAILQ_REMOVE(&g_spdk_vhost_devices, vdev, tailq);
820 	return 0;
821 }
822 
823 static struct spdk_vhost_session *
824 spdk_vhost_session_next(struct spdk_vhost_dev *vdev, unsigned prev_id)
825 {
826 	struct spdk_vhost_session *vsession;
827 
828 	TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
829 		if (vsession->id > prev_id) {
830 			return vsession;
831 		}
832 	}
833 
834 	return NULL;
835 }
836 
837 const char *
838 spdk_vhost_dev_get_name(struct spdk_vhost_dev *vdev)
839 {
840 	assert(vdev != NULL);
841 	return vdev->name;
842 }
843 
844 const struct spdk_cpuset *
845 spdk_vhost_dev_get_cpumask(struct spdk_vhost_dev *vdev)
846 {
847 	assert(vdev != NULL);
848 	return vdev->cpumask;
849 }
850 
851 uint32_t
852 spdk_vhost_allocate_reactor(struct spdk_cpuset *cpumask)
853 {
854 	uint32_t i, selected_core;
855 	uint32_t min_ctrlrs;
856 
857 	min_ctrlrs = INT_MAX;
858 	selected_core = spdk_env_get_first_core();
859 
860 	SPDK_ENV_FOREACH_CORE(i) {
861 		if (!spdk_cpuset_get_cpu(cpumask, i)) {
862 			continue;
863 		}
864 
865 		if (g_num_ctrlrs[i] < min_ctrlrs) {
866 			selected_core = i;
867 			min_ctrlrs = g_num_ctrlrs[i];
868 		}
869 	}
870 
871 	g_num_ctrlrs[selected_core]++;
872 	return selected_core;
873 }
874 
875 static void
876 complete_session_event(struct spdk_vhost_session *vsession, int response)
877 {
878 	struct spdk_vhost_session_fn_ctx *ctx = vsession->event_ctx;
879 
880 	ctx->response = response;
881 	sem_post(&ctx->sem);
882 }
883 
884 void
885 spdk_vhost_session_start_done(struct spdk_vhost_session *vsession, int response)
886 {
887 	if (response == 0) {
888 		vsession->lcore = spdk_env_get_current_core();
889 		assert(vsession->vdev->active_session_num < UINT32_MAX);
890 		vsession->vdev->active_session_num++;
891 	}
892 	complete_session_event(vsession, response);
893 }
894 
895 void
896 spdk_vhost_session_stop_done(struct spdk_vhost_session *vsession, int response)
897 {
898 	if (response == 0) {
899 		vsession->lcore = -1;
900 		assert(vsession->vdev->active_session_num > 0);
901 		vsession->vdev->active_session_num--;
902 	}
903 	complete_session_event(vsession, response);
904 }
905 
906 static void
907 spdk_vhost_event_cb(void *arg1, void *arg2)
908 {
909 	struct spdk_vhost_session_fn_ctx *ctx = arg1;
910 	struct spdk_vhost_session *vsession;
911 	struct spdk_event *ev;
912 
913 	if (pthread_mutex_trylock(&g_spdk_vhost_mutex) != 0) {
914 		ev = spdk_event_allocate(spdk_env_get_current_core(),
915 					 spdk_vhost_event_cb, arg1, arg2);
916 		spdk_event_call(ev);
917 		return;
918 	}
919 
920 	vsession = spdk_vhost_session_find_by_id(ctx->vdev, ctx->vsession_id);
921 	ctx->cb_fn(ctx->vdev, vsession, NULL);
922 	pthread_mutex_unlock(&g_spdk_vhost_mutex);
923 }
924 
925 static void spdk_vhost_external_event_foreach_continue(struct spdk_vhost_dev *vdev,
926 		struct spdk_vhost_session *vsession,
927 		spdk_vhost_session_fn fn, void *arg);
928 
929 static void
930 spdk_vhost_event_async_foreach_fn(void *arg1, void *arg2)
931 {
932 	struct spdk_vhost_session_fn_ctx *ctx = arg1;
933 	struct spdk_vhost_session *vsession = NULL;
934 	struct spdk_vhost_dev *vdev = ctx->vdev;
935 	struct spdk_event *ev;
936 	int rc;
937 
938 	if (pthread_mutex_trylock(&g_spdk_vhost_mutex) != 0) {
939 		ev = spdk_event_allocate(spdk_env_get_current_core(),
940 					 spdk_vhost_event_async_foreach_fn, arg1, arg2);
941 		spdk_event_call(ev);
942 		return;
943 	}
944 
945 	vsession = spdk_vhost_session_find_by_id(vdev, ctx->vsession_id);
946 	if (vsession == NULL || !vsession->initialized) {
947 		/* The session must have been removed in the meantime, so we
948 		 * just skip it in our foreach chain
949 		 */
950 		goto out_unlock_continue;
951 	}
952 
953 	if (vsession->lcore >= 0 &&
954 	    (uint32_t)vsession->lcore != spdk_env_get_current_core()) {
955 		/* if session has been relocated to other core, it is no longer thread-safe
956 		 * to access its contents here. Even though we're running under the global
957 		 * vhost mutex, the session itself (and its pollers) are not. We need to chase
958 		 * the session thread as many times as necessary.
959 		 */
960 		ev = spdk_event_allocate(vsession->lcore,
961 					 spdk_vhost_event_async_foreach_fn, arg1, arg2);
962 		spdk_event_call(ev);
963 		pthread_mutex_unlock(&g_spdk_vhost_mutex);
964 		return;
965 	}
966 
967 	rc = ctx->cb_fn(vdev, vsession, arg2);
968 	if (rc < 0) {
969 		goto out_unlock;
970 	}
971 
972 out_unlock_continue:
973 	vsession = spdk_vhost_session_next(vdev, ctx->vsession_id);
974 	spdk_vhost_external_event_foreach_continue(vdev, vsession, ctx->cb_fn, arg2);
975 out_unlock:
976 	pthread_mutex_unlock(&g_spdk_vhost_mutex);
977 	free(ctx);
978 }
979 
980 int
981 spdk_vhost_session_send_event(int32_t lcore, struct spdk_vhost_session *vsession,
982 			      spdk_vhost_session_fn cb_fn, unsigned timeout_sec,
983 			      const char *errmsg)
984 {
985 	struct spdk_vhost_session_fn_ctx ev_ctx = {0};
986 	struct spdk_event *ev;
987 	struct timespec timeout;
988 	int rc;
989 
990 	rc = sem_init(&ev_ctx.sem, 0, 0);
991 	if (rc != 0) {
992 		SPDK_ERRLOG("Failed to initialize semaphore for vhost timed event\n");
993 		return -errno;
994 	}
995 
996 	ev_ctx.vdev = vsession->vdev;
997 	ev_ctx.vsession_id = vsession->id;
998 	ev_ctx.cb_fn = cb_fn;
999 
1000 	vsession->event_ctx = &ev_ctx;
1001 	ev = spdk_event_allocate(lcore, spdk_vhost_event_cb, &ev_ctx, NULL);
1002 	assert(ev);
1003 	spdk_event_call(ev);
1004 	pthread_mutex_unlock(&g_spdk_vhost_mutex);
1005 
1006 	clock_gettime(CLOCK_REALTIME, &timeout);
1007 	timeout.tv_sec += timeout_sec;
1008 
1009 	rc = sem_timedwait(&ev_ctx.sem, &timeout);
1010 	if (rc != 0) {
1011 		SPDK_ERRLOG("Timeout waiting for event: %s.\n", errmsg);
1012 		sem_wait(&ev_ctx.sem);
1013 	}
1014 
1015 	sem_destroy(&ev_ctx.sem);
1016 	pthread_mutex_lock(&g_spdk_vhost_mutex);
1017 	vsession->event_ctx = NULL;
1018 	return ev_ctx.response;
1019 }
1020 
1021 static int
1022 spdk_vhost_event_async_send_foreach_continue(struct spdk_vhost_session *vsession,
1023 		spdk_vhost_session_fn cb_fn, void *arg)
1024 {
1025 	struct spdk_vhost_dev *vdev = vsession->vdev;
1026 	struct spdk_vhost_session_fn_ctx *ev_ctx;
1027 	struct spdk_event *ev;
1028 
1029 	ev_ctx = calloc(1, sizeof(*ev_ctx));
1030 	if (ev_ctx == NULL) {
1031 		SPDK_ERRLOG("Failed to alloc vhost event.\n");
1032 		assert(false);
1033 		return -ENOMEM;
1034 	}
1035 
1036 	ev_ctx->vdev = vdev;
1037 	ev_ctx->vsession_id = vsession->id;
1038 	ev_ctx->cb_fn = cb_fn;
1039 
1040 	ev = spdk_event_allocate(vsession->lcore,
1041 				 spdk_vhost_event_async_foreach_fn, ev_ctx, arg);
1042 	assert(ev);
1043 	spdk_event_call(ev);
1044 
1045 	return 0;
1046 }
1047 
1048 static void
1049 _stop_session(struct spdk_vhost_session *vsession)
1050 {
1051 	struct spdk_vhost_dev *vdev = vsession->vdev;
1052 	struct spdk_vhost_virtqueue *q;
1053 	int rc;
1054 	uint16_t i;
1055 
1056 	rc = vdev->backend->stop_session(vsession);
1057 	if (rc != 0) {
1058 		SPDK_ERRLOG("Couldn't stop device with vid %d.\n", vsession->vid);
1059 		pthread_mutex_unlock(&g_spdk_vhost_mutex);
1060 		return;
1061 	}
1062 
1063 	for (i = 0; i < vsession->max_queues; i++) {
1064 		q = &vsession->virtqueue[i];
1065 		if (q->vring.desc == NULL) {
1066 			continue;
1067 		}
1068 		rte_vhost_set_vring_base(vsession->vid, i, q->last_avail_idx, q->last_used_idx);
1069 	}
1070 
1071 	spdk_vhost_session_mem_unregister(vsession);
1072 	free(vsession->mem);
1073 }
1074 
1075 static void
1076 stop_device(int vid)
1077 {
1078 	struct spdk_vhost_session *vsession;
1079 
1080 	pthread_mutex_lock(&g_spdk_vhost_mutex);
1081 	vsession = spdk_vhost_session_find_by_vid(vid);
1082 	if (vsession == NULL) {
1083 		SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
1084 		pthread_mutex_unlock(&g_spdk_vhost_mutex);
1085 		return;
1086 	}
1087 
1088 	if (vsession->lcore == -1) {
1089 		/* already stopped, nothing to do */
1090 		pthread_mutex_unlock(&g_spdk_vhost_mutex);
1091 		return;
1092 	}
1093 
1094 	_stop_session(vsession);
1095 	pthread_mutex_unlock(&g_spdk_vhost_mutex);
1096 }
1097 
1098 static int
1099 start_device(int vid)
1100 {
1101 	struct spdk_vhost_dev *vdev;
1102 	struct spdk_vhost_session *vsession;
1103 	int rc = -1;
1104 	uint16_t i;
1105 
1106 	pthread_mutex_lock(&g_spdk_vhost_mutex);
1107 
1108 	vsession = spdk_vhost_session_find_by_vid(vid);
1109 	if (vsession == NULL) {
1110 		SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
1111 		goto out;
1112 	}
1113 
1114 	vdev = vsession->vdev;
1115 	if (vsession->lcore != -1) {
1116 		/* already started, nothing to do */
1117 		rc = 0;
1118 		goto out;
1119 	}
1120 
1121 	vsession->max_queues = 0;
1122 	memset(vsession->virtqueue, 0, sizeof(vsession->virtqueue));
1123 	for (i = 0; i < SPDK_VHOST_MAX_VQUEUES; i++) {
1124 		struct spdk_vhost_virtqueue *q = &vsession->virtqueue[i];
1125 
1126 		if (rte_vhost_get_vhost_vring(vid, i, &q->vring)) {
1127 			continue;
1128 		}
1129 
1130 		if (q->vring.desc == NULL || q->vring.size == 0) {
1131 			continue;
1132 		}
1133 
1134 		if (rte_vhost_get_vring_base(vsession->vid, i, &q->last_avail_idx, &q->last_used_idx)) {
1135 			q->vring.desc = NULL;
1136 			continue;
1137 		}
1138 
1139 		/* Disable notifications. */
1140 		if (rte_vhost_enable_guest_notification(vid, i, 0) != 0) {
1141 			SPDK_ERRLOG("vhost device %d: Failed to disable guest notification on queue %"PRIu16"\n", vid, i);
1142 			goto out;
1143 		}
1144 
1145 		vsession->max_queues = i + 1;
1146 	}
1147 
1148 	if (rte_vhost_get_negotiated_features(vid, &vsession->negotiated_features) != 0) {
1149 		SPDK_ERRLOG("vhost device %d: Failed to get negotiated driver features\n", vid);
1150 		goto out;
1151 	}
1152 
1153 	if (rte_vhost_get_mem_table(vid, &vsession->mem) != 0) {
1154 		SPDK_ERRLOG("vhost device %d: Failed to get guest memory table\n", vid);
1155 		goto out;
1156 	}
1157 
1158 	for (i = 0; i < vsession->mem->nregions; i++) {
1159 		uint64_t mmap_size = vsession->mem->regions[i].mmap_size;
1160 
1161 		if (mmap_size & MASK_2MB) {
1162 			SPDK_ERRLOG("vhost device %d: Guest mmaped memory size %" PRIx64
1163 				    " is not a 2MB multiple\n", vid, mmap_size);
1164 			free(vsession->mem);
1165 			goto out;
1166 		}
1167 	}
1168 
1169 	/*
1170 	 * Not sure right now but this look like some kind of QEMU bug and guest IO
1171 	 * might be frozed without kicking all queues after live-migration. This look like
1172 	 * the previous vhost instance failed to effectively deliver all interrupts before
1173 	 * the GET_VRING_BASE message. This shouldn't harm guest since spurious interrupts
1174 	 * should be ignored by guest virtio driver.
1175 	 *
1176 	 * Tested on QEMU 2.10.91 and 2.11.50.
1177 	 */
1178 	for (i = 0; i < vsession->max_queues; i++) {
1179 		if (vsession->virtqueue[i].vring.callfd != -1) {
1180 			eventfd_write(vsession->virtqueue[i].vring.callfd, (eventfd_t)1);
1181 		}
1182 	}
1183 
1184 	spdk_vhost_session_set_coalescing(vdev, vsession, NULL);
1185 	spdk_vhost_session_mem_register(vsession);
1186 	vsession->initialized = true;
1187 	rc = vdev->backend->start_session(vsession);
1188 	if (rc != 0) {
1189 		spdk_vhost_session_mem_unregister(vsession);
1190 		free(vsession->mem);
1191 		goto out;
1192 	}
1193 
1194 out:
1195 	pthread_mutex_unlock(&g_spdk_vhost_mutex);
1196 	return rc;
1197 }
1198 
1199 #ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
1200 static int
1201 get_config(int vid, uint8_t *config, uint32_t len)
1202 {
1203 	struct spdk_vhost_session *vsession;
1204 	struct spdk_vhost_dev *vdev;
1205 	int rc = -1;
1206 
1207 	pthread_mutex_lock(&g_spdk_vhost_mutex);
1208 	vsession = spdk_vhost_session_find_by_vid(vid);
1209 	if (vsession == NULL) {
1210 		SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
1211 		goto out;
1212 	}
1213 
1214 	vdev = vsession->vdev;
1215 	if (vdev->backend->vhost_get_config) {
1216 		rc = vdev->backend->vhost_get_config(vdev, config, len);
1217 	}
1218 
1219 out:
1220 	pthread_mutex_unlock(&g_spdk_vhost_mutex);
1221 	return rc;
1222 }
1223 
1224 static int
1225 set_config(int vid, uint8_t *config, uint32_t offset, uint32_t size, uint32_t flags)
1226 {
1227 	struct spdk_vhost_session *vsession;
1228 	struct spdk_vhost_dev *vdev;
1229 	int rc = -1;
1230 
1231 	pthread_mutex_lock(&g_spdk_vhost_mutex);
1232 	vsession = spdk_vhost_session_find_by_vid(vid);
1233 	if (vsession == NULL) {
1234 		SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
1235 		goto out;
1236 	}
1237 
1238 	vdev = vsession->vdev;
1239 	if (vdev->backend->vhost_set_config) {
1240 		rc = vdev->backend->vhost_set_config(vdev, config, offset, size, flags);
1241 	}
1242 
1243 out:
1244 	pthread_mutex_unlock(&g_spdk_vhost_mutex);
1245 	return rc;
1246 }
1247 #endif
1248 
1249 int
1250 spdk_vhost_set_socket_path(const char *basename)
1251 {
1252 	int ret;
1253 
1254 	if (basename && strlen(basename) > 0) {
1255 		ret = snprintf(dev_dirname, sizeof(dev_dirname) - 2, "%s", basename);
1256 		if (ret <= 0) {
1257 			return -EINVAL;
1258 		}
1259 		if ((size_t)ret >= sizeof(dev_dirname) - 2) {
1260 			SPDK_ERRLOG("Char dev dir path length %d is too long\n", ret);
1261 			return -EINVAL;
1262 		}
1263 
1264 		if (dev_dirname[ret - 1] != '/') {
1265 			dev_dirname[ret] = '/';
1266 			dev_dirname[ret + 1]  = '\0';
1267 		}
1268 	}
1269 
1270 	return 0;
1271 }
1272 
1273 static void *
1274 session_shutdown(void *arg)
1275 {
1276 	struct spdk_vhost_dev *vdev = NULL;
1277 
1278 	TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) {
1279 		rte_vhost_driver_unregister(vdev->path);
1280 		vdev->registered = false;
1281 	}
1282 
1283 	SPDK_INFOLOG(SPDK_LOG_VHOST, "Exiting\n");
1284 	spdk_event_call((struct spdk_event *)arg);
1285 	return NULL;
1286 }
1287 
1288 void
1289 spdk_vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
1290 {
1291 	assert(vdev->backend->dump_info_json != NULL);
1292 	vdev->backend->dump_info_json(vdev, w);
1293 }
1294 
1295 int
1296 spdk_vhost_dev_remove(struct spdk_vhost_dev *vdev)
1297 {
1298 	if (vdev->pending_async_op_num) {
1299 		return -EBUSY;
1300 	}
1301 
1302 	return vdev->backend->remove_device(vdev);
1303 }
1304 
1305 static int
1306 new_connection(int vid)
1307 {
1308 	struct spdk_vhost_dev *vdev;
1309 	struct spdk_vhost_session *vsession;
1310 	char ifname[PATH_MAX];
1311 
1312 	pthread_mutex_lock(&g_spdk_vhost_mutex);
1313 
1314 	if (rte_vhost_get_ifname(vid, ifname, PATH_MAX) < 0) {
1315 		SPDK_ERRLOG("Couldn't get a valid ifname for device with vid %d\n", vid);
1316 		pthread_mutex_unlock(&g_spdk_vhost_mutex);
1317 		return -1;
1318 	}
1319 
1320 	vdev = spdk_vhost_dev_find(ifname);
1321 	if (vdev == NULL) {
1322 		SPDK_ERRLOG("Couldn't find device with vid %d to create connection for.\n", vid);
1323 		pthread_mutex_unlock(&g_spdk_vhost_mutex);
1324 		return -1;
1325 	}
1326 
1327 	/* We expect sessions inside vdev->vsessions to be sorted in ascending
1328 	 * order in regard of vsession->id. For now we always set id = vsessions_cnt++
1329 	 * and append each session to the very end of the vsessions list.
1330 	 * This is required for spdk_vhost_dev_foreach_session() to work.
1331 	 */
1332 	if (vdev->vsessions_num == UINT_MAX) {
1333 		assert(false);
1334 		return -EINVAL;
1335 	}
1336 
1337 	if (posix_memalign((void **)&vsession, SPDK_CACHE_LINE_SIZE, sizeof(*vsession) +
1338 			   vdev->backend->session_ctx_size)) {
1339 		SPDK_ERRLOG("vsession alloc failed\n");
1340 		pthread_mutex_unlock(&g_spdk_vhost_mutex);
1341 		return -1;
1342 	}
1343 	memset(vsession, 0, sizeof(*vsession) + vdev->backend->session_ctx_size);
1344 
1345 	vsession->vdev = vdev;
1346 	vsession->id = vdev->vsessions_num++;
1347 	vsession->vid = vid;
1348 	vsession->lcore = -1;
1349 	vsession->initialized = false;
1350 	vsession->next_stats_check_time = 0;
1351 	vsession->stats_check_interval = SPDK_VHOST_STATS_CHECK_INTERVAL_MS *
1352 					 spdk_get_ticks_hz() / 1000UL;
1353 	TAILQ_INSERT_TAIL(&vdev->vsessions, vsession, tailq);
1354 
1355 	spdk_vhost_session_install_rte_compat_hooks(vsession);
1356 	pthread_mutex_unlock(&g_spdk_vhost_mutex);
1357 	return 0;
1358 }
1359 
1360 static void
1361 destroy_connection(int vid)
1362 {
1363 	struct spdk_vhost_session *vsession;
1364 
1365 	pthread_mutex_lock(&g_spdk_vhost_mutex);
1366 	vsession = spdk_vhost_session_find_by_vid(vid);
1367 	if (vsession == NULL) {
1368 		SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
1369 		pthread_mutex_unlock(&g_spdk_vhost_mutex);
1370 		return;
1371 	}
1372 
1373 	if (vsession->lcore != -1) {
1374 		_stop_session(vsession);
1375 	}
1376 
1377 	TAILQ_REMOVE(&vsession->vdev->vsessions, vsession, tailq);
1378 	free(vsession);
1379 	pthread_mutex_unlock(&g_spdk_vhost_mutex);
1380 }
1381 
1382 static void
1383 spdk_vhost_external_event_foreach_continue(struct spdk_vhost_dev *vdev,
1384 		struct spdk_vhost_session *vsession,
1385 		spdk_vhost_session_fn fn, void *arg)
1386 {
1387 	int rc;
1388 
1389 	if (vsession == NULL) {
1390 		goto out_finish_foreach;
1391 	}
1392 
1393 	while (vsession->lcore == -1) {
1394 		if (vsession->initialized) {
1395 			rc = fn(vdev, vsession, arg);
1396 			if (rc < 0) {
1397 				return;
1398 			}
1399 		}
1400 
1401 		vsession = spdk_vhost_session_next(vdev, vsession->id);
1402 		if (vsession == NULL) {
1403 			goto out_finish_foreach;
1404 		}
1405 	}
1406 
1407 	spdk_vhost_event_async_send_foreach_continue(vsession, fn, arg);
1408 	return;
1409 
1410 out_finish_foreach:
1411 	/* there are no more sessions to iterate through, so call the
1412 	 * fn one last time with vsession == NULL
1413 	 */
1414 	assert(vdev->pending_async_op_num > 0);
1415 	vdev->pending_async_op_num--;
1416 	fn(vdev, NULL, arg);
1417 }
1418 
1419 void
1420 spdk_vhost_dev_foreach_session(struct spdk_vhost_dev *vdev,
1421 			       spdk_vhost_session_fn fn, void *arg)
1422 {
1423 	struct spdk_vhost_session *vsession = TAILQ_FIRST(&vdev->vsessions);
1424 
1425 	assert(vdev->pending_async_op_num < UINT32_MAX);
1426 	vdev->pending_async_op_num++;
1427 	spdk_vhost_external_event_foreach_continue(vdev, vsession, fn, arg);
1428 }
1429 
1430 void
1431 spdk_vhost_lock(void)
1432 {
1433 	pthread_mutex_lock(&g_spdk_vhost_mutex);
1434 }
1435 
1436 int
1437 spdk_vhost_trylock(void)
1438 {
1439 	return -pthread_mutex_trylock(&g_spdk_vhost_mutex);
1440 }
1441 
1442 void
1443 spdk_vhost_unlock(void)
1444 {
1445 	pthread_mutex_unlock(&g_spdk_vhost_mutex);
1446 }
1447 
1448 int
1449 spdk_vhost_init(void)
1450 {
1451 	uint32_t last_core;
1452 	size_t len;
1453 	int ret;
1454 
1455 	if (dev_dirname[0] == '\0') {
1456 		if (getcwd(dev_dirname, sizeof(dev_dirname) - 1) == NULL) {
1457 			SPDK_ERRLOG("getcwd failed (%d): %s\n", errno, spdk_strerror(errno));
1458 			return -1;
1459 		}
1460 
1461 		len = strlen(dev_dirname);
1462 		if (dev_dirname[len - 1] != '/') {
1463 			dev_dirname[len] = '/';
1464 			dev_dirname[len + 1] = '\0';
1465 		}
1466 	}
1467 
1468 	last_core = spdk_env_get_last_core();
1469 	g_num_ctrlrs = calloc(last_core + 1, sizeof(uint32_t));
1470 	if (!g_num_ctrlrs) {
1471 		SPDK_ERRLOG("Could not allocate array size=%u for g_num_ctrlrs\n",
1472 			    last_core + 1);
1473 		return -1;
1474 	}
1475 
1476 	ret = spdk_vhost_scsi_controller_construct();
1477 	if (ret != 0) {
1478 		SPDK_ERRLOG("Cannot construct vhost controllers\n");
1479 		return -1;
1480 	}
1481 
1482 	ret = spdk_vhost_blk_controller_construct();
1483 	if (ret != 0) {
1484 		SPDK_ERRLOG("Cannot construct vhost block controllers\n");
1485 		return -1;
1486 	}
1487 
1488 #ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
1489 	ret = spdk_vhost_nvme_controller_construct();
1490 	if (ret != 0) {
1491 		SPDK_ERRLOG("Cannot construct vhost NVMe controllers\n");
1492 		return -1;
1493 	}
1494 #endif
1495 
1496 	return 0;
1497 }
1498 
1499 static void
1500 _spdk_vhost_fini(void *arg1, void *arg2)
1501 {
1502 	spdk_vhost_fini_cb fini_cb = arg1;
1503 	struct spdk_vhost_dev *vdev, *tmp;
1504 
1505 	spdk_vhost_lock();
1506 	vdev = spdk_vhost_dev_next(NULL);
1507 	while (vdev != NULL) {
1508 		tmp = spdk_vhost_dev_next(vdev);
1509 		spdk_vhost_dev_remove(vdev);
1510 		/* don't care if it fails, there's nothing we can do for now */
1511 		vdev = tmp;
1512 	}
1513 	spdk_vhost_unlock();
1514 
1515 	/* All devices are removed now. */
1516 	free(g_num_ctrlrs);
1517 	fini_cb();
1518 }
1519 
1520 void
1521 spdk_vhost_fini(spdk_vhost_fini_cb fini_cb)
1522 {
1523 	pthread_t tid;
1524 	int rc;
1525 	struct spdk_event *fini_ev;
1526 
1527 	fini_ev = spdk_event_allocate(spdk_env_get_current_core(), _spdk_vhost_fini, fini_cb, NULL);
1528 
1529 	/* rte_vhost API for removing sockets is not asynchronous. Since it may call SPDK
1530 	 * ops for stopping a device or removing a connection, we need to call it from
1531 	 * a separate thread to avoid deadlock.
1532 	 */
1533 	rc = pthread_create(&tid, NULL, &session_shutdown, fini_ev);
1534 	if (rc < 0) {
1535 		SPDK_ERRLOG("Failed to start session shutdown thread (%d): %s\n", rc, spdk_strerror(rc));
1536 		abort();
1537 	}
1538 	pthread_detach(tid);
1539 }
1540 
1541 void
1542 spdk_vhost_config_json(struct spdk_json_write_ctx *w)
1543 {
1544 	struct spdk_vhost_dev *vdev;
1545 	uint32_t delay_base_us;
1546 	uint32_t iops_threshold;
1547 
1548 	spdk_json_write_array_begin(w);
1549 
1550 	spdk_vhost_lock();
1551 	vdev = spdk_vhost_dev_next(NULL);
1552 	while (vdev != NULL) {
1553 		vdev->backend->write_config_json(vdev, w);
1554 
1555 		spdk_vhost_get_coalescing(vdev, &delay_base_us, &iops_threshold);
1556 		if (delay_base_us) {
1557 			spdk_json_write_object_begin(w);
1558 			spdk_json_write_named_string(w, "method", "set_vhost_controller_coalescing");
1559 
1560 			spdk_json_write_named_object_begin(w, "params");
1561 			spdk_json_write_named_string(w, "ctrlr", vdev->name);
1562 			spdk_json_write_named_uint32(w, "delay_base_us", delay_base_us);
1563 			spdk_json_write_named_uint32(w, "iops_threshold", iops_threshold);
1564 			spdk_json_write_object_end(w);
1565 
1566 			spdk_json_write_object_end(w);
1567 		}
1568 		vdev = spdk_vhost_dev_next(vdev);
1569 	}
1570 	spdk_vhost_unlock();
1571 
1572 	spdk_json_write_array_end(w);
1573 }
1574 
1575 SPDK_LOG_REGISTER_COMPONENT("vhost", SPDK_LOG_VHOST)
1576 SPDK_LOG_REGISTER_COMPONENT("vhost_ring", SPDK_LOG_VHOST_RING)
1577