xref: /dpdk/drivers/net/vhost/rte_eth_vhost.c (revision 1cde1b9a9b4dbf31cb5e5ccdfc5da3cb079f43a2)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016 IGEL Co., Ltd.
3  * Copyright(c) 2016-2018 Intel Corporation
4  */
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <stdbool.h>
8 
9 #include <rte_mbuf.h>
10 #include <rte_ethdev_driver.h>
11 #include <rte_ethdev_vdev.h>
12 #include <rte_malloc.h>
13 #include <rte_memcpy.h>
14 #include <rte_bus_vdev.h>
15 #include <rte_kvargs.h>
16 #include <rte_vhost.h>
17 #include <rte_spinlock.h>
18 
19 #include "rte_eth_vhost.h"
20 
21 static int vhost_logtype;
22 
23 #define VHOST_LOG(level, ...) \
24 	rte_log(RTE_LOG_ ## level, vhost_logtype, __VA_ARGS__)
25 
26 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
27 
28 #define ETH_VHOST_IFACE_ARG		"iface"
29 #define ETH_VHOST_QUEUES_ARG		"queues"
30 #define ETH_VHOST_CLIENT_ARG		"client"
31 #define ETH_VHOST_DEQUEUE_ZERO_COPY	"dequeue-zero-copy"
32 #define ETH_VHOST_IOMMU_SUPPORT		"iommu-support"
33 #define ETH_VHOST_POSTCOPY_SUPPORT	"postcopy-support"
34 #define VHOST_MAX_PKT_BURST 32
35 
36 static const char *valid_arguments[] = {
37 	ETH_VHOST_IFACE_ARG,
38 	ETH_VHOST_QUEUES_ARG,
39 	ETH_VHOST_CLIENT_ARG,
40 	ETH_VHOST_DEQUEUE_ZERO_COPY,
41 	ETH_VHOST_IOMMU_SUPPORT,
42 	ETH_VHOST_POSTCOPY_SUPPORT,
43 	NULL
44 };
45 
46 static struct rte_ether_addr base_eth_addr = {
47 	.addr_bytes = {
48 		0x56 /* V */,
49 		0x48 /* H */,
50 		0x4F /* O */,
51 		0x53 /* S */,
52 		0x54 /* T */,
53 		0x00
54 	}
55 };
56 
57 enum vhost_xstats_pkts {
58 	VHOST_UNDERSIZE_PKT = 0,
59 	VHOST_64_PKT,
60 	VHOST_65_TO_127_PKT,
61 	VHOST_128_TO_255_PKT,
62 	VHOST_256_TO_511_PKT,
63 	VHOST_512_TO_1023_PKT,
64 	VHOST_1024_TO_1522_PKT,
65 	VHOST_1523_TO_MAX_PKT,
66 	VHOST_BROADCAST_PKT,
67 	VHOST_MULTICAST_PKT,
68 	VHOST_UNICAST_PKT,
69 	VHOST_ERRORS_PKT,
70 	VHOST_ERRORS_FRAGMENTED,
71 	VHOST_ERRORS_JABBER,
72 	VHOST_UNKNOWN_PROTOCOL,
73 	VHOST_XSTATS_MAX,
74 };
75 
76 struct vhost_stats {
77 	uint64_t pkts;
78 	uint64_t bytes;
79 	uint64_t missed_pkts;
80 	uint64_t xstats[VHOST_XSTATS_MAX];
81 };
82 
83 struct vhost_queue {
84 	int vid;
85 	rte_atomic32_t allow_queuing;
86 	rte_atomic32_t while_queuing;
87 	struct pmd_internal *internal;
88 	struct rte_mempool *mb_pool;
89 	uint16_t port;
90 	uint16_t virtqueue_id;
91 	struct vhost_stats stats;
92 };
93 
94 struct pmd_internal {
95 	rte_atomic32_t dev_attached;
96 	char *dev_name;
97 	char *iface_name;
98 	uint16_t max_queues;
99 	int vid;
100 	rte_atomic32_t started;
101 	uint8_t vlan_strip;
102 };
103 
104 struct internal_list {
105 	TAILQ_ENTRY(internal_list) next;
106 	struct rte_eth_dev *eth_dev;
107 };
108 
109 TAILQ_HEAD(internal_list_head, internal_list);
110 static struct internal_list_head internal_list =
111 	TAILQ_HEAD_INITIALIZER(internal_list);
112 
113 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
114 
115 static struct rte_eth_link pmd_link = {
116 		.link_speed = 10000,
117 		.link_duplex = ETH_LINK_FULL_DUPLEX,
118 		.link_status = ETH_LINK_DOWN
119 };
120 
121 struct rte_vhost_vring_state {
122 	rte_spinlock_t lock;
123 
124 	bool cur[RTE_MAX_QUEUES_PER_PORT * 2];
125 	bool seen[RTE_MAX_QUEUES_PER_PORT * 2];
126 	unsigned int index;
127 	unsigned int max_vring;
128 };
129 
130 static struct rte_vhost_vring_state *vring_states[RTE_MAX_ETHPORTS];
131 
132 #define VHOST_XSTATS_NAME_SIZE 64
133 
134 struct vhost_xstats_name_off {
135 	char name[VHOST_XSTATS_NAME_SIZE];
136 	uint64_t offset;
137 };
138 
139 /* [rx]_is prepended to the name string here */
140 static const struct vhost_xstats_name_off vhost_rxport_stat_strings[] = {
141 	{"good_packets",
142 	 offsetof(struct vhost_queue, stats.pkts)},
143 	{"total_bytes",
144 	 offsetof(struct vhost_queue, stats.bytes)},
145 	{"missed_pkts",
146 	 offsetof(struct vhost_queue, stats.missed_pkts)},
147 	{"broadcast_packets",
148 	 offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
149 	{"multicast_packets",
150 	 offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
151 	{"unicast_packets",
152 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
153 	 {"undersize_packets",
154 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
155 	{"size_64_packets",
156 	 offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
157 	{"size_65_to_127_packets",
158 	 offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
159 	{"size_128_to_255_packets",
160 	 offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
161 	{"size_256_to_511_packets",
162 	 offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
163 	{"size_512_to_1023_packets",
164 	 offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
165 	{"size_1024_to_1522_packets",
166 	 offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
167 	{"size_1523_to_max_packets",
168 	 offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
169 	{"errors_with_bad_CRC",
170 	 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
171 	{"fragmented_errors",
172 	 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_FRAGMENTED])},
173 	{"jabber_errors",
174 	 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_JABBER])},
175 	{"unknown_protos_packets",
176 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNKNOWN_PROTOCOL])},
177 };
178 
179 /* [tx]_ is prepended to the name string here */
180 static const struct vhost_xstats_name_off vhost_txport_stat_strings[] = {
181 	{"good_packets",
182 	 offsetof(struct vhost_queue, stats.pkts)},
183 	{"total_bytes",
184 	 offsetof(struct vhost_queue, stats.bytes)},
185 	{"missed_pkts",
186 	 offsetof(struct vhost_queue, stats.missed_pkts)},
187 	{"broadcast_packets",
188 	 offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
189 	{"multicast_packets",
190 	 offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
191 	{"unicast_packets",
192 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
193 	{"undersize_packets",
194 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
195 	{"size_64_packets",
196 	 offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
197 	{"size_65_to_127_packets",
198 	 offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
199 	{"size_128_to_255_packets",
200 	 offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
201 	{"size_256_to_511_packets",
202 	 offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
203 	{"size_512_to_1023_packets",
204 	 offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
205 	{"size_1024_to_1522_packets",
206 	 offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
207 	{"size_1523_to_max_packets",
208 	 offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
209 	{"errors_with_bad_CRC",
210 	 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
211 };
212 
213 #define VHOST_NB_XSTATS_RXPORT (sizeof(vhost_rxport_stat_strings) / \
214 				sizeof(vhost_rxport_stat_strings[0]))
215 
216 #define VHOST_NB_XSTATS_TXPORT (sizeof(vhost_txport_stat_strings) / \
217 				sizeof(vhost_txport_stat_strings[0]))
218 
219 static int
220 vhost_dev_xstats_reset(struct rte_eth_dev *dev)
221 {
222 	struct vhost_queue *vq = NULL;
223 	unsigned int i = 0;
224 
225 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
226 		vq = dev->data->rx_queues[i];
227 		if (!vq)
228 			continue;
229 		memset(&vq->stats, 0, sizeof(vq->stats));
230 	}
231 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
232 		vq = dev->data->tx_queues[i];
233 		if (!vq)
234 			continue;
235 		memset(&vq->stats, 0, sizeof(vq->stats));
236 	}
237 
238 	return 0;
239 }
240 
241 static int
242 vhost_dev_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
243 			   struct rte_eth_xstat_name *xstats_names,
244 			   unsigned int limit __rte_unused)
245 {
246 	unsigned int t = 0;
247 	int count = 0;
248 	int nstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
249 
250 	if (!xstats_names)
251 		return nstats;
252 	for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
253 		snprintf(xstats_names[count].name,
254 			 sizeof(xstats_names[count].name),
255 			 "rx_%s", vhost_rxport_stat_strings[t].name);
256 		count++;
257 	}
258 	for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
259 		snprintf(xstats_names[count].name,
260 			 sizeof(xstats_names[count].name),
261 			 "tx_%s", vhost_txport_stat_strings[t].name);
262 		count++;
263 	}
264 	return count;
265 }
266 
267 static int
268 vhost_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats,
269 		     unsigned int n)
270 {
271 	unsigned int i;
272 	unsigned int t;
273 	unsigned int count = 0;
274 	struct vhost_queue *vq = NULL;
275 	unsigned int nxstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
276 
277 	if (n < nxstats)
278 		return nxstats;
279 
280 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
281 		vq = dev->data->rx_queues[i];
282 		if (!vq)
283 			continue;
284 		vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
285 				- (vq->stats.xstats[VHOST_BROADCAST_PKT]
286 				+ vq->stats.xstats[VHOST_MULTICAST_PKT]);
287 	}
288 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
289 		vq = dev->data->tx_queues[i];
290 		if (!vq)
291 			continue;
292 		vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
293 				+ vq->stats.missed_pkts
294 				- (vq->stats.xstats[VHOST_BROADCAST_PKT]
295 				+ vq->stats.xstats[VHOST_MULTICAST_PKT]);
296 	}
297 	for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
298 		xstats[count].value = 0;
299 		for (i = 0; i < dev->data->nb_rx_queues; i++) {
300 			vq = dev->data->rx_queues[i];
301 			if (!vq)
302 				continue;
303 			xstats[count].value +=
304 				*(uint64_t *)(((char *)vq)
305 				+ vhost_rxport_stat_strings[t].offset);
306 		}
307 		xstats[count].id = count;
308 		count++;
309 	}
310 	for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
311 		xstats[count].value = 0;
312 		for (i = 0; i < dev->data->nb_tx_queues; i++) {
313 			vq = dev->data->tx_queues[i];
314 			if (!vq)
315 				continue;
316 			xstats[count].value +=
317 				*(uint64_t *)(((char *)vq)
318 				+ vhost_txport_stat_strings[t].offset);
319 		}
320 		xstats[count].id = count;
321 		count++;
322 	}
323 	return count;
324 }
325 
326 static inline void
327 vhost_count_multicast_broadcast(struct vhost_queue *vq,
328 				struct rte_mbuf *mbuf)
329 {
330 	struct rte_ether_addr *ea = NULL;
331 	struct vhost_stats *pstats = &vq->stats;
332 
333 	ea = rte_pktmbuf_mtod(mbuf, struct rte_ether_addr *);
334 	if (rte_is_multicast_ether_addr(ea)) {
335 		if (rte_is_broadcast_ether_addr(ea))
336 			pstats->xstats[VHOST_BROADCAST_PKT]++;
337 		else
338 			pstats->xstats[VHOST_MULTICAST_PKT]++;
339 	}
340 }
341 
342 static void
343 vhost_update_packet_xstats(struct vhost_queue *vq,
344 			   struct rte_mbuf **bufs,
345 			   uint16_t count)
346 {
347 	uint32_t pkt_len = 0;
348 	uint64_t i = 0;
349 	uint64_t index;
350 	struct vhost_stats *pstats = &vq->stats;
351 
352 	for (i = 0; i < count ; i++) {
353 		pkt_len = bufs[i]->pkt_len;
354 		if (pkt_len == 64) {
355 			pstats->xstats[VHOST_64_PKT]++;
356 		} else if (pkt_len > 64 && pkt_len < 1024) {
357 			index = (sizeof(pkt_len) * 8)
358 				- __builtin_clz(pkt_len) - 5;
359 			pstats->xstats[index]++;
360 		} else {
361 			if (pkt_len < 64)
362 				pstats->xstats[VHOST_UNDERSIZE_PKT]++;
363 			else if (pkt_len <= 1522)
364 				pstats->xstats[VHOST_1024_TO_1522_PKT]++;
365 			else if (pkt_len > 1522)
366 				pstats->xstats[VHOST_1523_TO_MAX_PKT]++;
367 		}
368 		vhost_count_multicast_broadcast(vq, bufs[i]);
369 	}
370 }
371 
372 static uint16_t
373 eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
374 {
375 	struct vhost_queue *r = q;
376 	uint16_t i, nb_rx = 0;
377 	uint16_t nb_receive = nb_bufs;
378 
379 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
380 		return 0;
381 
382 	rte_atomic32_set(&r->while_queuing, 1);
383 
384 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
385 		goto out;
386 
387 	/* Dequeue packets from guest TX queue */
388 	while (nb_receive) {
389 		uint16_t nb_pkts;
390 		uint16_t num = (uint16_t)RTE_MIN(nb_receive,
391 						 VHOST_MAX_PKT_BURST);
392 
393 		nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
394 						  r->mb_pool, &bufs[nb_rx],
395 						  num);
396 
397 		nb_rx += nb_pkts;
398 		nb_receive -= nb_pkts;
399 		if (nb_pkts < num)
400 			break;
401 	}
402 
403 	r->stats.pkts += nb_rx;
404 
405 	for (i = 0; likely(i < nb_rx); i++) {
406 		bufs[i]->port = r->port;
407 		bufs[i]->vlan_tci = 0;
408 
409 		if (r->internal->vlan_strip)
410 			rte_vlan_strip(bufs[i]);
411 
412 		r->stats.bytes += bufs[i]->pkt_len;
413 	}
414 
415 	vhost_update_packet_xstats(r, bufs, nb_rx);
416 
417 out:
418 	rte_atomic32_set(&r->while_queuing, 0);
419 
420 	return nb_rx;
421 }
422 
423 static uint16_t
424 eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
425 {
426 	struct vhost_queue *r = q;
427 	uint16_t i, nb_tx = 0;
428 	uint16_t nb_send = 0;
429 
430 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
431 		return 0;
432 
433 	rte_atomic32_set(&r->while_queuing, 1);
434 
435 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
436 		goto out;
437 
438 	for (i = 0; i < nb_bufs; i++) {
439 		struct rte_mbuf *m = bufs[i];
440 
441 		/* Do VLAN tag insertion */
442 		if (m->ol_flags & PKT_TX_VLAN_PKT) {
443 			int error = rte_vlan_insert(&m);
444 			if (unlikely(error)) {
445 				rte_pktmbuf_free(m);
446 				continue;
447 			}
448 		}
449 
450 		bufs[nb_send] = m;
451 		++nb_send;
452 	}
453 
454 	/* Enqueue packets to guest RX queue */
455 	while (nb_send) {
456 		uint16_t nb_pkts;
457 		uint16_t num = (uint16_t)RTE_MIN(nb_send,
458 						 VHOST_MAX_PKT_BURST);
459 
460 		nb_pkts = rte_vhost_enqueue_burst(r->vid, r->virtqueue_id,
461 						  &bufs[nb_tx], num);
462 
463 		nb_tx += nb_pkts;
464 		nb_send -= nb_pkts;
465 		if (nb_pkts < num)
466 			break;
467 	}
468 
469 	r->stats.pkts += nb_tx;
470 	r->stats.missed_pkts += nb_bufs - nb_tx;
471 
472 	for (i = 0; likely(i < nb_tx); i++)
473 		r->stats.bytes += bufs[i]->pkt_len;
474 
475 	vhost_update_packet_xstats(r, bufs, nb_tx);
476 
477 	/* According to RFC2863 page42 section ifHCOutMulticastPkts and
478 	 * ifHCOutBroadcastPkts, the counters "multicast" and "broadcast"
479 	 * are increased when packets are not transmitted successfully.
480 	 */
481 	for (i = nb_tx; i < nb_bufs; i++)
482 		vhost_count_multicast_broadcast(r, bufs[i]);
483 
484 	for (i = 0; likely(i < nb_tx); i++)
485 		rte_pktmbuf_free(bufs[i]);
486 out:
487 	rte_atomic32_set(&r->while_queuing, 0);
488 
489 	return nb_tx;
490 }
491 
492 static int
493 eth_dev_configure(struct rte_eth_dev *dev __rte_unused)
494 {
495 	struct pmd_internal *internal = dev->data->dev_private;
496 	const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
497 
498 	internal->vlan_strip = !!(rxmode->offloads & DEV_RX_OFFLOAD_VLAN_STRIP);
499 
500 	return 0;
501 }
502 
503 static inline struct internal_list *
504 find_internal_resource(char *ifname)
505 {
506 	int found = 0;
507 	struct internal_list *list;
508 	struct pmd_internal *internal;
509 
510 	if (ifname == NULL)
511 		return NULL;
512 
513 	pthread_mutex_lock(&internal_list_lock);
514 
515 	TAILQ_FOREACH(list, &internal_list, next) {
516 		internal = list->eth_dev->data->dev_private;
517 		if (!strcmp(internal->iface_name, ifname)) {
518 			found = 1;
519 			break;
520 		}
521 	}
522 
523 	pthread_mutex_unlock(&internal_list_lock);
524 
525 	if (!found)
526 		return NULL;
527 
528 	return list;
529 }
530 
531 static int
532 eth_rxq_intr_enable(struct rte_eth_dev *dev, uint16_t qid)
533 {
534 	struct rte_vhost_vring vring;
535 	struct vhost_queue *vq;
536 	int ret = 0;
537 
538 	vq = dev->data->rx_queues[qid];
539 	if (!vq) {
540 		VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
541 		return -1;
542 	}
543 
544 	ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
545 	if (ret < 0) {
546 		VHOST_LOG(ERR, "Failed to get rxq%d's vring\n", qid);
547 		return ret;
548 	}
549 	VHOST_LOG(INFO, "Enable interrupt for rxq%d\n", qid);
550 	rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 1);
551 	rte_wmb();
552 
553 	return ret;
554 }
555 
556 static int
557 eth_rxq_intr_disable(struct rte_eth_dev *dev, uint16_t qid)
558 {
559 	struct rte_vhost_vring vring;
560 	struct vhost_queue *vq;
561 	int ret = 0;
562 
563 	vq = dev->data->rx_queues[qid];
564 	if (!vq) {
565 		VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
566 		return -1;
567 	}
568 
569 	ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
570 	if (ret < 0) {
571 		VHOST_LOG(ERR, "Failed to get rxq%d's vring", qid);
572 		return ret;
573 	}
574 	VHOST_LOG(INFO, "Disable interrupt for rxq%d\n", qid);
575 	rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 0);
576 	rte_wmb();
577 
578 	return 0;
579 }
580 
581 static void
582 eth_vhost_uninstall_intr(struct rte_eth_dev *dev)
583 {
584 	struct rte_intr_handle *intr_handle = dev->intr_handle;
585 
586 	if (intr_handle) {
587 		if (intr_handle->intr_vec)
588 			free(intr_handle->intr_vec);
589 		free(intr_handle);
590 	}
591 
592 	dev->intr_handle = NULL;
593 }
594 
595 static int
596 eth_vhost_install_intr(struct rte_eth_dev *dev)
597 {
598 	struct rte_vhost_vring vring;
599 	struct vhost_queue *vq;
600 	int count = 0;
601 	int nb_rxq = dev->data->nb_rx_queues;
602 	int i;
603 	int ret;
604 
605 	/* uninstall firstly if we are reconnecting */
606 	if (dev->intr_handle)
607 		eth_vhost_uninstall_intr(dev);
608 
609 	dev->intr_handle = malloc(sizeof(*dev->intr_handle));
610 	if (!dev->intr_handle) {
611 		VHOST_LOG(ERR, "Fail to allocate intr_handle\n");
612 		return -ENOMEM;
613 	}
614 	memset(dev->intr_handle, 0, sizeof(*dev->intr_handle));
615 
616 	dev->intr_handle->efd_counter_size = sizeof(uint64_t);
617 
618 	dev->intr_handle->intr_vec =
619 		malloc(nb_rxq * sizeof(dev->intr_handle->intr_vec[0]));
620 
621 	if (!dev->intr_handle->intr_vec) {
622 		VHOST_LOG(ERR,
623 			"Failed to allocate memory for interrupt vector\n");
624 		free(dev->intr_handle);
625 		return -ENOMEM;
626 	}
627 
628 	VHOST_LOG(INFO, "Prepare intr vec\n");
629 	for (i = 0; i < nb_rxq; i++) {
630 		vq = dev->data->rx_queues[i];
631 		if (!vq) {
632 			VHOST_LOG(INFO, "rxq-%d not setup yet, skip!\n", i);
633 			continue;
634 		}
635 
636 		ret = rte_vhost_get_vhost_vring(vq->vid, (i << 1) + 1, &vring);
637 		if (ret < 0) {
638 			VHOST_LOG(INFO,
639 				"Failed to get rxq-%d's vring, skip!\n", i);
640 			continue;
641 		}
642 
643 		if (vring.kickfd < 0) {
644 			VHOST_LOG(INFO,
645 				"rxq-%d's kickfd is invalid, skip!\n", i);
646 			continue;
647 		}
648 		dev->intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + i;
649 		dev->intr_handle->efds[i] = vring.kickfd;
650 		count++;
651 		VHOST_LOG(INFO, "Installed intr vec for rxq-%d\n", i);
652 	}
653 
654 	dev->intr_handle->nb_efd = count;
655 	dev->intr_handle->max_intr = count + 1;
656 	dev->intr_handle->type = RTE_INTR_HANDLE_VDEV;
657 
658 	return 0;
659 }
660 
661 static void
662 update_queuing_status(struct rte_eth_dev *dev)
663 {
664 	struct pmd_internal *internal = dev->data->dev_private;
665 	struct vhost_queue *vq;
666 	unsigned int i;
667 	int allow_queuing = 1;
668 
669 	if (!dev->data->rx_queues || !dev->data->tx_queues)
670 		return;
671 
672 	if (rte_atomic32_read(&internal->started) == 0 ||
673 	    rte_atomic32_read(&internal->dev_attached) == 0)
674 		allow_queuing = 0;
675 
676 	/* Wait until rx/tx_pkt_burst stops accessing vhost device */
677 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
678 		vq = dev->data->rx_queues[i];
679 		if (vq == NULL)
680 			continue;
681 		rte_atomic32_set(&vq->allow_queuing, allow_queuing);
682 		while (rte_atomic32_read(&vq->while_queuing))
683 			rte_pause();
684 	}
685 
686 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
687 		vq = dev->data->tx_queues[i];
688 		if (vq == NULL)
689 			continue;
690 		rte_atomic32_set(&vq->allow_queuing, allow_queuing);
691 		while (rte_atomic32_read(&vq->while_queuing))
692 			rte_pause();
693 	}
694 }
695 
696 static void
697 queue_setup(struct rte_eth_dev *eth_dev, struct pmd_internal *internal)
698 {
699 	struct vhost_queue *vq;
700 	int i;
701 
702 	for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
703 		vq = eth_dev->data->rx_queues[i];
704 		if (!vq)
705 			continue;
706 		vq->vid = internal->vid;
707 		vq->internal = internal;
708 		vq->port = eth_dev->data->port_id;
709 	}
710 	for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
711 		vq = eth_dev->data->tx_queues[i];
712 		if (!vq)
713 			continue;
714 		vq->vid = internal->vid;
715 		vq->internal = internal;
716 		vq->port = eth_dev->data->port_id;
717 	}
718 }
719 
720 static int
721 new_device(int vid)
722 {
723 	struct rte_eth_dev *eth_dev;
724 	struct internal_list *list;
725 	struct pmd_internal *internal;
726 	struct rte_eth_conf *dev_conf;
727 	unsigned i;
728 	char ifname[PATH_MAX];
729 #ifdef RTE_LIBRTE_VHOST_NUMA
730 	int newnode;
731 #endif
732 
733 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
734 	list = find_internal_resource(ifname);
735 	if (list == NULL) {
736 		VHOST_LOG(INFO, "Invalid device name: %s\n", ifname);
737 		return -1;
738 	}
739 
740 	eth_dev = list->eth_dev;
741 	internal = eth_dev->data->dev_private;
742 	dev_conf = &eth_dev->data->dev_conf;
743 
744 #ifdef RTE_LIBRTE_VHOST_NUMA
745 	newnode = rte_vhost_get_numa_node(vid);
746 	if (newnode >= 0)
747 		eth_dev->data->numa_node = newnode;
748 #endif
749 
750 	internal->vid = vid;
751 	if (rte_atomic32_read(&internal->started) == 1) {
752 		queue_setup(eth_dev, internal);
753 
754 		if (dev_conf->intr_conf.rxq) {
755 			if (eth_vhost_install_intr(eth_dev) < 0) {
756 				VHOST_LOG(INFO,
757 					"Failed to install interrupt handler.");
758 					return -1;
759 			}
760 		}
761 	} else {
762 		VHOST_LOG(INFO, "RX/TX queues not exist yet\n");
763 	}
764 
765 	for (i = 0; i < rte_vhost_get_vring_num(vid); i++)
766 		rte_vhost_enable_guest_notification(vid, i, 0);
767 
768 	rte_vhost_get_mtu(vid, &eth_dev->data->mtu);
769 
770 	eth_dev->data->dev_link.link_status = ETH_LINK_UP;
771 
772 	rte_atomic32_set(&internal->dev_attached, 1);
773 	update_queuing_status(eth_dev);
774 
775 	VHOST_LOG(INFO, "Vhost device %d created\n", vid);
776 
777 	_rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
778 
779 	return 0;
780 }
781 
782 static void
783 destroy_device(int vid)
784 {
785 	struct rte_eth_dev *eth_dev;
786 	struct pmd_internal *internal;
787 	struct vhost_queue *vq;
788 	struct internal_list *list;
789 	char ifname[PATH_MAX];
790 	unsigned i;
791 	struct rte_vhost_vring_state *state;
792 
793 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
794 	list = find_internal_resource(ifname);
795 	if (list == NULL) {
796 		VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
797 		return;
798 	}
799 	eth_dev = list->eth_dev;
800 	internal = eth_dev->data->dev_private;
801 
802 	rte_atomic32_set(&internal->dev_attached, 0);
803 	update_queuing_status(eth_dev);
804 
805 	eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
806 
807 	if (eth_dev->data->rx_queues && eth_dev->data->tx_queues) {
808 		for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
809 			vq = eth_dev->data->rx_queues[i];
810 			if (!vq)
811 				continue;
812 			vq->vid = -1;
813 		}
814 		for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
815 			vq = eth_dev->data->tx_queues[i];
816 			if (!vq)
817 				continue;
818 			vq->vid = -1;
819 		}
820 	}
821 
822 	state = vring_states[eth_dev->data->port_id];
823 	rte_spinlock_lock(&state->lock);
824 	for (i = 0; i <= state->max_vring; i++) {
825 		state->cur[i] = false;
826 		state->seen[i] = false;
827 	}
828 	state->max_vring = 0;
829 	rte_spinlock_unlock(&state->lock);
830 
831 	VHOST_LOG(INFO, "Vhost device %d destroyed\n", vid);
832 	eth_vhost_uninstall_intr(eth_dev);
833 
834 	_rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
835 }
836 
837 static int
838 vring_state_changed(int vid, uint16_t vring, int enable)
839 {
840 	struct rte_vhost_vring_state *state;
841 	struct rte_eth_dev *eth_dev;
842 	struct internal_list *list;
843 	char ifname[PATH_MAX];
844 
845 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
846 	list = find_internal_resource(ifname);
847 	if (list == NULL) {
848 		VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
849 		return -1;
850 	}
851 
852 	eth_dev = list->eth_dev;
853 	/* won't be NULL */
854 	state = vring_states[eth_dev->data->port_id];
855 	rte_spinlock_lock(&state->lock);
856 	state->cur[vring] = enable;
857 	state->max_vring = RTE_MAX(vring, state->max_vring);
858 	rte_spinlock_unlock(&state->lock);
859 
860 	VHOST_LOG(INFO, "vring%u is %s\n",
861 			vring, enable ? "enabled" : "disabled");
862 
863 	_rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_QUEUE_STATE, NULL);
864 
865 	return 0;
866 }
867 
868 static struct vhost_device_ops vhost_ops = {
869 	.new_device          = new_device,
870 	.destroy_device      = destroy_device,
871 	.vring_state_changed = vring_state_changed,
872 };
873 
874 int
875 rte_eth_vhost_get_queue_event(uint16_t port_id,
876 		struct rte_eth_vhost_queue_event *event)
877 {
878 	struct rte_vhost_vring_state *state;
879 	unsigned int i;
880 	int idx;
881 
882 	if (port_id >= RTE_MAX_ETHPORTS) {
883 		VHOST_LOG(ERR, "Invalid port id\n");
884 		return -1;
885 	}
886 
887 	state = vring_states[port_id];
888 	if (!state) {
889 		VHOST_LOG(ERR, "Unused port\n");
890 		return -1;
891 	}
892 
893 	rte_spinlock_lock(&state->lock);
894 	for (i = 0; i <= state->max_vring; i++) {
895 		idx = state->index++ % (state->max_vring + 1);
896 
897 		if (state->cur[idx] != state->seen[idx]) {
898 			state->seen[idx] = state->cur[idx];
899 			event->queue_id = idx / 2;
900 			event->rx = idx & 1;
901 			event->enable = state->cur[idx];
902 			rte_spinlock_unlock(&state->lock);
903 			return 0;
904 		}
905 	}
906 	rte_spinlock_unlock(&state->lock);
907 
908 	return -1;
909 }
910 
911 int
912 rte_eth_vhost_get_vid_from_port_id(uint16_t port_id)
913 {
914 	struct internal_list *list;
915 	struct rte_eth_dev *eth_dev;
916 	struct vhost_queue *vq;
917 	int vid = -1;
918 
919 	if (!rte_eth_dev_is_valid_port(port_id))
920 		return -1;
921 
922 	pthread_mutex_lock(&internal_list_lock);
923 
924 	TAILQ_FOREACH(list, &internal_list, next) {
925 		eth_dev = list->eth_dev;
926 		if (eth_dev->data->port_id == port_id) {
927 			vq = eth_dev->data->rx_queues[0];
928 			if (vq) {
929 				vid = vq->vid;
930 			}
931 			break;
932 		}
933 	}
934 
935 	pthread_mutex_unlock(&internal_list_lock);
936 
937 	return vid;
938 }
939 
940 static int
941 eth_dev_start(struct rte_eth_dev *eth_dev)
942 {
943 	struct pmd_internal *internal = eth_dev->data->dev_private;
944 	struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
945 
946 	queue_setup(eth_dev, internal);
947 
948 	if (rte_atomic32_read(&internal->dev_attached) == 1) {
949 		if (dev_conf->intr_conf.rxq) {
950 			if (eth_vhost_install_intr(eth_dev) < 0) {
951 				VHOST_LOG(INFO,
952 					"Failed to install interrupt handler.");
953 					return -1;
954 			}
955 		}
956 	}
957 
958 	rte_atomic32_set(&internal->started, 1);
959 	update_queuing_status(eth_dev);
960 
961 	return 0;
962 }
963 
964 static void
965 eth_dev_stop(struct rte_eth_dev *dev)
966 {
967 	struct pmd_internal *internal = dev->data->dev_private;
968 
969 	rte_atomic32_set(&internal->started, 0);
970 	update_queuing_status(dev);
971 }
972 
973 static void
974 eth_dev_close(struct rte_eth_dev *dev)
975 {
976 	struct pmd_internal *internal;
977 	struct internal_list *list;
978 	unsigned int i;
979 
980 	internal = dev->data->dev_private;
981 	if (!internal)
982 		return;
983 
984 	eth_dev_stop(dev);
985 
986 	rte_vhost_driver_unregister(internal->iface_name);
987 
988 	list = find_internal_resource(internal->iface_name);
989 	if (!list)
990 		return;
991 
992 	pthread_mutex_lock(&internal_list_lock);
993 	TAILQ_REMOVE(&internal_list, list, next);
994 	pthread_mutex_unlock(&internal_list_lock);
995 	rte_free(list);
996 
997 	if (dev->data->rx_queues)
998 		for (i = 0; i < dev->data->nb_rx_queues; i++)
999 			rte_free(dev->data->rx_queues[i]);
1000 
1001 	if (dev->data->tx_queues)
1002 		for (i = 0; i < dev->data->nb_tx_queues; i++)
1003 			rte_free(dev->data->tx_queues[i]);
1004 
1005 	free(internal->dev_name);
1006 	free(internal->iface_name);
1007 	rte_free(internal);
1008 
1009 	dev->data->dev_private = NULL;
1010 
1011 	rte_free(vring_states[dev->data->port_id]);
1012 	vring_states[dev->data->port_id] = NULL;
1013 }
1014 
1015 static int
1016 eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1017 		   uint16_t nb_rx_desc __rte_unused,
1018 		   unsigned int socket_id,
1019 		   const struct rte_eth_rxconf *rx_conf __rte_unused,
1020 		   struct rte_mempool *mb_pool)
1021 {
1022 	struct vhost_queue *vq;
1023 
1024 	vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1025 			RTE_CACHE_LINE_SIZE, socket_id);
1026 	if (vq == NULL) {
1027 		VHOST_LOG(ERR, "Failed to allocate memory for rx queue\n");
1028 		return -ENOMEM;
1029 	}
1030 
1031 	vq->mb_pool = mb_pool;
1032 	vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
1033 	dev->data->rx_queues[rx_queue_id] = vq;
1034 
1035 	return 0;
1036 }
1037 
1038 static int
1039 eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1040 		   uint16_t nb_tx_desc __rte_unused,
1041 		   unsigned int socket_id,
1042 		   const struct rte_eth_txconf *tx_conf __rte_unused)
1043 {
1044 	struct vhost_queue *vq;
1045 
1046 	vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1047 			RTE_CACHE_LINE_SIZE, socket_id);
1048 	if (vq == NULL) {
1049 		VHOST_LOG(ERR, "Failed to allocate memory for tx queue\n");
1050 		return -ENOMEM;
1051 	}
1052 
1053 	vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
1054 	dev->data->tx_queues[tx_queue_id] = vq;
1055 
1056 	return 0;
1057 }
1058 
1059 static int
1060 eth_dev_info(struct rte_eth_dev *dev,
1061 	     struct rte_eth_dev_info *dev_info)
1062 {
1063 	struct pmd_internal *internal;
1064 
1065 	internal = dev->data->dev_private;
1066 	if (internal == NULL) {
1067 		VHOST_LOG(ERR, "Invalid device specified\n");
1068 		return -ENODEV;
1069 	}
1070 
1071 	dev_info->max_mac_addrs = 1;
1072 	dev_info->max_rx_pktlen = (uint32_t)-1;
1073 	dev_info->max_rx_queues = internal->max_queues;
1074 	dev_info->max_tx_queues = internal->max_queues;
1075 	dev_info->min_rx_bufsize = 0;
1076 
1077 	dev_info->tx_offload_capa = DEV_TX_OFFLOAD_MULTI_SEGS |
1078 				DEV_TX_OFFLOAD_VLAN_INSERT;
1079 	dev_info->rx_offload_capa = DEV_RX_OFFLOAD_VLAN_STRIP;
1080 
1081 	return 0;
1082 }
1083 
1084 static int
1085 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1086 {
1087 	unsigned i;
1088 	unsigned long rx_total = 0, tx_total = 0;
1089 	unsigned long rx_total_bytes = 0, tx_total_bytes = 0;
1090 	struct vhost_queue *vq;
1091 
1092 	for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1093 			i < dev->data->nb_rx_queues; i++) {
1094 		if (dev->data->rx_queues[i] == NULL)
1095 			continue;
1096 		vq = dev->data->rx_queues[i];
1097 		stats->q_ipackets[i] = vq->stats.pkts;
1098 		rx_total += stats->q_ipackets[i];
1099 
1100 		stats->q_ibytes[i] = vq->stats.bytes;
1101 		rx_total_bytes += stats->q_ibytes[i];
1102 	}
1103 
1104 	for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1105 			i < dev->data->nb_tx_queues; i++) {
1106 		if (dev->data->tx_queues[i] == NULL)
1107 			continue;
1108 		vq = dev->data->tx_queues[i];
1109 		stats->q_opackets[i] = vq->stats.pkts;
1110 		tx_total += stats->q_opackets[i];
1111 
1112 		stats->q_obytes[i] = vq->stats.bytes;
1113 		tx_total_bytes += stats->q_obytes[i];
1114 	}
1115 
1116 	stats->ipackets = rx_total;
1117 	stats->opackets = tx_total;
1118 	stats->ibytes = rx_total_bytes;
1119 	stats->obytes = tx_total_bytes;
1120 
1121 	return 0;
1122 }
1123 
1124 static int
1125 eth_stats_reset(struct rte_eth_dev *dev)
1126 {
1127 	struct vhost_queue *vq;
1128 	unsigned i;
1129 
1130 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
1131 		if (dev->data->rx_queues[i] == NULL)
1132 			continue;
1133 		vq = dev->data->rx_queues[i];
1134 		vq->stats.pkts = 0;
1135 		vq->stats.bytes = 0;
1136 	}
1137 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
1138 		if (dev->data->tx_queues[i] == NULL)
1139 			continue;
1140 		vq = dev->data->tx_queues[i];
1141 		vq->stats.pkts = 0;
1142 		vq->stats.bytes = 0;
1143 		vq->stats.missed_pkts = 0;
1144 	}
1145 
1146 	return 0;
1147 }
1148 
1149 static void
1150 eth_queue_release(void *q)
1151 {
1152 	rte_free(q);
1153 }
1154 
1155 static int
1156 eth_tx_done_cleanup(void *txq __rte_unused, uint32_t free_cnt __rte_unused)
1157 {
1158 	/*
1159 	 * vHost does not hang onto mbuf. eth_vhost_tx() copies packet data
1160 	 * and releases mbuf, so nothing to cleanup.
1161 	 */
1162 	return 0;
1163 }
1164 
1165 static int
1166 eth_link_update(struct rte_eth_dev *dev __rte_unused,
1167 		int wait_to_complete __rte_unused)
1168 {
1169 	return 0;
1170 }
1171 
1172 static uint32_t
1173 eth_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1174 {
1175 	struct vhost_queue *vq;
1176 
1177 	vq = dev->data->rx_queues[rx_queue_id];
1178 	if (vq == NULL)
1179 		return 0;
1180 
1181 	return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
1182 }
1183 
1184 static const struct eth_dev_ops ops = {
1185 	.dev_start = eth_dev_start,
1186 	.dev_stop = eth_dev_stop,
1187 	.dev_close = eth_dev_close,
1188 	.dev_configure = eth_dev_configure,
1189 	.dev_infos_get = eth_dev_info,
1190 	.rx_queue_setup = eth_rx_queue_setup,
1191 	.tx_queue_setup = eth_tx_queue_setup,
1192 	.rx_queue_release = eth_queue_release,
1193 	.tx_queue_release = eth_queue_release,
1194 	.tx_done_cleanup = eth_tx_done_cleanup,
1195 	.rx_queue_count = eth_rx_queue_count,
1196 	.link_update = eth_link_update,
1197 	.stats_get = eth_stats_get,
1198 	.stats_reset = eth_stats_reset,
1199 	.xstats_reset = vhost_dev_xstats_reset,
1200 	.xstats_get = vhost_dev_xstats_get,
1201 	.xstats_get_names = vhost_dev_xstats_get_names,
1202 	.rx_queue_intr_enable = eth_rxq_intr_enable,
1203 	.rx_queue_intr_disable = eth_rxq_intr_disable,
1204 };
1205 
1206 static int
1207 eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name,
1208 	int16_t queues, const unsigned int numa_node, uint64_t flags)
1209 {
1210 	const char *name = rte_vdev_device_name(dev);
1211 	struct rte_eth_dev_data *data;
1212 	struct pmd_internal *internal = NULL;
1213 	struct rte_eth_dev *eth_dev = NULL;
1214 	struct rte_ether_addr *eth_addr = NULL;
1215 	struct rte_vhost_vring_state *vring_state = NULL;
1216 	struct internal_list *list = NULL;
1217 
1218 	VHOST_LOG(INFO, "Creating VHOST-USER backend on numa socket %u\n",
1219 		numa_node);
1220 
1221 	list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node);
1222 	if (list == NULL)
1223 		goto error;
1224 
1225 	/* reserve an ethdev entry */
1226 	eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal));
1227 	if (eth_dev == NULL)
1228 		goto error;
1229 	data = eth_dev->data;
1230 
1231 	eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
1232 	if (eth_addr == NULL)
1233 		goto error;
1234 	data->mac_addrs = eth_addr;
1235 	*eth_addr = base_eth_addr;
1236 	eth_addr->addr_bytes[5] = eth_dev->data->port_id;
1237 
1238 	vring_state = rte_zmalloc_socket(name,
1239 			sizeof(*vring_state), 0, numa_node);
1240 	if (vring_state == NULL)
1241 		goto error;
1242 
1243 	/* now put it all together
1244 	 * - store queue data in internal,
1245 	 * - point eth_dev_data to internals
1246 	 * - and point eth_dev structure to new eth_dev_data structure
1247 	 */
1248 	internal = eth_dev->data->dev_private;
1249 	internal->dev_name = strdup(name);
1250 	if (internal->dev_name == NULL)
1251 		goto error;
1252 	internal->iface_name = strdup(iface_name);
1253 	if (internal->iface_name == NULL)
1254 		goto error;
1255 
1256 	list->eth_dev = eth_dev;
1257 	pthread_mutex_lock(&internal_list_lock);
1258 	TAILQ_INSERT_TAIL(&internal_list, list, next);
1259 	pthread_mutex_unlock(&internal_list_lock);
1260 
1261 	rte_spinlock_init(&vring_state->lock);
1262 	vring_states[eth_dev->data->port_id] = vring_state;
1263 
1264 	data->nb_rx_queues = queues;
1265 	data->nb_tx_queues = queues;
1266 	internal->max_queues = queues;
1267 	internal->vid = -1;
1268 	data->dev_link = pmd_link;
1269 	data->dev_flags = RTE_ETH_DEV_INTR_LSC | RTE_ETH_DEV_CLOSE_REMOVE;
1270 
1271 	eth_dev->dev_ops = &ops;
1272 
1273 	/* finally assign rx and tx ops */
1274 	eth_dev->rx_pkt_burst = eth_vhost_rx;
1275 	eth_dev->tx_pkt_burst = eth_vhost_tx;
1276 
1277 	if (rte_vhost_driver_register(iface_name, flags))
1278 		goto error;
1279 
1280 	if (rte_vhost_driver_callback_register(iface_name, &vhost_ops) < 0) {
1281 		VHOST_LOG(ERR, "Can't register callbacks\n");
1282 		goto error;
1283 	}
1284 
1285 	if (rte_vhost_driver_start(iface_name) < 0) {
1286 		VHOST_LOG(ERR, "Failed to start driver for %s\n",
1287 			iface_name);
1288 		goto error;
1289 	}
1290 
1291 	rte_eth_dev_probing_finish(eth_dev);
1292 	return data->port_id;
1293 
1294 error:
1295 	if (internal) {
1296 		free(internal->iface_name);
1297 		free(internal->dev_name);
1298 	}
1299 	rte_free(vring_state);
1300 	rte_eth_dev_release_port(eth_dev);
1301 	rte_free(list);
1302 
1303 	return -1;
1304 }
1305 
1306 static inline int
1307 open_iface(const char *key __rte_unused, const char *value, void *extra_args)
1308 {
1309 	const char **iface_name = extra_args;
1310 
1311 	if (value == NULL)
1312 		return -1;
1313 
1314 	*iface_name = value;
1315 
1316 	return 0;
1317 }
1318 
1319 static inline int
1320 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1321 {
1322 	uint16_t *n = extra_args;
1323 
1324 	if (value == NULL || extra_args == NULL)
1325 		return -EINVAL;
1326 
1327 	*n = (uint16_t)strtoul(value, NULL, 0);
1328 	if (*n == USHRT_MAX && errno == ERANGE)
1329 		return -1;
1330 
1331 	return 0;
1332 }
1333 
1334 static int
1335 rte_pmd_vhost_probe(struct rte_vdev_device *dev)
1336 {
1337 	struct rte_kvargs *kvlist = NULL;
1338 	int ret = 0;
1339 	char *iface_name;
1340 	uint16_t queues;
1341 	uint64_t flags = 0;
1342 	int client_mode = 0;
1343 	int dequeue_zero_copy = 0;
1344 	int iommu_support = 0;
1345 	int postcopy_support = 0;
1346 	struct rte_eth_dev *eth_dev;
1347 	const char *name = rte_vdev_device_name(dev);
1348 
1349 	VHOST_LOG(INFO, "Initializing pmd_vhost for %s\n", name);
1350 
1351 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1352 		eth_dev = rte_eth_dev_attach_secondary(name);
1353 		if (!eth_dev) {
1354 			VHOST_LOG(ERR, "Failed to probe %s\n", name);
1355 			return -1;
1356 		}
1357 		/* TODO: request info from primary to set up Rx and Tx */
1358 		eth_dev->dev_ops = &ops;
1359 		eth_dev->device = &dev->device;
1360 		rte_eth_dev_probing_finish(eth_dev);
1361 		return 0;
1362 	}
1363 
1364 	kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1365 	if (kvlist == NULL)
1366 		return -1;
1367 
1368 	if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
1369 		ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
1370 					 &open_iface, &iface_name);
1371 		if (ret < 0)
1372 			goto out_free;
1373 	} else {
1374 		ret = -1;
1375 		goto out_free;
1376 	}
1377 
1378 	if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) {
1379 		ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG,
1380 					 &open_int, &queues);
1381 		if (ret < 0 || queues > RTE_MAX_QUEUES_PER_PORT)
1382 			goto out_free;
1383 
1384 	} else
1385 		queues = 1;
1386 
1387 	if (rte_kvargs_count(kvlist, ETH_VHOST_CLIENT_ARG) == 1) {
1388 		ret = rte_kvargs_process(kvlist, ETH_VHOST_CLIENT_ARG,
1389 					 &open_int, &client_mode);
1390 		if (ret < 0)
1391 			goto out_free;
1392 
1393 		if (client_mode)
1394 			flags |= RTE_VHOST_USER_CLIENT;
1395 	}
1396 
1397 	if (rte_kvargs_count(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY) == 1) {
1398 		ret = rte_kvargs_process(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY,
1399 					 &open_int, &dequeue_zero_copy);
1400 		if (ret < 0)
1401 			goto out_free;
1402 
1403 		if (dequeue_zero_copy)
1404 			flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1405 	}
1406 
1407 	if (rte_kvargs_count(kvlist, ETH_VHOST_IOMMU_SUPPORT) == 1) {
1408 		ret = rte_kvargs_process(kvlist, ETH_VHOST_IOMMU_SUPPORT,
1409 					 &open_int, &iommu_support);
1410 		if (ret < 0)
1411 			goto out_free;
1412 
1413 		if (iommu_support)
1414 			flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
1415 	}
1416 
1417 	if (rte_kvargs_count(kvlist, ETH_VHOST_POSTCOPY_SUPPORT) == 1) {
1418 		ret = rte_kvargs_process(kvlist, ETH_VHOST_POSTCOPY_SUPPORT,
1419 					 &open_int, &postcopy_support);
1420 		if (ret < 0)
1421 			goto out_free;
1422 
1423 		if (postcopy_support)
1424 			flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
1425 	}
1426 
1427 	if (dev->device.numa_node == SOCKET_ID_ANY)
1428 		dev->device.numa_node = rte_socket_id();
1429 
1430 	eth_dev_vhost_create(dev, iface_name, queues, dev->device.numa_node,
1431 		flags);
1432 
1433 out_free:
1434 	rte_kvargs_free(kvlist);
1435 	return ret;
1436 }
1437 
1438 static int
1439 rte_pmd_vhost_remove(struct rte_vdev_device *dev)
1440 {
1441 	const char *name;
1442 	struct rte_eth_dev *eth_dev = NULL;
1443 
1444 	name = rte_vdev_device_name(dev);
1445 	VHOST_LOG(INFO, "Un-Initializing pmd_vhost for %s\n", name);
1446 
1447 	/* find an ethdev entry */
1448 	eth_dev = rte_eth_dev_allocated(name);
1449 	if (eth_dev == NULL)
1450 		return 0;
1451 
1452 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1453 		return rte_eth_dev_release_port(eth_dev);
1454 
1455 	eth_dev_close(eth_dev);
1456 
1457 	rte_eth_dev_release_port(eth_dev);
1458 
1459 	return 0;
1460 }
1461 
1462 static struct rte_vdev_driver pmd_vhost_drv = {
1463 	.probe = rte_pmd_vhost_probe,
1464 	.remove = rte_pmd_vhost_remove,
1465 };
1466 
1467 RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv);
1468 RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost);
1469 RTE_PMD_REGISTER_PARAM_STRING(net_vhost,
1470 	"iface=<ifc> "
1471 	"queues=<int> "
1472 	"client=<0|1> "
1473 	"dequeue-zero-copy=<0|1> "
1474 	"iommu-support=<0|1> "
1475 	"postcopy-support=<0|1>");
1476 
1477 RTE_INIT(vhost_init_log)
1478 {
1479 	vhost_logtype = rte_log_register("pmd.net.vhost");
1480 	if (vhost_logtype >= 0)
1481 		rte_log_set_level(vhost_logtype, RTE_LOG_NOTICE);
1482 }
1483