xref: /dpdk/drivers/net/vhost/rte_eth_vhost.c (revision 9e991f217fc8719e38a812dc280dba5f84db9f59)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016 IGEL Co., Ltd.
3  * Copyright(c) 2016-2018 Intel Corporation
4  */
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <stdbool.h>
8 
9 #include <rte_mbuf.h>
10 #include <rte_ethdev_driver.h>
11 #include <rte_ethdev_vdev.h>
12 #include <rte_malloc.h>
13 #include <rte_memcpy.h>
14 #include <rte_bus_vdev.h>
15 #include <rte_kvargs.h>
16 #include <rte_vhost.h>
17 #include <rte_spinlock.h>
18 
19 #include "rte_eth_vhost.h"
20 
21 static int vhost_logtype;
22 
23 #define VHOST_LOG(level, ...) \
24 	rte_log(RTE_LOG_ ## level, vhost_logtype, __VA_ARGS__)
25 
26 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
27 
28 #define ETH_VHOST_IFACE_ARG		"iface"
29 #define ETH_VHOST_QUEUES_ARG		"queues"
30 #define ETH_VHOST_CLIENT_ARG		"client"
31 #define ETH_VHOST_DEQUEUE_ZERO_COPY	"dequeue-zero-copy"
32 #define ETH_VHOST_IOMMU_SUPPORT		"iommu-support"
33 #define ETH_VHOST_POSTCOPY_SUPPORT	"postcopy-support"
34 #define ETH_VHOST_VIRTIO_NET_F_HOST_TSO "tso"
35 #define VHOST_MAX_PKT_BURST 32
36 
37 static const char *valid_arguments[] = {
38 	ETH_VHOST_IFACE_ARG,
39 	ETH_VHOST_QUEUES_ARG,
40 	ETH_VHOST_CLIENT_ARG,
41 	ETH_VHOST_DEQUEUE_ZERO_COPY,
42 	ETH_VHOST_IOMMU_SUPPORT,
43 	ETH_VHOST_POSTCOPY_SUPPORT,
44 	ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
45 	NULL
46 };
47 
48 static struct rte_ether_addr base_eth_addr = {
49 	.addr_bytes = {
50 		0x56 /* V */,
51 		0x48 /* H */,
52 		0x4F /* O */,
53 		0x53 /* S */,
54 		0x54 /* T */,
55 		0x00
56 	}
57 };
58 
59 enum vhost_xstats_pkts {
60 	VHOST_UNDERSIZE_PKT = 0,
61 	VHOST_64_PKT,
62 	VHOST_65_TO_127_PKT,
63 	VHOST_128_TO_255_PKT,
64 	VHOST_256_TO_511_PKT,
65 	VHOST_512_TO_1023_PKT,
66 	VHOST_1024_TO_1522_PKT,
67 	VHOST_1523_TO_MAX_PKT,
68 	VHOST_BROADCAST_PKT,
69 	VHOST_MULTICAST_PKT,
70 	VHOST_UNICAST_PKT,
71 	VHOST_ERRORS_PKT,
72 	VHOST_ERRORS_FRAGMENTED,
73 	VHOST_ERRORS_JABBER,
74 	VHOST_UNKNOWN_PROTOCOL,
75 	VHOST_XSTATS_MAX,
76 };
77 
78 struct vhost_stats {
79 	uint64_t pkts;
80 	uint64_t bytes;
81 	uint64_t missed_pkts;
82 	uint64_t xstats[VHOST_XSTATS_MAX];
83 };
84 
85 struct vhost_queue {
86 	int vid;
87 	rte_atomic32_t allow_queuing;
88 	rte_atomic32_t while_queuing;
89 	struct pmd_internal *internal;
90 	struct rte_mempool *mb_pool;
91 	uint16_t port;
92 	uint16_t virtqueue_id;
93 	struct vhost_stats stats;
94 };
95 
96 struct pmd_internal {
97 	rte_atomic32_t dev_attached;
98 	char *iface_name;
99 	uint64_t flags;
100 	uint64_t disable_flags;
101 	uint16_t max_queues;
102 	int vid;
103 	rte_atomic32_t started;
104 	uint8_t vlan_strip;
105 };
106 
107 struct internal_list {
108 	TAILQ_ENTRY(internal_list) next;
109 	struct rte_eth_dev *eth_dev;
110 };
111 
112 TAILQ_HEAD(internal_list_head, internal_list);
113 static struct internal_list_head internal_list =
114 	TAILQ_HEAD_INITIALIZER(internal_list);
115 
116 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
117 
118 static struct rte_eth_link pmd_link = {
119 		.link_speed = 10000,
120 		.link_duplex = ETH_LINK_FULL_DUPLEX,
121 		.link_status = ETH_LINK_DOWN
122 };
123 
124 struct rte_vhost_vring_state {
125 	rte_spinlock_t lock;
126 
127 	bool cur[RTE_MAX_QUEUES_PER_PORT * 2];
128 	bool seen[RTE_MAX_QUEUES_PER_PORT * 2];
129 	unsigned int index;
130 	unsigned int max_vring;
131 };
132 
133 static struct rte_vhost_vring_state *vring_states[RTE_MAX_ETHPORTS];
134 
135 #define VHOST_XSTATS_NAME_SIZE 64
136 
137 struct vhost_xstats_name_off {
138 	char name[VHOST_XSTATS_NAME_SIZE];
139 	uint64_t offset;
140 };
141 
142 /* [rx]_is prepended to the name string here */
143 static const struct vhost_xstats_name_off vhost_rxport_stat_strings[] = {
144 	{"good_packets",
145 	 offsetof(struct vhost_queue, stats.pkts)},
146 	{"total_bytes",
147 	 offsetof(struct vhost_queue, stats.bytes)},
148 	{"missed_pkts",
149 	 offsetof(struct vhost_queue, stats.missed_pkts)},
150 	{"broadcast_packets",
151 	 offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
152 	{"multicast_packets",
153 	 offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
154 	{"unicast_packets",
155 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
156 	 {"undersize_packets",
157 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
158 	{"size_64_packets",
159 	 offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
160 	{"size_65_to_127_packets",
161 	 offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
162 	{"size_128_to_255_packets",
163 	 offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
164 	{"size_256_to_511_packets",
165 	 offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
166 	{"size_512_to_1023_packets",
167 	 offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
168 	{"size_1024_to_1522_packets",
169 	 offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
170 	{"size_1523_to_max_packets",
171 	 offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
172 	{"errors_with_bad_CRC",
173 	 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
174 	{"fragmented_errors",
175 	 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_FRAGMENTED])},
176 	{"jabber_errors",
177 	 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_JABBER])},
178 	{"unknown_protos_packets",
179 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNKNOWN_PROTOCOL])},
180 };
181 
182 /* [tx]_ is prepended to the name string here */
183 static const struct vhost_xstats_name_off vhost_txport_stat_strings[] = {
184 	{"good_packets",
185 	 offsetof(struct vhost_queue, stats.pkts)},
186 	{"total_bytes",
187 	 offsetof(struct vhost_queue, stats.bytes)},
188 	{"missed_pkts",
189 	 offsetof(struct vhost_queue, stats.missed_pkts)},
190 	{"broadcast_packets",
191 	 offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
192 	{"multicast_packets",
193 	 offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
194 	{"unicast_packets",
195 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
196 	{"undersize_packets",
197 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
198 	{"size_64_packets",
199 	 offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
200 	{"size_65_to_127_packets",
201 	 offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
202 	{"size_128_to_255_packets",
203 	 offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
204 	{"size_256_to_511_packets",
205 	 offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
206 	{"size_512_to_1023_packets",
207 	 offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
208 	{"size_1024_to_1522_packets",
209 	 offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
210 	{"size_1523_to_max_packets",
211 	 offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
212 	{"errors_with_bad_CRC",
213 	 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
214 };
215 
216 #define VHOST_NB_XSTATS_RXPORT (sizeof(vhost_rxport_stat_strings) / \
217 				sizeof(vhost_rxport_stat_strings[0]))
218 
219 #define VHOST_NB_XSTATS_TXPORT (sizeof(vhost_txport_stat_strings) / \
220 				sizeof(vhost_txport_stat_strings[0]))
221 
222 static int
223 vhost_dev_xstats_reset(struct rte_eth_dev *dev)
224 {
225 	struct vhost_queue *vq = NULL;
226 	unsigned int i = 0;
227 
228 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
229 		vq = dev->data->rx_queues[i];
230 		if (!vq)
231 			continue;
232 		memset(&vq->stats, 0, sizeof(vq->stats));
233 	}
234 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
235 		vq = dev->data->tx_queues[i];
236 		if (!vq)
237 			continue;
238 		memset(&vq->stats, 0, sizeof(vq->stats));
239 	}
240 
241 	return 0;
242 }
243 
244 static int
245 vhost_dev_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
246 			   struct rte_eth_xstat_name *xstats_names,
247 			   unsigned int limit __rte_unused)
248 {
249 	unsigned int t = 0;
250 	int count = 0;
251 	int nstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
252 
253 	if (!xstats_names)
254 		return nstats;
255 	for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
256 		snprintf(xstats_names[count].name,
257 			 sizeof(xstats_names[count].name),
258 			 "rx_%s", vhost_rxport_stat_strings[t].name);
259 		count++;
260 	}
261 	for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
262 		snprintf(xstats_names[count].name,
263 			 sizeof(xstats_names[count].name),
264 			 "tx_%s", vhost_txport_stat_strings[t].name);
265 		count++;
266 	}
267 	return count;
268 }
269 
270 static int
271 vhost_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats,
272 		     unsigned int n)
273 {
274 	unsigned int i;
275 	unsigned int t;
276 	unsigned int count = 0;
277 	struct vhost_queue *vq = NULL;
278 	unsigned int nxstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
279 
280 	if (n < nxstats)
281 		return nxstats;
282 
283 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
284 		vq = dev->data->rx_queues[i];
285 		if (!vq)
286 			continue;
287 		vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
288 				- (vq->stats.xstats[VHOST_BROADCAST_PKT]
289 				+ vq->stats.xstats[VHOST_MULTICAST_PKT]);
290 	}
291 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
292 		vq = dev->data->tx_queues[i];
293 		if (!vq)
294 			continue;
295 		vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
296 				+ vq->stats.missed_pkts
297 				- (vq->stats.xstats[VHOST_BROADCAST_PKT]
298 				+ vq->stats.xstats[VHOST_MULTICAST_PKT]);
299 	}
300 	for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
301 		xstats[count].value = 0;
302 		for (i = 0; i < dev->data->nb_rx_queues; i++) {
303 			vq = dev->data->rx_queues[i];
304 			if (!vq)
305 				continue;
306 			xstats[count].value +=
307 				*(uint64_t *)(((char *)vq)
308 				+ vhost_rxport_stat_strings[t].offset);
309 		}
310 		xstats[count].id = count;
311 		count++;
312 	}
313 	for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
314 		xstats[count].value = 0;
315 		for (i = 0; i < dev->data->nb_tx_queues; i++) {
316 			vq = dev->data->tx_queues[i];
317 			if (!vq)
318 				continue;
319 			xstats[count].value +=
320 				*(uint64_t *)(((char *)vq)
321 				+ vhost_txport_stat_strings[t].offset);
322 		}
323 		xstats[count].id = count;
324 		count++;
325 	}
326 	return count;
327 }
328 
329 static inline void
330 vhost_count_multicast_broadcast(struct vhost_queue *vq,
331 				struct rte_mbuf *mbuf)
332 {
333 	struct rte_ether_addr *ea = NULL;
334 	struct vhost_stats *pstats = &vq->stats;
335 
336 	ea = rte_pktmbuf_mtod(mbuf, struct rte_ether_addr *);
337 	if (rte_is_multicast_ether_addr(ea)) {
338 		if (rte_is_broadcast_ether_addr(ea))
339 			pstats->xstats[VHOST_BROADCAST_PKT]++;
340 		else
341 			pstats->xstats[VHOST_MULTICAST_PKT]++;
342 	}
343 }
344 
345 static void
346 vhost_update_packet_xstats(struct vhost_queue *vq,
347 			   struct rte_mbuf **bufs,
348 			   uint16_t count)
349 {
350 	uint32_t pkt_len = 0;
351 	uint64_t i = 0;
352 	uint64_t index;
353 	struct vhost_stats *pstats = &vq->stats;
354 
355 	for (i = 0; i < count ; i++) {
356 		pkt_len = bufs[i]->pkt_len;
357 		if (pkt_len == 64) {
358 			pstats->xstats[VHOST_64_PKT]++;
359 		} else if (pkt_len > 64 && pkt_len < 1024) {
360 			index = (sizeof(pkt_len) * 8)
361 				- __builtin_clz(pkt_len) - 5;
362 			pstats->xstats[index]++;
363 		} else {
364 			if (pkt_len < 64)
365 				pstats->xstats[VHOST_UNDERSIZE_PKT]++;
366 			else if (pkt_len <= 1522)
367 				pstats->xstats[VHOST_1024_TO_1522_PKT]++;
368 			else if (pkt_len > 1522)
369 				pstats->xstats[VHOST_1523_TO_MAX_PKT]++;
370 		}
371 		vhost_count_multicast_broadcast(vq, bufs[i]);
372 	}
373 }
374 
375 static uint16_t
376 eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
377 {
378 	struct vhost_queue *r = q;
379 	uint16_t i, nb_rx = 0;
380 	uint16_t nb_receive = nb_bufs;
381 
382 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
383 		return 0;
384 
385 	rte_atomic32_set(&r->while_queuing, 1);
386 
387 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
388 		goto out;
389 
390 	/* Dequeue packets from guest TX queue */
391 	while (nb_receive) {
392 		uint16_t nb_pkts;
393 		uint16_t num = (uint16_t)RTE_MIN(nb_receive,
394 						 VHOST_MAX_PKT_BURST);
395 
396 		nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
397 						  r->mb_pool, &bufs[nb_rx],
398 						  num);
399 
400 		nb_rx += nb_pkts;
401 		nb_receive -= nb_pkts;
402 		if (nb_pkts < num)
403 			break;
404 	}
405 
406 	r->stats.pkts += nb_rx;
407 
408 	for (i = 0; likely(i < nb_rx); i++) {
409 		bufs[i]->port = r->port;
410 		bufs[i]->vlan_tci = 0;
411 
412 		if (r->internal->vlan_strip)
413 			rte_vlan_strip(bufs[i]);
414 
415 		r->stats.bytes += bufs[i]->pkt_len;
416 	}
417 
418 	vhost_update_packet_xstats(r, bufs, nb_rx);
419 
420 out:
421 	rte_atomic32_set(&r->while_queuing, 0);
422 
423 	return nb_rx;
424 }
425 
426 static uint16_t
427 eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
428 {
429 	struct vhost_queue *r = q;
430 	uint16_t i, nb_tx = 0;
431 	uint16_t nb_send = 0;
432 
433 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
434 		return 0;
435 
436 	rte_atomic32_set(&r->while_queuing, 1);
437 
438 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
439 		goto out;
440 
441 	for (i = 0; i < nb_bufs; i++) {
442 		struct rte_mbuf *m = bufs[i];
443 
444 		/* Do VLAN tag insertion */
445 		if (m->ol_flags & PKT_TX_VLAN_PKT) {
446 			int error = rte_vlan_insert(&m);
447 			if (unlikely(error)) {
448 				rte_pktmbuf_free(m);
449 				continue;
450 			}
451 		}
452 
453 		bufs[nb_send] = m;
454 		++nb_send;
455 	}
456 
457 	/* Enqueue packets to guest RX queue */
458 	while (nb_send) {
459 		uint16_t nb_pkts;
460 		uint16_t num = (uint16_t)RTE_MIN(nb_send,
461 						 VHOST_MAX_PKT_BURST);
462 
463 		nb_pkts = rte_vhost_enqueue_burst(r->vid, r->virtqueue_id,
464 						  &bufs[nb_tx], num);
465 
466 		nb_tx += nb_pkts;
467 		nb_send -= nb_pkts;
468 		if (nb_pkts < num)
469 			break;
470 	}
471 
472 	r->stats.pkts += nb_tx;
473 	r->stats.missed_pkts += nb_bufs - nb_tx;
474 
475 	for (i = 0; likely(i < nb_tx); i++)
476 		r->stats.bytes += bufs[i]->pkt_len;
477 
478 	vhost_update_packet_xstats(r, bufs, nb_tx);
479 
480 	/* According to RFC2863 page42 section ifHCOutMulticastPkts and
481 	 * ifHCOutBroadcastPkts, the counters "multicast" and "broadcast"
482 	 * are increased when packets are not transmitted successfully.
483 	 */
484 	for (i = nb_tx; i < nb_bufs; i++)
485 		vhost_count_multicast_broadcast(r, bufs[i]);
486 
487 	for (i = 0; likely(i < nb_tx); i++)
488 		rte_pktmbuf_free(bufs[i]);
489 out:
490 	rte_atomic32_set(&r->while_queuing, 0);
491 
492 	return nb_tx;
493 }
494 
495 static inline struct internal_list *
496 find_internal_resource(char *ifname)
497 {
498 	int found = 0;
499 	struct internal_list *list;
500 	struct pmd_internal *internal;
501 
502 	if (ifname == NULL)
503 		return NULL;
504 
505 	pthread_mutex_lock(&internal_list_lock);
506 
507 	TAILQ_FOREACH(list, &internal_list, next) {
508 		internal = list->eth_dev->data->dev_private;
509 		if (!strcmp(internal->iface_name, ifname)) {
510 			found = 1;
511 			break;
512 		}
513 	}
514 
515 	pthread_mutex_unlock(&internal_list_lock);
516 
517 	if (!found)
518 		return NULL;
519 
520 	return list;
521 }
522 
523 static int
524 eth_rxq_intr_enable(struct rte_eth_dev *dev, uint16_t qid)
525 {
526 	struct rte_vhost_vring vring;
527 	struct vhost_queue *vq;
528 	int ret = 0;
529 
530 	vq = dev->data->rx_queues[qid];
531 	if (!vq) {
532 		VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
533 		return -1;
534 	}
535 
536 	ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
537 	if (ret < 0) {
538 		VHOST_LOG(ERR, "Failed to get rxq%d's vring\n", qid);
539 		return ret;
540 	}
541 	VHOST_LOG(INFO, "Enable interrupt for rxq%d\n", qid);
542 	rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 1);
543 	rte_wmb();
544 
545 	return ret;
546 }
547 
548 static int
549 eth_rxq_intr_disable(struct rte_eth_dev *dev, uint16_t qid)
550 {
551 	struct rte_vhost_vring vring;
552 	struct vhost_queue *vq;
553 	int ret = 0;
554 
555 	vq = dev->data->rx_queues[qid];
556 	if (!vq) {
557 		VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
558 		return -1;
559 	}
560 
561 	ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
562 	if (ret < 0) {
563 		VHOST_LOG(ERR, "Failed to get rxq%d's vring", qid);
564 		return ret;
565 	}
566 	VHOST_LOG(INFO, "Disable interrupt for rxq%d\n", qid);
567 	rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 0);
568 	rte_wmb();
569 
570 	return 0;
571 }
572 
573 static void
574 eth_vhost_uninstall_intr(struct rte_eth_dev *dev)
575 {
576 	struct rte_intr_handle *intr_handle = dev->intr_handle;
577 
578 	if (intr_handle) {
579 		if (intr_handle->intr_vec)
580 			free(intr_handle->intr_vec);
581 		free(intr_handle);
582 	}
583 
584 	dev->intr_handle = NULL;
585 }
586 
587 static int
588 eth_vhost_install_intr(struct rte_eth_dev *dev)
589 {
590 	struct rte_vhost_vring vring;
591 	struct vhost_queue *vq;
592 	int count = 0;
593 	int nb_rxq = dev->data->nb_rx_queues;
594 	int i;
595 	int ret;
596 
597 	/* uninstall firstly if we are reconnecting */
598 	if (dev->intr_handle)
599 		eth_vhost_uninstall_intr(dev);
600 
601 	dev->intr_handle = malloc(sizeof(*dev->intr_handle));
602 	if (!dev->intr_handle) {
603 		VHOST_LOG(ERR, "Fail to allocate intr_handle\n");
604 		return -ENOMEM;
605 	}
606 	memset(dev->intr_handle, 0, sizeof(*dev->intr_handle));
607 
608 	dev->intr_handle->efd_counter_size = sizeof(uint64_t);
609 
610 	dev->intr_handle->intr_vec =
611 		malloc(nb_rxq * sizeof(dev->intr_handle->intr_vec[0]));
612 
613 	if (!dev->intr_handle->intr_vec) {
614 		VHOST_LOG(ERR,
615 			"Failed to allocate memory for interrupt vector\n");
616 		free(dev->intr_handle);
617 		return -ENOMEM;
618 	}
619 
620 	VHOST_LOG(INFO, "Prepare intr vec\n");
621 	for (i = 0; i < nb_rxq; i++) {
622 		vq = dev->data->rx_queues[i];
623 		if (!vq) {
624 			VHOST_LOG(INFO, "rxq-%d not setup yet, skip!\n", i);
625 			continue;
626 		}
627 
628 		ret = rte_vhost_get_vhost_vring(vq->vid, (i << 1) + 1, &vring);
629 		if (ret < 0) {
630 			VHOST_LOG(INFO,
631 				"Failed to get rxq-%d's vring, skip!\n", i);
632 			continue;
633 		}
634 
635 		if (vring.kickfd < 0) {
636 			VHOST_LOG(INFO,
637 				"rxq-%d's kickfd is invalid, skip!\n", i);
638 			continue;
639 		}
640 		dev->intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + i;
641 		dev->intr_handle->efds[i] = vring.kickfd;
642 		count++;
643 		VHOST_LOG(INFO, "Installed intr vec for rxq-%d\n", i);
644 	}
645 
646 	dev->intr_handle->nb_efd = count;
647 	dev->intr_handle->max_intr = count + 1;
648 	dev->intr_handle->type = RTE_INTR_HANDLE_VDEV;
649 
650 	return 0;
651 }
652 
653 static void
654 update_queuing_status(struct rte_eth_dev *dev)
655 {
656 	struct pmd_internal *internal = dev->data->dev_private;
657 	struct vhost_queue *vq;
658 	unsigned int i;
659 	int allow_queuing = 1;
660 
661 	if (!dev->data->rx_queues || !dev->data->tx_queues)
662 		return;
663 
664 	if (rte_atomic32_read(&internal->started) == 0 ||
665 	    rte_atomic32_read(&internal->dev_attached) == 0)
666 		allow_queuing = 0;
667 
668 	/* Wait until rx/tx_pkt_burst stops accessing vhost device */
669 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
670 		vq = dev->data->rx_queues[i];
671 		if (vq == NULL)
672 			continue;
673 		rte_atomic32_set(&vq->allow_queuing, allow_queuing);
674 		while (rte_atomic32_read(&vq->while_queuing))
675 			rte_pause();
676 	}
677 
678 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
679 		vq = dev->data->tx_queues[i];
680 		if (vq == NULL)
681 			continue;
682 		rte_atomic32_set(&vq->allow_queuing, allow_queuing);
683 		while (rte_atomic32_read(&vq->while_queuing))
684 			rte_pause();
685 	}
686 }
687 
688 static void
689 queue_setup(struct rte_eth_dev *eth_dev, struct pmd_internal *internal)
690 {
691 	struct vhost_queue *vq;
692 	int i;
693 
694 	for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
695 		vq = eth_dev->data->rx_queues[i];
696 		if (!vq)
697 			continue;
698 		vq->vid = internal->vid;
699 		vq->internal = internal;
700 		vq->port = eth_dev->data->port_id;
701 	}
702 	for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
703 		vq = eth_dev->data->tx_queues[i];
704 		if (!vq)
705 			continue;
706 		vq->vid = internal->vid;
707 		vq->internal = internal;
708 		vq->port = eth_dev->data->port_id;
709 	}
710 }
711 
712 static int
713 new_device(int vid)
714 {
715 	struct rte_eth_dev *eth_dev;
716 	struct internal_list *list;
717 	struct pmd_internal *internal;
718 	struct rte_eth_conf *dev_conf;
719 	unsigned i;
720 	char ifname[PATH_MAX];
721 #ifdef RTE_LIBRTE_VHOST_NUMA
722 	int newnode;
723 #endif
724 
725 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
726 	list = find_internal_resource(ifname);
727 	if (list == NULL) {
728 		VHOST_LOG(INFO, "Invalid device name: %s\n", ifname);
729 		return -1;
730 	}
731 
732 	eth_dev = list->eth_dev;
733 	internal = eth_dev->data->dev_private;
734 	dev_conf = &eth_dev->data->dev_conf;
735 
736 #ifdef RTE_LIBRTE_VHOST_NUMA
737 	newnode = rte_vhost_get_numa_node(vid);
738 	if (newnode >= 0)
739 		eth_dev->data->numa_node = newnode;
740 #endif
741 
742 	internal->vid = vid;
743 	if (rte_atomic32_read(&internal->started) == 1) {
744 		queue_setup(eth_dev, internal);
745 
746 		if (dev_conf->intr_conf.rxq) {
747 			if (eth_vhost_install_intr(eth_dev) < 0) {
748 				VHOST_LOG(INFO,
749 					"Failed to install interrupt handler.");
750 					return -1;
751 			}
752 		}
753 	} else {
754 		VHOST_LOG(INFO, "RX/TX queues not exist yet\n");
755 	}
756 
757 	for (i = 0; i < rte_vhost_get_vring_num(vid); i++)
758 		rte_vhost_enable_guest_notification(vid, i, 0);
759 
760 	rte_vhost_get_mtu(vid, &eth_dev->data->mtu);
761 
762 	eth_dev->data->dev_link.link_status = ETH_LINK_UP;
763 
764 	rte_atomic32_set(&internal->dev_attached, 1);
765 	update_queuing_status(eth_dev);
766 
767 	VHOST_LOG(INFO, "Vhost device %d created\n", vid);
768 
769 	_rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
770 
771 	return 0;
772 }
773 
774 static void
775 destroy_device(int vid)
776 {
777 	struct rte_eth_dev *eth_dev;
778 	struct pmd_internal *internal;
779 	struct vhost_queue *vq;
780 	struct internal_list *list;
781 	char ifname[PATH_MAX];
782 	unsigned i;
783 	struct rte_vhost_vring_state *state;
784 
785 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
786 	list = find_internal_resource(ifname);
787 	if (list == NULL) {
788 		VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
789 		return;
790 	}
791 	eth_dev = list->eth_dev;
792 	internal = eth_dev->data->dev_private;
793 
794 	rte_atomic32_set(&internal->dev_attached, 0);
795 	update_queuing_status(eth_dev);
796 
797 	eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
798 
799 	if (eth_dev->data->rx_queues && eth_dev->data->tx_queues) {
800 		for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
801 			vq = eth_dev->data->rx_queues[i];
802 			if (!vq)
803 				continue;
804 			vq->vid = -1;
805 		}
806 		for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
807 			vq = eth_dev->data->tx_queues[i];
808 			if (!vq)
809 				continue;
810 			vq->vid = -1;
811 		}
812 	}
813 
814 	state = vring_states[eth_dev->data->port_id];
815 	rte_spinlock_lock(&state->lock);
816 	for (i = 0; i <= state->max_vring; i++) {
817 		state->cur[i] = false;
818 		state->seen[i] = false;
819 	}
820 	state->max_vring = 0;
821 	rte_spinlock_unlock(&state->lock);
822 
823 	VHOST_LOG(INFO, "Vhost device %d destroyed\n", vid);
824 	eth_vhost_uninstall_intr(eth_dev);
825 
826 	_rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
827 }
828 
829 static int
830 vring_state_changed(int vid, uint16_t vring, int enable)
831 {
832 	struct rte_vhost_vring_state *state;
833 	struct rte_eth_dev *eth_dev;
834 	struct internal_list *list;
835 	char ifname[PATH_MAX];
836 
837 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
838 	list = find_internal_resource(ifname);
839 	if (list == NULL) {
840 		VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
841 		return -1;
842 	}
843 
844 	eth_dev = list->eth_dev;
845 	/* won't be NULL */
846 	state = vring_states[eth_dev->data->port_id];
847 	rte_spinlock_lock(&state->lock);
848 	if (state->cur[vring] == enable) {
849 		rte_spinlock_unlock(&state->lock);
850 		return 0;
851 	}
852 	state->cur[vring] = enable;
853 	state->max_vring = RTE_MAX(vring, state->max_vring);
854 	rte_spinlock_unlock(&state->lock);
855 
856 	VHOST_LOG(INFO, "vring%u is %s\n",
857 			vring, enable ? "enabled" : "disabled");
858 
859 	_rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_QUEUE_STATE, NULL);
860 
861 	return 0;
862 }
863 
864 static struct vhost_device_ops vhost_ops = {
865 	.new_device          = new_device,
866 	.destroy_device      = destroy_device,
867 	.vring_state_changed = vring_state_changed,
868 };
869 
870 static int
871 vhost_driver_setup(struct rte_eth_dev *eth_dev)
872 {
873 	struct pmd_internal *internal = eth_dev->data->dev_private;
874 	struct internal_list *list = NULL;
875 	struct rte_vhost_vring_state *vring_state = NULL;
876 	unsigned int numa_node = eth_dev->device->numa_node;
877 	const char *name = eth_dev->device->name;
878 
879 	/* Don't try to setup again if it has already been done. */
880 	list = find_internal_resource(internal->iface_name);
881 	if (list)
882 		return 0;
883 
884 	list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node);
885 	if (list == NULL)
886 		return -1;
887 
888 	vring_state = rte_zmalloc_socket(name, sizeof(*vring_state),
889 					 0, numa_node);
890 	if (vring_state == NULL)
891 		goto free_list;
892 
893 	list->eth_dev = eth_dev;
894 	pthread_mutex_lock(&internal_list_lock);
895 	TAILQ_INSERT_TAIL(&internal_list, list, next);
896 	pthread_mutex_unlock(&internal_list_lock);
897 
898 	rte_spinlock_init(&vring_state->lock);
899 	vring_states[eth_dev->data->port_id] = vring_state;
900 
901 	if (rte_vhost_driver_register(internal->iface_name, internal->flags))
902 		goto list_remove;
903 
904 	if (internal->disable_flags) {
905 		if (rte_vhost_driver_disable_features(internal->iface_name,
906 						      internal->disable_flags))
907 			goto drv_unreg;
908 	}
909 
910 	if (rte_vhost_driver_callback_register(internal->iface_name,
911 					       &vhost_ops) < 0) {
912 		VHOST_LOG(ERR, "Can't register callbacks\n");
913 		goto drv_unreg;
914 	}
915 
916 	if (rte_vhost_driver_start(internal->iface_name) < 0) {
917 		VHOST_LOG(ERR, "Failed to start driver for %s\n",
918 			  internal->iface_name);
919 		goto drv_unreg;
920 	}
921 
922 	return 0;
923 
924 drv_unreg:
925 	rte_vhost_driver_unregister(internal->iface_name);
926 list_remove:
927 	vring_states[eth_dev->data->port_id] = NULL;
928 	pthread_mutex_lock(&internal_list_lock);
929 	TAILQ_REMOVE(&internal_list, list, next);
930 	pthread_mutex_unlock(&internal_list_lock);
931 	rte_free(vring_state);
932 free_list:
933 	rte_free(list);
934 
935 	return -1;
936 }
937 
938 int
939 rte_eth_vhost_get_queue_event(uint16_t port_id,
940 		struct rte_eth_vhost_queue_event *event)
941 {
942 	struct rte_vhost_vring_state *state;
943 	unsigned int i;
944 	int idx;
945 
946 	if (port_id >= RTE_MAX_ETHPORTS) {
947 		VHOST_LOG(ERR, "Invalid port id\n");
948 		return -1;
949 	}
950 
951 	state = vring_states[port_id];
952 	if (!state) {
953 		VHOST_LOG(ERR, "Unused port\n");
954 		return -1;
955 	}
956 
957 	rte_spinlock_lock(&state->lock);
958 	for (i = 0; i <= state->max_vring; i++) {
959 		idx = state->index++ % (state->max_vring + 1);
960 
961 		if (state->cur[idx] != state->seen[idx]) {
962 			state->seen[idx] = state->cur[idx];
963 			event->queue_id = idx / 2;
964 			event->rx = idx & 1;
965 			event->enable = state->cur[idx];
966 			rte_spinlock_unlock(&state->lock);
967 			return 0;
968 		}
969 	}
970 	rte_spinlock_unlock(&state->lock);
971 
972 	return -1;
973 }
974 
975 int
976 rte_eth_vhost_get_vid_from_port_id(uint16_t port_id)
977 {
978 	struct internal_list *list;
979 	struct rte_eth_dev *eth_dev;
980 	struct vhost_queue *vq;
981 	int vid = -1;
982 
983 	if (!rte_eth_dev_is_valid_port(port_id))
984 		return -1;
985 
986 	pthread_mutex_lock(&internal_list_lock);
987 
988 	TAILQ_FOREACH(list, &internal_list, next) {
989 		eth_dev = list->eth_dev;
990 		if (eth_dev->data->port_id == port_id) {
991 			vq = eth_dev->data->rx_queues[0];
992 			if (vq) {
993 				vid = vq->vid;
994 			}
995 			break;
996 		}
997 	}
998 
999 	pthread_mutex_unlock(&internal_list_lock);
1000 
1001 	return vid;
1002 }
1003 
1004 static int
1005 eth_dev_configure(struct rte_eth_dev *dev)
1006 {
1007 	struct pmd_internal *internal = dev->data->dev_private;
1008 	const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
1009 
1010 	/* NOTE: the same process has to operate a vhost interface
1011 	 * from beginning to end (from eth_dev configure to eth_dev close).
1012 	 * It is user's responsibility at the moment.
1013 	 */
1014 	if (vhost_driver_setup(dev) < 0)
1015 		return -1;
1016 
1017 	internal->vlan_strip = !!(rxmode->offloads & DEV_RX_OFFLOAD_VLAN_STRIP);
1018 
1019 	return 0;
1020 }
1021 
1022 static int
1023 eth_dev_start(struct rte_eth_dev *eth_dev)
1024 {
1025 	struct pmd_internal *internal = eth_dev->data->dev_private;
1026 	struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
1027 
1028 	queue_setup(eth_dev, internal);
1029 
1030 	if (rte_atomic32_read(&internal->dev_attached) == 1) {
1031 		if (dev_conf->intr_conf.rxq) {
1032 			if (eth_vhost_install_intr(eth_dev) < 0) {
1033 				VHOST_LOG(INFO,
1034 					"Failed to install interrupt handler.");
1035 					return -1;
1036 			}
1037 		}
1038 	}
1039 
1040 	rte_atomic32_set(&internal->started, 1);
1041 	update_queuing_status(eth_dev);
1042 
1043 	return 0;
1044 }
1045 
1046 static void
1047 eth_dev_stop(struct rte_eth_dev *dev)
1048 {
1049 	struct pmd_internal *internal = dev->data->dev_private;
1050 
1051 	rte_atomic32_set(&internal->started, 0);
1052 	update_queuing_status(dev);
1053 }
1054 
1055 static void
1056 eth_dev_close(struct rte_eth_dev *dev)
1057 {
1058 	struct pmd_internal *internal;
1059 	struct internal_list *list;
1060 	unsigned int i;
1061 
1062 	internal = dev->data->dev_private;
1063 	if (!internal)
1064 		return;
1065 
1066 	eth_dev_stop(dev);
1067 
1068 	rte_vhost_driver_unregister(internal->iface_name);
1069 
1070 	list = find_internal_resource(internal->iface_name);
1071 	if (!list)
1072 		return;
1073 
1074 	pthread_mutex_lock(&internal_list_lock);
1075 	TAILQ_REMOVE(&internal_list, list, next);
1076 	pthread_mutex_unlock(&internal_list_lock);
1077 	rte_free(list);
1078 
1079 	if (dev->data->rx_queues)
1080 		for (i = 0; i < dev->data->nb_rx_queues; i++)
1081 			rte_free(dev->data->rx_queues[i]);
1082 
1083 	if (dev->data->tx_queues)
1084 		for (i = 0; i < dev->data->nb_tx_queues; i++)
1085 			rte_free(dev->data->tx_queues[i]);
1086 
1087 	rte_free(internal->iface_name);
1088 	rte_free(internal);
1089 
1090 	dev->data->dev_private = NULL;
1091 
1092 	rte_free(vring_states[dev->data->port_id]);
1093 	vring_states[dev->data->port_id] = NULL;
1094 }
1095 
1096 static int
1097 eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1098 		   uint16_t nb_rx_desc __rte_unused,
1099 		   unsigned int socket_id,
1100 		   const struct rte_eth_rxconf *rx_conf __rte_unused,
1101 		   struct rte_mempool *mb_pool)
1102 {
1103 	struct vhost_queue *vq;
1104 
1105 	vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1106 			RTE_CACHE_LINE_SIZE, socket_id);
1107 	if (vq == NULL) {
1108 		VHOST_LOG(ERR, "Failed to allocate memory for rx queue\n");
1109 		return -ENOMEM;
1110 	}
1111 
1112 	vq->mb_pool = mb_pool;
1113 	vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
1114 	dev->data->rx_queues[rx_queue_id] = vq;
1115 
1116 	return 0;
1117 }
1118 
1119 static int
1120 eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1121 		   uint16_t nb_tx_desc __rte_unused,
1122 		   unsigned int socket_id,
1123 		   const struct rte_eth_txconf *tx_conf __rte_unused)
1124 {
1125 	struct vhost_queue *vq;
1126 
1127 	vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1128 			RTE_CACHE_LINE_SIZE, socket_id);
1129 	if (vq == NULL) {
1130 		VHOST_LOG(ERR, "Failed to allocate memory for tx queue\n");
1131 		return -ENOMEM;
1132 	}
1133 
1134 	vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
1135 	dev->data->tx_queues[tx_queue_id] = vq;
1136 
1137 	return 0;
1138 }
1139 
1140 static int
1141 eth_dev_info(struct rte_eth_dev *dev,
1142 	     struct rte_eth_dev_info *dev_info)
1143 {
1144 	struct pmd_internal *internal;
1145 
1146 	internal = dev->data->dev_private;
1147 	if (internal == NULL) {
1148 		VHOST_LOG(ERR, "Invalid device specified\n");
1149 		return -ENODEV;
1150 	}
1151 
1152 	dev_info->max_mac_addrs = 1;
1153 	dev_info->max_rx_pktlen = (uint32_t)-1;
1154 	dev_info->max_rx_queues = internal->max_queues;
1155 	dev_info->max_tx_queues = internal->max_queues;
1156 	dev_info->min_rx_bufsize = 0;
1157 
1158 	dev_info->tx_offload_capa = DEV_TX_OFFLOAD_MULTI_SEGS |
1159 				DEV_TX_OFFLOAD_VLAN_INSERT;
1160 	dev_info->rx_offload_capa = DEV_RX_OFFLOAD_VLAN_STRIP;
1161 
1162 	return 0;
1163 }
1164 
1165 static int
1166 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1167 {
1168 	unsigned i;
1169 	unsigned long rx_total = 0, tx_total = 0;
1170 	unsigned long rx_total_bytes = 0, tx_total_bytes = 0;
1171 	struct vhost_queue *vq;
1172 
1173 	for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1174 			i < dev->data->nb_rx_queues; i++) {
1175 		if (dev->data->rx_queues[i] == NULL)
1176 			continue;
1177 		vq = dev->data->rx_queues[i];
1178 		stats->q_ipackets[i] = vq->stats.pkts;
1179 		rx_total += stats->q_ipackets[i];
1180 
1181 		stats->q_ibytes[i] = vq->stats.bytes;
1182 		rx_total_bytes += stats->q_ibytes[i];
1183 	}
1184 
1185 	for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1186 			i < dev->data->nb_tx_queues; i++) {
1187 		if (dev->data->tx_queues[i] == NULL)
1188 			continue;
1189 		vq = dev->data->tx_queues[i];
1190 		stats->q_opackets[i] = vq->stats.pkts;
1191 		tx_total += stats->q_opackets[i];
1192 
1193 		stats->q_obytes[i] = vq->stats.bytes;
1194 		tx_total_bytes += stats->q_obytes[i];
1195 	}
1196 
1197 	stats->ipackets = rx_total;
1198 	stats->opackets = tx_total;
1199 	stats->ibytes = rx_total_bytes;
1200 	stats->obytes = tx_total_bytes;
1201 
1202 	return 0;
1203 }
1204 
1205 static int
1206 eth_stats_reset(struct rte_eth_dev *dev)
1207 {
1208 	struct vhost_queue *vq;
1209 	unsigned i;
1210 
1211 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
1212 		if (dev->data->rx_queues[i] == NULL)
1213 			continue;
1214 		vq = dev->data->rx_queues[i];
1215 		vq->stats.pkts = 0;
1216 		vq->stats.bytes = 0;
1217 	}
1218 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
1219 		if (dev->data->tx_queues[i] == NULL)
1220 			continue;
1221 		vq = dev->data->tx_queues[i];
1222 		vq->stats.pkts = 0;
1223 		vq->stats.bytes = 0;
1224 		vq->stats.missed_pkts = 0;
1225 	}
1226 
1227 	return 0;
1228 }
1229 
1230 static void
1231 eth_queue_release(void *q)
1232 {
1233 	rte_free(q);
1234 }
1235 
1236 static int
1237 eth_tx_done_cleanup(void *txq __rte_unused, uint32_t free_cnt __rte_unused)
1238 {
1239 	/*
1240 	 * vHost does not hang onto mbuf. eth_vhost_tx() copies packet data
1241 	 * and releases mbuf, so nothing to cleanup.
1242 	 */
1243 	return 0;
1244 }
1245 
1246 static int
1247 eth_link_update(struct rte_eth_dev *dev __rte_unused,
1248 		int wait_to_complete __rte_unused)
1249 {
1250 	return 0;
1251 }
1252 
1253 static uint32_t
1254 eth_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1255 {
1256 	struct vhost_queue *vq;
1257 
1258 	vq = dev->data->rx_queues[rx_queue_id];
1259 	if (vq == NULL)
1260 		return 0;
1261 
1262 	return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
1263 }
1264 
1265 static const struct eth_dev_ops ops = {
1266 	.dev_start = eth_dev_start,
1267 	.dev_stop = eth_dev_stop,
1268 	.dev_close = eth_dev_close,
1269 	.dev_configure = eth_dev_configure,
1270 	.dev_infos_get = eth_dev_info,
1271 	.rx_queue_setup = eth_rx_queue_setup,
1272 	.tx_queue_setup = eth_tx_queue_setup,
1273 	.rx_queue_release = eth_queue_release,
1274 	.tx_queue_release = eth_queue_release,
1275 	.tx_done_cleanup = eth_tx_done_cleanup,
1276 	.rx_queue_count = eth_rx_queue_count,
1277 	.link_update = eth_link_update,
1278 	.stats_get = eth_stats_get,
1279 	.stats_reset = eth_stats_reset,
1280 	.xstats_reset = vhost_dev_xstats_reset,
1281 	.xstats_get = vhost_dev_xstats_get,
1282 	.xstats_get_names = vhost_dev_xstats_get_names,
1283 	.rx_queue_intr_enable = eth_rxq_intr_enable,
1284 	.rx_queue_intr_disable = eth_rxq_intr_disable,
1285 };
1286 
1287 static int
1288 eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name,
1289 	int16_t queues, const unsigned int numa_node, uint64_t flags,
1290 	uint64_t disable_flags)
1291 {
1292 	const char *name = rte_vdev_device_name(dev);
1293 	struct rte_eth_dev_data *data;
1294 	struct pmd_internal *internal = NULL;
1295 	struct rte_eth_dev *eth_dev = NULL;
1296 	struct rte_ether_addr *eth_addr = NULL;
1297 
1298 	VHOST_LOG(INFO, "Creating VHOST-USER backend on numa socket %u\n",
1299 		numa_node);
1300 
1301 	/* reserve an ethdev entry */
1302 	eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal));
1303 	if (eth_dev == NULL)
1304 		goto error;
1305 	data = eth_dev->data;
1306 
1307 	eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
1308 	if (eth_addr == NULL)
1309 		goto error;
1310 	data->mac_addrs = eth_addr;
1311 	*eth_addr = base_eth_addr;
1312 	eth_addr->addr_bytes[5] = eth_dev->data->port_id;
1313 
1314 	/* now put it all together
1315 	 * - store queue data in internal,
1316 	 * - point eth_dev_data to internals
1317 	 * - and point eth_dev structure to new eth_dev_data structure
1318 	 */
1319 	internal = eth_dev->data->dev_private;
1320 	internal->iface_name = rte_malloc_socket(name, strlen(iface_name) + 1,
1321 						 0, numa_node);
1322 	if (internal->iface_name == NULL)
1323 		goto error;
1324 	strcpy(internal->iface_name, iface_name);
1325 
1326 	data->nb_rx_queues = queues;
1327 	data->nb_tx_queues = queues;
1328 	internal->max_queues = queues;
1329 	internal->vid = -1;
1330 	internal->flags = flags;
1331 	internal->disable_flags = disable_flags;
1332 	data->dev_link = pmd_link;
1333 	data->dev_flags = RTE_ETH_DEV_INTR_LSC | RTE_ETH_DEV_CLOSE_REMOVE;
1334 
1335 	eth_dev->dev_ops = &ops;
1336 
1337 	/* finally assign rx and tx ops */
1338 	eth_dev->rx_pkt_burst = eth_vhost_rx;
1339 	eth_dev->tx_pkt_burst = eth_vhost_tx;
1340 
1341 	rte_eth_dev_probing_finish(eth_dev);
1342 	return 0;
1343 
1344 error:
1345 	if (internal)
1346 		rte_free(internal->iface_name);
1347 	rte_eth_dev_release_port(eth_dev);
1348 
1349 	return -1;
1350 }
1351 
1352 static inline int
1353 open_iface(const char *key __rte_unused, const char *value, void *extra_args)
1354 {
1355 	const char **iface_name = extra_args;
1356 
1357 	if (value == NULL)
1358 		return -1;
1359 
1360 	*iface_name = value;
1361 
1362 	return 0;
1363 }
1364 
1365 static inline int
1366 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1367 {
1368 	uint16_t *n = extra_args;
1369 
1370 	if (value == NULL || extra_args == NULL)
1371 		return -EINVAL;
1372 
1373 	*n = (uint16_t)strtoul(value, NULL, 0);
1374 	if (*n == USHRT_MAX && errno == ERANGE)
1375 		return -1;
1376 
1377 	return 0;
1378 }
1379 
1380 static int
1381 rte_pmd_vhost_probe(struct rte_vdev_device *dev)
1382 {
1383 	struct rte_kvargs *kvlist = NULL;
1384 	int ret = 0;
1385 	char *iface_name;
1386 	uint16_t queues;
1387 	uint64_t flags = 0;
1388 	uint64_t disable_flags = 0;
1389 	int client_mode = 0;
1390 	int dequeue_zero_copy = 0;
1391 	int iommu_support = 0;
1392 	int postcopy_support = 0;
1393 	int tso = 0;
1394 	struct rte_eth_dev *eth_dev;
1395 	const char *name = rte_vdev_device_name(dev);
1396 
1397 	VHOST_LOG(INFO, "Initializing pmd_vhost for %s\n", name);
1398 
1399 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1400 		eth_dev = rte_eth_dev_attach_secondary(name);
1401 		if (!eth_dev) {
1402 			VHOST_LOG(ERR, "Failed to probe %s\n", name);
1403 			return -1;
1404 		}
1405 		eth_dev->rx_pkt_burst = eth_vhost_rx;
1406 		eth_dev->tx_pkt_burst = eth_vhost_tx;
1407 		eth_dev->dev_ops = &ops;
1408 		if (dev->device.numa_node == SOCKET_ID_ANY)
1409 			dev->device.numa_node = rte_socket_id();
1410 		eth_dev->device = &dev->device;
1411 		rte_eth_dev_probing_finish(eth_dev);
1412 		return 0;
1413 	}
1414 
1415 	kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1416 	if (kvlist == NULL)
1417 		return -1;
1418 
1419 	if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
1420 		ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
1421 					 &open_iface, &iface_name);
1422 		if (ret < 0)
1423 			goto out_free;
1424 	} else {
1425 		ret = -1;
1426 		goto out_free;
1427 	}
1428 
1429 	if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) {
1430 		ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG,
1431 					 &open_int, &queues);
1432 		if (ret < 0 || queues > RTE_MAX_QUEUES_PER_PORT)
1433 			goto out_free;
1434 
1435 	} else
1436 		queues = 1;
1437 
1438 	if (rte_kvargs_count(kvlist, ETH_VHOST_CLIENT_ARG) == 1) {
1439 		ret = rte_kvargs_process(kvlist, ETH_VHOST_CLIENT_ARG,
1440 					 &open_int, &client_mode);
1441 		if (ret < 0)
1442 			goto out_free;
1443 
1444 		if (client_mode)
1445 			flags |= RTE_VHOST_USER_CLIENT;
1446 	}
1447 
1448 	if (rte_kvargs_count(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY) == 1) {
1449 		ret = rte_kvargs_process(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY,
1450 					 &open_int, &dequeue_zero_copy);
1451 		if (ret < 0)
1452 			goto out_free;
1453 
1454 		if (dequeue_zero_copy)
1455 			flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1456 	}
1457 
1458 	if (rte_kvargs_count(kvlist, ETH_VHOST_IOMMU_SUPPORT) == 1) {
1459 		ret = rte_kvargs_process(kvlist, ETH_VHOST_IOMMU_SUPPORT,
1460 					 &open_int, &iommu_support);
1461 		if (ret < 0)
1462 			goto out_free;
1463 
1464 		if (iommu_support)
1465 			flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
1466 	}
1467 
1468 	if (rte_kvargs_count(kvlist, ETH_VHOST_POSTCOPY_SUPPORT) == 1) {
1469 		ret = rte_kvargs_process(kvlist, ETH_VHOST_POSTCOPY_SUPPORT,
1470 					 &open_int, &postcopy_support);
1471 		if (ret < 0)
1472 			goto out_free;
1473 
1474 		if (postcopy_support)
1475 			flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
1476 	}
1477 
1478 	if (rte_kvargs_count(kvlist, ETH_VHOST_VIRTIO_NET_F_HOST_TSO) == 1) {
1479 		ret = rte_kvargs_process(kvlist,
1480 				ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
1481 				&open_int, &tso);
1482 		if (ret < 0)
1483 			goto out_free;
1484 
1485 		if (tso == 0) {
1486 			disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO4);
1487 			disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO6);
1488 		}
1489 	}
1490 
1491 	if (dev->device.numa_node == SOCKET_ID_ANY)
1492 		dev->device.numa_node = rte_socket_id();
1493 
1494 	ret = eth_dev_vhost_create(dev, iface_name, queues,
1495 				   dev->device.numa_node, flags, disable_flags);
1496 	if (ret == -1)
1497 		VHOST_LOG(ERR, "Failed to create %s\n", name);
1498 
1499 out_free:
1500 	rte_kvargs_free(kvlist);
1501 	return ret;
1502 }
1503 
1504 static int
1505 rte_pmd_vhost_remove(struct rte_vdev_device *dev)
1506 {
1507 	const char *name;
1508 	struct rte_eth_dev *eth_dev = NULL;
1509 
1510 	name = rte_vdev_device_name(dev);
1511 	VHOST_LOG(INFO, "Un-Initializing pmd_vhost for %s\n", name);
1512 
1513 	/* find an ethdev entry */
1514 	eth_dev = rte_eth_dev_allocated(name);
1515 	if (eth_dev == NULL)
1516 		return 0;
1517 
1518 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1519 		return rte_eth_dev_release_port(eth_dev);
1520 
1521 	eth_dev_close(eth_dev);
1522 
1523 	rte_eth_dev_release_port(eth_dev);
1524 
1525 	return 0;
1526 }
1527 
1528 static struct rte_vdev_driver pmd_vhost_drv = {
1529 	.probe = rte_pmd_vhost_probe,
1530 	.remove = rte_pmd_vhost_remove,
1531 };
1532 
1533 RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv);
1534 RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost);
1535 RTE_PMD_REGISTER_PARAM_STRING(net_vhost,
1536 	"iface=<ifc> "
1537 	"queues=<int> "
1538 	"client=<0|1> "
1539 	"dequeue-zero-copy=<0|1> "
1540 	"iommu-support=<0|1> "
1541 	"postcopy-support=<0|1> "
1542 	"tso=<0|1>");
1543 
1544 RTE_INIT(vhost_init_log)
1545 {
1546 	vhost_logtype = rte_log_register("pmd.net.vhost");
1547 	if (vhost_logtype >= 0)
1548 		rte_log_set_level(vhost_logtype, RTE_LOG_NOTICE);
1549 }
1550