xref: /dpdk/drivers/net/vhost/rte_eth_vhost.c (revision 3cc6ecfdfe85d2577fef30e1791bb7534e3d60b3)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016 IGEL Co., Ltd.
3  * Copyright(c) 2016-2018 Intel Corporation
4  */
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <stdbool.h>
8 #include <sys/epoll.h>
9 
10 #include <rte_mbuf.h>
11 #include <rte_ethdev_driver.h>
12 #include <rte_ethdev_vdev.h>
13 #include <rte_malloc.h>
14 #include <rte_memcpy.h>
15 #include <rte_bus_vdev.h>
16 #include <rte_kvargs.h>
17 #include <rte_vhost.h>
18 #include <rte_spinlock.h>
19 
20 #include "rte_eth_vhost.h"
21 
22 RTE_LOG_REGISTER(vhost_logtype, pmd.net.vhost, NOTICE);
23 
24 #define VHOST_LOG(level, ...) \
25 	rte_log(RTE_LOG_ ## level, vhost_logtype, __VA_ARGS__)
26 
27 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
28 
29 #define ETH_VHOST_IFACE_ARG		"iface"
30 #define ETH_VHOST_QUEUES_ARG		"queues"
31 #define ETH_VHOST_CLIENT_ARG		"client"
32 #define ETH_VHOST_DEQUEUE_ZERO_COPY	"dequeue-zero-copy"
33 #define ETH_VHOST_IOMMU_SUPPORT		"iommu-support"
34 #define ETH_VHOST_POSTCOPY_SUPPORT	"postcopy-support"
35 #define ETH_VHOST_VIRTIO_NET_F_HOST_TSO "tso"
36 #define ETH_VHOST_LINEAR_BUF  "linear-buffer"
37 #define ETH_VHOST_EXT_BUF  "ext-buffer"
38 #define VHOST_MAX_PKT_BURST 32
39 
40 static const char *valid_arguments[] = {
41 	ETH_VHOST_IFACE_ARG,
42 	ETH_VHOST_QUEUES_ARG,
43 	ETH_VHOST_CLIENT_ARG,
44 	ETH_VHOST_DEQUEUE_ZERO_COPY,
45 	ETH_VHOST_IOMMU_SUPPORT,
46 	ETH_VHOST_POSTCOPY_SUPPORT,
47 	ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
48 	ETH_VHOST_LINEAR_BUF,
49 	ETH_VHOST_EXT_BUF,
50 	NULL
51 };
52 
53 static struct rte_ether_addr base_eth_addr = {
54 	.addr_bytes = {
55 		0x56 /* V */,
56 		0x48 /* H */,
57 		0x4F /* O */,
58 		0x53 /* S */,
59 		0x54 /* T */,
60 		0x00
61 	}
62 };
63 
64 enum vhost_xstats_pkts {
65 	VHOST_UNDERSIZE_PKT = 0,
66 	VHOST_64_PKT,
67 	VHOST_65_TO_127_PKT,
68 	VHOST_128_TO_255_PKT,
69 	VHOST_256_TO_511_PKT,
70 	VHOST_512_TO_1023_PKT,
71 	VHOST_1024_TO_1522_PKT,
72 	VHOST_1523_TO_MAX_PKT,
73 	VHOST_BROADCAST_PKT,
74 	VHOST_MULTICAST_PKT,
75 	VHOST_UNICAST_PKT,
76 	VHOST_ERRORS_PKT,
77 	VHOST_ERRORS_FRAGMENTED,
78 	VHOST_ERRORS_JABBER,
79 	VHOST_UNKNOWN_PROTOCOL,
80 	VHOST_XSTATS_MAX,
81 };
82 
83 struct vhost_stats {
84 	uint64_t pkts;
85 	uint64_t bytes;
86 	uint64_t missed_pkts;
87 	uint64_t xstats[VHOST_XSTATS_MAX];
88 };
89 
90 struct vhost_queue {
91 	int vid;
92 	rte_atomic32_t allow_queuing;
93 	rte_atomic32_t while_queuing;
94 	struct pmd_internal *internal;
95 	struct rte_mempool *mb_pool;
96 	uint16_t port;
97 	uint16_t virtqueue_id;
98 	struct vhost_stats stats;
99 	int intr_enable;
100 	rte_spinlock_t intr_lock;
101 };
102 
103 struct pmd_internal {
104 	rte_atomic32_t dev_attached;
105 	char *iface_name;
106 	uint64_t flags;
107 	uint64_t disable_flags;
108 	uint16_t max_queues;
109 	int vid;
110 	rte_atomic32_t started;
111 	uint8_t vlan_strip;
112 };
113 
114 struct internal_list {
115 	TAILQ_ENTRY(internal_list) next;
116 	struct rte_eth_dev *eth_dev;
117 };
118 
119 TAILQ_HEAD(internal_list_head, internal_list);
120 static struct internal_list_head internal_list =
121 	TAILQ_HEAD_INITIALIZER(internal_list);
122 
123 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
124 
125 static struct rte_eth_link pmd_link = {
126 		.link_speed = 10000,
127 		.link_duplex = ETH_LINK_FULL_DUPLEX,
128 		.link_status = ETH_LINK_DOWN
129 };
130 
131 struct rte_vhost_vring_state {
132 	rte_spinlock_t lock;
133 
134 	bool cur[RTE_MAX_QUEUES_PER_PORT * 2];
135 	bool seen[RTE_MAX_QUEUES_PER_PORT * 2];
136 	unsigned int index;
137 	unsigned int max_vring;
138 };
139 
140 static struct rte_vhost_vring_state *vring_states[RTE_MAX_ETHPORTS];
141 
142 #define VHOST_XSTATS_NAME_SIZE 64
143 
144 struct vhost_xstats_name_off {
145 	char name[VHOST_XSTATS_NAME_SIZE];
146 	uint64_t offset;
147 };
148 
149 /* [rx]_is prepended to the name string here */
150 static const struct vhost_xstats_name_off vhost_rxport_stat_strings[] = {
151 	{"good_packets",
152 	 offsetof(struct vhost_queue, stats.pkts)},
153 	{"total_bytes",
154 	 offsetof(struct vhost_queue, stats.bytes)},
155 	{"missed_pkts",
156 	 offsetof(struct vhost_queue, stats.missed_pkts)},
157 	{"broadcast_packets",
158 	 offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
159 	{"multicast_packets",
160 	 offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
161 	{"unicast_packets",
162 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
163 	 {"undersize_packets",
164 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
165 	{"size_64_packets",
166 	 offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
167 	{"size_65_to_127_packets",
168 	 offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
169 	{"size_128_to_255_packets",
170 	 offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
171 	{"size_256_to_511_packets",
172 	 offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
173 	{"size_512_to_1023_packets",
174 	 offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
175 	{"size_1024_to_1522_packets",
176 	 offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
177 	{"size_1523_to_max_packets",
178 	 offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
179 	{"errors_with_bad_CRC",
180 	 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
181 	{"fragmented_errors",
182 	 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_FRAGMENTED])},
183 	{"jabber_errors",
184 	 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_JABBER])},
185 	{"unknown_protos_packets",
186 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNKNOWN_PROTOCOL])},
187 };
188 
189 /* [tx]_ is prepended to the name string here */
190 static const struct vhost_xstats_name_off vhost_txport_stat_strings[] = {
191 	{"good_packets",
192 	 offsetof(struct vhost_queue, stats.pkts)},
193 	{"total_bytes",
194 	 offsetof(struct vhost_queue, stats.bytes)},
195 	{"missed_pkts",
196 	 offsetof(struct vhost_queue, stats.missed_pkts)},
197 	{"broadcast_packets",
198 	 offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
199 	{"multicast_packets",
200 	 offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
201 	{"unicast_packets",
202 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
203 	{"undersize_packets",
204 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
205 	{"size_64_packets",
206 	 offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
207 	{"size_65_to_127_packets",
208 	 offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
209 	{"size_128_to_255_packets",
210 	 offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
211 	{"size_256_to_511_packets",
212 	 offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
213 	{"size_512_to_1023_packets",
214 	 offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
215 	{"size_1024_to_1522_packets",
216 	 offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
217 	{"size_1523_to_max_packets",
218 	 offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
219 	{"errors_with_bad_CRC",
220 	 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
221 };
222 
223 #define VHOST_NB_XSTATS_RXPORT (sizeof(vhost_rxport_stat_strings) / \
224 				sizeof(vhost_rxport_stat_strings[0]))
225 
226 #define VHOST_NB_XSTATS_TXPORT (sizeof(vhost_txport_stat_strings) / \
227 				sizeof(vhost_txport_stat_strings[0]))
228 
229 static int
230 vhost_dev_xstats_reset(struct rte_eth_dev *dev)
231 {
232 	struct vhost_queue *vq = NULL;
233 	unsigned int i = 0;
234 
235 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
236 		vq = dev->data->rx_queues[i];
237 		if (!vq)
238 			continue;
239 		memset(&vq->stats, 0, sizeof(vq->stats));
240 	}
241 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
242 		vq = dev->data->tx_queues[i];
243 		if (!vq)
244 			continue;
245 		memset(&vq->stats, 0, sizeof(vq->stats));
246 	}
247 
248 	return 0;
249 }
250 
251 static int
252 vhost_dev_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
253 			   struct rte_eth_xstat_name *xstats_names,
254 			   unsigned int limit __rte_unused)
255 {
256 	unsigned int t = 0;
257 	int count = 0;
258 	int nstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
259 
260 	if (!xstats_names)
261 		return nstats;
262 	for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
263 		snprintf(xstats_names[count].name,
264 			 sizeof(xstats_names[count].name),
265 			 "rx_%s", vhost_rxport_stat_strings[t].name);
266 		count++;
267 	}
268 	for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
269 		snprintf(xstats_names[count].name,
270 			 sizeof(xstats_names[count].name),
271 			 "tx_%s", vhost_txport_stat_strings[t].name);
272 		count++;
273 	}
274 	return count;
275 }
276 
277 static int
278 vhost_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats,
279 		     unsigned int n)
280 {
281 	unsigned int i;
282 	unsigned int t;
283 	unsigned int count = 0;
284 	struct vhost_queue *vq = NULL;
285 	unsigned int nxstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
286 
287 	if (n < nxstats)
288 		return nxstats;
289 
290 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
291 		vq = dev->data->rx_queues[i];
292 		if (!vq)
293 			continue;
294 		vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
295 				- (vq->stats.xstats[VHOST_BROADCAST_PKT]
296 				+ vq->stats.xstats[VHOST_MULTICAST_PKT]);
297 	}
298 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
299 		vq = dev->data->tx_queues[i];
300 		if (!vq)
301 			continue;
302 		vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
303 				+ vq->stats.missed_pkts
304 				- (vq->stats.xstats[VHOST_BROADCAST_PKT]
305 				+ vq->stats.xstats[VHOST_MULTICAST_PKT]);
306 	}
307 	for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
308 		xstats[count].value = 0;
309 		for (i = 0; i < dev->data->nb_rx_queues; i++) {
310 			vq = dev->data->rx_queues[i];
311 			if (!vq)
312 				continue;
313 			xstats[count].value +=
314 				*(uint64_t *)(((char *)vq)
315 				+ vhost_rxport_stat_strings[t].offset);
316 		}
317 		xstats[count].id = count;
318 		count++;
319 	}
320 	for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
321 		xstats[count].value = 0;
322 		for (i = 0; i < dev->data->nb_tx_queues; i++) {
323 			vq = dev->data->tx_queues[i];
324 			if (!vq)
325 				continue;
326 			xstats[count].value +=
327 				*(uint64_t *)(((char *)vq)
328 				+ vhost_txport_stat_strings[t].offset);
329 		}
330 		xstats[count].id = count;
331 		count++;
332 	}
333 	return count;
334 }
335 
336 static inline void
337 vhost_count_multicast_broadcast(struct vhost_queue *vq,
338 				struct rte_mbuf *mbuf)
339 {
340 	struct rte_ether_addr *ea = NULL;
341 	struct vhost_stats *pstats = &vq->stats;
342 
343 	ea = rte_pktmbuf_mtod(mbuf, struct rte_ether_addr *);
344 	if (rte_is_multicast_ether_addr(ea)) {
345 		if (rte_is_broadcast_ether_addr(ea))
346 			pstats->xstats[VHOST_BROADCAST_PKT]++;
347 		else
348 			pstats->xstats[VHOST_MULTICAST_PKT]++;
349 	}
350 }
351 
352 static void
353 vhost_update_packet_xstats(struct vhost_queue *vq,
354 			   struct rte_mbuf **bufs,
355 			   uint16_t count)
356 {
357 	uint32_t pkt_len = 0;
358 	uint64_t i = 0;
359 	uint64_t index;
360 	struct vhost_stats *pstats = &vq->stats;
361 
362 	for (i = 0; i < count ; i++) {
363 		pkt_len = bufs[i]->pkt_len;
364 		if (pkt_len == 64) {
365 			pstats->xstats[VHOST_64_PKT]++;
366 		} else if (pkt_len > 64 && pkt_len < 1024) {
367 			index = (sizeof(pkt_len) * 8)
368 				- __builtin_clz(pkt_len) - 5;
369 			pstats->xstats[index]++;
370 		} else {
371 			if (pkt_len < 64)
372 				pstats->xstats[VHOST_UNDERSIZE_PKT]++;
373 			else if (pkt_len <= 1522)
374 				pstats->xstats[VHOST_1024_TO_1522_PKT]++;
375 			else if (pkt_len > 1522)
376 				pstats->xstats[VHOST_1523_TO_MAX_PKT]++;
377 		}
378 		vhost_count_multicast_broadcast(vq, bufs[i]);
379 	}
380 }
381 
382 static uint16_t
383 eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
384 {
385 	struct vhost_queue *r = q;
386 	uint16_t i, nb_rx = 0;
387 	uint16_t nb_receive = nb_bufs;
388 
389 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
390 		return 0;
391 
392 	rte_atomic32_set(&r->while_queuing, 1);
393 
394 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
395 		goto out;
396 
397 	/* Dequeue packets from guest TX queue */
398 	while (nb_receive) {
399 		uint16_t nb_pkts;
400 		uint16_t num = (uint16_t)RTE_MIN(nb_receive,
401 						 VHOST_MAX_PKT_BURST);
402 
403 		nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
404 						  r->mb_pool, &bufs[nb_rx],
405 						  num);
406 
407 		nb_rx += nb_pkts;
408 		nb_receive -= nb_pkts;
409 		if (nb_pkts < num)
410 			break;
411 	}
412 
413 	r->stats.pkts += nb_rx;
414 
415 	for (i = 0; likely(i < nb_rx); i++) {
416 		bufs[i]->port = r->port;
417 		bufs[i]->vlan_tci = 0;
418 
419 		if (r->internal->vlan_strip)
420 			rte_vlan_strip(bufs[i]);
421 
422 		r->stats.bytes += bufs[i]->pkt_len;
423 	}
424 
425 	vhost_update_packet_xstats(r, bufs, nb_rx);
426 
427 out:
428 	rte_atomic32_set(&r->while_queuing, 0);
429 
430 	return nb_rx;
431 }
432 
433 static uint16_t
434 eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
435 {
436 	struct vhost_queue *r = q;
437 	uint16_t i, nb_tx = 0;
438 	uint16_t nb_send = 0;
439 
440 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
441 		return 0;
442 
443 	rte_atomic32_set(&r->while_queuing, 1);
444 
445 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
446 		goto out;
447 
448 	for (i = 0; i < nb_bufs; i++) {
449 		struct rte_mbuf *m = bufs[i];
450 
451 		/* Do VLAN tag insertion */
452 		if (m->ol_flags & PKT_TX_VLAN_PKT) {
453 			int error = rte_vlan_insert(&m);
454 			if (unlikely(error)) {
455 				rte_pktmbuf_free(m);
456 				continue;
457 			}
458 		}
459 
460 		bufs[nb_send] = m;
461 		++nb_send;
462 	}
463 
464 	/* Enqueue packets to guest RX queue */
465 	while (nb_send) {
466 		uint16_t nb_pkts;
467 		uint16_t num = (uint16_t)RTE_MIN(nb_send,
468 						 VHOST_MAX_PKT_BURST);
469 
470 		nb_pkts = rte_vhost_enqueue_burst(r->vid, r->virtqueue_id,
471 						  &bufs[nb_tx], num);
472 
473 		nb_tx += nb_pkts;
474 		nb_send -= nb_pkts;
475 		if (nb_pkts < num)
476 			break;
477 	}
478 
479 	r->stats.pkts += nb_tx;
480 	r->stats.missed_pkts += nb_bufs - nb_tx;
481 
482 	for (i = 0; likely(i < nb_tx); i++)
483 		r->stats.bytes += bufs[i]->pkt_len;
484 
485 	vhost_update_packet_xstats(r, bufs, nb_tx);
486 
487 	/* According to RFC2863 page42 section ifHCOutMulticastPkts and
488 	 * ifHCOutBroadcastPkts, the counters "multicast" and "broadcast"
489 	 * are increased when packets are not transmitted successfully.
490 	 */
491 	for (i = nb_tx; i < nb_bufs; i++)
492 		vhost_count_multicast_broadcast(r, bufs[i]);
493 
494 	for (i = 0; likely(i < nb_tx); i++)
495 		rte_pktmbuf_free(bufs[i]);
496 out:
497 	rte_atomic32_set(&r->while_queuing, 0);
498 
499 	return nb_tx;
500 }
501 
502 static inline struct internal_list *
503 find_internal_resource(char *ifname)
504 {
505 	int found = 0;
506 	struct internal_list *list;
507 	struct pmd_internal *internal;
508 
509 	if (ifname == NULL)
510 		return NULL;
511 
512 	pthread_mutex_lock(&internal_list_lock);
513 
514 	TAILQ_FOREACH(list, &internal_list, next) {
515 		internal = list->eth_dev->data->dev_private;
516 		if (!strcmp(internal->iface_name, ifname)) {
517 			found = 1;
518 			break;
519 		}
520 	}
521 
522 	pthread_mutex_unlock(&internal_list_lock);
523 
524 	if (!found)
525 		return NULL;
526 
527 	return list;
528 }
529 
530 static int
531 eth_vhost_update_intr(struct rte_eth_dev *eth_dev, uint16_t rxq_idx)
532 {
533 	struct rte_intr_handle *handle = eth_dev->intr_handle;
534 	struct rte_epoll_event rev;
535 	int epfd, ret;
536 
537 	if (!handle)
538 		return 0;
539 
540 	if (handle->efds[rxq_idx] == handle->elist[rxq_idx].fd)
541 		return 0;
542 
543 	VHOST_LOG(INFO, "kickfd for rxq-%d was changed, updating handler.\n",
544 			rxq_idx);
545 
546 	if (handle->elist[rxq_idx].fd != -1)
547 		VHOST_LOG(ERR, "Unexpected previous kickfd value (Got %d, expected -1).\n",
548 				handle->elist[rxq_idx].fd);
549 
550 	/*
551 	 * First remove invalid epoll event, and then install
552 	 * the new one. May be solved with a proper API in the
553 	 * future.
554 	 */
555 	epfd = handle->elist[rxq_idx].epfd;
556 	rev = handle->elist[rxq_idx];
557 	ret = rte_epoll_ctl(epfd, EPOLL_CTL_DEL, rev.fd,
558 			&handle->elist[rxq_idx]);
559 	if (ret) {
560 		VHOST_LOG(ERR, "Delete epoll event failed.\n");
561 		return ret;
562 	}
563 
564 	rev.fd = handle->efds[rxq_idx];
565 	handle->elist[rxq_idx] = rev;
566 	ret = rte_epoll_ctl(epfd, EPOLL_CTL_ADD, rev.fd,
567 			&handle->elist[rxq_idx]);
568 	if (ret) {
569 		VHOST_LOG(ERR, "Add epoll event failed.\n");
570 		return ret;
571 	}
572 
573 	return 0;
574 }
575 
576 static int
577 eth_rxq_intr_enable(struct rte_eth_dev *dev, uint16_t qid)
578 {
579 	struct rte_vhost_vring vring;
580 	struct vhost_queue *vq;
581 	int old_intr_enable, ret = 0;
582 
583 	vq = dev->data->rx_queues[qid];
584 	if (!vq) {
585 		VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
586 		return -1;
587 	}
588 
589 	rte_spinlock_lock(&vq->intr_lock);
590 	old_intr_enable = vq->intr_enable;
591 	vq->intr_enable = 1;
592 	ret = eth_vhost_update_intr(dev, qid);
593 	rte_spinlock_unlock(&vq->intr_lock);
594 
595 	if (ret < 0) {
596 		VHOST_LOG(ERR, "Failed to update rxq%d's intr\n", qid);
597 		vq->intr_enable = old_intr_enable;
598 		return ret;
599 	}
600 
601 	ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
602 	if (ret < 0) {
603 		VHOST_LOG(ERR, "Failed to get rxq%d's vring\n", qid);
604 		return ret;
605 	}
606 	VHOST_LOG(INFO, "Enable interrupt for rxq%d\n", qid);
607 	rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 1);
608 	rte_wmb();
609 
610 	return ret;
611 }
612 
613 static int
614 eth_rxq_intr_disable(struct rte_eth_dev *dev, uint16_t qid)
615 {
616 	struct rte_vhost_vring vring;
617 	struct vhost_queue *vq;
618 	int ret = 0;
619 
620 	vq = dev->data->rx_queues[qid];
621 	if (!vq) {
622 		VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
623 		return -1;
624 	}
625 
626 	ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
627 	if (ret < 0) {
628 		VHOST_LOG(ERR, "Failed to get rxq%d's vring", qid);
629 		return ret;
630 	}
631 	VHOST_LOG(INFO, "Disable interrupt for rxq%d\n", qid);
632 	rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 0);
633 	rte_wmb();
634 
635 	vq->intr_enable = 0;
636 
637 	return 0;
638 }
639 
640 static void
641 eth_vhost_uninstall_intr(struct rte_eth_dev *dev)
642 {
643 	struct rte_intr_handle *intr_handle = dev->intr_handle;
644 
645 	if (intr_handle) {
646 		if (intr_handle->intr_vec)
647 			free(intr_handle->intr_vec);
648 		free(intr_handle);
649 	}
650 
651 	dev->intr_handle = NULL;
652 }
653 
654 static int
655 eth_vhost_install_intr(struct rte_eth_dev *dev)
656 {
657 	struct rte_vhost_vring vring;
658 	struct vhost_queue *vq;
659 	int nb_rxq = dev->data->nb_rx_queues;
660 	int i;
661 	int ret;
662 
663 	/* uninstall firstly if we are reconnecting */
664 	if (dev->intr_handle)
665 		eth_vhost_uninstall_intr(dev);
666 
667 	dev->intr_handle = malloc(sizeof(*dev->intr_handle));
668 	if (!dev->intr_handle) {
669 		VHOST_LOG(ERR, "Fail to allocate intr_handle\n");
670 		return -ENOMEM;
671 	}
672 	memset(dev->intr_handle, 0, sizeof(*dev->intr_handle));
673 
674 	dev->intr_handle->efd_counter_size = sizeof(uint64_t);
675 
676 	dev->intr_handle->intr_vec =
677 		malloc(nb_rxq * sizeof(dev->intr_handle->intr_vec[0]));
678 
679 	if (!dev->intr_handle->intr_vec) {
680 		VHOST_LOG(ERR,
681 			"Failed to allocate memory for interrupt vector\n");
682 		free(dev->intr_handle);
683 		return -ENOMEM;
684 	}
685 
686 	VHOST_LOG(INFO, "Prepare intr vec\n");
687 	for (i = 0; i < nb_rxq; i++) {
688 		dev->intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + i;
689 		dev->intr_handle->efds[i] = -1;
690 		vq = dev->data->rx_queues[i];
691 		if (!vq) {
692 			VHOST_LOG(INFO, "rxq-%d not setup yet, skip!\n", i);
693 			continue;
694 		}
695 
696 		ret = rte_vhost_get_vhost_vring(vq->vid, (i << 1) + 1, &vring);
697 		if (ret < 0) {
698 			VHOST_LOG(INFO,
699 				"Failed to get rxq-%d's vring, skip!\n", i);
700 			continue;
701 		}
702 
703 		if (vring.kickfd < 0) {
704 			VHOST_LOG(INFO,
705 				"rxq-%d's kickfd is invalid, skip!\n", i);
706 			continue;
707 		}
708 		dev->intr_handle->efds[i] = vring.kickfd;
709 		VHOST_LOG(INFO, "Installed intr vec for rxq-%d\n", i);
710 	}
711 
712 	dev->intr_handle->nb_efd = nb_rxq;
713 	dev->intr_handle->max_intr = nb_rxq + 1;
714 	dev->intr_handle->type = RTE_INTR_HANDLE_VDEV;
715 
716 	return 0;
717 }
718 
719 static void
720 update_queuing_status(struct rte_eth_dev *dev)
721 {
722 	struct pmd_internal *internal = dev->data->dev_private;
723 	struct vhost_queue *vq;
724 	unsigned int i;
725 	int allow_queuing = 1;
726 
727 	if (!dev->data->rx_queues || !dev->data->tx_queues)
728 		return;
729 
730 	if (rte_atomic32_read(&internal->started) == 0 ||
731 	    rte_atomic32_read(&internal->dev_attached) == 0)
732 		allow_queuing = 0;
733 
734 	/* Wait until rx/tx_pkt_burst stops accessing vhost device */
735 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
736 		vq = dev->data->rx_queues[i];
737 		if (vq == NULL)
738 			continue;
739 		rte_atomic32_set(&vq->allow_queuing, allow_queuing);
740 		while (rte_atomic32_read(&vq->while_queuing))
741 			rte_pause();
742 	}
743 
744 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
745 		vq = dev->data->tx_queues[i];
746 		if (vq == NULL)
747 			continue;
748 		rte_atomic32_set(&vq->allow_queuing, allow_queuing);
749 		while (rte_atomic32_read(&vq->while_queuing))
750 			rte_pause();
751 	}
752 }
753 
754 static void
755 queue_setup(struct rte_eth_dev *eth_dev, struct pmd_internal *internal)
756 {
757 	struct vhost_queue *vq;
758 	int i;
759 
760 	for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
761 		vq = eth_dev->data->rx_queues[i];
762 		if (!vq)
763 			continue;
764 		vq->vid = internal->vid;
765 		vq->internal = internal;
766 		vq->port = eth_dev->data->port_id;
767 	}
768 	for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
769 		vq = eth_dev->data->tx_queues[i];
770 		if (!vq)
771 			continue;
772 		vq->vid = internal->vid;
773 		vq->internal = internal;
774 		vq->port = eth_dev->data->port_id;
775 	}
776 }
777 
778 static int
779 new_device(int vid)
780 {
781 	struct rte_eth_dev *eth_dev;
782 	struct internal_list *list;
783 	struct pmd_internal *internal;
784 	struct rte_eth_conf *dev_conf;
785 	unsigned i;
786 	char ifname[PATH_MAX];
787 #ifdef RTE_LIBRTE_VHOST_NUMA
788 	int newnode;
789 #endif
790 
791 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
792 	list = find_internal_resource(ifname);
793 	if (list == NULL) {
794 		VHOST_LOG(INFO, "Invalid device name: %s\n", ifname);
795 		return -1;
796 	}
797 
798 	eth_dev = list->eth_dev;
799 	internal = eth_dev->data->dev_private;
800 	dev_conf = &eth_dev->data->dev_conf;
801 
802 #ifdef RTE_LIBRTE_VHOST_NUMA
803 	newnode = rte_vhost_get_numa_node(vid);
804 	if (newnode >= 0)
805 		eth_dev->data->numa_node = newnode;
806 #endif
807 
808 	internal->vid = vid;
809 	if (rte_atomic32_read(&internal->started) == 1) {
810 		queue_setup(eth_dev, internal);
811 
812 		if (dev_conf->intr_conf.rxq) {
813 			if (eth_vhost_install_intr(eth_dev) < 0) {
814 				VHOST_LOG(INFO,
815 					"Failed to install interrupt handler.");
816 					return -1;
817 			}
818 		}
819 	} else {
820 		VHOST_LOG(INFO, "RX/TX queues not exist yet\n");
821 	}
822 
823 	for (i = 0; i < rte_vhost_get_vring_num(vid); i++)
824 		rte_vhost_enable_guest_notification(vid, i, 0);
825 
826 	rte_vhost_get_mtu(vid, &eth_dev->data->mtu);
827 
828 	eth_dev->data->dev_link.link_status = ETH_LINK_UP;
829 
830 	rte_atomic32_set(&internal->dev_attached, 1);
831 	update_queuing_status(eth_dev);
832 
833 	VHOST_LOG(INFO, "Vhost device %d created\n", vid);
834 
835 	_rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
836 
837 	return 0;
838 }
839 
840 static void
841 destroy_device(int vid)
842 {
843 	struct rte_eth_dev *eth_dev;
844 	struct pmd_internal *internal;
845 	struct vhost_queue *vq;
846 	struct internal_list *list;
847 	char ifname[PATH_MAX];
848 	unsigned i;
849 	struct rte_vhost_vring_state *state;
850 
851 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
852 	list = find_internal_resource(ifname);
853 	if (list == NULL) {
854 		VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
855 		return;
856 	}
857 	eth_dev = list->eth_dev;
858 	internal = eth_dev->data->dev_private;
859 
860 	rte_atomic32_set(&internal->dev_attached, 0);
861 	update_queuing_status(eth_dev);
862 
863 	eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
864 
865 	if (eth_dev->data->rx_queues && eth_dev->data->tx_queues) {
866 		for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
867 			vq = eth_dev->data->rx_queues[i];
868 			if (!vq)
869 				continue;
870 			vq->vid = -1;
871 		}
872 		for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
873 			vq = eth_dev->data->tx_queues[i];
874 			if (!vq)
875 				continue;
876 			vq->vid = -1;
877 		}
878 	}
879 
880 	state = vring_states[eth_dev->data->port_id];
881 	rte_spinlock_lock(&state->lock);
882 	for (i = 0; i <= state->max_vring; i++) {
883 		state->cur[i] = false;
884 		state->seen[i] = false;
885 	}
886 	state->max_vring = 0;
887 	rte_spinlock_unlock(&state->lock);
888 
889 	VHOST_LOG(INFO, "Vhost device %d destroyed\n", vid);
890 	eth_vhost_uninstall_intr(eth_dev);
891 
892 	_rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
893 }
894 
895 static int
896 vring_conf_update(int vid, struct rte_eth_dev *eth_dev, uint16_t vring_id)
897 {
898 	struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
899 	struct pmd_internal *internal = eth_dev->data->dev_private;
900 	struct vhost_queue *vq;
901 	struct rte_vhost_vring vring;
902 	int rx_idx = vring_id % 2 ? (vring_id - 1) >> 1 : -1;
903 	int ret = 0;
904 
905 	/*
906 	 * The vring kickfd may be changed after the new device notification.
907 	 * Update it when the vring state is updated.
908 	 */
909 	if (rx_idx >= 0 && rx_idx < eth_dev->data->nb_rx_queues &&
910 	    rte_atomic32_read(&internal->dev_attached) &&
911 	    rte_atomic32_read(&internal->started) &&
912 	    dev_conf->intr_conf.rxq) {
913 		ret = rte_vhost_get_vhost_vring(vid, vring_id, &vring);
914 		if (ret) {
915 			VHOST_LOG(ERR, "Failed to get vring %d information.\n",
916 					vring_id);
917 			return ret;
918 		}
919 		eth_dev->intr_handle->efds[rx_idx] = vring.kickfd;
920 
921 		vq = eth_dev->data->rx_queues[rx_idx];
922 		if (!vq) {
923 			VHOST_LOG(ERR, "rxq%d is not setup yet\n", rx_idx);
924 			return -1;
925 		}
926 
927 		rte_spinlock_lock(&vq->intr_lock);
928 		if (vq->intr_enable)
929 			ret = eth_vhost_update_intr(eth_dev, rx_idx);
930 		rte_spinlock_unlock(&vq->intr_lock);
931 	}
932 
933 	return ret;
934 }
935 
936 static int
937 vring_state_changed(int vid, uint16_t vring, int enable)
938 {
939 	struct rte_vhost_vring_state *state;
940 	struct rte_eth_dev *eth_dev;
941 	struct internal_list *list;
942 	char ifname[PATH_MAX];
943 
944 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
945 	list = find_internal_resource(ifname);
946 	if (list == NULL) {
947 		VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
948 		return -1;
949 	}
950 
951 	eth_dev = list->eth_dev;
952 	/* won't be NULL */
953 	state = vring_states[eth_dev->data->port_id];
954 
955 	if (enable && vring_conf_update(vid, eth_dev, vring))
956 		VHOST_LOG(INFO, "Failed to update vring-%d configuration.\n",
957 			  (int)vring);
958 
959 	rte_spinlock_lock(&state->lock);
960 	if (state->cur[vring] == enable) {
961 		rte_spinlock_unlock(&state->lock);
962 		return 0;
963 	}
964 	state->cur[vring] = enable;
965 	state->max_vring = RTE_MAX(vring, state->max_vring);
966 	rte_spinlock_unlock(&state->lock);
967 
968 	VHOST_LOG(INFO, "vring%u is %s\n",
969 			vring, enable ? "enabled" : "disabled");
970 
971 	_rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_QUEUE_STATE, NULL);
972 
973 	return 0;
974 }
975 
976 static struct vhost_device_ops vhost_ops = {
977 	.new_device          = new_device,
978 	.destroy_device      = destroy_device,
979 	.vring_state_changed = vring_state_changed,
980 };
981 
982 static int
983 vhost_driver_setup(struct rte_eth_dev *eth_dev)
984 {
985 	struct pmd_internal *internal = eth_dev->data->dev_private;
986 	struct internal_list *list = NULL;
987 	struct rte_vhost_vring_state *vring_state = NULL;
988 	unsigned int numa_node = eth_dev->device->numa_node;
989 	const char *name = eth_dev->device->name;
990 
991 	/* Don't try to setup again if it has already been done. */
992 	list = find_internal_resource(internal->iface_name);
993 	if (list)
994 		return 0;
995 
996 	list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node);
997 	if (list == NULL)
998 		return -1;
999 
1000 	vring_state = rte_zmalloc_socket(name, sizeof(*vring_state),
1001 					 0, numa_node);
1002 	if (vring_state == NULL)
1003 		goto free_list;
1004 
1005 	list->eth_dev = eth_dev;
1006 	pthread_mutex_lock(&internal_list_lock);
1007 	TAILQ_INSERT_TAIL(&internal_list, list, next);
1008 	pthread_mutex_unlock(&internal_list_lock);
1009 
1010 	rte_spinlock_init(&vring_state->lock);
1011 	vring_states[eth_dev->data->port_id] = vring_state;
1012 
1013 	if (rte_vhost_driver_register(internal->iface_name, internal->flags))
1014 		goto list_remove;
1015 
1016 	if (internal->disable_flags) {
1017 		if (rte_vhost_driver_disable_features(internal->iface_name,
1018 						      internal->disable_flags))
1019 			goto drv_unreg;
1020 	}
1021 
1022 	if (rte_vhost_driver_callback_register(internal->iface_name,
1023 					       &vhost_ops) < 0) {
1024 		VHOST_LOG(ERR, "Can't register callbacks\n");
1025 		goto drv_unreg;
1026 	}
1027 
1028 	if (rte_vhost_driver_start(internal->iface_name) < 0) {
1029 		VHOST_LOG(ERR, "Failed to start driver for %s\n",
1030 			  internal->iface_name);
1031 		goto drv_unreg;
1032 	}
1033 
1034 	return 0;
1035 
1036 drv_unreg:
1037 	rte_vhost_driver_unregister(internal->iface_name);
1038 list_remove:
1039 	vring_states[eth_dev->data->port_id] = NULL;
1040 	pthread_mutex_lock(&internal_list_lock);
1041 	TAILQ_REMOVE(&internal_list, list, next);
1042 	pthread_mutex_unlock(&internal_list_lock);
1043 	rte_free(vring_state);
1044 free_list:
1045 	rte_free(list);
1046 
1047 	return -1;
1048 }
1049 
1050 int
1051 rte_eth_vhost_get_queue_event(uint16_t port_id,
1052 		struct rte_eth_vhost_queue_event *event)
1053 {
1054 	struct rte_vhost_vring_state *state;
1055 	unsigned int i;
1056 	int idx;
1057 
1058 	if (port_id >= RTE_MAX_ETHPORTS) {
1059 		VHOST_LOG(ERR, "Invalid port id\n");
1060 		return -1;
1061 	}
1062 
1063 	state = vring_states[port_id];
1064 	if (!state) {
1065 		VHOST_LOG(ERR, "Unused port\n");
1066 		return -1;
1067 	}
1068 
1069 	rte_spinlock_lock(&state->lock);
1070 	for (i = 0; i <= state->max_vring; i++) {
1071 		idx = state->index++ % (state->max_vring + 1);
1072 
1073 		if (state->cur[idx] != state->seen[idx]) {
1074 			state->seen[idx] = state->cur[idx];
1075 			event->queue_id = idx / 2;
1076 			event->rx = idx & 1;
1077 			event->enable = state->cur[idx];
1078 			rte_spinlock_unlock(&state->lock);
1079 			return 0;
1080 		}
1081 	}
1082 	rte_spinlock_unlock(&state->lock);
1083 
1084 	return -1;
1085 }
1086 
1087 int
1088 rte_eth_vhost_get_vid_from_port_id(uint16_t port_id)
1089 {
1090 	struct internal_list *list;
1091 	struct rte_eth_dev *eth_dev;
1092 	struct vhost_queue *vq;
1093 	int vid = -1;
1094 
1095 	if (!rte_eth_dev_is_valid_port(port_id))
1096 		return -1;
1097 
1098 	pthread_mutex_lock(&internal_list_lock);
1099 
1100 	TAILQ_FOREACH(list, &internal_list, next) {
1101 		eth_dev = list->eth_dev;
1102 		if (eth_dev->data->port_id == port_id) {
1103 			vq = eth_dev->data->rx_queues[0];
1104 			if (vq) {
1105 				vid = vq->vid;
1106 			}
1107 			break;
1108 		}
1109 	}
1110 
1111 	pthread_mutex_unlock(&internal_list_lock);
1112 
1113 	return vid;
1114 }
1115 
1116 static int
1117 eth_dev_configure(struct rte_eth_dev *dev)
1118 {
1119 	struct pmd_internal *internal = dev->data->dev_private;
1120 	const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
1121 
1122 	/* NOTE: the same process has to operate a vhost interface
1123 	 * from beginning to end (from eth_dev configure to eth_dev close).
1124 	 * It is user's responsibility at the moment.
1125 	 */
1126 	if (vhost_driver_setup(dev) < 0)
1127 		return -1;
1128 
1129 	internal->vlan_strip = !!(rxmode->offloads & DEV_RX_OFFLOAD_VLAN_STRIP);
1130 
1131 	return 0;
1132 }
1133 
1134 static int
1135 eth_dev_start(struct rte_eth_dev *eth_dev)
1136 {
1137 	struct pmd_internal *internal = eth_dev->data->dev_private;
1138 	struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
1139 
1140 	queue_setup(eth_dev, internal);
1141 
1142 	if (rte_atomic32_read(&internal->dev_attached) == 1) {
1143 		if (dev_conf->intr_conf.rxq) {
1144 			if (eth_vhost_install_intr(eth_dev) < 0) {
1145 				VHOST_LOG(INFO,
1146 					"Failed to install interrupt handler.");
1147 					return -1;
1148 			}
1149 		}
1150 	}
1151 
1152 	rte_atomic32_set(&internal->started, 1);
1153 	update_queuing_status(eth_dev);
1154 
1155 	return 0;
1156 }
1157 
1158 static void
1159 eth_dev_stop(struct rte_eth_dev *dev)
1160 {
1161 	struct pmd_internal *internal = dev->data->dev_private;
1162 
1163 	rte_atomic32_set(&internal->started, 0);
1164 	update_queuing_status(dev);
1165 }
1166 
1167 static void
1168 eth_dev_close(struct rte_eth_dev *dev)
1169 {
1170 	struct pmd_internal *internal;
1171 	struct internal_list *list;
1172 	unsigned int i;
1173 
1174 	internal = dev->data->dev_private;
1175 	if (!internal)
1176 		return;
1177 
1178 	eth_dev_stop(dev);
1179 
1180 	list = find_internal_resource(internal->iface_name);
1181 	if (list) {
1182 		rte_vhost_driver_unregister(internal->iface_name);
1183 		pthread_mutex_lock(&internal_list_lock);
1184 		TAILQ_REMOVE(&internal_list, list, next);
1185 		pthread_mutex_unlock(&internal_list_lock);
1186 		rte_free(list);
1187 	}
1188 
1189 	if (dev->data->rx_queues)
1190 		for (i = 0; i < dev->data->nb_rx_queues; i++)
1191 			rte_free(dev->data->rx_queues[i]);
1192 
1193 	if (dev->data->tx_queues)
1194 		for (i = 0; i < dev->data->nb_tx_queues; i++)
1195 			rte_free(dev->data->tx_queues[i]);
1196 
1197 	rte_free(internal->iface_name);
1198 	rte_free(internal);
1199 
1200 	dev->data->dev_private = NULL;
1201 
1202 	rte_free(vring_states[dev->data->port_id]);
1203 	vring_states[dev->data->port_id] = NULL;
1204 }
1205 
1206 static int
1207 eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1208 		   uint16_t nb_rx_desc __rte_unused,
1209 		   unsigned int socket_id,
1210 		   const struct rte_eth_rxconf *rx_conf __rte_unused,
1211 		   struct rte_mempool *mb_pool)
1212 {
1213 	struct vhost_queue *vq;
1214 
1215 	vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1216 			RTE_CACHE_LINE_SIZE, socket_id);
1217 	if (vq == NULL) {
1218 		VHOST_LOG(ERR, "Failed to allocate memory for rx queue\n");
1219 		return -ENOMEM;
1220 	}
1221 
1222 	vq->mb_pool = mb_pool;
1223 	vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
1224 	rte_spinlock_init(&vq->intr_lock);
1225 	dev->data->rx_queues[rx_queue_id] = vq;
1226 
1227 	return 0;
1228 }
1229 
1230 static int
1231 eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1232 		   uint16_t nb_tx_desc __rte_unused,
1233 		   unsigned int socket_id,
1234 		   const struct rte_eth_txconf *tx_conf __rte_unused)
1235 {
1236 	struct vhost_queue *vq;
1237 
1238 	vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1239 			RTE_CACHE_LINE_SIZE, socket_id);
1240 	if (vq == NULL) {
1241 		VHOST_LOG(ERR, "Failed to allocate memory for tx queue\n");
1242 		return -ENOMEM;
1243 	}
1244 
1245 	vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
1246 	rte_spinlock_init(&vq->intr_lock);
1247 	dev->data->tx_queues[tx_queue_id] = vq;
1248 
1249 	return 0;
1250 }
1251 
1252 static int
1253 eth_dev_info(struct rte_eth_dev *dev,
1254 	     struct rte_eth_dev_info *dev_info)
1255 {
1256 	struct pmd_internal *internal;
1257 
1258 	internal = dev->data->dev_private;
1259 	if (internal == NULL) {
1260 		VHOST_LOG(ERR, "Invalid device specified\n");
1261 		return -ENODEV;
1262 	}
1263 
1264 	dev_info->max_mac_addrs = 1;
1265 	dev_info->max_rx_pktlen = (uint32_t)-1;
1266 	dev_info->max_rx_queues = internal->max_queues;
1267 	dev_info->max_tx_queues = internal->max_queues;
1268 	dev_info->min_rx_bufsize = 0;
1269 
1270 	dev_info->tx_offload_capa = DEV_TX_OFFLOAD_MULTI_SEGS |
1271 				DEV_TX_OFFLOAD_VLAN_INSERT;
1272 	dev_info->rx_offload_capa = DEV_RX_OFFLOAD_VLAN_STRIP;
1273 
1274 	return 0;
1275 }
1276 
1277 static int
1278 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1279 {
1280 	unsigned i;
1281 	unsigned long rx_total = 0, tx_total = 0;
1282 	unsigned long rx_total_bytes = 0, tx_total_bytes = 0;
1283 	struct vhost_queue *vq;
1284 
1285 	for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1286 			i < dev->data->nb_rx_queues; i++) {
1287 		if (dev->data->rx_queues[i] == NULL)
1288 			continue;
1289 		vq = dev->data->rx_queues[i];
1290 		stats->q_ipackets[i] = vq->stats.pkts;
1291 		rx_total += stats->q_ipackets[i];
1292 
1293 		stats->q_ibytes[i] = vq->stats.bytes;
1294 		rx_total_bytes += stats->q_ibytes[i];
1295 	}
1296 
1297 	for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1298 			i < dev->data->nb_tx_queues; i++) {
1299 		if (dev->data->tx_queues[i] == NULL)
1300 			continue;
1301 		vq = dev->data->tx_queues[i];
1302 		stats->q_opackets[i] = vq->stats.pkts;
1303 		tx_total += stats->q_opackets[i];
1304 
1305 		stats->q_obytes[i] = vq->stats.bytes;
1306 		tx_total_bytes += stats->q_obytes[i];
1307 	}
1308 
1309 	stats->ipackets = rx_total;
1310 	stats->opackets = tx_total;
1311 	stats->ibytes = rx_total_bytes;
1312 	stats->obytes = tx_total_bytes;
1313 
1314 	return 0;
1315 }
1316 
1317 static int
1318 eth_stats_reset(struct rte_eth_dev *dev)
1319 {
1320 	struct vhost_queue *vq;
1321 	unsigned i;
1322 
1323 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
1324 		if (dev->data->rx_queues[i] == NULL)
1325 			continue;
1326 		vq = dev->data->rx_queues[i];
1327 		vq->stats.pkts = 0;
1328 		vq->stats.bytes = 0;
1329 	}
1330 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
1331 		if (dev->data->tx_queues[i] == NULL)
1332 			continue;
1333 		vq = dev->data->tx_queues[i];
1334 		vq->stats.pkts = 0;
1335 		vq->stats.bytes = 0;
1336 		vq->stats.missed_pkts = 0;
1337 	}
1338 
1339 	return 0;
1340 }
1341 
1342 static void
1343 eth_queue_release(void *q)
1344 {
1345 	rte_free(q);
1346 }
1347 
1348 static int
1349 eth_tx_done_cleanup(void *txq __rte_unused, uint32_t free_cnt __rte_unused)
1350 {
1351 	/*
1352 	 * vHost does not hang onto mbuf. eth_vhost_tx() copies packet data
1353 	 * and releases mbuf, so nothing to cleanup.
1354 	 */
1355 	return 0;
1356 }
1357 
1358 static int
1359 eth_link_update(struct rte_eth_dev *dev __rte_unused,
1360 		int wait_to_complete __rte_unused)
1361 {
1362 	return 0;
1363 }
1364 
1365 static uint32_t
1366 eth_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1367 {
1368 	struct vhost_queue *vq;
1369 
1370 	vq = dev->data->rx_queues[rx_queue_id];
1371 	if (vq == NULL)
1372 		return 0;
1373 
1374 	return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
1375 }
1376 
1377 static const struct eth_dev_ops ops = {
1378 	.dev_start = eth_dev_start,
1379 	.dev_stop = eth_dev_stop,
1380 	.dev_close = eth_dev_close,
1381 	.dev_configure = eth_dev_configure,
1382 	.dev_infos_get = eth_dev_info,
1383 	.rx_queue_setup = eth_rx_queue_setup,
1384 	.tx_queue_setup = eth_tx_queue_setup,
1385 	.rx_queue_release = eth_queue_release,
1386 	.tx_queue_release = eth_queue_release,
1387 	.tx_done_cleanup = eth_tx_done_cleanup,
1388 	.rx_queue_count = eth_rx_queue_count,
1389 	.link_update = eth_link_update,
1390 	.stats_get = eth_stats_get,
1391 	.stats_reset = eth_stats_reset,
1392 	.xstats_reset = vhost_dev_xstats_reset,
1393 	.xstats_get = vhost_dev_xstats_get,
1394 	.xstats_get_names = vhost_dev_xstats_get_names,
1395 	.rx_queue_intr_enable = eth_rxq_intr_enable,
1396 	.rx_queue_intr_disable = eth_rxq_intr_disable,
1397 };
1398 
1399 static int
1400 eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name,
1401 	int16_t queues, const unsigned int numa_node, uint64_t flags,
1402 	uint64_t disable_flags)
1403 {
1404 	const char *name = rte_vdev_device_name(dev);
1405 	struct rte_eth_dev_data *data;
1406 	struct pmd_internal *internal = NULL;
1407 	struct rte_eth_dev *eth_dev = NULL;
1408 	struct rte_ether_addr *eth_addr = NULL;
1409 
1410 	VHOST_LOG(INFO, "Creating VHOST-USER backend on numa socket %u\n",
1411 		numa_node);
1412 
1413 	/* reserve an ethdev entry */
1414 	eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal));
1415 	if (eth_dev == NULL)
1416 		goto error;
1417 	data = eth_dev->data;
1418 
1419 	eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
1420 	if (eth_addr == NULL)
1421 		goto error;
1422 	data->mac_addrs = eth_addr;
1423 	*eth_addr = base_eth_addr;
1424 	eth_addr->addr_bytes[5] = eth_dev->data->port_id;
1425 
1426 	/* now put it all together
1427 	 * - store queue data in internal,
1428 	 * - point eth_dev_data to internals
1429 	 * - and point eth_dev structure to new eth_dev_data structure
1430 	 */
1431 	internal = eth_dev->data->dev_private;
1432 	internal->iface_name = rte_malloc_socket(name, strlen(iface_name) + 1,
1433 						 0, numa_node);
1434 	if (internal->iface_name == NULL)
1435 		goto error;
1436 	strcpy(internal->iface_name, iface_name);
1437 
1438 	data->nb_rx_queues = queues;
1439 	data->nb_tx_queues = queues;
1440 	internal->max_queues = queues;
1441 	internal->vid = -1;
1442 	internal->flags = flags;
1443 	internal->disable_flags = disable_flags;
1444 	data->dev_link = pmd_link;
1445 	data->dev_flags = RTE_ETH_DEV_INTR_LSC | RTE_ETH_DEV_CLOSE_REMOVE;
1446 	data->promiscuous = 1;
1447 	data->all_multicast = 1;
1448 
1449 	eth_dev->dev_ops = &ops;
1450 
1451 	/* finally assign rx and tx ops */
1452 	eth_dev->rx_pkt_burst = eth_vhost_rx;
1453 	eth_dev->tx_pkt_burst = eth_vhost_tx;
1454 
1455 	rte_eth_dev_probing_finish(eth_dev);
1456 	return 0;
1457 
1458 error:
1459 	if (internal)
1460 		rte_free(internal->iface_name);
1461 	rte_eth_dev_release_port(eth_dev);
1462 
1463 	return -1;
1464 }
1465 
1466 static inline int
1467 open_iface(const char *key __rte_unused, const char *value, void *extra_args)
1468 {
1469 	const char **iface_name = extra_args;
1470 
1471 	if (value == NULL)
1472 		return -1;
1473 
1474 	*iface_name = value;
1475 
1476 	return 0;
1477 }
1478 
1479 static inline int
1480 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1481 {
1482 	uint16_t *n = extra_args;
1483 
1484 	if (value == NULL || extra_args == NULL)
1485 		return -EINVAL;
1486 
1487 	*n = (uint16_t)strtoul(value, NULL, 0);
1488 	if (*n == USHRT_MAX && errno == ERANGE)
1489 		return -1;
1490 
1491 	return 0;
1492 }
1493 
1494 static int
1495 rte_pmd_vhost_probe(struct rte_vdev_device *dev)
1496 {
1497 	struct rte_kvargs *kvlist = NULL;
1498 	int ret = 0;
1499 	char *iface_name;
1500 	uint16_t queues;
1501 	uint64_t flags = 0;
1502 	uint64_t disable_flags = 0;
1503 	int client_mode = 0;
1504 	int dequeue_zero_copy = 0;
1505 	int iommu_support = 0;
1506 	int postcopy_support = 0;
1507 	int tso = 0;
1508 	int linear_buf = 0;
1509 	int ext_buf = 0;
1510 	struct rte_eth_dev *eth_dev;
1511 	const char *name = rte_vdev_device_name(dev);
1512 
1513 	VHOST_LOG(INFO, "Initializing pmd_vhost for %s\n", name);
1514 
1515 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1516 		eth_dev = rte_eth_dev_attach_secondary(name);
1517 		if (!eth_dev) {
1518 			VHOST_LOG(ERR, "Failed to probe %s\n", name);
1519 			return -1;
1520 		}
1521 		eth_dev->rx_pkt_burst = eth_vhost_rx;
1522 		eth_dev->tx_pkt_burst = eth_vhost_tx;
1523 		eth_dev->dev_ops = &ops;
1524 		if (dev->device.numa_node == SOCKET_ID_ANY)
1525 			dev->device.numa_node = rte_socket_id();
1526 		eth_dev->device = &dev->device;
1527 		rte_eth_dev_probing_finish(eth_dev);
1528 		return 0;
1529 	}
1530 
1531 	kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1532 	if (kvlist == NULL)
1533 		return -1;
1534 
1535 	if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
1536 		ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
1537 					 &open_iface, &iface_name);
1538 		if (ret < 0)
1539 			goto out_free;
1540 	} else {
1541 		ret = -1;
1542 		goto out_free;
1543 	}
1544 
1545 	if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) {
1546 		ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG,
1547 					 &open_int, &queues);
1548 		if (ret < 0 || queues > RTE_MAX_QUEUES_PER_PORT)
1549 			goto out_free;
1550 
1551 	} else
1552 		queues = 1;
1553 
1554 	if (rte_kvargs_count(kvlist, ETH_VHOST_CLIENT_ARG) == 1) {
1555 		ret = rte_kvargs_process(kvlist, ETH_VHOST_CLIENT_ARG,
1556 					 &open_int, &client_mode);
1557 		if (ret < 0)
1558 			goto out_free;
1559 
1560 		if (client_mode)
1561 			flags |= RTE_VHOST_USER_CLIENT;
1562 	}
1563 
1564 	if (rte_kvargs_count(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY) == 1) {
1565 		ret = rte_kvargs_process(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY,
1566 					 &open_int, &dequeue_zero_copy);
1567 		if (ret < 0)
1568 			goto out_free;
1569 
1570 		if (dequeue_zero_copy)
1571 			flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1572 	}
1573 
1574 	if (rte_kvargs_count(kvlist, ETH_VHOST_IOMMU_SUPPORT) == 1) {
1575 		ret = rte_kvargs_process(kvlist, ETH_VHOST_IOMMU_SUPPORT,
1576 					 &open_int, &iommu_support);
1577 		if (ret < 0)
1578 			goto out_free;
1579 
1580 		if (iommu_support)
1581 			flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
1582 	}
1583 
1584 	if (rte_kvargs_count(kvlist, ETH_VHOST_POSTCOPY_SUPPORT) == 1) {
1585 		ret = rte_kvargs_process(kvlist, ETH_VHOST_POSTCOPY_SUPPORT,
1586 					 &open_int, &postcopy_support);
1587 		if (ret < 0)
1588 			goto out_free;
1589 
1590 		if (postcopy_support)
1591 			flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
1592 	}
1593 
1594 	if (rte_kvargs_count(kvlist, ETH_VHOST_VIRTIO_NET_F_HOST_TSO) == 1) {
1595 		ret = rte_kvargs_process(kvlist,
1596 				ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
1597 				&open_int, &tso);
1598 		if (ret < 0)
1599 			goto out_free;
1600 
1601 		if (tso == 0) {
1602 			disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO4);
1603 			disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO6);
1604 		}
1605 	}
1606 
1607 	if (rte_kvargs_count(kvlist, ETH_VHOST_LINEAR_BUF) == 1) {
1608 		ret = rte_kvargs_process(kvlist,
1609 				ETH_VHOST_LINEAR_BUF,
1610 				&open_int, &linear_buf);
1611 		if (ret < 0)
1612 			goto out_free;
1613 
1614 		if (linear_buf == 1)
1615 			flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
1616 	}
1617 
1618 	if (rte_kvargs_count(kvlist, ETH_VHOST_EXT_BUF) == 1) {
1619 		ret = rte_kvargs_process(kvlist,
1620 				ETH_VHOST_EXT_BUF,
1621 				&open_int, &ext_buf);
1622 		if (ret < 0)
1623 			goto out_free;
1624 
1625 		if (ext_buf == 1)
1626 			flags |= RTE_VHOST_USER_EXTBUF_SUPPORT;
1627 	}
1628 
1629 	if (dev->device.numa_node == SOCKET_ID_ANY)
1630 		dev->device.numa_node = rte_socket_id();
1631 
1632 	ret = eth_dev_vhost_create(dev, iface_name, queues,
1633 				   dev->device.numa_node, flags, disable_flags);
1634 	if (ret == -1)
1635 		VHOST_LOG(ERR, "Failed to create %s\n", name);
1636 
1637 out_free:
1638 	rte_kvargs_free(kvlist);
1639 	return ret;
1640 }
1641 
1642 static int
1643 rte_pmd_vhost_remove(struct rte_vdev_device *dev)
1644 {
1645 	const char *name;
1646 	struct rte_eth_dev *eth_dev = NULL;
1647 
1648 	name = rte_vdev_device_name(dev);
1649 	VHOST_LOG(INFO, "Un-Initializing pmd_vhost for %s\n", name);
1650 
1651 	/* find an ethdev entry */
1652 	eth_dev = rte_eth_dev_allocated(name);
1653 	if (eth_dev == NULL)
1654 		return 0;
1655 
1656 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1657 		return rte_eth_dev_release_port(eth_dev);
1658 
1659 	eth_dev_close(eth_dev);
1660 
1661 	rte_eth_dev_release_port(eth_dev);
1662 
1663 	return 0;
1664 }
1665 
1666 static struct rte_vdev_driver pmd_vhost_drv = {
1667 	.probe = rte_pmd_vhost_probe,
1668 	.remove = rte_pmd_vhost_remove,
1669 };
1670 
1671 RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv);
1672 RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost);
1673 RTE_PMD_REGISTER_PARAM_STRING(net_vhost,
1674 	"iface=<ifc> "
1675 	"queues=<int> "
1676 	"client=<0|1> "
1677 	"dequeue-zero-copy=<0|1> "
1678 	"iommu-support=<0|1> "
1679 	"postcopy-support=<0|1> "
1680 	"tso=<0|1> "
1681 	"linear-buffer=<0|1> "
1682 	"ext-buffer=<0|1>");
1683