xref: /dpdk/drivers/net/vhost/rte_eth_vhost.c (revision 68a03efeed657e6e05f281479b33b51102797e15)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016 IGEL Co., Ltd.
3  * Copyright(c) 2016-2018 Intel Corporation
4  */
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <stdbool.h>
8 #include <sys/epoll.h>
9 
10 #include <rte_mbuf.h>
11 #include <ethdev_driver.h>
12 #include <ethdev_vdev.h>
13 #include <rte_malloc.h>
14 #include <rte_memcpy.h>
15 #include <rte_bus_vdev.h>
16 #include <rte_kvargs.h>
17 #include <rte_vhost.h>
18 #include <rte_spinlock.h>
19 
20 #include "rte_eth_vhost.h"
21 
22 RTE_LOG_REGISTER(vhost_logtype, pmd.net.vhost, NOTICE);
23 
24 #define VHOST_LOG(level, ...) \
25 	rte_log(RTE_LOG_ ## level, vhost_logtype, __VA_ARGS__)
26 
27 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
28 
29 #define ETH_VHOST_IFACE_ARG		"iface"
30 #define ETH_VHOST_QUEUES_ARG		"queues"
31 #define ETH_VHOST_CLIENT_ARG		"client"
32 #define ETH_VHOST_IOMMU_SUPPORT		"iommu-support"
33 #define ETH_VHOST_POSTCOPY_SUPPORT	"postcopy-support"
34 #define ETH_VHOST_VIRTIO_NET_F_HOST_TSO "tso"
35 #define ETH_VHOST_LINEAR_BUF  "linear-buffer"
36 #define ETH_VHOST_EXT_BUF  "ext-buffer"
37 #define VHOST_MAX_PKT_BURST 32
38 
39 static const char *valid_arguments[] = {
40 	ETH_VHOST_IFACE_ARG,
41 	ETH_VHOST_QUEUES_ARG,
42 	ETH_VHOST_CLIENT_ARG,
43 	ETH_VHOST_IOMMU_SUPPORT,
44 	ETH_VHOST_POSTCOPY_SUPPORT,
45 	ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
46 	ETH_VHOST_LINEAR_BUF,
47 	ETH_VHOST_EXT_BUF,
48 	NULL
49 };
50 
51 static struct rte_ether_addr base_eth_addr = {
52 	.addr_bytes = {
53 		0x56 /* V */,
54 		0x48 /* H */,
55 		0x4F /* O */,
56 		0x53 /* S */,
57 		0x54 /* T */,
58 		0x00
59 	}
60 };
61 
62 enum vhost_xstats_pkts {
63 	VHOST_UNDERSIZE_PKT = 0,
64 	VHOST_64_PKT,
65 	VHOST_65_TO_127_PKT,
66 	VHOST_128_TO_255_PKT,
67 	VHOST_256_TO_511_PKT,
68 	VHOST_512_TO_1023_PKT,
69 	VHOST_1024_TO_1522_PKT,
70 	VHOST_1523_TO_MAX_PKT,
71 	VHOST_BROADCAST_PKT,
72 	VHOST_MULTICAST_PKT,
73 	VHOST_UNICAST_PKT,
74 	VHOST_PKT,
75 	VHOST_BYTE,
76 	VHOST_MISSED_PKT,
77 	VHOST_ERRORS_PKT,
78 	VHOST_ERRORS_FRAGMENTED,
79 	VHOST_ERRORS_JABBER,
80 	VHOST_UNKNOWN_PROTOCOL,
81 	VHOST_XSTATS_MAX,
82 };
83 
84 struct vhost_stats {
85 	uint64_t pkts;
86 	uint64_t bytes;
87 	uint64_t missed_pkts;
88 	uint64_t xstats[VHOST_XSTATS_MAX];
89 };
90 
91 struct vhost_queue {
92 	int vid;
93 	rte_atomic32_t allow_queuing;
94 	rte_atomic32_t while_queuing;
95 	struct pmd_internal *internal;
96 	struct rte_mempool *mb_pool;
97 	uint16_t port;
98 	uint16_t virtqueue_id;
99 	struct vhost_stats stats;
100 	int intr_enable;
101 	rte_spinlock_t intr_lock;
102 };
103 
104 struct pmd_internal {
105 	rte_atomic32_t dev_attached;
106 	char *iface_name;
107 	uint64_t flags;
108 	uint64_t disable_flags;
109 	uint16_t max_queues;
110 	int vid;
111 	rte_atomic32_t started;
112 	uint8_t vlan_strip;
113 };
114 
115 struct internal_list {
116 	TAILQ_ENTRY(internal_list) next;
117 	struct rte_eth_dev *eth_dev;
118 };
119 
120 TAILQ_HEAD(internal_list_head, internal_list);
121 static struct internal_list_head internal_list =
122 	TAILQ_HEAD_INITIALIZER(internal_list);
123 
124 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
125 
126 static struct rte_eth_link pmd_link = {
127 		.link_speed = 10000,
128 		.link_duplex = ETH_LINK_FULL_DUPLEX,
129 		.link_status = ETH_LINK_DOWN
130 };
131 
132 struct rte_vhost_vring_state {
133 	rte_spinlock_t lock;
134 
135 	bool cur[RTE_MAX_QUEUES_PER_PORT * 2];
136 	bool seen[RTE_MAX_QUEUES_PER_PORT * 2];
137 	unsigned int index;
138 	unsigned int max_vring;
139 };
140 
141 static struct rte_vhost_vring_state *vring_states[RTE_MAX_ETHPORTS];
142 
143 #define VHOST_XSTATS_NAME_SIZE 64
144 
145 struct vhost_xstats_name_off {
146 	char name[VHOST_XSTATS_NAME_SIZE];
147 	uint64_t offset;
148 };
149 
150 /* [rx]_is prepended to the name string here */
151 static const struct vhost_xstats_name_off vhost_rxport_stat_strings[] = {
152 	{"good_packets",
153 	 offsetof(struct vhost_queue, stats.xstats[VHOST_PKT])},
154 	{"total_bytes",
155 	 offsetof(struct vhost_queue, stats.xstats[VHOST_BYTE])},
156 	{"missed_pkts",
157 	 offsetof(struct vhost_queue, stats.xstats[VHOST_MISSED_PKT])},
158 	{"broadcast_packets",
159 	 offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
160 	{"multicast_packets",
161 	 offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
162 	{"unicast_packets",
163 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
164 	 {"undersize_packets",
165 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
166 	{"size_64_packets",
167 	 offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
168 	{"size_65_to_127_packets",
169 	 offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
170 	{"size_128_to_255_packets",
171 	 offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
172 	{"size_256_to_511_packets",
173 	 offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
174 	{"size_512_to_1023_packets",
175 	 offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
176 	{"size_1024_to_1522_packets",
177 	 offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
178 	{"size_1523_to_max_packets",
179 	 offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
180 	{"errors_with_bad_CRC",
181 	 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
182 	{"fragmented_errors",
183 	 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_FRAGMENTED])},
184 	{"jabber_errors",
185 	 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_JABBER])},
186 	{"unknown_protos_packets",
187 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNKNOWN_PROTOCOL])},
188 };
189 
190 /* [tx]_ is prepended to the name string here */
191 static const struct vhost_xstats_name_off vhost_txport_stat_strings[] = {
192 	{"good_packets",
193 	 offsetof(struct vhost_queue, stats.xstats[VHOST_PKT])},
194 	{"total_bytes",
195 	 offsetof(struct vhost_queue, stats.xstats[VHOST_BYTE])},
196 	{"missed_pkts",
197 	 offsetof(struct vhost_queue, stats.xstats[VHOST_MISSED_PKT])},
198 	{"broadcast_packets",
199 	 offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
200 	{"multicast_packets",
201 	 offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
202 	{"unicast_packets",
203 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
204 	{"undersize_packets",
205 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
206 	{"size_64_packets",
207 	 offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
208 	{"size_65_to_127_packets",
209 	 offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
210 	{"size_128_to_255_packets",
211 	 offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
212 	{"size_256_to_511_packets",
213 	 offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
214 	{"size_512_to_1023_packets",
215 	 offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
216 	{"size_1024_to_1522_packets",
217 	 offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
218 	{"size_1523_to_max_packets",
219 	 offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
220 	{"errors_with_bad_CRC",
221 	 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
222 };
223 
224 #define VHOST_NB_XSTATS_RXPORT (sizeof(vhost_rxport_stat_strings) / \
225 				sizeof(vhost_rxport_stat_strings[0]))
226 
227 #define VHOST_NB_XSTATS_TXPORT (sizeof(vhost_txport_stat_strings) / \
228 				sizeof(vhost_txport_stat_strings[0]))
229 
230 static int
231 vhost_dev_xstats_reset(struct rte_eth_dev *dev)
232 {
233 	struct vhost_queue *vq = NULL;
234 	unsigned int i = 0;
235 
236 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
237 		vq = dev->data->rx_queues[i];
238 		if (!vq)
239 			continue;
240 		memset(&vq->stats, 0, sizeof(vq->stats));
241 	}
242 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
243 		vq = dev->data->tx_queues[i];
244 		if (!vq)
245 			continue;
246 		memset(&vq->stats, 0, sizeof(vq->stats));
247 	}
248 
249 	return 0;
250 }
251 
252 static int
253 vhost_dev_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
254 			   struct rte_eth_xstat_name *xstats_names,
255 			   unsigned int limit __rte_unused)
256 {
257 	unsigned int t = 0;
258 	int count = 0;
259 	int nstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
260 
261 	if (!xstats_names)
262 		return nstats;
263 	for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
264 		snprintf(xstats_names[count].name,
265 			 sizeof(xstats_names[count].name),
266 			 "rx_%s", vhost_rxport_stat_strings[t].name);
267 		count++;
268 	}
269 	for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
270 		snprintf(xstats_names[count].name,
271 			 sizeof(xstats_names[count].name),
272 			 "tx_%s", vhost_txport_stat_strings[t].name);
273 		count++;
274 	}
275 	return count;
276 }
277 
278 static int
279 vhost_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats,
280 		     unsigned int n)
281 {
282 	unsigned int i;
283 	unsigned int t;
284 	unsigned int count = 0;
285 	struct vhost_queue *vq = NULL;
286 	unsigned int nxstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
287 
288 	if (n < nxstats)
289 		return nxstats;
290 
291 	for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
292 		xstats[count].value = 0;
293 		for (i = 0; i < dev->data->nb_rx_queues; i++) {
294 			vq = dev->data->rx_queues[i];
295 			if (!vq)
296 				continue;
297 			xstats[count].value +=
298 				*(uint64_t *)(((char *)vq)
299 				+ vhost_rxport_stat_strings[t].offset);
300 		}
301 		xstats[count].id = count;
302 		count++;
303 	}
304 	for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
305 		xstats[count].value = 0;
306 		for (i = 0; i < dev->data->nb_tx_queues; i++) {
307 			vq = dev->data->tx_queues[i];
308 			if (!vq)
309 				continue;
310 			xstats[count].value +=
311 				*(uint64_t *)(((char *)vq)
312 				+ vhost_txport_stat_strings[t].offset);
313 		}
314 		xstats[count].id = count;
315 		count++;
316 	}
317 	return count;
318 }
319 
320 static inline void
321 vhost_count_xcast_packets(struct vhost_queue *vq,
322 				struct rte_mbuf *mbuf)
323 {
324 	struct rte_ether_addr *ea = NULL;
325 	struct vhost_stats *pstats = &vq->stats;
326 
327 	ea = rte_pktmbuf_mtod(mbuf, struct rte_ether_addr *);
328 	if (rte_is_multicast_ether_addr(ea)) {
329 		if (rte_is_broadcast_ether_addr(ea))
330 			pstats->xstats[VHOST_BROADCAST_PKT]++;
331 		else
332 			pstats->xstats[VHOST_MULTICAST_PKT]++;
333 	} else {
334 		pstats->xstats[VHOST_UNICAST_PKT]++;
335 	}
336 }
337 
338 static void
339 vhost_update_packet_xstats(struct vhost_queue *vq, struct rte_mbuf **bufs,
340 			   uint16_t count, uint64_t nb_bytes,
341 			   uint64_t nb_missed)
342 {
343 	uint32_t pkt_len = 0;
344 	uint64_t i = 0;
345 	uint64_t index;
346 	struct vhost_stats *pstats = &vq->stats;
347 
348 	pstats->xstats[VHOST_BYTE] += nb_bytes;
349 	pstats->xstats[VHOST_MISSED_PKT] += nb_missed;
350 	pstats->xstats[VHOST_UNICAST_PKT] += nb_missed;
351 
352 	for (i = 0; i < count ; i++) {
353 		pstats->xstats[VHOST_PKT]++;
354 		pkt_len = bufs[i]->pkt_len;
355 		if (pkt_len == 64) {
356 			pstats->xstats[VHOST_64_PKT]++;
357 		} else if (pkt_len > 64 && pkt_len < 1024) {
358 			index = (sizeof(pkt_len) * 8)
359 				- __builtin_clz(pkt_len) - 5;
360 			pstats->xstats[index]++;
361 		} else {
362 			if (pkt_len < 64)
363 				pstats->xstats[VHOST_UNDERSIZE_PKT]++;
364 			else if (pkt_len <= 1522)
365 				pstats->xstats[VHOST_1024_TO_1522_PKT]++;
366 			else if (pkt_len > 1522)
367 				pstats->xstats[VHOST_1523_TO_MAX_PKT]++;
368 		}
369 		vhost_count_xcast_packets(vq, bufs[i]);
370 	}
371 }
372 
373 static uint16_t
374 eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
375 {
376 	struct vhost_queue *r = q;
377 	uint16_t i, nb_rx = 0;
378 	uint16_t nb_receive = nb_bufs;
379 	uint64_t nb_bytes = 0;
380 
381 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
382 		return 0;
383 
384 	rte_atomic32_set(&r->while_queuing, 1);
385 
386 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
387 		goto out;
388 
389 	/* Dequeue packets from guest TX queue */
390 	while (nb_receive) {
391 		uint16_t nb_pkts;
392 		uint16_t num = (uint16_t)RTE_MIN(nb_receive,
393 						 VHOST_MAX_PKT_BURST);
394 
395 		nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
396 						  r->mb_pool, &bufs[nb_rx],
397 						  num);
398 
399 		nb_rx += nb_pkts;
400 		nb_receive -= nb_pkts;
401 		if (nb_pkts < num)
402 			break;
403 	}
404 
405 	r->stats.pkts += nb_rx;
406 
407 	for (i = 0; likely(i < nb_rx); i++) {
408 		bufs[i]->port = r->port;
409 		bufs[i]->vlan_tci = 0;
410 
411 		if (r->internal->vlan_strip)
412 			rte_vlan_strip(bufs[i]);
413 
414 		nb_bytes += bufs[i]->pkt_len;
415 	}
416 
417 	r->stats.bytes += nb_bytes;
418 	vhost_update_packet_xstats(r, bufs, nb_rx, nb_bytes, 0);
419 
420 out:
421 	rte_atomic32_set(&r->while_queuing, 0);
422 
423 	return nb_rx;
424 }
425 
426 static uint16_t
427 eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
428 {
429 	struct vhost_queue *r = q;
430 	uint16_t i, nb_tx = 0;
431 	uint16_t nb_send = 0;
432 	uint64_t nb_bytes = 0;
433 	uint64_t nb_missed = 0;
434 
435 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
436 		return 0;
437 
438 	rte_atomic32_set(&r->while_queuing, 1);
439 
440 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
441 		goto out;
442 
443 	for (i = 0; i < nb_bufs; i++) {
444 		struct rte_mbuf *m = bufs[i];
445 
446 		/* Do VLAN tag insertion */
447 		if (m->ol_flags & PKT_TX_VLAN_PKT) {
448 			int error = rte_vlan_insert(&m);
449 			if (unlikely(error)) {
450 				rte_pktmbuf_free(m);
451 				continue;
452 			}
453 		}
454 
455 		bufs[nb_send] = m;
456 		++nb_send;
457 	}
458 
459 	/* Enqueue packets to guest RX queue */
460 	while (nb_send) {
461 		uint16_t nb_pkts;
462 		uint16_t num = (uint16_t)RTE_MIN(nb_send,
463 						 VHOST_MAX_PKT_BURST);
464 
465 		nb_pkts = rte_vhost_enqueue_burst(r->vid, r->virtqueue_id,
466 						  &bufs[nb_tx], num);
467 
468 		nb_tx += nb_pkts;
469 		nb_send -= nb_pkts;
470 		if (nb_pkts < num)
471 			break;
472 	}
473 
474 	for (i = 0; likely(i < nb_tx); i++)
475 		nb_bytes += bufs[i]->pkt_len;
476 
477 	nb_missed = nb_bufs - nb_tx;
478 
479 	r->stats.pkts += nb_tx;
480 	r->stats.bytes += nb_bytes;
481 	r->stats.missed_pkts += nb_bufs - nb_tx;
482 
483 	vhost_update_packet_xstats(r, bufs, nb_tx, nb_bytes, nb_missed);
484 
485 	/* According to RFC2863, ifHCOutUcastPkts, ifHCOutMulticastPkts and
486 	 * ifHCOutBroadcastPkts counters are increased when packets are not
487 	 * transmitted successfully.
488 	 */
489 	for (i = nb_tx; i < nb_bufs; i++)
490 		vhost_count_xcast_packets(r, bufs[i]);
491 
492 	for (i = 0; likely(i < nb_tx); i++)
493 		rte_pktmbuf_free(bufs[i]);
494 out:
495 	rte_atomic32_set(&r->while_queuing, 0);
496 
497 	return nb_tx;
498 }
499 
500 static inline struct internal_list *
501 find_internal_resource(char *ifname)
502 {
503 	int found = 0;
504 	struct internal_list *list;
505 	struct pmd_internal *internal;
506 
507 	if (ifname == NULL)
508 		return NULL;
509 
510 	pthread_mutex_lock(&internal_list_lock);
511 
512 	TAILQ_FOREACH(list, &internal_list, next) {
513 		internal = list->eth_dev->data->dev_private;
514 		if (!strcmp(internal->iface_name, ifname)) {
515 			found = 1;
516 			break;
517 		}
518 	}
519 
520 	pthread_mutex_unlock(&internal_list_lock);
521 
522 	if (!found)
523 		return NULL;
524 
525 	return list;
526 }
527 
528 static int
529 eth_vhost_update_intr(struct rte_eth_dev *eth_dev, uint16_t rxq_idx)
530 {
531 	struct rte_intr_handle *handle = eth_dev->intr_handle;
532 	struct rte_epoll_event rev;
533 	int epfd, ret;
534 
535 	if (!handle)
536 		return 0;
537 
538 	if (handle->efds[rxq_idx] == handle->elist[rxq_idx].fd)
539 		return 0;
540 
541 	VHOST_LOG(INFO, "kickfd for rxq-%d was changed, updating handler.\n",
542 			rxq_idx);
543 
544 	if (handle->elist[rxq_idx].fd != -1)
545 		VHOST_LOG(ERR, "Unexpected previous kickfd value (Got %d, expected -1).\n",
546 				handle->elist[rxq_idx].fd);
547 
548 	/*
549 	 * First remove invalid epoll event, and then install
550 	 * the new one. May be solved with a proper API in the
551 	 * future.
552 	 */
553 	epfd = handle->elist[rxq_idx].epfd;
554 	rev = handle->elist[rxq_idx];
555 	ret = rte_epoll_ctl(epfd, EPOLL_CTL_DEL, rev.fd,
556 			&handle->elist[rxq_idx]);
557 	if (ret) {
558 		VHOST_LOG(ERR, "Delete epoll event failed.\n");
559 		return ret;
560 	}
561 
562 	rev.fd = handle->efds[rxq_idx];
563 	handle->elist[rxq_idx] = rev;
564 	ret = rte_epoll_ctl(epfd, EPOLL_CTL_ADD, rev.fd,
565 			&handle->elist[rxq_idx]);
566 	if (ret) {
567 		VHOST_LOG(ERR, "Add epoll event failed.\n");
568 		return ret;
569 	}
570 
571 	return 0;
572 }
573 
574 static int
575 eth_rxq_intr_enable(struct rte_eth_dev *dev, uint16_t qid)
576 {
577 	struct rte_vhost_vring vring;
578 	struct vhost_queue *vq;
579 	int old_intr_enable, ret = 0;
580 
581 	vq = dev->data->rx_queues[qid];
582 	if (!vq) {
583 		VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
584 		return -1;
585 	}
586 
587 	rte_spinlock_lock(&vq->intr_lock);
588 	old_intr_enable = vq->intr_enable;
589 	vq->intr_enable = 1;
590 	ret = eth_vhost_update_intr(dev, qid);
591 	rte_spinlock_unlock(&vq->intr_lock);
592 
593 	if (ret < 0) {
594 		VHOST_LOG(ERR, "Failed to update rxq%d's intr\n", qid);
595 		vq->intr_enable = old_intr_enable;
596 		return ret;
597 	}
598 
599 	ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
600 	if (ret < 0) {
601 		VHOST_LOG(ERR, "Failed to get rxq%d's vring\n", qid);
602 		return ret;
603 	}
604 	VHOST_LOG(INFO, "Enable interrupt for rxq%d\n", qid);
605 	rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 1);
606 	rte_wmb();
607 
608 	return ret;
609 }
610 
611 static int
612 eth_rxq_intr_disable(struct rte_eth_dev *dev, uint16_t qid)
613 {
614 	struct rte_vhost_vring vring;
615 	struct vhost_queue *vq;
616 	int ret = 0;
617 
618 	vq = dev->data->rx_queues[qid];
619 	if (!vq) {
620 		VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
621 		return -1;
622 	}
623 
624 	ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
625 	if (ret < 0) {
626 		VHOST_LOG(ERR, "Failed to get rxq%d's vring", qid);
627 		return ret;
628 	}
629 	VHOST_LOG(INFO, "Disable interrupt for rxq%d\n", qid);
630 	rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 0);
631 	rte_wmb();
632 
633 	vq->intr_enable = 0;
634 
635 	return 0;
636 }
637 
638 static void
639 eth_vhost_uninstall_intr(struct rte_eth_dev *dev)
640 {
641 	struct rte_intr_handle *intr_handle = dev->intr_handle;
642 
643 	if (intr_handle) {
644 		if (intr_handle->intr_vec)
645 			free(intr_handle->intr_vec);
646 		free(intr_handle);
647 	}
648 
649 	dev->intr_handle = NULL;
650 }
651 
652 static int
653 eth_vhost_install_intr(struct rte_eth_dev *dev)
654 {
655 	struct rte_vhost_vring vring;
656 	struct vhost_queue *vq;
657 	int nb_rxq = dev->data->nb_rx_queues;
658 	int i;
659 	int ret;
660 
661 	/* uninstall firstly if we are reconnecting */
662 	if (dev->intr_handle)
663 		eth_vhost_uninstall_intr(dev);
664 
665 	dev->intr_handle = malloc(sizeof(*dev->intr_handle));
666 	if (!dev->intr_handle) {
667 		VHOST_LOG(ERR, "Fail to allocate intr_handle\n");
668 		return -ENOMEM;
669 	}
670 	memset(dev->intr_handle, 0, sizeof(*dev->intr_handle));
671 
672 	dev->intr_handle->efd_counter_size = sizeof(uint64_t);
673 
674 	dev->intr_handle->intr_vec =
675 		malloc(nb_rxq * sizeof(dev->intr_handle->intr_vec[0]));
676 
677 	if (!dev->intr_handle->intr_vec) {
678 		VHOST_LOG(ERR,
679 			"Failed to allocate memory for interrupt vector\n");
680 		free(dev->intr_handle);
681 		return -ENOMEM;
682 	}
683 
684 	VHOST_LOG(INFO, "Prepare intr vec\n");
685 	for (i = 0; i < nb_rxq; i++) {
686 		dev->intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + i;
687 		dev->intr_handle->efds[i] = -1;
688 		vq = dev->data->rx_queues[i];
689 		if (!vq) {
690 			VHOST_LOG(INFO, "rxq-%d not setup yet, skip!\n", i);
691 			continue;
692 		}
693 
694 		ret = rte_vhost_get_vhost_vring(vq->vid, (i << 1) + 1, &vring);
695 		if (ret < 0) {
696 			VHOST_LOG(INFO,
697 				"Failed to get rxq-%d's vring, skip!\n", i);
698 			continue;
699 		}
700 
701 		if (vring.kickfd < 0) {
702 			VHOST_LOG(INFO,
703 				"rxq-%d's kickfd is invalid, skip!\n", i);
704 			continue;
705 		}
706 		dev->intr_handle->efds[i] = vring.kickfd;
707 		VHOST_LOG(INFO, "Installed intr vec for rxq-%d\n", i);
708 	}
709 
710 	dev->intr_handle->nb_efd = nb_rxq;
711 	dev->intr_handle->max_intr = nb_rxq + 1;
712 	dev->intr_handle->type = RTE_INTR_HANDLE_VDEV;
713 
714 	return 0;
715 }
716 
717 static void
718 update_queuing_status(struct rte_eth_dev *dev)
719 {
720 	struct pmd_internal *internal = dev->data->dev_private;
721 	struct vhost_queue *vq;
722 	unsigned int i;
723 	int allow_queuing = 1;
724 
725 	if (!dev->data->rx_queues || !dev->data->tx_queues)
726 		return;
727 
728 	if (rte_atomic32_read(&internal->started) == 0 ||
729 	    rte_atomic32_read(&internal->dev_attached) == 0)
730 		allow_queuing = 0;
731 
732 	/* Wait until rx/tx_pkt_burst stops accessing vhost device */
733 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
734 		vq = dev->data->rx_queues[i];
735 		if (vq == NULL)
736 			continue;
737 		rte_atomic32_set(&vq->allow_queuing, allow_queuing);
738 		while (rte_atomic32_read(&vq->while_queuing))
739 			rte_pause();
740 	}
741 
742 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
743 		vq = dev->data->tx_queues[i];
744 		if (vq == NULL)
745 			continue;
746 		rte_atomic32_set(&vq->allow_queuing, allow_queuing);
747 		while (rte_atomic32_read(&vq->while_queuing))
748 			rte_pause();
749 	}
750 }
751 
752 static void
753 queue_setup(struct rte_eth_dev *eth_dev, struct pmd_internal *internal)
754 {
755 	struct vhost_queue *vq;
756 	int i;
757 
758 	for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
759 		vq = eth_dev->data->rx_queues[i];
760 		if (!vq)
761 			continue;
762 		vq->vid = internal->vid;
763 		vq->internal = internal;
764 		vq->port = eth_dev->data->port_id;
765 	}
766 	for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
767 		vq = eth_dev->data->tx_queues[i];
768 		if (!vq)
769 			continue;
770 		vq->vid = internal->vid;
771 		vq->internal = internal;
772 		vq->port = eth_dev->data->port_id;
773 	}
774 }
775 
776 static int
777 new_device(int vid)
778 {
779 	struct rte_eth_dev *eth_dev;
780 	struct internal_list *list;
781 	struct pmd_internal *internal;
782 	struct rte_eth_conf *dev_conf;
783 	unsigned i;
784 	char ifname[PATH_MAX];
785 #ifdef RTE_LIBRTE_VHOST_NUMA
786 	int newnode;
787 #endif
788 
789 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
790 	list = find_internal_resource(ifname);
791 	if (list == NULL) {
792 		VHOST_LOG(INFO, "Invalid device name: %s\n", ifname);
793 		return -1;
794 	}
795 
796 	eth_dev = list->eth_dev;
797 	internal = eth_dev->data->dev_private;
798 	dev_conf = &eth_dev->data->dev_conf;
799 
800 #ifdef RTE_LIBRTE_VHOST_NUMA
801 	newnode = rte_vhost_get_numa_node(vid);
802 	if (newnode >= 0)
803 		eth_dev->data->numa_node = newnode;
804 #endif
805 
806 	internal->vid = vid;
807 	if (rte_atomic32_read(&internal->started) == 1) {
808 		queue_setup(eth_dev, internal);
809 
810 		if (dev_conf->intr_conf.rxq) {
811 			if (eth_vhost_install_intr(eth_dev) < 0) {
812 				VHOST_LOG(INFO,
813 					"Failed to install interrupt handler.");
814 					return -1;
815 			}
816 		}
817 	} else {
818 		VHOST_LOG(INFO, "RX/TX queues not exist yet\n");
819 	}
820 
821 	for (i = 0; i < rte_vhost_get_vring_num(vid); i++)
822 		rte_vhost_enable_guest_notification(vid, i, 0);
823 
824 	rte_vhost_get_mtu(vid, &eth_dev->data->mtu);
825 
826 	eth_dev->data->dev_link.link_status = ETH_LINK_UP;
827 
828 	rte_atomic32_set(&internal->dev_attached, 1);
829 	update_queuing_status(eth_dev);
830 
831 	VHOST_LOG(INFO, "Vhost device %d created\n", vid);
832 
833 	rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
834 
835 	return 0;
836 }
837 
838 static void
839 destroy_device(int vid)
840 {
841 	struct rte_eth_dev *eth_dev;
842 	struct pmd_internal *internal;
843 	struct vhost_queue *vq;
844 	struct internal_list *list;
845 	char ifname[PATH_MAX];
846 	unsigned i;
847 	struct rte_vhost_vring_state *state;
848 
849 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
850 	list = find_internal_resource(ifname);
851 	if (list == NULL) {
852 		VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
853 		return;
854 	}
855 	eth_dev = list->eth_dev;
856 	internal = eth_dev->data->dev_private;
857 
858 	rte_atomic32_set(&internal->dev_attached, 0);
859 	update_queuing_status(eth_dev);
860 
861 	eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
862 
863 	if (eth_dev->data->rx_queues && eth_dev->data->tx_queues) {
864 		for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
865 			vq = eth_dev->data->rx_queues[i];
866 			if (!vq)
867 				continue;
868 			vq->vid = -1;
869 		}
870 		for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
871 			vq = eth_dev->data->tx_queues[i];
872 			if (!vq)
873 				continue;
874 			vq->vid = -1;
875 		}
876 	}
877 
878 	state = vring_states[eth_dev->data->port_id];
879 	rte_spinlock_lock(&state->lock);
880 	for (i = 0; i <= state->max_vring; i++) {
881 		state->cur[i] = false;
882 		state->seen[i] = false;
883 	}
884 	state->max_vring = 0;
885 	rte_spinlock_unlock(&state->lock);
886 
887 	VHOST_LOG(INFO, "Vhost device %d destroyed\n", vid);
888 	eth_vhost_uninstall_intr(eth_dev);
889 
890 	rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
891 }
892 
893 static int
894 vring_conf_update(int vid, struct rte_eth_dev *eth_dev, uint16_t vring_id)
895 {
896 	struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
897 	struct pmd_internal *internal = eth_dev->data->dev_private;
898 	struct vhost_queue *vq;
899 	struct rte_vhost_vring vring;
900 	int rx_idx = vring_id % 2 ? (vring_id - 1) >> 1 : -1;
901 	int ret = 0;
902 
903 	/*
904 	 * The vring kickfd may be changed after the new device notification.
905 	 * Update it when the vring state is updated.
906 	 */
907 	if (rx_idx >= 0 && rx_idx < eth_dev->data->nb_rx_queues &&
908 	    rte_atomic32_read(&internal->dev_attached) &&
909 	    rte_atomic32_read(&internal->started) &&
910 	    dev_conf->intr_conf.rxq) {
911 		ret = rte_vhost_get_vhost_vring(vid, vring_id, &vring);
912 		if (ret) {
913 			VHOST_LOG(ERR, "Failed to get vring %d information.\n",
914 					vring_id);
915 			return ret;
916 		}
917 		eth_dev->intr_handle->efds[rx_idx] = vring.kickfd;
918 
919 		vq = eth_dev->data->rx_queues[rx_idx];
920 		if (!vq) {
921 			VHOST_LOG(ERR, "rxq%d is not setup yet\n", rx_idx);
922 			return -1;
923 		}
924 
925 		rte_spinlock_lock(&vq->intr_lock);
926 		if (vq->intr_enable)
927 			ret = eth_vhost_update_intr(eth_dev, rx_idx);
928 		rte_spinlock_unlock(&vq->intr_lock);
929 	}
930 
931 	return ret;
932 }
933 
934 static int
935 vring_state_changed(int vid, uint16_t vring, int enable)
936 {
937 	struct rte_vhost_vring_state *state;
938 	struct rte_eth_dev *eth_dev;
939 	struct internal_list *list;
940 	char ifname[PATH_MAX];
941 
942 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
943 	list = find_internal_resource(ifname);
944 	if (list == NULL) {
945 		VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
946 		return -1;
947 	}
948 
949 	eth_dev = list->eth_dev;
950 	/* won't be NULL */
951 	state = vring_states[eth_dev->data->port_id];
952 
953 	if (enable && vring_conf_update(vid, eth_dev, vring))
954 		VHOST_LOG(INFO, "Failed to update vring-%d configuration.\n",
955 			  (int)vring);
956 
957 	rte_spinlock_lock(&state->lock);
958 	if (state->cur[vring] == enable) {
959 		rte_spinlock_unlock(&state->lock);
960 		return 0;
961 	}
962 	state->cur[vring] = enable;
963 	state->max_vring = RTE_MAX(vring, state->max_vring);
964 	rte_spinlock_unlock(&state->lock);
965 
966 	VHOST_LOG(INFO, "vring%u is %s\n",
967 			vring, enable ? "enabled" : "disabled");
968 
969 	rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_QUEUE_STATE, NULL);
970 
971 	return 0;
972 }
973 
974 static struct vhost_device_ops vhost_ops = {
975 	.new_device          = new_device,
976 	.destroy_device      = destroy_device,
977 	.vring_state_changed = vring_state_changed,
978 };
979 
980 static int
981 vhost_driver_setup(struct rte_eth_dev *eth_dev)
982 {
983 	struct pmd_internal *internal = eth_dev->data->dev_private;
984 	struct internal_list *list = NULL;
985 	struct rte_vhost_vring_state *vring_state = NULL;
986 	unsigned int numa_node = eth_dev->device->numa_node;
987 	const char *name = eth_dev->device->name;
988 
989 	/* Don't try to setup again if it has already been done. */
990 	list = find_internal_resource(internal->iface_name);
991 	if (list)
992 		return 0;
993 
994 	list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node);
995 	if (list == NULL)
996 		return -1;
997 
998 	vring_state = rte_zmalloc_socket(name, sizeof(*vring_state),
999 					 0, numa_node);
1000 	if (vring_state == NULL)
1001 		goto free_list;
1002 
1003 	list->eth_dev = eth_dev;
1004 	pthread_mutex_lock(&internal_list_lock);
1005 	TAILQ_INSERT_TAIL(&internal_list, list, next);
1006 	pthread_mutex_unlock(&internal_list_lock);
1007 
1008 	rte_spinlock_init(&vring_state->lock);
1009 	vring_states[eth_dev->data->port_id] = vring_state;
1010 
1011 	if (rte_vhost_driver_register(internal->iface_name, internal->flags))
1012 		goto list_remove;
1013 
1014 	if (internal->disable_flags) {
1015 		if (rte_vhost_driver_disable_features(internal->iface_name,
1016 						      internal->disable_flags))
1017 			goto drv_unreg;
1018 	}
1019 
1020 	if (rte_vhost_driver_callback_register(internal->iface_name,
1021 					       &vhost_ops) < 0) {
1022 		VHOST_LOG(ERR, "Can't register callbacks\n");
1023 		goto drv_unreg;
1024 	}
1025 
1026 	if (rte_vhost_driver_start(internal->iface_name) < 0) {
1027 		VHOST_LOG(ERR, "Failed to start driver for %s\n",
1028 			  internal->iface_name);
1029 		goto drv_unreg;
1030 	}
1031 
1032 	return 0;
1033 
1034 drv_unreg:
1035 	rte_vhost_driver_unregister(internal->iface_name);
1036 list_remove:
1037 	vring_states[eth_dev->data->port_id] = NULL;
1038 	pthread_mutex_lock(&internal_list_lock);
1039 	TAILQ_REMOVE(&internal_list, list, next);
1040 	pthread_mutex_unlock(&internal_list_lock);
1041 	rte_free(vring_state);
1042 free_list:
1043 	rte_free(list);
1044 
1045 	return -1;
1046 }
1047 
1048 int
1049 rte_eth_vhost_get_queue_event(uint16_t port_id,
1050 		struct rte_eth_vhost_queue_event *event)
1051 {
1052 	struct rte_vhost_vring_state *state;
1053 	unsigned int i;
1054 	int idx;
1055 
1056 	if (port_id >= RTE_MAX_ETHPORTS) {
1057 		VHOST_LOG(ERR, "Invalid port id\n");
1058 		return -1;
1059 	}
1060 
1061 	state = vring_states[port_id];
1062 	if (!state) {
1063 		VHOST_LOG(ERR, "Unused port\n");
1064 		return -1;
1065 	}
1066 
1067 	rte_spinlock_lock(&state->lock);
1068 	for (i = 0; i <= state->max_vring; i++) {
1069 		idx = state->index++ % (state->max_vring + 1);
1070 
1071 		if (state->cur[idx] != state->seen[idx]) {
1072 			state->seen[idx] = state->cur[idx];
1073 			event->queue_id = idx / 2;
1074 			event->rx = idx & 1;
1075 			event->enable = state->cur[idx];
1076 			rte_spinlock_unlock(&state->lock);
1077 			return 0;
1078 		}
1079 	}
1080 	rte_spinlock_unlock(&state->lock);
1081 
1082 	return -1;
1083 }
1084 
1085 int
1086 rte_eth_vhost_get_vid_from_port_id(uint16_t port_id)
1087 {
1088 	struct internal_list *list;
1089 	struct rte_eth_dev *eth_dev;
1090 	struct vhost_queue *vq;
1091 	int vid = -1;
1092 
1093 	if (!rte_eth_dev_is_valid_port(port_id))
1094 		return -1;
1095 
1096 	pthread_mutex_lock(&internal_list_lock);
1097 
1098 	TAILQ_FOREACH(list, &internal_list, next) {
1099 		eth_dev = list->eth_dev;
1100 		if (eth_dev->data->port_id == port_id) {
1101 			vq = eth_dev->data->rx_queues[0];
1102 			if (vq) {
1103 				vid = vq->vid;
1104 			}
1105 			break;
1106 		}
1107 	}
1108 
1109 	pthread_mutex_unlock(&internal_list_lock);
1110 
1111 	return vid;
1112 }
1113 
1114 static int
1115 eth_dev_configure(struct rte_eth_dev *dev)
1116 {
1117 	struct pmd_internal *internal = dev->data->dev_private;
1118 	const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
1119 
1120 	/* NOTE: the same process has to operate a vhost interface
1121 	 * from beginning to end (from eth_dev configure to eth_dev close).
1122 	 * It is user's responsibility at the moment.
1123 	 */
1124 	if (vhost_driver_setup(dev) < 0)
1125 		return -1;
1126 
1127 	internal->vlan_strip = !!(rxmode->offloads & DEV_RX_OFFLOAD_VLAN_STRIP);
1128 
1129 	return 0;
1130 }
1131 
1132 static int
1133 eth_dev_start(struct rte_eth_dev *eth_dev)
1134 {
1135 	struct pmd_internal *internal = eth_dev->data->dev_private;
1136 	struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
1137 
1138 	queue_setup(eth_dev, internal);
1139 
1140 	if (rte_atomic32_read(&internal->dev_attached) == 1) {
1141 		if (dev_conf->intr_conf.rxq) {
1142 			if (eth_vhost_install_intr(eth_dev) < 0) {
1143 				VHOST_LOG(INFO,
1144 					"Failed to install interrupt handler.");
1145 					return -1;
1146 			}
1147 		}
1148 	}
1149 
1150 	rte_atomic32_set(&internal->started, 1);
1151 	update_queuing_status(eth_dev);
1152 
1153 	return 0;
1154 }
1155 
1156 static int
1157 eth_dev_stop(struct rte_eth_dev *dev)
1158 {
1159 	struct pmd_internal *internal = dev->data->dev_private;
1160 
1161 	dev->data->dev_started = 0;
1162 	rte_atomic32_set(&internal->started, 0);
1163 	update_queuing_status(dev);
1164 
1165 	return 0;
1166 }
1167 
1168 static int
1169 eth_dev_close(struct rte_eth_dev *dev)
1170 {
1171 	struct pmd_internal *internal;
1172 	struct internal_list *list;
1173 	unsigned int i, ret;
1174 
1175 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1176 		return 0;
1177 
1178 	internal = dev->data->dev_private;
1179 	if (!internal)
1180 		return 0;
1181 
1182 	ret = eth_dev_stop(dev);
1183 
1184 	list = find_internal_resource(internal->iface_name);
1185 	if (list) {
1186 		rte_vhost_driver_unregister(internal->iface_name);
1187 		pthread_mutex_lock(&internal_list_lock);
1188 		TAILQ_REMOVE(&internal_list, list, next);
1189 		pthread_mutex_unlock(&internal_list_lock);
1190 		rte_free(list);
1191 	}
1192 
1193 	if (dev->data->rx_queues)
1194 		for (i = 0; i < dev->data->nb_rx_queues; i++)
1195 			rte_free(dev->data->rx_queues[i]);
1196 
1197 	if (dev->data->tx_queues)
1198 		for (i = 0; i < dev->data->nb_tx_queues; i++)
1199 			rte_free(dev->data->tx_queues[i]);
1200 
1201 	rte_free(internal->iface_name);
1202 	rte_free(internal);
1203 
1204 	dev->data->dev_private = NULL;
1205 
1206 	rte_free(vring_states[dev->data->port_id]);
1207 	vring_states[dev->data->port_id] = NULL;
1208 
1209 	return ret;
1210 }
1211 
1212 static int
1213 eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1214 		   uint16_t nb_rx_desc __rte_unused,
1215 		   unsigned int socket_id,
1216 		   const struct rte_eth_rxconf *rx_conf __rte_unused,
1217 		   struct rte_mempool *mb_pool)
1218 {
1219 	struct vhost_queue *vq;
1220 
1221 	vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1222 			RTE_CACHE_LINE_SIZE, socket_id);
1223 	if (vq == NULL) {
1224 		VHOST_LOG(ERR, "Failed to allocate memory for rx queue\n");
1225 		return -ENOMEM;
1226 	}
1227 
1228 	vq->mb_pool = mb_pool;
1229 	vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
1230 	rte_spinlock_init(&vq->intr_lock);
1231 	dev->data->rx_queues[rx_queue_id] = vq;
1232 
1233 	return 0;
1234 }
1235 
1236 static int
1237 eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1238 		   uint16_t nb_tx_desc __rte_unused,
1239 		   unsigned int socket_id,
1240 		   const struct rte_eth_txconf *tx_conf __rte_unused)
1241 {
1242 	struct vhost_queue *vq;
1243 
1244 	vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1245 			RTE_CACHE_LINE_SIZE, socket_id);
1246 	if (vq == NULL) {
1247 		VHOST_LOG(ERR, "Failed to allocate memory for tx queue\n");
1248 		return -ENOMEM;
1249 	}
1250 
1251 	vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
1252 	rte_spinlock_init(&vq->intr_lock);
1253 	dev->data->tx_queues[tx_queue_id] = vq;
1254 
1255 	return 0;
1256 }
1257 
1258 static int
1259 eth_dev_info(struct rte_eth_dev *dev,
1260 	     struct rte_eth_dev_info *dev_info)
1261 {
1262 	struct pmd_internal *internal;
1263 
1264 	internal = dev->data->dev_private;
1265 	if (internal == NULL) {
1266 		VHOST_LOG(ERR, "Invalid device specified\n");
1267 		return -ENODEV;
1268 	}
1269 
1270 	dev_info->max_mac_addrs = 1;
1271 	dev_info->max_rx_pktlen = (uint32_t)-1;
1272 	dev_info->max_rx_queues = internal->max_queues;
1273 	dev_info->max_tx_queues = internal->max_queues;
1274 	dev_info->min_rx_bufsize = 0;
1275 
1276 	dev_info->tx_offload_capa = DEV_TX_OFFLOAD_MULTI_SEGS |
1277 				DEV_TX_OFFLOAD_VLAN_INSERT;
1278 	dev_info->rx_offload_capa = DEV_RX_OFFLOAD_VLAN_STRIP;
1279 
1280 	return 0;
1281 }
1282 
1283 static int
1284 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1285 {
1286 	unsigned i;
1287 	unsigned long rx_total = 0, tx_total = 0;
1288 	unsigned long rx_total_bytes = 0, tx_total_bytes = 0;
1289 	struct vhost_queue *vq;
1290 
1291 	for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1292 			i < dev->data->nb_rx_queues; i++) {
1293 		if (dev->data->rx_queues[i] == NULL)
1294 			continue;
1295 		vq = dev->data->rx_queues[i];
1296 		stats->q_ipackets[i] = vq->stats.pkts;
1297 		rx_total += stats->q_ipackets[i];
1298 
1299 		stats->q_ibytes[i] = vq->stats.bytes;
1300 		rx_total_bytes += stats->q_ibytes[i];
1301 	}
1302 
1303 	for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1304 			i < dev->data->nb_tx_queues; i++) {
1305 		if (dev->data->tx_queues[i] == NULL)
1306 			continue;
1307 		vq = dev->data->tx_queues[i];
1308 		stats->q_opackets[i] = vq->stats.pkts;
1309 		tx_total += stats->q_opackets[i];
1310 
1311 		stats->q_obytes[i] = vq->stats.bytes;
1312 		tx_total_bytes += stats->q_obytes[i];
1313 	}
1314 
1315 	stats->ipackets = rx_total;
1316 	stats->opackets = tx_total;
1317 	stats->ibytes = rx_total_bytes;
1318 	stats->obytes = tx_total_bytes;
1319 
1320 	return 0;
1321 }
1322 
1323 static int
1324 eth_stats_reset(struct rte_eth_dev *dev)
1325 {
1326 	struct vhost_queue *vq;
1327 	unsigned i;
1328 
1329 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
1330 		if (dev->data->rx_queues[i] == NULL)
1331 			continue;
1332 		vq = dev->data->rx_queues[i];
1333 		vq->stats.pkts = 0;
1334 		vq->stats.bytes = 0;
1335 	}
1336 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
1337 		if (dev->data->tx_queues[i] == NULL)
1338 			continue;
1339 		vq = dev->data->tx_queues[i];
1340 		vq->stats.pkts = 0;
1341 		vq->stats.bytes = 0;
1342 		vq->stats.missed_pkts = 0;
1343 	}
1344 
1345 	return 0;
1346 }
1347 
1348 static void
1349 eth_queue_release(void *q)
1350 {
1351 	rte_free(q);
1352 }
1353 
1354 static int
1355 eth_tx_done_cleanup(void *txq __rte_unused, uint32_t free_cnt __rte_unused)
1356 {
1357 	/*
1358 	 * vHost does not hang onto mbuf. eth_vhost_tx() copies packet data
1359 	 * and releases mbuf, so nothing to cleanup.
1360 	 */
1361 	return 0;
1362 }
1363 
1364 static int
1365 eth_link_update(struct rte_eth_dev *dev __rte_unused,
1366 		int wait_to_complete __rte_unused)
1367 {
1368 	return 0;
1369 }
1370 
1371 static uint32_t
1372 eth_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1373 {
1374 	struct vhost_queue *vq;
1375 
1376 	vq = dev->data->rx_queues[rx_queue_id];
1377 	if (vq == NULL)
1378 		return 0;
1379 
1380 	return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
1381 }
1382 
1383 static const struct eth_dev_ops ops = {
1384 	.dev_start = eth_dev_start,
1385 	.dev_stop = eth_dev_stop,
1386 	.dev_close = eth_dev_close,
1387 	.dev_configure = eth_dev_configure,
1388 	.dev_infos_get = eth_dev_info,
1389 	.rx_queue_setup = eth_rx_queue_setup,
1390 	.tx_queue_setup = eth_tx_queue_setup,
1391 	.rx_queue_release = eth_queue_release,
1392 	.tx_queue_release = eth_queue_release,
1393 	.tx_done_cleanup = eth_tx_done_cleanup,
1394 	.link_update = eth_link_update,
1395 	.stats_get = eth_stats_get,
1396 	.stats_reset = eth_stats_reset,
1397 	.xstats_reset = vhost_dev_xstats_reset,
1398 	.xstats_get = vhost_dev_xstats_get,
1399 	.xstats_get_names = vhost_dev_xstats_get_names,
1400 	.rx_queue_intr_enable = eth_rxq_intr_enable,
1401 	.rx_queue_intr_disable = eth_rxq_intr_disable,
1402 };
1403 
1404 static int
1405 eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name,
1406 	int16_t queues, const unsigned int numa_node, uint64_t flags,
1407 	uint64_t disable_flags)
1408 {
1409 	const char *name = rte_vdev_device_name(dev);
1410 	struct rte_eth_dev_data *data;
1411 	struct pmd_internal *internal = NULL;
1412 	struct rte_eth_dev *eth_dev = NULL;
1413 	struct rte_ether_addr *eth_addr = NULL;
1414 
1415 	VHOST_LOG(INFO, "Creating VHOST-USER backend on numa socket %u\n",
1416 		numa_node);
1417 
1418 	/* reserve an ethdev entry */
1419 	eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal));
1420 	if (eth_dev == NULL)
1421 		goto error;
1422 	data = eth_dev->data;
1423 
1424 	eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
1425 	if (eth_addr == NULL)
1426 		goto error;
1427 	data->mac_addrs = eth_addr;
1428 	*eth_addr = base_eth_addr;
1429 	eth_addr->addr_bytes[5] = eth_dev->data->port_id;
1430 
1431 	/* now put it all together
1432 	 * - store queue data in internal,
1433 	 * - point eth_dev_data to internals
1434 	 * - and point eth_dev structure to new eth_dev_data structure
1435 	 */
1436 	internal = eth_dev->data->dev_private;
1437 	internal->iface_name = rte_malloc_socket(name, strlen(iface_name) + 1,
1438 						 0, numa_node);
1439 	if (internal->iface_name == NULL)
1440 		goto error;
1441 	strcpy(internal->iface_name, iface_name);
1442 
1443 	data->nb_rx_queues = queues;
1444 	data->nb_tx_queues = queues;
1445 	internal->max_queues = queues;
1446 	internal->vid = -1;
1447 	internal->flags = flags;
1448 	internal->disable_flags = disable_flags;
1449 	data->dev_link = pmd_link;
1450 	data->dev_flags = RTE_ETH_DEV_INTR_LSC |
1451 				RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
1452 	data->promiscuous = 1;
1453 	data->all_multicast = 1;
1454 
1455 	eth_dev->dev_ops = &ops;
1456 	eth_dev->rx_queue_count = eth_rx_queue_count;
1457 
1458 	/* finally assign rx and tx ops */
1459 	eth_dev->rx_pkt_burst = eth_vhost_rx;
1460 	eth_dev->tx_pkt_burst = eth_vhost_tx;
1461 
1462 	rte_eth_dev_probing_finish(eth_dev);
1463 	return 0;
1464 
1465 error:
1466 	if (internal)
1467 		rte_free(internal->iface_name);
1468 	rte_eth_dev_release_port(eth_dev);
1469 
1470 	return -1;
1471 }
1472 
1473 static inline int
1474 open_iface(const char *key __rte_unused, const char *value, void *extra_args)
1475 {
1476 	const char **iface_name = extra_args;
1477 
1478 	if (value == NULL)
1479 		return -1;
1480 
1481 	*iface_name = value;
1482 
1483 	return 0;
1484 }
1485 
1486 static inline int
1487 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1488 {
1489 	uint16_t *n = extra_args;
1490 
1491 	if (value == NULL || extra_args == NULL)
1492 		return -EINVAL;
1493 
1494 	*n = (uint16_t)strtoul(value, NULL, 0);
1495 	if (*n == USHRT_MAX && errno == ERANGE)
1496 		return -1;
1497 
1498 	return 0;
1499 }
1500 
1501 static int
1502 rte_pmd_vhost_probe(struct rte_vdev_device *dev)
1503 {
1504 	struct rte_kvargs *kvlist = NULL;
1505 	int ret = 0;
1506 	char *iface_name;
1507 	uint16_t queues;
1508 	uint64_t flags = 0;
1509 	uint64_t disable_flags = 0;
1510 	int client_mode = 0;
1511 	int iommu_support = 0;
1512 	int postcopy_support = 0;
1513 	int tso = 0;
1514 	int linear_buf = 0;
1515 	int ext_buf = 0;
1516 	struct rte_eth_dev *eth_dev;
1517 	const char *name = rte_vdev_device_name(dev);
1518 
1519 	VHOST_LOG(INFO, "Initializing pmd_vhost for %s\n", name);
1520 
1521 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1522 		eth_dev = rte_eth_dev_attach_secondary(name);
1523 		if (!eth_dev) {
1524 			VHOST_LOG(ERR, "Failed to probe %s\n", name);
1525 			return -1;
1526 		}
1527 		eth_dev->rx_pkt_burst = eth_vhost_rx;
1528 		eth_dev->tx_pkt_burst = eth_vhost_tx;
1529 		eth_dev->dev_ops = &ops;
1530 		if (dev->device.numa_node == SOCKET_ID_ANY)
1531 			dev->device.numa_node = rte_socket_id();
1532 		eth_dev->device = &dev->device;
1533 		rte_eth_dev_probing_finish(eth_dev);
1534 		return 0;
1535 	}
1536 
1537 	kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1538 	if (kvlist == NULL)
1539 		return -1;
1540 
1541 	if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
1542 		ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
1543 					 &open_iface, &iface_name);
1544 		if (ret < 0)
1545 			goto out_free;
1546 	} else {
1547 		ret = -1;
1548 		goto out_free;
1549 	}
1550 
1551 	if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) {
1552 		ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG,
1553 					 &open_int, &queues);
1554 		if (ret < 0 || queues > RTE_MAX_QUEUES_PER_PORT)
1555 			goto out_free;
1556 
1557 	} else
1558 		queues = 1;
1559 
1560 	if (rte_kvargs_count(kvlist, ETH_VHOST_CLIENT_ARG) == 1) {
1561 		ret = rte_kvargs_process(kvlist, ETH_VHOST_CLIENT_ARG,
1562 					 &open_int, &client_mode);
1563 		if (ret < 0)
1564 			goto out_free;
1565 
1566 		if (client_mode)
1567 			flags |= RTE_VHOST_USER_CLIENT;
1568 	}
1569 
1570 	if (rte_kvargs_count(kvlist, ETH_VHOST_IOMMU_SUPPORT) == 1) {
1571 		ret = rte_kvargs_process(kvlist, ETH_VHOST_IOMMU_SUPPORT,
1572 					 &open_int, &iommu_support);
1573 		if (ret < 0)
1574 			goto out_free;
1575 
1576 		if (iommu_support)
1577 			flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
1578 	}
1579 
1580 	if (rte_kvargs_count(kvlist, ETH_VHOST_POSTCOPY_SUPPORT) == 1) {
1581 		ret = rte_kvargs_process(kvlist, ETH_VHOST_POSTCOPY_SUPPORT,
1582 					 &open_int, &postcopy_support);
1583 		if (ret < 0)
1584 			goto out_free;
1585 
1586 		if (postcopy_support)
1587 			flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
1588 	}
1589 
1590 	if (rte_kvargs_count(kvlist, ETH_VHOST_VIRTIO_NET_F_HOST_TSO) == 1) {
1591 		ret = rte_kvargs_process(kvlist,
1592 				ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
1593 				&open_int, &tso);
1594 		if (ret < 0)
1595 			goto out_free;
1596 
1597 		if (tso == 0) {
1598 			disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO4);
1599 			disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO6);
1600 		}
1601 	}
1602 
1603 	if (rte_kvargs_count(kvlist, ETH_VHOST_LINEAR_BUF) == 1) {
1604 		ret = rte_kvargs_process(kvlist,
1605 				ETH_VHOST_LINEAR_BUF,
1606 				&open_int, &linear_buf);
1607 		if (ret < 0)
1608 			goto out_free;
1609 
1610 		if (linear_buf == 1)
1611 			flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
1612 	}
1613 
1614 	if (rte_kvargs_count(kvlist, ETH_VHOST_EXT_BUF) == 1) {
1615 		ret = rte_kvargs_process(kvlist,
1616 				ETH_VHOST_EXT_BUF,
1617 				&open_int, &ext_buf);
1618 		if (ret < 0)
1619 			goto out_free;
1620 
1621 		if (ext_buf == 1)
1622 			flags |= RTE_VHOST_USER_EXTBUF_SUPPORT;
1623 	}
1624 
1625 	if (dev->device.numa_node == SOCKET_ID_ANY)
1626 		dev->device.numa_node = rte_socket_id();
1627 
1628 	ret = eth_dev_vhost_create(dev, iface_name, queues,
1629 				   dev->device.numa_node, flags, disable_flags);
1630 	if (ret == -1)
1631 		VHOST_LOG(ERR, "Failed to create %s\n", name);
1632 
1633 out_free:
1634 	rte_kvargs_free(kvlist);
1635 	return ret;
1636 }
1637 
1638 static int
1639 rte_pmd_vhost_remove(struct rte_vdev_device *dev)
1640 {
1641 	const char *name;
1642 	struct rte_eth_dev *eth_dev = NULL;
1643 
1644 	name = rte_vdev_device_name(dev);
1645 	VHOST_LOG(INFO, "Un-Initializing pmd_vhost for %s\n", name);
1646 
1647 	/* find an ethdev entry */
1648 	eth_dev = rte_eth_dev_allocated(name);
1649 	if (eth_dev == NULL)
1650 		return 0;
1651 
1652 	eth_dev_close(eth_dev);
1653 	rte_eth_dev_release_port(eth_dev);
1654 
1655 	return 0;
1656 }
1657 
1658 static struct rte_vdev_driver pmd_vhost_drv = {
1659 	.probe = rte_pmd_vhost_probe,
1660 	.remove = rte_pmd_vhost_remove,
1661 };
1662 
1663 RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv);
1664 RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost);
1665 RTE_PMD_REGISTER_PARAM_STRING(net_vhost,
1666 	"iface=<ifc> "
1667 	"queues=<int> "
1668 	"client=<0|1> "
1669 	"iommu-support=<0|1> "
1670 	"postcopy-support=<0|1> "
1671 	"tso=<0|1> "
1672 	"linear-buffer=<0|1> "
1673 	"ext-buffer=<0|1>");
1674