xref: /dpdk/drivers/net/vhost/rte_eth_vhost.c (revision 2b843cac232eb3f2fa79e4254e21766817e2019f)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016 IGEL Co., Ltd.
3  * Copyright(c) 2016-2018 Intel Corporation
4  */
5 #include <stdlib.h>
6 #include <unistd.h>
7 #include <pthread.h>
8 #include <stdbool.h>
9 #include <sys/epoll.h>
10 
11 #include <rte_mbuf.h>
12 #include <ethdev_driver.h>
13 #include <ethdev_vdev.h>
14 #include <rte_malloc.h>
15 #include <rte_memcpy.h>
16 #include <rte_net.h>
17 #include <bus_vdev_driver.h>
18 #include <rte_kvargs.h>
19 #include <rte_vhost.h>
20 #include <rte_spinlock.h>
21 
22 #include "rte_eth_vhost.h"
23 
24 RTE_LOG_REGISTER_DEFAULT(vhost_logtype, NOTICE);
25 #define RTE_LOGTYPE_VHOST vhost_logtype
26 
27 #define VHOST_LOG_LINE(level, ...) \
28 	RTE_LOG_LINE(level, VHOST, __VA_ARGS__)
29 
30 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
31 
32 #define ETH_VHOST_IFACE_ARG		"iface"
33 #define ETH_VHOST_QUEUES_ARG		"queues"
34 #define ETH_VHOST_CLIENT_ARG		"client"
35 #define ETH_VHOST_IOMMU_SUPPORT		"iommu-support"
36 #define ETH_VHOST_POSTCOPY_SUPPORT	"postcopy-support"
37 #define ETH_VHOST_VIRTIO_NET_F_HOST_TSO	"tso"
38 #define ETH_VHOST_LINEAR_BUF		"linear-buffer"
39 #define ETH_VHOST_EXT_BUF		"ext-buffer"
40 #define ETH_VHOST_LEGACY_OL_FLAGS	"legacy-ol-flags"
41 #define VHOST_MAX_PKT_BURST 32
42 
43 static const char *valid_arguments[] = {
44 	ETH_VHOST_IFACE_ARG,
45 	ETH_VHOST_QUEUES_ARG,
46 	ETH_VHOST_CLIENT_ARG,
47 	ETH_VHOST_IOMMU_SUPPORT,
48 	ETH_VHOST_POSTCOPY_SUPPORT,
49 	ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
50 	ETH_VHOST_LINEAR_BUF,
51 	ETH_VHOST_EXT_BUF,
52 	ETH_VHOST_LEGACY_OL_FLAGS,
53 	NULL
54 };
55 
56 static struct rte_ether_addr base_eth_addr = {
57 	.addr_bytes = {
58 		0x56 /* V */,
59 		0x48 /* H */,
60 		0x4F /* O */,
61 		0x53 /* S */,
62 		0x54 /* T */,
63 		0x00
64 	}
65 };
66 
67 struct vhost_stats {
68 	uint64_t pkts;
69 	uint64_t bytes;
70 	uint64_t missed_pkts;
71 };
72 
73 struct vhost_queue {
74 	int vid;
75 	rte_atomic32_t allow_queuing;
76 	rte_atomic32_t while_queuing;
77 	struct pmd_internal *internal;
78 	struct rte_mempool *mb_pool;
79 	uint16_t port;
80 	uint16_t virtqueue_id;
81 	struct vhost_stats stats;
82 	rte_spinlock_t intr_lock;
83 	struct epoll_event ev;
84 	int kickfd;
85 };
86 
87 struct pmd_internal {
88 	rte_atomic32_t dev_attached;
89 	char *iface_name;
90 	uint64_t flags;
91 	uint64_t disable_flags;
92 	uint64_t features;
93 	uint16_t max_queues;
94 	int vid;
95 	rte_atomic32_t started;
96 	bool vlan_strip;
97 	bool rx_sw_csum;
98 	bool tx_sw_csum;
99 };
100 
101 struct internal_list {
102 	TAILQ_ENTRY(internal_list) next;
103 	struct rte_eth_dev *eth_dev;
104 };
105 
106 TAILQ_HEAD(internal_list_head, internal_list);
107 static struct internal_list_head internal_list =
108 	TAILQ_HEAD_INITIALIZER(internal_list);
109 
110 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
111 
112 static struct rte_eth_link pmd_link = {
113 		.link_speed = 10000,
114 		.link_duplex = RTE_ETH_LINK_FULL_DUPLEX,
115 		.link_status = RTE_ETH_LINK_DOWN
116 };
117 
118 struct rte_vhost_vring_state {
119 	rte_spinlock_t lock;
120 
121 	bool cur[RTE_MAX_QUEUES_PER_PORT * 2];
122 	bool seen[RTE_MAX_QUEUES_PER_PORT * 2];
123 	unsigned int index;
124 	unsigned int max_vring;
125 };
126 
127 static struct rte_vhost_vring_state *vring_states[RTE_MAX_ETHPORTS];
128 
129 static int
130 vhost_dev_xstats_reset(struct rte_eth_dev *dev)
131 {
132 	struct vhost_queue *vq;
133 	int ret, i;
134 
135 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
136 		vq = dev->data->rx_queues[i];
137 		ret = rte_vhost_vring_stats_reset(vq->vid, vq->virtqueue_id);
138 		if (ret < 0)
139 			return ret;
140 	}
141 
142 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
143 		vq = dev->data->tx_queues[i];
144 		ret = rte_vhost_vring_stats_reset(vq->vid, vq->virtqueue_id);
145 		if (ret < 0)
146 			return ret;
147 	}
148 
149 	return 0;
150 }
151 
152 static int
153 vhost_dev_xstats_get_names(struct rte_eth_dev *dev,
154 			   struct rte_eth_xstat_name *xstats_names,
155 			   unsigned int limit)
156 {
157 	struct rte_vhost_stat_name *name;
158 	struct vhost_queue *vq;
159 	int ret, i, count = 0, nstats = 0;
160 
161 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
162 		vq = dev->data->rx_queues[i];
163 		ret = rte_vhost_vring_stats_get_names(vq->vid, vq->virtqueue_id, NULL, 0);
164 		if (ret < 0)
165 			return ret;
166 
167 		nstats += ret;
168 	}
169 
170 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
171 		vq = dev->data->tx_queues[i];
172 		ret = rte_vhost_vring_stats_get_names(vq->vid, vq->virtqueue_id, NULL, 0);
173 		if (ret < 0)
174 			return ret;
175 
176 		nstats += ret;
177 	}
178 
179 	if (!xstats_names || limit < (unsigned int)nstats)
180 		return nstats;
181 
182 	name = calloc(nstats, sizeof(*name));
183 	if (!name)
184 		return -1;
185 
186 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
187 		vq = dev->data->rx_queues[i];
188 		ret = rte_vhost_vring_stats_get_names(vq->vid, vq->virtqueue_id,
189 				name + count, nstats - count);
190 		if (ret < 0) {
191 			free(name);
192 			return ret;
193 		}
194 
195 		count += ret;
196 	}
197 
198 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
199 		vq = dev->data->tx_queues[i];
200 		ret = rte_vhost_vring_stats_get_names(vq->vid, vq->virtqueue_id,
201 				name + count, nstats - count);
202 		if (ret < 0) {
203 			free(name);
204 			return ret;
205 		}
206 
207 		count += ret;
208 	}
209 
210 	for (i = 0; i < count; i++)
211 		strncpy(xstats_names[i].name, name[i].name, RTE_ETH_XSTATS_NAME_SIZE);
212 
213 	free(name);
214 
215 	return count;
216 }
217 
218 static int
219 vhost_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats,
220 		     unsigned int n)
221 {
222 	struct rte_vhost_stat *stats;
223 	struct vhost_queue *vq;
224 	int ret, i, count = 0, nstats = 0;
225 
226 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
227 		vq = dev->data->rx_queues[i];
228 		ret = rte_vhost_vring_stats_get(vq->vid, vq->virtqueue_id, NULL, 0);
229 		if (ret < 0)
230 			return ret;
231 
232 		nstats += ret;
233 	}
234 
235 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
236 		vq = dev->data->tx_queues[i];
237 		ret = rte_vhost_vring_stats_get(vq->vid, vq->virtqueue_id, NULL, 0);
238 		if (ret < 0)
239 			return ret;
240 
241 		nstats += ret;
242 	}
243 
244 	if (!xstats || n < (unsigned int)nstats)
245 		return nstats;
246 
247 	stats = calloc(nstats, sizeof(*stats));
248 	if (!stats)
249 		return -1;
250 
251 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
252 		vq = dev->data->rx_queues[i];
253 		ret = rte_vhost_vring_stats_get(vq->vid, vq->virtqueue_id,
254 				stats + count, nstats - count);
255 		if (ret < 0) {
256 			free(stats);
257 			return ret;
258 		}
259 
260 		count += ret;
261 	}
262 
263 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
264 		vq = dev->data->tx_queues[i];
265 		ret = rte_vhost_vring_stats_get(vq->vid, vq->virtqueue_id,
266 				stats + count, nstats - count);
267 		if (ret < 0) {
268 			free(stats);
269 			return ret;
270 		}
271 
272 		count += ret;
273 	}
274 
275 	for (i = 0; i < count; i++) {
276 		xstats[i].id = stats[i].id;
277 		xstats[i].value = stats[i].value;
278 	}
279 
280 	free(stats);
281 
282 	return nstats;
283 }
284 
285 static void
286 vhost_dev_csum_configure(struct rte_eth_dev *eth_dev)
287 {
288 	struct pmd_internal *internal = eth_dev->data->dev_private;
289 	const struct rte_eth_rxmode *rxmode = &eth_dev->data->dev_conf.rxmode;
290 	const struct rte_eth_txmode *txmode = &eth_dev->data->dev_conf.txmode;
291 
292 	internal->rx_sw_csum = false;
293 	internal->tx_sw_csum = false;
294 
295 	/* SW checksum is not compatible with legacy mode */
296 	if (!(internal->flags & RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS))
297 		return;
298 
299 	if (internal->features & (1ULL << VIRTIO_NET_F_CSUM)) {
300 		if (!(rxmode->offloads &
301 				(RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_TCP_CKSUM))) {
302 			VHOST_LOG_LINE(NOTICE, "Rx csum will be done in SW, may impact performance.");
303 			internal->rx_sw_csum = true;
304 		}
305 	}
306 
307 	if (!(internal->features & (1ULL << VIRTIO_NET_F_GUEST_CSUM))) {
308 		if (txmode->offloads &
309 				(RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_TCP_CKSUM)) {
310 			VHOST_LOG_LINE(NOTICE, "Tx csum will be done in SW, may impact performance.");
311 			internal->tx_sw_csum = true;
312 		}
313 	}
314 }
315 
316 static void
317 vhost_dev_tx_sw_csum(struct rte_mbuf *mbuf)
318 {
319 	uint32_t hdr_len;
320 	uint16_t csum = 0, csum_offset;
321 
322 	switch (mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK) {
323 	case RTE_MBUF_F_TX_L4_NO_CKSUM:
324 		return;
325 	case RTE_MBUF_F_TX_TCP_CKSUM:
326 		csum_offset = offsetof(struct rte_tcp_hdr, cksum);
327 		break;
328 	case RTE_MBUF_F_TX_UDP_CKSUM:
329 		csum_offset = offsetof(struct rte_udp_hdr, dgram_cksum);
330 		break;
331 	default:
332 		/* Unsupported packet type. */
333 		return;
334 	}
335 
336 	hdr_len = mbuf->l2_len + mbuf->l3_len;
337 	csum_offset += hdr_len;
338 
339 	/* Prepare the pseudo-header checksum */
340 	if (rte_net_intel_cksum_prepare(mbuf) < 0)
341 		return;
342 
343 	if (rte_raw_cksum_mbuf(mbuf, hdr_len, rte_pktmbuf_pkt_len(mbuf) - hdr_len, &csum) < 0)
344 		return;
345 
346 	csum = ~csum;
347 	/* See RFC768 */
348 	if (unlikely((mbuf->packet_type & RTE_PTYPE_L4_UDP) && csum == 0))
349 		csum = 0xffff;
350 
351 	if (rte_pktmbuf_data_len(mbuf) >= csum_offset + 1)
352 		*rte_pktmbuf_mtod_offset(mbuf, uint16_t *, csum_offset) = csum;
353 
354 	mbuf->ol_flags &= ~RTE_MBUF_F_TX_L4_MASK;
355 	mbuf->ol_flags |= RTE_MBUF_F_TX_L4_NO_CKSUM;
356 }
357 
358 static void
359 vhost_dev_rx_sw_csum(struct rte_mbuf *mbuf)
360 {
361 	struct rte_net_hdr_lens hdr_lens;
362 	uint32_t ptype, hdr_len;
363 	uint16_t csum = 0, csum_offset;
364 
365 	/* Return early if the L4 checksum was not offloaded */
366 	if ((mbuf->ol_flags & RTE_MBUF_F_RX_L4_CKSUM_MASK) != RTE_MBUF_F_RX_L4_CKSUM_NONE)
367 		return;
368 
369 	ptype = rte_net_get_ptype(mbuf, &hdr_lens, RTE_PTYPE_ALL_MASK);
370 
371 	hdr_len = hdr_lens.l2_len + hdr_lens.l3_len;
372 
373 	switch (ptype & RTE_PTYPE_L4_MASK) {
374 	case RTE_PTYPE_L4_TCP:
375 		csum_offset = offsetof(struct rte_tcp_hdr, cksum) + hdr_len;
376 		break;
377 	case RTE_PTYPE_L4_UDP:
378 		csum_offset = offsetof(struct rte_udp_hdr, dgram_cksum) + hdr_len;
379 		break;
380 	default:
381 		/* Unsupported packet type */
382 		return;
383 	}
384 
385 	/* The pseudo-header checksum is already performed, as per Virtio spec */
386 	if (rte_raw_cksum_mbuf(mbuf, hdr_len, rte_pktmbuf_pkt_len(mbuf) - hdr_len, &csum) < 0)
387 		return;
388 
389 	csum = ~csum;
390 	/* See RFC768 */
391 	if (unlikely((ptype & RTE_PTYPE_L4_UDP) && csum == 0))
392 		csum = 0xffff;
393 
394 	if (rte_pktmbuf_data_len(mbuf) >= csum_offset + 1)
395 		*rte_pktmbuf_mtod_offset(mbuf, uint16_t *, csum_offset) = csum;
396 
397 	mbuf->ol_flags &= ~RTE_MBUF_F_RX_L4_CKSUM_MASK;
398 	mbuf->ol_flags |= RTE_MBUF_F_RX_L4_CKSUM_GOOD;
399 }
400 
401 static uint16_t
402 eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
403 {
404 	struct vhost_queue *r = q;
405 	uint16_t i, nb_rx = 0;
406 	uint16_t nb_receive = nb_bufs;
407 
408 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
409 		return 0;
410 
411 	rte_atomic32_set(&r->while_queuing, 1);
412 
413 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
414 		goto out;
415 
416 	/* Dequeue packets from guest TX queue */
417 	while (nb_receive) {
418 		uint16_t nb_pkts;
419 		uint16_t num = (uint16_t)RTE_MIN(nb_receive,
420 						 VHOST_MAX_PKT_BURST);
421 
422 		nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
423 						  r->mb_pool, &bufs[nb_rx],
424 						  num);
425 
426 		nb_rx += nb_pkts;
427 		nb_receive -= nb_pkts;
428 		if (nb_pkts < num)
429 			break;
430 	}
431 
432 	r->stats.pkts += nb_rx;
433 
434 	for (i = 0; likely(i < nb_rx); i++) {
435 		bufs[i]->port = r->port;
436 		bufs[i]->vlan_tci = 0;
437 
438 		if (r->internal->vlan_strip)
439 			rte_vlan_strip(bufs[i]);
440 
441 		if (r->internal->rx_sw_csum)
442 			vhost_dev_rx_sw_csum(bufs[i]);
443 
444 		r->stats.bytes += bufs[i]->pkt_len;
445 	}
446 
447 out:
448 	rte_atomic32_set(&r->while_queuing, 0);
449 
450 	return nb_rx;
451 }
452 
453 static uint16_t
454 eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
455 {
456 	struct vhost_queue *r = q;
457 	uint16_t i, nb_tx = 0;
458 	uint16_t nb_send = 0;
459 	uint64_t nb_bytes = 0;
460 	uint64_t nb_missed = 0;
461 
462 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
463 		return 0;
464 
465 	rte_atomic32_set(&r->while_queuing, 1);
466 
467 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
468 		goto out;
469 
470 	for (i = 0; i < nb_bufs; i++) {
471 		struct rte_mbuf *m = bufs[i];
472 
473 		/* Do VLAN tag insertion */
474 		if (m->ol_flags & RTE_MBUF_F_TX_VLAN) {
475 			int error = rte_vlan_insert(&m);
476 			if (unlikely(error)) {
477 				rte_pktmbuf_free(m);
478 				continue;
479 			}
480 		}
481 
482 		if (r->internal->tx_sw_csum)
483 			vhost_dev_tx_sw_csum(m);
484 
485 
486 		bufs[nb_send] = m;
487 		++nb_send;
488 	}
489 
490 	/* Enqueue packets to guest RX queue */
491 	while (nb_send) {
492 		uint16_t nb_pkts;
493 		uint16_t num = (uint16_t)RTE_MIN(nb_send,
494 						 VHOST_MAX_PKT_BURST);
495 
496 		nb_pkts = rte_vhost_enqueue_burst(r->vid, r->virtqueue_id,
497 						  &bufs[nb_tx], num);
498 
499 		nb_tx += nb_pkts;
500 		nb_send -= nb_pkts;
501 		if (nb_pkts < num)
502 			break;
503 	}
504 
505 	for (i = 0; likely(i < nb_tx); i++)
506 		nb_bytes += bufs[i]->pkt_len;
507 
508 	nb_missed = nb_bufs - nb_tx;
509 
510 	r->stats.pkts += nb_tx;
511 	r->stats.bytes += nb_bytes;
512 	r->stats.missed_pkts += nb_missed;
513 
514 	for (i = 0; likely(i < nb_tx); i++)
515 		rte_pktmbuf_free(bufs[i]);
516 out:
517 	rte_atomic32_set(&r->while_queuing, 0);
518 
519 	return nb_tx;
520 }
521 
522 static inline struct internal_list *
523 find_internal_resource(char *ifname)
524 {
525 	int found = 0;
526 	struct internal_list *list;
527 	struct pmd_internal *internal;
528 
529 	if (ifname == NULL)
530 		return NULL;
531 
532 	pthread_mutex_lock(&internal_list_lock);
533 
534 	TAILQ_FOREACH(list, &internal_list, next) {
535 		internal = list->eth_dev->data->dev_private;
536 		if (!strcmp(internal->iface_name, ifname)) {
537 			found = 1;
538 			break;
539 		}
540 	}
541 
542 	pthread_mutex_unlock(&internal_list_lock);
543 
544 	if (!found)
545 		return NULL;
546 
547 	return list;
548 }
549 
550 static void
551 eth_vhost_update_intr(struct rte_eth_dev *eth_dev, uint16_t rxq_idx)
552 {
553 	struct rte_vhost_vring vring;
554 	struct vhost_queue *vq;
555 
556 	vq = eth_dev->data->rx_queues[rxq_idx];
557 	if (vq == NULL || vq->vid < 0)
558 		return;
559 
560 	if (rte_vhost_get_vhost_vring(vq->vid, (rxq_idx << 1) + 1, &vring) < 0) {
561 		VHOST_LOG_LINE(DEBUG, "Failed to get rxq-%d's vring, skip!", rxq_idx);
562 		return;
563 	}
564 
565 	rte_spinlock_lock(&vq->intr_lock);
566 
567 	/* Remove previous kickfd from proxy epoll */
568 	if (vq->kickfd >= 0 && vq->kickfd != vring.kickfd) {
569 		if (epoll_ctl(vq->ev.data.fd, EPOLL_CTL_DEL, vq->kickfd, &vq->ev) < 0) {
570 			VHOST_LOG_LINE(DEBUG, "Failed to unregister %d from rxq-%d epoll: %s",
571 				vq->kickfd, rxq_idx, strerror(errno));
572 		} else {
573 			VHOST_LOG_LINE(DEBUG, "Unregistered %d from rxq-%d epoll",
574 				vq->kickfd, rxq_idx);
575 		}
576 		vq->kickfd = -1;
577 	}
578 
579 	/* Add new one, if valid */
580 	if (vq->kickfd != vring.kickfd && vring.kickfd >= 0) {
581 		if (epoll_ctl(vq->ev.data.fd, EPOLL_CTL_ADD, vring.kickfd, &vq->ev) < 0) {
582 			VHOST_LOG_LINE(ERR, "Failed to register %d in rxq-%d epoll: %s",
583 				vring.kickfd, rxq_idx, strerror(errno));
584 		} else {
585 			vq->kickfd = vring.kickfd;
586 			VHOST_LOG_LINE(DEBUG, "Registered %d in rxq-%d epoll",
587 				vq->kickfd, rxq_idx);
588 		}
589 	}
590 
591 	rte_spinlock_unlock(&vq->intr_lock);
592 }
593 
594 static int
595 eth_rxq_intr_enable(struct rte_eth_dev *dev, uint16_t qid)
596 {
597 	struct vhost_queue *vq = dev->data->rx_queues[qid];
598 
599 	if (vq->vid >= 0)
600 		rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 1);
601 
602 	return 0;
603 }
604 
605 static int
606 eth_rxq_intr_disable(struct rte_eth_dev *dev, uint16_t qid)
607 {
608 	struct vhost_queue *vq = dev->data->rx_queues[qid];
609 
610 	if (vq->vid >= 0)
611 		rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 0);
612 
613 	return 0;
614 }
615 
616 static void
617 eth_vhost_uninstall_intr(struct rte_eth_dev *dev)
618 {
619 	struct rte_intr_handle *intr_handle = dev->intr_handle;
620 
621 	if (intr_handle != NULL) {
622 		int i;
623 
624 		for (i = 0; i < dev->data->nb_rx_queues; i++) {
625 			int epoll_fd = rte_intr_efds_index_get(dev->intr_handle, i);
626 
627 			if (epoll_fd >= 0)
628 				close(epoll_fd);
629 		}
630 		rte_intr_vec_list_free(intr_handle);
631 		rte_intr_instance_free(intr_handle);
632 	}
633 	dev->intr_handle = NULL;
634 }
635 
636 static int
637 eth_vhost_install_intr(struct rte_eth_dev *dev)
638 {
639 	int nb_rxq = dev->data->nb_rx_queues;
640 	struct vhost_queue *vq;
641 
642 	int ret;
643 	int i;
644 
645 	dev->intr_handle = rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_PRIVATE);
646 	if (dev->intr_handle == NULL) {
647 		VHOST_LOG_LINE(ERR, "Fail to allocate intr_handle");
648 		ret = -ENOMEM;
649 		goto error;
650 	}
651 	if (rte_intr_efd_counter_size_set(dev->intr_handle, 0)) {
652 		ret = -rte_errno;
653 		goto error;
654 	}
655 
656 	if (rte_intr_vec_list_alloc(dev->intr_handle, NULL, nb_rxq)) {
657 		VHOST_LOG_LINE(ERR, "Failed to allocate memory for interrupt vector");
658 		ret = -ENOMEM;
659 		goto error;
660 	}
661 
662 	VHOST_LOG_LINE(DEBUG, "Prepare intr vec");
663 	for (i = 0; i < nb_rxq; i++) {
664 		int epoll_fd = epoll_create1(0);
665 
666 		if (epoll_fd < 0) {
667 			VHOST_LOG_LINE(ERR, "Failed to create proxy epoll fd for rxq-%d", i);
668 			ret = -errno;
669 			goto error;
670 		}
671 
672 		if (rte_intr_vec_list_index_set(dev->intr_handle, i,
673 				RTE_INTR_VEC_RXTX_OFFSET + i) ||
674 				rte_intr_efds_index_set(dev->intr_handle, i, epoll_fd)) {
675 			ret = -rte_errno;
676 			close(epoll_fd);
677 			goto error;
678 		}
679 
680 		vq = dev->data->rx_queues[i];
681 		memset(&vq->ev, 0, sizeof(vq->ev));
682 		vq->ev.events = EPOLLIN;
683 		vq->ev.data.fd = epoll_fd;
684 	}
685 
686 	if (rte_intr_nb_efd_set(dev->intr_handle, nb_rxq)) {
687 		ret = -rte_errno;
688 		goto error;
689 	}
690 	if (rte_intr_max_intr_set(dev->intr_handle, nb_rxq + 1)) {
691 		ret = -rte_errno;
692 		goto error;
693 	}
694 	if (rte_intr_type_set(dev->intr_handle, RTE_INTR_HANDLE_VDEV)) {
695 		ret = -rte_errno;
696 		goto error;
697 	}
698 
699 	return 0;
700 
701 error:
702 	eth_vhost_uninstall_intr(dev);
703 	return ret;
704 }
705 
706 static void
707 eth_vhost_configure_intr(struct rte_eth_dev *dev)
708 {
709 	int i;
710 
711 	VHOST_LOG_LINE(DEBUG, "Configure intr vec");
712 	for (i = 0; i < dev->data->nb_rx_queues; i++)
713 		eth_vhost_update_intr(dev, i);
714 }
715 
716 static void
717 eth_vhost_unconfigure_intr(struct rte_eth_dev *eth_dev)
718 {
719 	struct vhost_queue *vq;
720 	int i;
721 
722 	VHOST_LOG_LINE(DEBUG, "Unconfigure intr vec");
723 	for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
724 		vq = eth_dev->data->rx_queues[i];
725 		if (vq == NULL || vq->vid < 0)
726 			continue;
727 
728 		rte_spinlock_lock(&vq->intr_lock);
729 
730 		/* Remove previous kickfd from proxy epoll */
731 		if (vq->kickfd >= 0) {
732 			if (epoll_ctl(vq->ev.data.fd, EPOLL_CTL_DEL, vq->kickfd, &vq->ev) < 0) {
733 				VHOST_LOG_LINE(DEBUG, "Failed to unregister %d from rxq-%d epoll: %s",
734 					vq->kickfd, i, strerror(errno));
735 			} else {
736 				VHOST_LOG_LINE(DEBUG, "Unregistered %d from rxq-%d epoll",
737 					vq->kickfd, i);
738 			}
739 			vq->kickfd = -1;
740 		}
741 
742 		rte_spinlock_unlock(&vq->intr_lock);
743 	}
744 }
745 
746 static void
747 update_queuing_status(struct rte_eth_dev *dev, bool wait_queuing)
748 {
749 	struct pmd_internal *internal = dev->data->dev_private;
750 	struct vhost_queue *vq;
751 	struct rte_vhost_vring_state *state;
752 	unsigned int i;
753 	int allow_queuing = 1;
754 
755 	if (!dev->data->rx_queues || !dev->data->tx_queues)
756 		return;
757 
758 	if (rte_atomic32_read(&internal->started) == 0 ||
759 	    rte_atomic32_read(&internal->dev_attached) == 0)
760 		allow_queuing = 0;
761 
762 	state = vring_states[dev->data->port_id];
763 
764 	/* Wait until rx/tx_pkt_burst stops accessing vhost device */
765 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
766 		vq = dev->data->rx_queues[i];
767 		if (vq == NULL)
768 			continue;
769 		if (allow_queuing && state->cur[vq->virtqueue_id])
770 			rte_atomic32_set(&vq->allow_queuing, 1);
771 		else
772 			rte_atomic32_set(&vq->allow_queuing, 0);
773 		while (wait_queuing && rte_atomic32_read(&vq->while_queuing))
774 			rte_pause();
775 	}
776 
777 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
778 		vq = dev->data->tx_queues[i];
779 		if (vq == NULL)
780 			continue;
781 		if (allow_queuing && state->cur[vq->virtqueue_id])
782 			rte_atomic32_set(&vq->allow_queuing, 1);
783 		else
784 			rte_atomic32_set(&vq->allow_queuing, 0);
785 		while (wait_queuing && rte_atomic32_read(&vq->while_queuing))
786 			rte_pause();
787 	}
788 }
789 
790 static void
791 queue_setup(struct rte_eth_dev *eth_dev, struct pmd_internal *internal)
792 {
793 	struct vhost_queue *vq;
794 	int i;
795 
796 	for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
797 		vq = eth_dev->data->rx_queues[i];
798 		if (!vq)
799 			continue;
800 		vq->vid = internal->vid;
801 		vq->internal = internal;
802 		vq->port = eth_dev->data->port_id;
803 	}
804 	for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
805 		vq = eth_dev->data->tx_queues[i];
806 		if (!vq)
807 			continue;
808 		vq->vid = internal->vid;
809 		vq->internal = internal;
810 		vq->port = eth_dev->data->port_id;
811 	}
812 }
813 
814 static int
815 new_device(int vid)
816 {
817 	struct rte_eth_dev *eth_dev;
818 	struct internal_list *list;
819 	struct pmd_internal *internal;
820 	struct rte_eth_conf *dev_conf;
821 	unsigned i;
822 	char ifname[PATH_MAX];
823 #ifdef RTE_LIBRTE_VHOST_NUMA
824 	int newnode;
825 #endif
826 
827 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
828 	list = find_internal_resource(ifname);
829 	if (list == NULL) {
830 		VHOST_LOG_LINE(INFO, "Invalid device name: %s", ifname);
831 		return -1;
832 	}
833 
834 	eth_dev = list->eth_dev;
835 	internal = eth_dev->data->dev_private;
836 	dev_conf = &eth_dev->data->dev_conf;
837 
838 #ifdef RTE_LIBRTE_VHOST_NUMA
839 	newnode = rte_vhost_get_numa_node(vid);
840 	if (newnode >= 0)
841 		eth_dev->data->numa_node = newnode;
842 #endif
843 
844 	if (rte_vhost_get_negotiated_features(vid, &internal->features)) {
845 		VHOST_LOG_LINE(ERR, "Failed to get device features");
846 		return -1;
847 	}
848 
849 	internal->vid = vid;
850 	if (rte_atomic32_read(&internal->started) == 1) {
851 		queue_setup(eth_dev, internal);
852 		if (dev_conf->intr_conf.rxq)
853 			eth_vhost_configure_intr(eth_dev);
854 	}
855 
856 	for (i = 0; i < rte_vhost_get_vring_num(vid); i++)
857 		rte_vhost_enable_guest_notification(vid, i, 0);
858 
859 	rte_vhost_get_mtu(vid, &eth_dev->data->mtu);
860 
861 	eth_dev->data->dev_link.link_status = RTE_ETH_LINK_UP;
862 
863 	vhost_dev_csum_configure(eth_dev);
864 
865 	rte_atomic32_set(&internal->dev_attached, 1);
866 	update_queuing_status(eth_dev, false);
867 
868 	VHOST_LOG_LINE(INFO, "Vhost device %d created", vid);
869 
870 	rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
871 
872 	return 0;
873 }
874 
875 static void
876 destroy_device(int vid)
877 {
878 	struct rte_eth_dev *eth_dev;
879 	struct pmd_internal *internal;
880 	struct vhost_queue *vq;
881 	struct internal_list *list;
882 	char ifname[PATH_MAX];
883 	unsigned i;
884 	struct rte_vhost_vring_state *state;
885 
886 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
887 	list = find_internal_resource(ifname);
888 	if (list == NULL) {
889 		VHOST_LOG_LINE(ERR, "Invalid interface name: %s", ifname);
890 		return;
891 	}
892 	eth_dev = list->eth_dev;
893 	internal = eth_dev->data->dev_private;
894 
895 	rte_atomic32_set(&internal->dev_attached, 0);
896 	update_queuing_status(eth_dev, true);
897 	eth_vhost_unconfigure_intr(eth_dev);
898 
899 	eth_dev->data->dev_link.link_status = RTE_ETH_LINK_DOWN;
900 
901 	if (eth_dev->data->rx_queues && eth_dev->data->tx_queues) {
902 		for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
903 			vq = eth_dev->data->rx_queues[i];
904 			if (!vq)
905 				continue;
906 			vq->vid = -1;
907 		}
908 		for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
909 			vq = eth_dev->data->tx_queues[i];
910 			if (!vq)
911 				continue;
912 			vq->vid = -1;
913 		}
914 	}
915 
916 	state = vring_states[eth_dev->data->port_id];
917 	rte_spinlock_lock(&state->lock);
918 	for (i = 0; i <= state->max_vring; i++) {
919 		state->cur[i] = false;
920 		state->seen[i] = false;
921 	}
922 	state->max_vring = 0;
923 	rte_spinlock_unlock(&state->lock);
924 
925 	VHOST_LOG_LINE(INFO, "Vhost device %d destroyed", vid);
926 
927 	rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
928 }
929 
930 static int
931 vring_state_changed(int vid, uint16_t vring, int enable)
932 {
933 	struct rte_vhost_vring_state *state;
934 	struct rte_eth_dev *eth_dev;
935 	struct internal_list *list;
936 	char ifname[PATH_MAX];
937 
938 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
939 	list = find_internal_resource(ifname);
940 	if (list == NULL) {
941 		VHOST_LOG_LINE(ERR, "Invalid interface name: %s", ifname);
942 		return -1;
943 	}
944 
945 	eth_dev = list->eth_dev;
946 	/* won't be NULL */
947 	state = vring_states[eth_dev->data->port_id];
948 
949 	if (eth_dev->data->dev_conf.intr_conf.rxq && vring % 2)
950 		eth_vhost_update_intr(eth_dev, (vring - 1) >> 1);
951 
952 	rte_spinlock_lock(&state->lock);
953 	if (state->cur[vring] == enable) {
954 		rte_spinlock_unlock(&state->lock);
955 		return 0;
956 	}
957 	state->cur[vring] = enable;
958 	state->max_vring = RTE_MAX(vring, state->max_vring);
959 	rte_spinlock_unlock(&state->lock);
960 
961 	update_queuing_status(eth_dev, false);
962 
963 	VHOST_LOG_LINE(INFO, "vring%u is %s",
964 			vring, enable ? "enabled" : "disabled");
965 
966 	rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_QUEUE_STATE, NULL);
967 
968 	return 0;
969 }
970 
971 static struct rte_vhost_device_ops vhost_ops = {
972 	.new_device          = new_device,
973 	.destroy_device      = destroy_device,
974 	.vring_state_changed = vring_state_changed,
975 };
976 
977 static int
978 vhost_driver_setup(struct rte_eth_dev *eth_dev)
979 {
980 	struct pmd_internal *internal = eth_dev->data->dev_private;
981 	struct internal_list *list = NULL;
982 	struct rte_vhost_vring_state *vring_state = NULL;
983 	unsigned int numa_node = eth_dev->device->numa_node;
984 	const char *name = eth_dev->device->name;
985 
986 	/* Don't try to setup again if it has already been done. */
987 	list = find_internal_resource(internal->iface_name);
988 	if (list)
989 		return 0;
990 
991 	list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node);
992 	if (list == NULL)
993 		return -1;
994 
995 	vring_state = rte_zmalloc_socket(name, sizeof(*vring_state),
996 					 0, numa_node);
997 	if (vring_state == NULL)
998 		goto free_list;
999 
1000 	list->eth_dev = eth_dev;
1001 	pthread_mutex_lock(&internal_list_lock);
1002 	TAILQ_INSERT_TAIL(&internal_list, list, next);
1003 	pthread_mutex_unlock(&internal_list_lock);
1004 
1005 	rte_spinlock_init(&vring_state->lock);
1006 	vring_states[eth_dev->data->port_id] = vring_state;
1007 
1008 	if (rte_vhost_driver_register(internal->iface_name, internal->flags))
1009 		goto list_remove;
1010 
1011 	if (internal->disable_flags) {
1012 		if (rte_vhost_driver_disable_features(internal->iface_name,
1013 						      internal->disable_flags))
1014 			goto drv_unreg;
1015 	}
1016 
1017 	if (rte_vhost_driver_set_max_queue_num(internal->iface_name, internal->max_queues))
1018 		goto drv_unreg;
1019 
1020 	if (rte_vhost_driver_callback_register(internal->iface_name,
1021 					       &vhost_ops) < 0) {
1022 		VHOST_LOG_LINE(ERR, "Can't register callbacks");
1023 		goto drv_unreg;
1024 	}
1025 
1026 	if (rte_vhost_driver_start(internal->iface_name) < 0) {
1027 		VHOST_LOG_LINE(ERR, "Failed to start driver for %s",
1028 			  internal->iface_name);
1029 		goto drv_unreg;
1030 	}
1031 
1032 	return 0;
1033 
1034 drv_unreg:
1035 	rte_vhost_driver_unregister(internal->iface_name);
1036 list_remove:
1037 	vring_states[eth_dev->data->port_id] = NULL;
1038 	pthread_mutex_lock(&internal_list_lock);
1039 	TAILQ_REMOVE(&internal_list, list, next);
1040 	pthread_mutex_unlock(&internal_list_lock);
1041 	rte_free(vring_state);
1042 free_list:
1043 	rte_free(list);
1044 
1045 	return -1;
1046 }
1047 
1048 int
1049 rte_eth_vhost_get_queue_event(uint16_t port_id,
1050 		struct rte_eth_vhost_queue_event *event)
1051 {
1052 	struct rte_vhost_vring_state *state;
1053 	unsigned int i;
1054 	int idx;
1055 
1056 	if (port_id >= RTE_MAX_ETHPORTS) {
1057 		VHOST_LOG_LINE(ERR, "Invalid port id");
1058 		return -1;
1059 	}
1060 
1061 	state = vring_states[port_id];
1062 	if (!state) {
1063 		VHOST_LOG_LINE(ERR, "Unused port");
1064 		return -1;
1065 	}
1066 
1067 	rte_spinlock_lock(&state->lock);
1068 	for (i = 0; i <= state->max_vring; i++) {
1069 		idx = state->index++ % (state->max_vring + 1);
1070 
1071 		if (state->cur[idx] != state->seen[idx]) {
1072 			state->seen[idx] = state->cur[idx];
1073 			event->queue_id = idx / 2;
1074 			event->rx = idx & 1;
1075 			event->enable = state->cur[idx];
1076 			rte_spinlock_unlock(&state->lock);
1077 			return 0;
1078 		}
1079 	}
1080 	rte_spinlock_unlock(&state->lock);
1081 
1082 	return -1;
1083 }
1084 
1085 int
1086 rte_eth_vhost_get_vid_from_port_id(uint16_t port_id)
1087 {
1088 	struct internal_list *list;
1089 	struct rte_eth_dev *eth_dev;
1090 	struct vhost_queue *vq;
1091 	int vid = -1;
1092 
1093 	if (!rte_eth_dev_is_valid_port(port_id))
1094 		return -1;
1095 
1096 	pthread_mutex_lock(&internal_list_lock);
1097 
1098 	TAILQ_FOREACH(list, &internal_list, next) {
1099 		eth_dev = list->eth_dev;
1100 		if (eth_dev->data->port_id == port_id) {
1101 			vq = eth_dev->data->rx_queues[0];
1102 			if (vq) {
1103 				vid = vq->vid;
1104 			}
1105 			break;
1106 		}
1107 	}
1108 
1109 	pthread_mutex_unlock(&internal_list_lock);
1110 
1111 	return vid;
1112 }
1113 
1114 static int
1115 eth_dev_configure(struct rte_eth_dev *dev)
1116 {
1117 	struct pmd_internal *internal = dev->data->dev_private;
1118 	const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
1119 
1120 	/* NOTE: the same process has to operate a vhost interface
1121 	 * from beginning to end (from eth_dev configure to eth_dev close).
1122 	 * It is user's responsibility at the moment.
1123 	 */
1124 	if (vhost_driver_setup(dev) < 0)
1125 		return -1;
1126 
1127 	internal->vlan_strip = !!(rxmode->offloads & RTE_ETH_RX_OFFLOAD_VLAN_STRIP);
1128 
1129 	vhost_dev_csum_configure(dev);
1130 
1131 	return 0;
1132 }
1133 
1134 static int
1135 eth_dev_start(struct rte_eth_dev *eth_dev)
1136 {
1137 	struct pmd_internal *internal = eth_dev->data->dev_private;
1138 	struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
1139 	uint16_t i;
1140 
1141 	eth_vhost_uninstall_intr(eth_dev);
1142 	if (dev_conf->intr_conf.rxq && eth_vhost_install_intr(eth_dev) < 0) {
1143 		VHOST_LOG_LINE(ERR, "Failed to install interrupt handler.");
1144 		return -1;
1145 	}
1146 
1147 	queue_setup(eth_dev, internal);
1148 	if (rte_atomic32_read(&internal->dev_attached) == 1 &&
1149 			dev_conf->intr_conf.rxq)
1150 		eth_vhost_configure_intr(eth_dev);
1151 
1152 	rte_atomic32_set(&internal->started, 1);
1153 	update_queuing_status(eth_dev, false);
1154 
1155 	for (i = 0; i < eth_dev->data->nb_rx_queues; i++)
1156 		eth_dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED;
1157 	for (i = 0; i < eth_dev->data->nb_tx_queues; i++)
1158 		eth_dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED;
1159 
1160 	return 0;
1161 }
1162 
1163 static int
1164 eth_dev_stop(struct rte_eth_dev *dev)
1165 {
1166 	struct pmd_internal *internal = dev->data->dev_private;
1167 	uint16_t i;
1168 
1169 	dev->data->dev_started = 0;
1170 	rte_atomic32_set(&internal->started, 0);
1171 	update_queuing_status(dev, true);
1172 
1173 	for (i = 0; i < dev->data->nb_rx_queues; i++)
1174 		dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED;
1175 	for (i = 0; i < dev->data->nb_tx_queues; i++)
1176 		dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED;
1177 
1178 	return 0;
1179 }
1180 
1181 static int
1182 eth_dev_close(struct rte_eth_dev *dev)
1183 {
1184 	struct pmd_internal *internal;
1185 	struct internal_list *list;
1186 	unsigned int i, ret;
1187 
1188 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1189 		return 0;
1190 
1191 	internal = dev->data->dev_private;
1192 	if (!internal)
1193 		return 0;
1194 
1195 	ret = eth_dev_stop(dev);
1196 
1197 	list = find_internal_resource(internal->iface_name);
1198 	if (list) {
1199 		rte_vhost_driver_unregister(internal->iface_name);
1200 		pthread_mutex_lock(&internal_list_lock);
1201 		TAILQ_REMOVE(&internal_list, list, next);
1202 		pthread_mutex_unlock(&internal_list_lock);
1203 		rte_free(list);
1204 	}
1205 
1206 	if (dev->data->rx_queues)
1207 		for (i = 0; i < dev->data->nb_rx_queues; i++)
1208 			rte_free(dev->data->rx_queues[i]);
1209 
1210 	if (dev->data->tx_queues)
1211 		for (i = 0; i < dev->data->nb_tx_queues; i++)
1212 			rte_free(dev->data->tx_queues[i]);
1213 
1214 	rte_free(internal->iface_name);
1215 	rte_free(internal);
1216 
1217 	eth_vhost_uninstall_intr(dev);
1218 
1219 	dev->data->dev_private = NULL;
1220 
1221 	rte_free(vring_states[dev->data->port_id]);
1222 	vring_states[dev->data->port_id] = NULL;
1223 
1224 	return ret;
1225 }
1226 
1227 static int
1228 eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1229 		   uint16_t nb_rx_desc __rte_unused,
1230 		   unsigned int socket_id,
1231 		   const struct rte_eth_rxconf *rx_conf __rte_unused,
1232 		   struct rte_mempool *mb_pool)
1233 {
1234 	struct vhost_queue *vq;
1235 
1236 	vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1237 			RTE_CACHE_LINE_SIZE, socket_id);
1238 	if (vq == NULL) {
1239 		VHOST_LOG_LINE(ERR, "Failed to allocate memory for rx queue");
1240 		return -ENOMEM;
1241 	}
1242 
1243 	vq->mb_pool = mb_pool;
1244 	vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
1245 	rte_spinlock_init(&vq->intr_lock);
1246 	vq->kickfd = -1;
1247 	dev->data->rx_queues[rx_queue_id] = vq;
1248 
1249 	return 0;
1250 }
1251 
1252 static int
1253 eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1254 		   uint16_t nb_tx_desc __rte_unused,
1255 		   unsigned int socket_id,
1256 		   const struct rte_eth_txconf *tx_conf __rte_unused)
1257 {
1258 	struct vhost_queue *vq;
1259 
1260 	vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1261 			RTE_CACHE_LINE_SIZE, socket_id);
1262 	if (vq == NULL) {
1263 		VHOST_LOG_LINE(ERR, "Failed to allocate memory for tx queue");
1264 		return -ENOMEM;
1265 	}
1266 
1267 	vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
1268 	rte_spinlock_init(&vq->intr_lock);
1269 	vq->kickfd = -1;
1270 	dev->data->tx_queues[tx_queue_id] = vq;
1271 
1272 	return 0;
1273 }
1274 
1275 static int
1276 eth_dev_info(struct rte_eth_dev *dev,
1277 	     struct rte_eth_dev_info *dev_info)
1278 {
1279 	struct pmd_internal *internal;
1280 
1281 	internal = dev->data->dev_private;
1282 	if (internal == NULL) {
1283 		VHOST_LOG_LINE(ERR, "Invalid device specified");
1284 		return -ENODEV;
1285 	}
1286 
1287 	dev_info->max_mac_addrs = 1;
1288 	dev_info->max_rx_pktlen = (uint32_t)-1;
1289 	dev_info->max_rx_queues = internal->max_queues;
1290 	dev_info->max_tx_queues = internal->max_queues;
1291 	dev_info->min_rx_bufsize = 0;
1292 
1293 	dev_info->tx_offload_capa = RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
1294 				RTE_ETH_TX_OFFLOAD_VLAN_INSERT;
1295 	if (internal->flags & RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS) {
1296 		dev_info->tx_offload_capa |= RTE_ETH_TX_OFFLOAD_UDP_CKSUM |
1297 			RTE_ETH_TX_OFFLOAD_TCP_CKSUM;
1298 	}
1299 
1300 	dev_info->rx_offload_capa = RTE_ETH_RX_OFFLOAD_VLAN_STRIP;
1301 	if (internal->flags & RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS) {
1302 		dev_info->rx_offload_capa |= RTE_ETH_RX_OFFLOAD_UDP_CKSUM |
1303 			RTE_ETH_RX_OFFLOAD_TCP_CKSUM;
1304 	}
1305 
1306 	return 0;
1307 }
1308 
1309 static int
1310 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1311 {
1312 	unsigned i;
1313 	unsigned long rx_total = 0, tx_total = 0;
1314 	unsigned long rx_total_bytes = 0, tx_total_bytes = 0;
1315 	unsigned long tx_total_errors = 0;
1316 	struct vhost_queue *vq;
1317 
1318 	for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1319 			i < dev->data->nb_rx_queues; i++) {
1320 		if (dev->data->rx_queues[i] == NULL)
1321 			continue;
1322 		vq = dev->data->rx_queues[i];
1323 		stats->q_ipackets[i] = vq->stats.pkts;
1324 		rx_total += stats->q_ipackets[i];
1325 
1326 		stats->q_ibytes[i] = vq->stats.bytes;
1327 		rx_total_bytes += stats->q_ibytes[i];
1328 	}
1329 
1330 	for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1331 			i < dev->data->nb_tx_queues; i++) {
1332 		if (dev->data->tx_queues[i] == NULL)
1333 			continue;
1334 		vq = dev->data->tx_queues[i];
1335 		stats->q_opackets[i] = vq->stats.pkts;
1336 		tx_total += stats->q_opackets[i];
1337 
1338 		stats->q_obytes[i] = vq->stats.bytes;
1339 		tx_total_bytes += stats->q_obytes[i];
1340 
1341 		tx_total_errors += vq->stats.missed_pkts;
1342 	}
1343 
1344 	stats->ipackets = rx_total;
1345 	stats->opackets = tx_total;
1346 	stats->ibytes = rx_total_bytes;
1347 	stats->obytes = tx_total_bytes;
1348 	stats->oerrors = tx_total_errors;
1349 
1350 	return 0;
1351 }
1352 
1353 static int
1354 eth_stats_reset(struct rte_eth_dev *dev)
1355 {
1356 	struct vhost_queue *vq;
1357 	unsigned i;
1358 
1359 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
1360 		if (dev->data->rx_queues[i] == NULL)
1361 			continue;
1362 		vq = dev->data->rx_queues[i];
1363 		vq->stats.pkts = 0;
1364 		vq->stats.bytes = 0;
1365 	}
1366 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
1367 		if (dev->data->tx_queues[i] == NULL)
1368 			continue;
1369 		vq = dev->data->tx_queues[i];
1370 		vq->stats.pkts = 0;
1371 		vq->stats.bytes = 0;
1372 		vq->stats.missed_pkts = 0;
1373 	}
1374 
1375 	return 0;
1376 }
1377 
1378 static void
1379 eth_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
1380 {
1381 	rte_free(dev->data->rx_queues[qid]);
1382 }
1383 
1384 static void
1385 eth_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
1386 {
1387 	rte_free(dev->data->tx_queues[qid]);
1388 }
1389 
1390 static int
1391 eth_tx_done_cleanup(void *txq __rte_unused, uint32_t free_cnt __rte_unused)
1392 {
1393 	/*
1394 	 * vHost does not hang onto mbuf. eth_vhost_tx() copies packet data
1395 	 * and releases mbuf, so nothing to cleanup.
1396 	 */
1397 	return 0;
1398 }
1399 
1400 static int
1401 eth_link_update(struct rte_eth_dev *dev __rte_unused,
1402 		int wait_to_complete __rte_unused)
1403 {
1404 	return 0;
1405 }
1406 
1407 static uint32_t
1408 eth_rx_queue_count(void *rx_queue)
1409 {
1410 	struct vhost_queue *vq;
1411 
1412 	vq = rx_queue;
1413 	if (vq == NULL)
1414 		return 0;
1415 
1416 	return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
1417 }
1418 
1419 #define CLB_VAL_IDX 0
1420 #define CLB_MSK_IDX 1
1421 #define CLB_MATCH_IDX 2
1422 static int
1423 vhost_monitor_callback(const uint64_t value,
1424 		const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
1425 {
1426 	const uint64_t m = opaque[CLB_MSK_IDX];
1427 	const uint64_t v = opaque[CLB_VAL_IDX];
1428 	const uint64_t c = opaque[CLB_MATCH_IDX];
1429 
1430 	if (c)
1431 		return (value & m) == v ? -1 : 0;
1432 	else
1433 		return (value & m) == v ? 0 : -1;
1434 }
1435 
1436 static int
1437 vhost_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
1438 {
1439 	struct vhost_queue *vq = rx_queue;
1440 	struct rte_vhost_power_monitor_cond vhost_pmc;
1441 	int ret;
1442 	if (vq == NULL)
1443 		return -EINVAL;
1444 	ret = rte_vhost_get_monitor_addr(vq->vid, vq->virtqueue_id,
1445 			&vhost_pmc);
1446 	if (ret < 0)
1447 		return -EINVAL;
1448 	pmc->addr = vhost_pmc.addr;
1449 	pmc->opaque[CLB_VAL_IDX] = vhost_pmc.val;
1450 	pmc->opaque[CLB_MSK_IDX] = vhost_pmc.mask;
1451 	pmc->opaque[CLB_MATCH_IDX] = vhost_pmc.match;
1452 	pmc->size = vhost_pmc.size;
1453 	pmc->fn = vhost_monitor_callback;
1454 
1455 	return 0;
1456 }
1457 
1458 static int
1459 vhost_dev_priv_dump(struct rte_eth_dev *dev, FILE *f)
1460 {
1461 	struct pmd_internal *internal = dev->data->dev_private;
1462 
1463 	fprintf(f, "iface_name: %s\n", internal->iface_name);
1464 	fprintf(f, "flags: 0x%" PRIx64 "\n", internal->flags);
1465 	fprintf(f, "disable_flags: 0x%" PRIx64 "\n", internal->disable_flags);
1466 	fprintf(f, "features: 0x%" PRIx64 "\n", internal->features);
1467 	fprintf(f, "max_queues: %u\n", internal->max_queues);
1468 	fprintf(f, "vid: %d\n", internal->vid);
1469 	fprintf(f, "started: %d\n", rte_atomic32_read(&internal->started));
1470 	fprintf(f, "dev_attached: %d\n", rte_atomic32_read(&internal->dev_attached));
1471 	fprintf(f, "vlan_strip: %d\n", internal->vlan_strip);
1472 	fprintf(f, "rx_sw_csum: %d\n", internal->rx_sw_csum);
1473 	fprintf(f, "tx_sw_csum: %d\n", internal->tx_sw_csum);
1474 
1475 	return 0;
1476 }
1477 
1478 static const struct eth_dev_ops ops = {
1479 	.dev_start = eth_dev_start,
1480 	.dev_stop = eth_dev_stop,
1481 	.dev_close = eth_dev_close,
1482 	.dev_configure = eth_dev_configure,
1483 	.dev_infos_get = eth_dev_info,
1484 	.rx_queue_setup = eth_rx_queue_setup,
1485 	.tx_queue_setup = eth_tx_queue_setup,
1486 	.rx_queue_release = eth_rx_queue_release,
1487 	.tx_queue_release = eth_tx_queue_release,
1488 	.tx_done_cleanup = eth_tx_done_cleanup,
1489 	.link_update = eth_link_update,
1490 	.stats_get = eth_stats_get,
1491 	.stats_reset = eth_stats_reset,
1492 	.xstats_reset = vhost_dev_xstats_reset,
1493 	.xstats_get = vhost_dev_xstats_get,
1494 	.xstats_get_names = vhost_dev_xstats_get_names,
1495 	.rx_queue_intr_enable = eth_rxq_intr_enable,
1496 	.rx_queue_intr_disable = eth_rxq_intr_disable,
1497 	.get_monitor_addr = vhost_get_monitor_addr,
1498 	.eth_dev_priv_dump = vhost_dev_priv_dump,
1499 };
1500 
1501 static int
1502 eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name,
1503 	int16_t queues, const unsigned int numa_node, uint64_t flags,
1504 	uint64_t disable_flags)
1505 {
1506 	const char *name = rte_vdev_device_name(dev);
1507 	struct rte_eth_dev_data *data;
1508 	struct pmd_internal *internal = NULL;
1509 	struct rte_eth_dev *eth_dev = NULL;
1510 	struct rte_ether_addr *eth_addr = NULL;
1511 
1512 	VHOST_LOG_LINE(INFO, "Creating VHOST-USER backend on numa socket %u",
1513 		numa_node);
1514 
1515 	/* reserve an ethdev entry */
1516 	eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal));
1517 	if (eth_dev == NULL)
1518 		goto error;
1519 	data = eth_dev->data;
1520 
1521 	eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
1522 	if (eth_addr == NULL)
1523 		goto error;
1524 	data->mac_addrs = eth_addr;
1525 	*eth_addr = base_eth_addr;
1526 	eth_addr->addr_bytes[5] = eth_dev->data->port_id;
1527 
1528 	/* now put it all together
1529 	 * - store queue data in internal,
1530 	 * - point eth_dev_data to internals
1531 	 * - and point eth_dev structure to new eth_dev_data structure
1532 	 */
1533 	internal = eth_dev->data->dev_private;
1534 	internal->iface_name = rte_malloc_socket(name, strlen(iface_name) + 1,
1535 						 0, numa_node);
1536 	if (internal->iface_name == NULL)
1537 		goto error;
1538 	strcpy(internal->iface_name, iface_name);
1539 
1540 	data->nb_rx_queues = queues;
1541 	data->nb_tx_queues = queues;
1542 	internal->max_queues = queues;
1543 	internal->vid = -1;
1544 	internal->flags = flags;
1545 	internal->disable_flags = disable_flags;
1546 	data->dev_link = pmd_link;
1547 	data->dev_flags = RTE_ETH_DEV_INTR_LSC |
1548 				RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
1549 	data->promiscuous = 1;
1550 	data->all_multicast = 1;
1551 
1552 	eth_dev->dev_ops = &ops;
1553 	eth_dev->rx_queue_count = eth_rx_queue_count;
1554 
1555 	/* finally assign rx and tx ops */
1556 	eth_dev->rx_pkt_burst = eth_vhost_rx;
1557 	eth_dev->tx_pkt_burst = eth_vhost_tx;
1558 
1559 	rte_eth_dev_probing_finish(eth_dev);
1560 	return 0;
1561 
1562 error:
1563 	if (internal)
1564 		rte_free(internal->iface_name);
1565 	rte_eth_dev_release_port(eth_dev);
1566 
1567 	return -1;
1568 }
1569 
1570 static inline int
1571 open_iface(const char *key __rte_unused, const char *value, void *extra_args)
1572 {
1573 	const char **iface_name = extra_args;
1574 
1575 	if (value == NULL)
1576 		return -1;
1577 
1578 	*iface_name = value;
1579 
1580 	return 0;
1581 }
1582 
1583 static inline int
1584 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1585 {
1586 	uint16_t *n = extra_args;
1587 
1588 	if (value == NULL || extra_args == NULL)
1589 		return -EINVAL;
1590 
1591 	*n = (uint16_t)strtoul(value, NULL, 0);
1592 	if (*n == USHRT_MAX && errno == ERANGE)
1593 		return -1;
1594 
1595 	return 0;
1596 }
1597 
1598 static int
1599 rte_pmd_vhost_probe(struct rte_vdev_device *dev)
1600 {
1601 	struct rte_kvargs *kvlist = NULL;
1602 	int ret = 0;
1603 	char *iface_name;
1604 	uint16_t queues;
1605 	uint64_t flags = RTE_VHOST_USER_NET_STATS_ENABLE;
1606 	uint64_t disable_flags = 0;
1607 	int client_mode = 0;
1608 	int iommu_support = 0;
1609 	int postcopy_support = 0;
1610 	int tso = 0;
1611 	int linear_buf = 0;
1612 	int ext_buf = 0;
1613 	int legacy_ol_flags = 0;
1614 	struct rte_eth_dev *eth_dev;
1615 	const char *name = rte_vdev_device_name(dev);
1616 
1617 	VHOST_LOG_LINE(INFO, "Initializing pmd_vhost for %s", name);
1618 
1619 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1620 		eth_dev = rte_eth_dev_attach_secondary(name);
1621 		if (!eth_dev) {
1622 			VHOST_LOG_LINE(ERR, "Failed to probe %s", name);
1623 			return -1;
1624 		}
1625 		eth_dev->rx_pkt_burst = eth_vhost_rx;
1626 		eth_dev->tx_pkt_burst = eth_vhost_tx;
1627 		eth_dev->dev_ops = &ops;
1628 		if (dev->device.numa_node == SOCKET_ID_ANY)
1629 			dev->device.numa_node = rte_socket_id();
1630 		eth_dev->device = &dev->device;
1631 		rte_eth_dev_probing_finish(eth_dev);
1632 		return 0;
1633 	}
1634 
1635 	kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1636 	if (kvlist == NULL)
1637 		return -1;
1638 
1639 	if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
1640 		ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
1641 					 &open_iface, &iface_name);
1642 		if (ret < 0)
1643 			goto out_free;
1644 	} else {
1645 		ret = -1;
1646 		goto out_free;
1647 	}
1648 
1649 	if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) {
1650 		ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG,
1651 					 &open_int, &queues);
1652 		if (ret < 0 || queues > RTE_MAX_QUEUES_PER_PORT)
1653 			goto out_free;
1654 
1655 	} else
1656 		queues = 1;
1657 
1658 	if (rte_kvargs_count(kvlist, ETH_VHOST_CLIENT_ARG) == 1) {
1659 		ret = rte_kvargs_process(kvlist, ETH_VHOST_CLIENT_ARG,
1660 					 &open_int, &client_mode);
1661 		if (ret < 0)
1662 			goto out_free;
1663 
1664 		if (client_mode)
1665 			flags |= RTE_VHOST_USER_CLIENT;
1666 	}
1667 
1668 	if (rte_kvargs_count(kvlist, ETH_VHOST_IOMMU_SUPPORT) == 1) {
1669 		ret = rte_kvargs_process(kvlist, ETH_VHOST_IOMMU_SUPPORT,
1670 					 &open_int, &iommu_support);
1671 		if (ret < 0)
1672 			goto out_free;
1673 
1674 		if (iommu_support)
1675 			flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
1676 	}
1677 
1678 	if (rte_kvargs_count(kvlist, ETH_VHOST_POSTCOPY_SUPPORT) == 1) {
1679 		ret = rte_kvargs_process(kvlist, ETH_VHOST_POSTCOPY_SUPPORT,
1680 					 &open_int, &postcopy_support);
1681 		if (ret < 0)
1682 			goto out_free;
1683 
1684 		if (postcopy_support)
1685 			flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
1686 	}
1687 
1688 	if (rte_kvargs_count(kvlist, ETH_VHOST_VIRTIO_NET_F_HOST_TSO) == 1) {
1689 		ret = rte_kvargs_process(kvlist,
1690 				ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
1691 				&open_int, &tso);
1692 		if (ret < 0)
1693 			goto out_free;
1694 	}
1695 
1696 	if (tso == 0) {
1697 		disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO4);
1698 		disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO6);
1699 	}
1700 
1701 	if (rte_kvargs_count(kvlist, ETH_VHOST_LINEAR_BUF) == 1) {
1702 		ret = rte_kvargs_process(kvlist,
1703 				ETH_VHOST_LINEAR_BUF,
1704 				&open_int, &linear_buf);
1705 		if (ret < 0)
1706 			goto out_free;
1707 
1708 		if (linear_buf == 1)
1709 			flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
1710 	}
1711 
1712 	if (rte_kvargs_count(kvlist, ETH_VHOST_EXT_BUF) == 1) {
1713 		ret = rte_kvargs_process(kvlist,
1714 				ETH_VHOST_EXT_BUF,
1715 				&open_int, &ext_buf);
1716 		if (ret < 0)
1717 			goto out_free;
1718 
1719 		if (ext_buf == 1)
1720 			flags |= RTE_VHOST_USER_EXTBUF_SUPPORT;
1721 	}
1722 
1723 	if (rte_kvargs_count(kvlist, ETH_VHOST_LEGACY_OL_FLAGS) == 1) {
1724 		ret = rte_kvargs_process(kvlist,
1725 				ETH_VHOST_LEGACY_OL_FLAGS,
1726 				&open_int, &legacy_ol_flags);
1727 		if (ret < 0)
1728 			goto out_free;
1729 	}
1730 
1731 	if (legacy_ol_flags == 0)
1732 		flags |= RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1733 
1734 	if (dev->device.numa_node == SOCKET_ID_ANY)
1735 		dev->device.numa_node = rte_socket_id();
1736 
1737 	ret = eth_dev_vhost_create(dev, iface_name, queues,
1738 				   dev->device.numa_node, flags, disable_flags);
1739 	if (ret == -1)
1740 		VHOST_LOG_LINE(ERR, "Failed to create %s", name);
1741 
1742 out_free:
1743 	rte_kvargs_free(kvlist);
1744 	return ret;
1745 }
1746 
1747 static int
1748 rte_pmd_vhost_remove(struct rte_vdev_device *dev)
1749 {
1750 	const char *name;
1751 	struct rte_eth_dev *eth_dev = NULL;
1752 
1753 	name = rte_vdev_device_name(dev);
1754 	VHOST_LOG_LINE(INFO, "Un-Initializing pmd_vhost for %s", name);
1755 
1756 	/* find an ethdev entry */
1757 	eth_dev = rte_eth_dev_allocated(name);
1758 	if (eth_dev == NULL)
1759 		return 0;
1760 
1761 	eth_dev_close(eth_dev);
1762 	rte_eth_dev_release_port(eth_dev);
1763 
1764 	return 0;
1765 }
1766 
1767 static struct rte_vdev_driver pmd_vhost_drv = {
1768 	.probe = rte_pmd_vhost_probe,
1769 	.remove = rte_pmd_vhost_remove,
1770 };
1771 
1772 RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv);
1773 RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost);
1774 RTE_PMD_REGISTER_PARAM_STRING(net_vhost,
1775 	"iface=<ifc> "
1776 	"queues=<int> "
1777 	"client=<0|1> "
1778 	"iommu-support=<0|1> "
1779 	"postcopy-support=<0|1> "
1780 	"tso=<0|1> "
1781 	"linear-buffer=<0|1> "
1782 	"ext-buffer=<0|1>");
1783