xref: /dpdk/drivers/net/tap/rte_eth_tap.c (revision 68a03efeed657e6e05f281479b33b51102797e15)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016-2017 Intel Corporation
3  */
4 
5 #include <rte_atomic.h>
6 #include <rte_branch_prediction.h>
7 #include <rte_byteorder.h>
8 #include <rte_common.h>
9 #include <rte_mbuf.h>
10 #include <ethdev_driver.h>
11 #include <ethdev_vdev.h>
12 #include <rte_malloc.h>
13 #include <rte_bus_vdev.h>
14 #include <rte_kvargs.h>
15 #include <rte_net.h>
16 #include <rte_debug.h>
17 #include <rte_ip.h>
18 #include <rte_string_fns.h>
19 #include <rte_ethdev.h>
20 #include <rte_errno.h>
21 #include <rte_cycles.h>
22 
23 #include <sys/types.h>
24 #include <sys/stat.h>
25 #include <sys/socket.h>
26 #include <sys/ioctl.h>
27 #include <sys/utsname.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30 #include <signal.h>
31 #include <stdbool.h>
32 #include <stdint.h>
33 #include <sys/uio.h>
34 #include <unistd.h>
35 #include <arpa/inet.h>
36 #include <net/if.h>
37 #include <linux/if_tun.h>
38 #include <linux/if_ether.h>
39 #include <fcntl.h>
40 #include <ctype.h>
41 
42 #include <tap_rss.h>
43 #include <rte_eth_tap.h>
44 #include <tap_flow.h>
45 #include <tap_netlink.h>
46 #include <tap_tcmsgs.h>
47 
48 /* Linux based path to the TUN device */
49 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
50 #define DEFAULT_TAP_NAME        "dtap"
51 #define DEFAULT_TUN_NAME        "dtun"
52 
53 #define ETH_TAP_IFACE_ARG       "iface"
54 #define ETH_TAP_REMOTE_ARG      "remote"
55 #define ETH_TAP_MAC_ARG         "mac"
56 #define ETH_TAP_MAC_FIXED       "fixed"
57 
58 #define ETH_TAP_USR_MAC_FMT     "xx:xx:xx:xx:xx:xx"
59 #define ETH_TAP_CMP_MAC_FMT     "0123456789ABCDEFabcdef"
60 #define ETH_TAP_MAC_ARG_FMT     ETH_TAP_MAC_FIXED "|" ETH_TAP_USR_MAC_FMT
61 
62 #define TAP_GSO_MBUFS_PER_CORE	128
63 #define TAP_GSO_MBUF_SEG_SIZE	128
64 #define TAP_GSO_MBUF_CACHE_SIZE	4
65 #define TAP_GSO_MBUFS_NUM \
66 	(TAP_GSO_MBUFS_PER_CORE * TAP_GSO_MBUF_CACHE_SIZE)
67 
68 /* IPC key for queue fds sync */
69 #define TAP_MP_KEY "tap_mp_sync_queues"
70 
71 #define TAP_IOV_DEFAULT_MAX 1024
72 
73 static int tap_devices_count;
74 
75 static const char *tuntap_types[ETH_TUNTAP_TYPE_MAX] = {
76 	"UNKNOWN", "TUN", "TAP"
77 };
78 
79 static const char *valid_arguments[] = {
80 	ETH_TAP_IFACE_ARG,
81 	ETH_TAP_REMOTE_ARG,
82 	ETH_TAP_MAC_ARG,
83 	NULL
84 };
85 
86 static volatile uint32_t tap_trigger;	/* Rx trigger */
87 
88 static struct rte_eth_link pmd_link = {
89 	.link_speed = ETH_SPEED_NUM_10G,
90 	.link_duplex = ETH_LINK_FULL_DUPLEX,
91 	.link_status = ETH_LINK_DOWN,
92 	.link_autoneg = ETH_LINK_FIXED,
93 };
94 
95 static void
96 tap_trigger_cb(int sig __rte_unused)
97 {
98 	/* Valid trigger values are nonzero */
99 	tap_trigger = (tap_trigger + 1) | 0x80000000;
100 }
101 
102 /* Specifies on what netdevices the ioctl should be applied */
103 enum ioctl_mode {
104 	LOCAL_AND_REMOTE,
105 	LOCAL_ONLY,
106 	REMOTE_ONLY,
107 };
108 
109 /* Message header to synchronize queues via IPC */
110 struct ipc_queues {
111 	char port_name[RTE_DEV_NAME_MAX_LEN];
112 	int rxq_count;
113 	int txq_count;
114 	/*
115 	 * The file descriptors are in the dedicated part
116 	 * of the Unix message to be translated by the kernel.
117 	 */
118 };
119 
120 static int tap_intr_handle_set(struct rte_eth_dev *dev, int set);
121 
122 /**
123  * Tun/Tap allocation routine
124  *
125  * @param[in] pmd
126  *   Pointer to private structure.
127  *
128  * @param[in] is_keepalive
129  *   Keepalive flag
130  *
131  * @return
132  *   -1 on failure, fd on success
133  */
134 static int
135 tun_alloc(struct pmd_internals *pmd, int is_keepalive)
136 {
137 	struct ifreq ifr;
138 #ifdef IFF_MULTI_QUEUE
139 	unsigned int features;
140 #endif
141 	int fd, signo, flags;
142 
143 	memset(&ifr, 0, sizeof(struct ifreq));
144 
145 	/*
146 	 * Do not set IFF_NO_PI as packet information header will be needed
147 	 * to check if a received packet has been truncated.
148 	 */
149 	ifr.ifr_flags = (pmd->type == ETH_TUNTAP_TYPE_TAP) ?
150 		IFF_TAP : IFF_TUN | IFF_POINTOPOINT;
151 	strlcpy(ifr.ifr_name, pmd->name, IFNAMSIZ);
152 
153 	fd = open(TUN_TAP_DEV_PATH, O_RDWR);
154 	if (fd < 0) {
155 		TAP_LOG(ERR, "Unable to open %s interface", TUN_TAP_DEV_PATH);
156 		goto error;
157 	}
158 
159 #ifdef IFF_MULTI_QUEUE
160 	/* Grab the TUN features to verify we can work multi-queue */
161 	if (ioctl(fd, TUNGETFEATURES, &features) < 0) {
162 		TAP_LOG(ERR, "unable to get TUN/TAP features");
163 		goto error;
164 	}
165 	TAP_LOG(DEBUG, "%s Features %08x", TUN_TAP_DEV_PATH, features);
166 
167 	if (features & IFF_MULTI_QUEUE) {
168 		TAP_LOG(DEBUG, "  Multi-queue support for %d queues",
169 			RTE_PMD_TAP_MAX_QUEUES);
170 		ifr.ifr_flags |= IFF_MULTI_QUEUE;
171 	} else
172 #endif
173 	{
174 		ifr.ifr_flags |= IFF_ONE_QUEUE;
175 		TAP_LOG(DEBUG, "  Single queue only support");
176 	}
177 
178 	/* Set the TUN/TAP configuration and set the name if needed */
179 	if (ioctl(fd, TUNSETIFF, (void *)&ifr) < 0) {
180 		TAP_LOG(WARNING, "Unable to set TUNSETIFF for %s: %s",
181 			ifr.ifr_name, strerror(errno));
182 		goto error;
183 	}
184 
185 	/*
186 	 * Name passed to kernel might be wildcard like dtun%d
187 	 * and need to find the resulting device.
188 	 */
189 	TAP_LOG(DEBUG, "Device name is '%s'", ifr.ifr_name);
190 	strlcpy(pmd->name, ifr.ifr_name, RTE_ETH_NAME_MAX_LEN);
191 
192 	if (is_keepalive) {
193 		/*
194 		 * Detach the TUN/TAP keep-alive queue
195 		 * to avoid traffic through it
196 		 */
197 		ifr.ifr_flags = IFF_DETACH_QUEUE;
198 		if (ioctl(fd, TUNSETQUEUE, (void *)&ifr) < 0) {
199 			TAP_LOG(WARNING,
200 				"Unable to detach keep-alive queue for %s: %s",
201 				ifr.ifr_name, strerror(errno));
202 			goto error;
203 		}
204 	}
205 
206 	flags = fcntl(fd, F_GETFL);
207 	if (flags == -1) {
208 		TAP_LOG(WARNING,
209 			"Unable to get %s current flags\n",
210 			ifr.ifr_name);
211 		goto error;
212 	}
213 
214 	/* Always set the file descriptor to non-blocking */
215 	flags |= O_NONBLOCK;
216 	if (fcntl(fd, F_SETFL, flags) < 0) {
217 		TAP_LOG(WARNING,
218 			"Unable to set %s to nonblocking: %s",
219 			ifr.ifr_name, strerror(errno));
220 		goto error;
221 	}
222 
223 	/* Find a free realtime signal */
224 	for (signo = SIGRTMIN + 1; signo < SIGRTMAX; signo++) {
225 		struct sigaction sa;
226 
227 		if (sigaction(signo, NULL, &sa) == -1) {
228 			TAP_LOG(WARNING,
229 				"Unable to get current rt-signal %d handler",
230 				signo);
231 			goto error;
232 		}
233 
234 		/* Already have the handler we want on this signal  */
235 		if (sa.sa_handler == tap_trigger_cb)
236 			break;
237 
238 		/* Is handler in use by application */
239 		if (sa.sa_handler != SIG_DFL) {
240 			TAP_LOG(DEBUG,
241 				"Skipping used rt-signal %d", signo);
242 			continue;
243 		}
244 
245 		sa = (struct sigaction) {
246 			.sa_flags = SA_RESTART,
247 			.sa_handler = tap_trigger_cb,
248 		};
249 
250 		if (sigaction(signo, &sa, NULL) == -1) {
251 			TAP_LOG(WARNING,
252 				"Unable to set rt-signal %d handler\n", signo);
253 			goto error;
254 		}
255 
256 		/* Found a good signal to use */
257 		TAP_LOG(DEBUG,
258 			"Using rt-signal %d", signo);
259 		break;
260 	}
261 
262 	if (signo == SIGRTMAX) {
263 		TAP_LOG(WARNING, "All rt-signals are in use\n");
264 
265 		/* Disable trigger globally in case of error */
266 		tap_trigger = 0;
267 		TAP_LOG(NOTICE, "No Rx trigger signal available\n");
268 	} else {
269 		/* Enable signal on file descriptor */
270 		if (fcntl(fd, F_SETSIG, signo) < 0) {
271 			TAP_LOG(WARNING, "Unable to set signo %d for fd %d: %s",
272 				signo, fd, strerror(errno));
273 			goto error;
274 		}
275 		if (fcntl(fd, F_SETFL, flags | O_ASYNC) < 0) {
276 			TAP_LOG(WARNING, "Unable to set fcntl flags: %s",
277 				strerror(errno));
278 			goto error;
279 		}
280 
281 		if (fcntl(fd, F_SETOWN, getpid()) < 0) {
282 			TAP_LOG(WARNING, "Unable to set fcntl owner: %s",
283 				strerror(errno));
284 			goto error;
285 		}
286 	}
287 	return fd;
288 
289 error:
290 	if (fd >= 0)
291 		close(fd);
292 	return -1;
293 }
294 
295 static void
296 tap_verify_csum(struct rte_mbuf *mbuf)
297 {
298 	uint32_t l2 = mbuf->packet_type & RTE_PTYPE_L2_MASK;
299 	uint32_t l3 = mbuf->packet_type & RTE_PTYPE_L3_MASK;
300 	uint32_t l4 = mbuf->packet_type & RTE_PTYPE_L4_MASK;
301 	unsigned int l2_len = sizeof(struct rte_ether_hdr);
302 	unsigned int l3_len;
303 	uint16_t cksum = 0;
304 	void *l3_hdr;
305 	void *l4_hdr;
306 	struct rte_udp_hdr *udp_hdr;
307 
308 	if (l2 == RTE_PTYPE_L2_ETHER_VLAN)
309 		l2_len += 4;
310 	else if (l2 == RTE_PTYPE_L2_ETHER_QINQ)
311 		l2_len += 8;
312 	/* Don't verify checksum for packets with discontinuous L2 header */
313 	if (unlikely(l2_len + sizeof(struct rte_ipv4_hdr) >
314 		     rte_pktmbuf_data_len(mbuf)))
315 		return;
316 	l3_hdr = rte_pktmbuf_mtod_offset(mbuf, void *, l2_len);
317 	if (l3 == RTE_PTYPE_L3_IPV4 || l3 == RTE_PTYPE_L3_IPV4_EXT) {
318 		struct rte_ipv4_hdr *iph = l3_hdr;
319 
320 		l3_len = rte_ipv4_hdr_len(iph);
321 		if (unlikely(l2_len + l3_len > rte_pktmbuf_data_len(mbuf)))
322 			return;
323 		/* check that the total length reported by header is not
324 		 * greater than the total received size
325 		 */
326 		if (l2_len + rte_be_to_cpu_16(iph->total_length) >
327 				rte_pktmbuf_data_len(mbuf))
328 			return;
329 
330 		cksum = ~rte_raw_cksum(iph, l3_len);
331 		mbuf->ol_flags |= cksum ?
332 			PKT_RX_IP_CKSUM_BAD :
333 			PKT_RX_IP_CKSUM_GOOD;
334 	} else if (l3 == RTE_PTYPE_L3_IPV6) {
335 		struct rte_ipv6_hdr *iph = l3_hdr;
336 
337 		l3_len = sizeof(struct rte_ipv6_hdr);
338 		/* check that the total length reported by header is not
339 		 * greater than the total received size
340 		 */
341 		if (l2_len + l3_len + rte_be_to_cpu_16(iph->payload_len) >
342 				rte_pktmbuf_data_len(mbuf))
343 			return;
344 	} else {
345 		/* IPv6 extensions are not supported */
346 		return;
347 	}
348 	if (l4 == RTE_PTYPE_L4_UDP || l4 == RTE_PTYPE_L4_TCP) {
349 		l4_hdr = rte_pktmbuf_mtod_offset(mbuf, void *, l2_len + l3_len);
350 		/* Don't verify checksum for multi-segment packets. */
351 		if (mbuf->nb_segs > 1)
352 			return;
353 		if (l3 == RTE_PTYPE_L3_IPV4) {
354 			if (l4 == RTE_PTYPE_L4_UDP) {
355 				udp_hdr = (struct rte_udp_hdr *)l4_hdr;
356 				if (udp_hdr->dgram_cksum == 0) {
357 					/*
358 					 * For IPv4, a zero UDP checksum
359 					 * indicates that the sender did not
360 					 * generate one [RFC 768].
361 					 */
362 					mbuf->ol_flags |= PKT_RX_L4_CKSUM_NONE;
363 					return;
364 				}
365 			}
366 			cksum = ~rte_ipv4_udptcp_cksum(l3_hdr, l4_hdr);
367 		} else if (l3 == RTE_PTYPE_L3_IPV6) {
368 			cksum = ~rte_ipv6_udptcp_cksum(l3_hdr, l4_hdr);
369 		}
370 		mbuf->ol_flags |= cksum ?
371 			PKT_RX_L4_CKSUM_BAD :
372 			PKT_RX_L4_CKSUM_GOOD;
373 	}
374 }
375 
376 static uint64_t
377 tap_rx_offload_get_port_capa(void)
378 {
379 	/*
380 	 * No specific port Rx offload capabilities.
381 	 */
382 	return 0;
383 }
384 
385 static uint64_t
386 tap_rx_offload_get_queue_capa(void)
387 {
388 	return DEV_RX_OFFLOAD_SCATTER |
389 	       DEV_RX_OFFLOAD_IPV4_CKSUM |
390 	       DEV_RX_OFFLOAD_UDP_CKSUM |
391 	       DEV_RX_OFFLOAD_TCP_CKSUM;
392 }
393 
394 static void
395 tap_rxq_pool_free(struct rte_mbuf *pool)
396 {
397 	struct rte_mbuf *mbuf = pool;
398 	uint16_t nb_segs = 1;
399 
400 	if (mbuf == NULL)
401 		return;
402 
403 	while (mbuf->next) {
404 		mbuf = mbuf->next;
405 		nb_segs++;
406 	}
407 	pool->nb_segs = nb_segs;
408 	rte_pktmbuf_free(pool);
409 }
410 
411 /* Callback to handle the rx burst of packets to the correct interface and
412  * file descriptor(s) in a multi-queue setup.
413  */
414 static uint16_t
415 pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
416 {
417 	struct rx_queue *rxq = queue;
418 	struct pmd_process_private *process_private;
419 	uint16_t num_rx;
420 	unsigned long num_rx_bytes = 0;
421 	uint32_t trigger = tap_trigger;
422 
423 	if (trigger == rxq->trigger_seen)
424 		return 0;
425 
426 	process_private = rte_eth_devices[rxq->in_port].process_private;
427 	for (num_rx = 0; num_rx < nb_pkts; ) {
428 		struct rte_mbuf *mbuf = rxq->pool;
429 		struct rte_mbuf *seg = NULL;
430 		struct rte_mbuf *new_tail = NULL;
431 		uint16_t data_off = rte_pktmbuf_headroom(mbuf);
432 		int len;
433 
434 		len = readv(process_private->rxq_fds[rxq->queue_id],
435 			*rxq->iovecs,
436 			1 + (rxq->rxmode->offloads & DEV_RX_OFFLOAD_SCATTER ?
437 			     rxq->nb_rx_desc : 1));
438 		if (len < (int)sizeof(struct tun_pi))
439 			break;
440 
441 		/* Packet couldn't fit in the provided mbuf */
442 		if (unlikely(rxq->pi.flags & TUN_PKT_STRIP)) {
443 			rxq->stats.ierrors++;
444 			continue;
445 		}
446 
447 		len -= sizeof(struct tun_pi);
448 
449 		mbuf->pkt_len = len;
450 		mbuf->port = rxq->in_port;
451 		while (1) {
452 			struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp);
453 
454 			if (unlikely(!buf)) {
455 				rxq->stats.rx_nombuf++;
456 				/* No new buf has been allocated: do nothing */
457 				if (!new_tail || !seg)
458 					goto end;
459 
460 				seg->next = NULL;
461 				tap_rxq_pool_free(mbuf);
462 
463 				goto end;
464 			}
465 			seg = seg ? seg->next : mbuf;
466 			if (rxq->pool == mbuf)
467 				rxq->pool = buf;
468 			if (new_tail)
469 				new_tail->next = buf;
470 			new_tail = buf;
471 			new_tail->next = seg->next;
472 
473 			/* iovecs[0] is reserved for packet info (pi) */
474 			(*rxq->iovecs)[mbuf->nb_segs].iov_len =
475 				buf->buf_len - data_off;
476 			(*rxq->iovecs)[mbuf->nb_segs].iov_base =
477 				(char *)buf->buf_addr + data_off;
478 
479 			seg->data_len = RTE_MIN(seg->buf_len - data_off, len);
480 			seg->data_off = data_off;
481 
482 			len -= seg->data_len;
483 			if (len <= 0)
484 				break;
485 			mbuf->nb_segs++;
486 			/* First segment has headroom, not the others */
487 			data_off = 0;
488 		}
489 		seg->next = NULL;
490 		mbuf->packet_type = rte_net_get_ptype(mbuf, NULL,
491 						      RTE_PTYPE_ALL_MASK);
492 		if (rxq->rxmode->offloads & DEV_RX_OFFLOAD_CHECKSUM)
493 			tap_verify_csum(mbuf);
494 
495 		/* account for the receive frame */
496 		bufs[num_rx++] = mbuf;
497 		num_rx_bytes += mbuf->pkt_len;
498 	}
499 end:
500 	rxq->stats.ipackets += num_rx;
501 	rxq->stats.ibytes += num_rx_bytes;
502 
503 	if (trigger && num_rx < nb_pkts)
504 		rxq->trigger_seen = trigger;
505 
506 	return num_rx;
507 }
508 
509 static uint64_t
510 tap_tx_offload_get_port_capa(void)
511 {
512 	/*
513 	 * No specific port Tx offload capabilities.
514 	 */
515 	return 0;
516 }
517 
518 static uint64_t
519 tap_tx_offload_get_queue_capa(void)
520 {
521 	return DEV_TX_OFFLOAD_MULTI_SEGS |
522 	       DEV_TX_OFFLOAD_IPV4_CKSUM |
523 	       DEV_TX_OFFLOAD_UDP_CKSUM |
524 	       DEV_TX_OFFLOAD_TCP_CKSUM |
525 	       DEV_TX_OFFLOAD_TCP_TSO;
526 }
527 
528 /* Finalize l4 checksum calculation */
529 static void
530 tap_tx_l4_cksum(uint16_t *l4_cksum, uint16_t l4_phdr_cksum,
531 		uint32_t l4_raw_cksum)
532 {
533 	if (l4_cksum) {
534 		uint32_t cksum;
535 
536 		cksum = __rte_raw_cksum_reduce(l4_raw_cksum);
537 		cksum += l4_phdr_cksum;
538 
539 		cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff);
540 		cksum = (~cksum) & 0xffff;
541 		if (cksum == 0)
542 			cksum = 0xffff;
543 		*l4_cksum = cksum;
544 	}
545 }
546 
547 /* Accumaulate L4 raw checksums */
548 static void
549 tap_tx_l4_add_rcksum(char *l4_data, unsigned int l4_len, uint16_t *l4_cksum,
550 			uint32_t *l4_raw_cksum)
551 {
552 	if (l4_cksum == NULL)
553 		return;
554 
555 	*l4_raw_cksum = __rte_raw_cksum(l4_data, l4_len, *l4_raw_cksum);
556 }
557 
558 /* L3 and L4 pseudo headers checksum offloads */
559 static void
560 tap_tx_l3_cksum(char *packet, uint64_t ol_flags, unsigned int l2_len,
561 		unsigned int l3_len, unsigned int l4_len, uint16_t **l4_cksum,
562 		uint16_t *l4_phdr_cksum, uint32_t *l4_raw_cksum)
563 {
564 	void *l3_hdr = packet + l2_len;
565 
566 	if (ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_IPV4)) {
567 		struct rte_ipv4_hdr *iph = l3_hdr;
568 		uint16_t cksum;
569 
570 		iph->hdr_checksum = 0;
571 		cksum = rte_raw_cksum(iph, l3_len);
572 		iph->hdr_checksum = (cksum == 0xffff) ? cksum : ~cksum;
573 	}
574 	if (ol_flags & PKT_TX_L4_MASK) {
575 		void *l4_hdr;
576 
577 		l4_hdr = packet + l2_len + l3_len;
578 		if ((ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM)
579 			*l4_cksum = &((struct rte_udp_hdr *)l4_hdr)->dgram_cksum;
580 		else if ((ol_flags & PKT_TX_L4_MASK) == PKT_TX_TCP_CKSUM)
581 			*l4_cksum = &((struct rte_tcp_hdr *)l4_hdr)->cksum;
582 		else
583 			return;
584 		**l4_cksum = 0;
585 		if (ol_flags & PKT_TX_IPV4)
586 			*l4_phdr_cksum = rte_ipv4_phdr_cksum(l3_hdr, 0);
587 		else
588 			*l4_phdr_cksum = rte_ipv6_phdr_cksum(l3_hdr, 0);
589 		*l4_raw_cksum = __rte_raw_cksum(l4_hdr, l4_len, 0);
590 	}
591 }
592 
593 static inline int
594 tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
595 			struct rte_mbuf **pmbufs,
596 			uint16_t *num_packets, unsigned long *num_tx_bytes)
597 {
598 	int i;
599 	uint16_t l234_hlen;
600 	struct pmd_process_private *process_private;
601 
602 	process_private = rte_eth_devices[txq->out_port].process_private;
603 
604 	for (i = 0; i < num_mbufs; i++) {
605 		struct rte_mbuf *mbuf = pmbufs[i];
606 		struct iovec iovecs[mbuf->nb_segs + 2];
607 		struct tun_pi pi = { .flags = 0, .proto = 0x00 };
608 		struct rte_mbuf *seg = mbuf;
609 		char m_copy[mbuf->data_len];
610 		int proto;
611 		int n;
612 		int j;
613 		int k; /* current index in iovecs for copying segments */
614 		uint16_t seg_len; /* length of first segment */
615 		uint16_t nb_segs;
616 		uint16_t *l4_cksum; /* l4 checksum (pseudo header + payload) */
617 		uint32_t l4_raw_cksum = 0; /* TCP/UDP payload raw checksum */
618 		uint16_t l4_phdr_cksum = 0; /* TCP/UDP pseudo header checksum */
619 		uint16_t is_cksum = 0; /* in case cksum should be offloaded */
620 
621 		l4_cksum = NULL;
622 		if (txq->type == ETH_TUNTAP_TYPE_TUN) {
623 			/*
624 			 * TUN and TAP are created with IFF_NO_PI disabled.
625 			 * For TUN PMD this mandatory as fields are used by
626 			 * Kernel tun.c to determine whether its IP or non IP
627 			 * packets.
628 			 *
629 			 * The logic fetches the first byte of data from mbuf
630 			 * then compares whether its v4 or v6. If first byte
631 			 * is 4 or 6, then protocol field is updated.
632 			 */
633 			char *buff_data = rte_pktmbuf_mtod(seg, void *);
634 			proto = (*buff_data & 0xf0);
635 			pi.proto = (proto == 0x40) ?
636 				rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4) :
637 				((proto == 0x60) ?
638 					rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV6) :
639 					0x00);
640 		}
641 
642 		k = 0;
643 		iovecs[k].iov_base = &pi;
644 		iovecs[k].iov_len = sizeof(pi);
645 		k++;
646 
647 		nb_segs = mbuf->nb_segs;
648 		if (txq->csum &&
649 		    ((mbuf->ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_IPV4) ||
650 		     (mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM ||
651 		     (mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_TCP_CKSUM))) {
652 			is_cksum = 1;
653 
654 			/* Support only packets with at least layer 4
655 			 * header included in the first segment
656 			 */
657 			seg_len = rte_pktmbuf_data_len(mbuf);
658 			l234_hlen = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
659 			if (seg_len < l234_hlen)
660 				return -1;
661 
662 			/* To change checksums, work on a * copy of l2, l3
663 			 * headers + l4 pseudo header
664 			 */
665 			rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void *),
666 					l234_hlen);
667 			tap_tx_l3_cksum(m_copy, mbuf->ol_flags,
668 				       mbuf->l2_len, mbuf->l3_len, mbuf->l4_len,
669 				       &l4_cksum, &l4_phdr_cksum,
670 				       &l4_raw_cksum);
671 			iovecs[k].iov_base = m_copy;
672 			iovecs[k].iov_len = l234_hlen;
673 			k++;
674 
675 			/* Update next iovecs[] beyond l2, l3, l4 headers */
676 			if (seg_len > l234_hlen) {
677 				iovecs[k].iov_len = seg_len - l234_hlen;
678 				iovecs[k].iov_base =
679 					rte_pktmbuf_mtod(seg, char *) +
680 						l234_hlen;
681 				tap_tx_l4_add_rcksum(iovecs[k].iov_base,
682 					iovecs[k].iov_len, l4_cksum,
683 					&l4_raw_cksum);
684 				k++;
685 				nb_segs++;
686 			}
687 			seg = seg->next;
688 		}
689 
690 		for (j = k; j <= nb_segs; j++) {
691 			iovecs[j].iov_len = rte_pktmbuf_data_len(seg);
692 			iovecs[j].iov_base = rte_pktmbuf_mtod(seg, void *);
693 			if (is_cksum)
694 				tap_tx_l4_add_rcksum(iovecs[j].iov_base,
695 					iovecs[j].iov_len, l4_cksum,
696 					&l4_raw_cksum);
697 			seg = seg->next;
698 		}
699 
700 		if (is_cksum)
701 			tap_tx_l4_cksum(l4_cksum, l4_phdr_cksum, l4_raw_cksum);
702 
703 		/* copy the tx frame data */
704 		n = writev(process_private->txq_fds[txq->queue_id], iovecs, j);
705 		if (n <= 0)
706 			return -1;
707 
708 		(*num_packets)++;
709 		(*num_tx_bytes) += rte_pktmbuf_pkt_len(mbuf);
710 	}
711 	return 0;
712 }
713 
714 /* Callback to handle sending packets from the tap interface
715  */
716 static uint16_t
717 pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
718 {
719 	struct tx_queue *txq = queue;
720 	uint16_t num_tx = 0;
721 	uint16_t num_packets = 0;
722 	unsigned long num_tx_bytes = 0;
723 	uint32_t max_size;
724 	int i;
725 
726 	if (unlikely(nb_pkts == 0))
727 		return 0;
728 
729 	struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS];
730 	max_size = *txq->mtu + (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN + 4);
731 	for (i = 0; i < nb_pkts; i++) {
732 		struct rte_mbuf *mbuf_in = bufs[num_tx];
733 		struct rte_mbuf **mbuf;
734 		uint16_t num_mbufs = 0;
735 		uint16_t tso_segsz = 0;
736 		int ret;
737 		int num_tso_mbufs;
738 		uint16_t hdrs_len;
739 		uint64_t tso;
740 
741 		tso = mbuf_in->ol_flags & PKT_TX_TCP_SEG;
742 		if (tso) {
743 			struct rte_gso_ctx *gso_ctx = &txq->gso_ctx;
744 
745 			/* TCP segmentation implies TCP checksum offload */
746 			mbuf_in->ol_flags |= PKT_TX_TCP_CKSUM;
747 
748 			/* gso size is calculated without RTE_ETHER_CRC_LEN */
749 			hdrs_len = mbuf_in->l2_len + mbuf_in->l3_len +
750 					mbuf_in->l4_len;
751 			tso_segsz = mbuf_in->tso_segsz + hdrs_len;
752 			if (unlikely(tso_segsz == hdrs_len) ||
753 				tso_segsz > *txq->mtu) {
754 				txq->stats.errs++;
755 				break;
756 			}
757 			gso_ctx->gso_size = tso_segsz;
758 			/* 'mbuf_in' packet to segment */
759 			num_tso_mbufs = rte_gso_segment(mbuf_in,
760 				gso_ctx, /* gso control block */
761 				(struct rte_mbuf **)&gso_mbufs, /* out mbufs */
762 				RTE_DIM(gso_mbufs)); /* max tso mbufs */
763 
764 			/* ret contains the number of new created mbufs */
765 			if (num_tso_mbufs < 0)
766 				break;
767 
768 			if (num_tso_mbufs >= 1) {
769 				mbuf = gso_mbufs;
770 				num_mbufs = num_tso_mbufs;
771 			} else {
772 				/* 0 means it can be transmitted directly
773 				 * without gso.
774 				 */
775 				mbuf = &mbuf_in;
776 				num_mbufs = 1;
777 			}
778 		} else {
779 			/* stats.errs will be incremented */
780 			if (rte_pktmbuf_pkt_len(mbuf_in) > max_size)
781 				break;
782 
783 			/* ret 0 indicates no new mbufs were created */
784 			num_tso_mbufs = 0;
785 			mbuf = &mbuf_in;
786 			num_mbufs = 1;
787 		}
788 
789 		ret = tap_write_mbufs(txq, num_mbufs, mbuf,
790 				&num_packets, &num_tx_bytes);
791 		if (ret == -1) {
792 			txq->stats.errs++;
793 			/* free tso mbufs */
794 			if (num_tso_mbufs > 0)
795 				rte_pktmbuf_free_bulk(mbuf, num_tso_mbufs);
796 			break;
797 		}
798 		num_tx++;
799 		/* free original mbuf */
800 		rte_pktmbuf_free(mbuf_in);
801 		/* free tso mbufs */
802 		if (num_tso_mbufs > 0)
803 			rte_pktmbuf_free_bulk(mbuf, num_tso_mbufs);
804 	}
805 
806 	txq->stats.opackets += num_packets;
807 	txq->stats.errs += nb_pkts - num_tx;
808 	txq->stats.obytes += num_tx_bytes;
809 
810 	return num_tx;
811 }
812 
813 static const char *
814 tap_ioctl_req2str(unsigned long request)
815 {
816 	switch (request) {
817 	case SIOCSIFFLAGS:
818 		return "SIOCSIFFLAGS";
819 	case SIOCGIFFLAGS:
820 		return "SIOCGIFFLAGS";
821 	case SIOCGIFHWADDR:
822 		return "SIOCGIFHWADDR";
823 	case SIOCSIFHWADDR:
824 		return "SIOCSIFHWADDR";
825 	case SIOCSIFMTU:
826 		return "SIOCSIFMTU";
827 	}
828 	return "UNKNOWN";
829 }
830 
831 static int
832 tap_ioctl(struct pmd_internals *pmd, unsigned long request,
833 	  struct ifreq *ifr, int set, enum ioctl_mode mode)
834 {
835 	short req_flags = ifr->ifr_flags;
836 	int remote = pmd->remote_if_index &&
837 		(mode == REMOTE_ONLY || mode == LOCAL_AND_REMOTE);
838 
839 	if (!pmd->remote_if_index && mode == REMOTE_ONLY)
840 		return 0;
841 	/*
842 	 * If there is a remote netdevice, apply ioctl on it, then apply it on
843 	 * the tap netdevice.
844 	 */
845 apply:
846 	if (remote)
847 		strlcpy(ifr->ifr_name, pmd->remote_iface, IFNAMSIZ);
848 	else if (mode == LOCAL_ONLY || mode == LOCAL_AND_REMOTE)
849 		strlcpy(ifr->ifr_name, pmd->name, IFNAMSIZ);
850 	switch (request) {
851 	case SIOCSIFFLAGS:
852 		/* fetch current flags to leave other flags untouched */
853 		if (ioctl(pmd->ioctl_sock, SIOCGIFFLAGS, ifr) < 0)
854 			goto error;
855 		if (set)
856 			ifr->ifr_flags |= req_flags;
857 		else
858 			ifr->ifr_flags &= ~req_flags;
859 		break;
860 	case SIOCGIFFLAGS:
861 	case SIOCGIFHWADDR:
862 	case SIOCSIFHWADDR:
863 	case SIOCSIFMTU:
864 		break;
865 	default:
866 		TAP_LOG(WARNING, "%s: ioctl() called with wrong arg",
867 			pmd->name);
868 		return -EINVAL;
869 	}
870 	if (ioctl(pmd->ioctl_sock, request, ifr) < 0)
871 		goto error;
872 	if (remote-- && mode == LOCAL_AND_REMOTE)
873 		goto apply;
874 	return 0;
875 
876 error:
877 	TAP_LOG(DEBUG, "%s(%s) failed: %s(%d)", ifr->ifr_name,
878 		tap_ioctl_req2str(request), strerror(errno), errno);
879 	return -errno;
880 }
881 
882 static int
883 tap_link_set_down(struct rte_eth_dev *dev)
884 {
885 	struct pmd_internals *pmd = dev->data->dev_private;
886 	struct ifreq ifr = { .ifr_flags = IFF_UP };
887 
888 	dev->data->dev_link.link_status = ETH_LINK_DOWN;
889 	return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_ONLY);
890 }
891 
892 static int
893 tap_link_set_up(struct rte_eth_dev *dev)
894 {
895 	struct pmd_internals *pmd = dev->data->dev_private;
896 	struct ifreq ifr = { .ifr_flags = IFF_UP };
897 
898 	dev->data->dev_link.link_status = ETH_LINK_UP;
899 	return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
900 }
901 
902 static int
903 tap_dev_start(struct rte_eth_dev *dev)
904 {
905 	int err, i;
906 
907 	err = tap_intr_handle_set(dev, 1);
908 	if (err)
909 		return err;
910 
911 	err = tap_link_set_up(dev);
912 	if (err)
913 		return err;
914 
915 	for (i = 0; i < dev->data->nb_tx_queues; i++)
916 		dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED;
917 	for (i = 0; i < dev->data->nb_rx_queues; i++)
918 		dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED;
919 
920 	return err;
921 }
922 
923 /* This function gets called when the current port gets stopped.
924  */
925 static int
926 tap_dev_stop(struct rte_eth_dev *dev)
927 {
928 	int i;
929 
930 	for (i = 0; i < dev->data->nb_tx_queues; i++)
931 		dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED;
932 	for (i = 0; i < dev->data->nb_rx_queues; i++)
933 		dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED;
934 
935 	tap_intr_handle_set(dev, 0);
936 	tap_link_set_down(dev);
937 
938 	return 0;
939 }
940 
941 static int
942 tap_dev_configure(struct rte_eth_dev *dev)
943 {
944 	struct pmd_internals *pmd = dev->data->dev_private;
945 
946 	if (dev->data->nb_rx_queues > RTE_PMD_TAP_MAX_QUEUES) {
947 		TAP_LOG(ERR,
948 			"%s: number of rx queues %d exceeds max num of queues %d",
949 			dev->device->name,
950 			dev->data->nb_rx_queues,
951 			RTE_PMD_TAP_MAX_QUEUES);
952 		return -1;
953 	}
954 	if (dev->data->nb_tx_queues > RTE_PMD_TAP_MAX_QUEUES) {
955 		TAP_LOG(ERR,
956 			"%s: number of tx queues %d exceeds max num of queues %d",
957 			dev->device->name,
958 			dev->data->nb_tx_queues,
959 			RTE_PMD_TAP_MAX_QUEUES);
960 		return -1;
961 	}
962 
963 	TAP_LOG(INFO, "%s: %s: TX configured queues number: %u",
964 		dev->device->name, pmd->name, dev->data->nb_tx_queues);
965 
966 	TAP_LOG(INFO, "%s: %s: RX configured queues number: %u",
967 		dev->device->name, pmd->name, dev->data->nb_rx_queues);
968 
969 	return 0;
970 }
971 
972 static uint32_t
973 tap_dev_speed_capa(void)
974 {
975 	uint32_t speed = pmd_link.link_speed;
976 	uint32_t capa = 0;
977 
978 	if (speed >= ETH_SPEED_NUM_10M)
979 		capa |= ETH_LINK_SPEED_10M;
980 	if (speed >= ETH_SPEED_NUM_100M)
981 		capa |= ETH_LINK_SPEED_100M;
982 	if (speed >= ETH_SPEED_NUM_1G)
983 		capa |= ETH_LINK_SPEED_1G;
984 	if (speed >= ETH_SPEED_NUM_5G)
985 		capa |= ETH_LINK_SPEED_2_5G;
986 	if (speed >= ETH_SPEED_NUM_5G)
987 		capa |= ETH_LINK_SPEED_5G;
988 	if (speed >= ETH_SPEED_NUM_10G)
989 		capa |= ETH_LINK_SPEED_10G;
990 	if (speed >= ETH_SPEED_NUM_20G)
991 		capa |= ETH_LINK_SPEED_20G;
992 	if (speed >= ETH_SPEED_NUM_25G)
993 		capa |= ETH_LINK_SPEED_25G;
994 	if (speed >= ETH_SPEED_NUM_40G)
995 		capa |= ETH_LINK_SPEED_40G;
996 	if (speed >= ETH_SPEED_NUM_50G)
997 		capa |= ETH_LINK_SPEED_50G;
998 	if (speed >= ETH_SPEED_NUM_56G)
999 		capa |= ETH_LINK_SPEED_56G;
1000 	if (speed >= ETH_SPEED_NUM_100G)
1001 		capa |= ETH_LINK_SPEED_100G;
1002 
1003 	return capa;
1004 }
1005 
1006 static int
1007 tap_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
1008 {
1009 	struct pmd_internals *internals = dev->data->dev_private;
1010 
1011 	dev_info->if_index = internals->if_index;
1012 	dev_info->max_mac_addrs = 1;
1013 	dev_info->max_rx_pktlen = (uint32_t)RTE_ETHER_MAX_VLAN_FRAME_LEN;
1014 	dev_info->max_rx_queues = RTE_PMD_TAP_MAX_QUEUES;
1015 	dev_info->max_tx_queues = RTE_PMD_TAP_MAX_QUEUES;
1016 	dev_info->min_rx_bufsize = 0;
1017 	dev_info->speed_capa = tap_dev_speed_capa();
1018 	dev_info->rx_queue_offload_capa = tap_rx_offload_get_queue_capa();
1019 	dev_info->rx_offload_capa = tap_rx_offload_get_port_capa() |
1020 				    dev_info->rx_queue_offload_capa;
1021 	dev_info->tx_queue_offload_capa = tap_tx_offload_get_queue_capa();
1022 	dev_info->tx_offload_capa = tap_tx_offload_get_port_capa() |
1023 				    dev_info->tx_queue_offload_capa;
1024 	dev_info->hash_key_size = TAP_RSS_HASH_KEY_SIZE;
1025 	/*
1026 	 * limitation: TAP supports all of IP, UDP and TCP hash
1027 	 * functions together and not in partial combinations
1028 	 */
1029 	dev_info->flow_type_rss_offloads = ~TAP_RSS_HF_MASK;
1030 
1031 	return 0;
1032 }
1033 
1034 static int
1035 tap_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *tap_stats)
1036 {
1037 	unsigned int i, imax;
1038 	unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
1039 	unsigned long rx_bytes_total = 0, tx_bytes_total = 0;
1040 	unsigned long rx_nombuf = 0, ierrors = 0;
1041 	const struct pmd_internals *pmd = dev->data->dev_private;
1042 
1043 	/* rx queue statistics */
1044 	imax = (dev->data->nb_rx_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
1045 		dev->data->nb_rx_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
1046 	for (i = 0; i < imax; i++) {
1047 		tap_stats->q_ipackets[i] = pmd->rxq[i].stats.ipackets;
1048 		tap_stats->q_ibytes[i] = pmd->rxq[i].stats.ibytes;
1049 		rx_total += tap_stats->q_ipackets[i];
1050 		rx_bytes_total += tap_stats->q_ibytes[i];
1051 		rx_nombuf += pmd->rxq[i].stats.rx_nombuf;
1052 		ierrors += pmd->rxq[i].stats.ierrors;
1053 	}
1054 
1055 	/* tx queue statistics */
1056 	imax = (dev->data->nb_tx_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
1057 		dev->data->nb_tx_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
1058 
1059 	for (i = 0; i < imax; i++) {
1060 		tap_stats->q_opackets[i] = pmd->txq[i].stats.opackets;
1061 		tap_stats->q_obytes[i] = pmd->txq[i].stats.obytes;
1062 		tx_total += tap_stats->q_opackets[i];
1063 		tx_err_total += pmd->txq[i].stats.errs;
1064 		tx_bytes_total += tap_stats->q_obytes[i];
1065 	}
1066 
1067 	tap_stats->ipackets = rx_total;
1068 	tap_stats->ibytes = rx_bytes_total;
1069 	tap_stats->ierrors = ierrors;
1070 	tap_stats->rx_nombuf = rx_nombuf;
1071 	tap_stats->opackets = tx_total;
1072 	tap_stats->oerrors = tx_err_total;
1073 	tap_stats->obytes = tx_bytes_total;
1074 	return 0;
1075 }
1076 
1077 static int
1078 tap_stats_reset(struct rte_eth_dev *dev)
1079 {
1080 	int i;
1081 	struct pmd_internals *pmd = dev->data->dev_private;
1082 
1083 	for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
1084 		pmd->rxq[i].stats.ipackets = 0;
1085 		pmd->rxq[i].stats.ibytes = 0;
1086 		pmd->rxq[i].stats.ierrors = 0;
1087 		pmd->rxq[i].stats.rx_nombuf = 0;
1088 
1089 		pmd->txq[i].stats.opackets = 0;
1090 		pmd->txq[i].stats.errs = 0;
1091 		pmd->txq[i].stats.obytes = 0;
1092 	}
1093 
1094 	return 0;
1095 }
1096 
1097 static int
1098 tap_dev_close(struct rte_eth_dev *dev)
1099 {
1100 	int i;
1101 	struct pmd_internals *internals = dev->data->dev_private;
1102 	struct pmd_process_private *process_private = dev->process_private;
1103 	struct rx_queue *rxq;
1104 
1105 	if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
1106 		rte_free(dev->process_private);
1107 		return 0;
1108 	}
1109 
1110 	tap_link_set_down(dev);
1111 	if (internals->nlsk_fd != -1) {
1112 		tap_flow_flush(dev, NULL);
1113 		tap_flow_implicit_flush(internals, NULL);
1114 		tap_nl_final(internals->nlsk_fd);
1115 		internals->nlsk_fd = -1;
1116 	}
1117 
1118 	for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
1119 		if (process_private->rxq_fds[i] != -1) {
1120 			rxq = &internals->rxq[i];
1121 			close(process_private->rxq_fds[i]);
1122 			process_private->rxq_fds[i] = -1;
1123 			tap_rxq_pool_free(rxq->pool);
1124 			rte_free(rxq->iovecs);
1125 			rxq->pool = NULL;
1126 			rxq->iovecs = NULL;
1127 		}
1128 		if (process_private->txq_fds[i] != -1) {
1129 			close(process_private->txq_fds[i]);
1130 			process_private->txq_fds[i] = -1;
1131 		}
1132 	}
1133 
1134 	if (internals->remote_if_index) {
1135 		/* Restore initial remote state */
1136 		ioctl(internals->ioctl_sock, SIOCSIFFLAGS,
1137 				&internals->remote_initial_flags);
1138 	}
1139 
1140 	rte_mempool_free(internals->gso_ctx_mp);
1141 	internals->gso_ctx_mp = NULL;
1142 
1143 	if (internals->ka_fd != -1) {
1144 		close(internals->ka_fd);
1145 		internals->ka_fd = -1;
1146 	}
1147 
1148 	/* mac_addrs must not be freed alone because part of dev_private */
1149 	dev->data->mac_addrs = NULL;
1150 
1151 	internals = dev->data->dev_private;
1152 	TAP_LOG(DEBUG, "Closing %s Ethernet device on numa %u",
1153 		tuntap_types[internals->type], rte_socket_id());
1154 
1155 	if (internals->ioctl_sock != -1) {
1156 		close(internals->ioctl_sock);
1157 		internals->ioctl_sock = -1;
1158 	}
1159 	rte_free(dev->process_private);
1160 	if (tap_devices_count == 1)
1161 		rte_mp_action_unregister(TAP_MP_KEY);
1162 	tap_devices_count--;
1163 	/*
1164 	 * Since TUN device has no more opened file descriptors
1165 	 * it will be removed from kernel
1166 	 */
1167 
1168 	return 0;
1169 }
1170 
1171 static void
1172 tap_rx_queue_release(void *queue)
1173 {
1174 	struct rx_queue *rxq = queue;
1175 	struct pmd_process_private *process_private;
1176 
1177 	if (!rxq)
1178 		return;
1179 	process_private = rte_eth_devices[rxq->in_port].process_private;
1180 	if (process_private->rxq_fds[rxq->queue_id] != -1) {
1181 		close(process_private->rxq_fds[rxq->queue_id]);
1182 		process_private->rxq_fds[rxq->queue_id] = -1;
1183 		tap_rxq_pool_free(rxq->pool);
1184 		rte_free(rxq->iovecs);
1185 		rxq->pool = NULL;
1186 		rxq->iovecs = NULL;
1187 	}
1188 }
1189 
1190 static void
1191 tap_tx_queue_release(void *queue)
1192 {
1193 	struct tx_queue *txq = queue;
1194 	struct pmd_process_private *process_private;
1195 
1196 	if (!txq)
1197 		return;
1198 	process_private = rte_eth_devices[txq->out_port].process_private;
1199 
1200 	if (process_private->txq_fds[txq->queue_id] != -1) {
1201 		close(process_private->txq_fds[txq->queue_id]);
1202 		process_private->txq_fds[txq->queue_id] = -1;
1203 	}
1204 }
1205 
1206 static int
1207 tap_link_update(struct rte_eth_dev *dev, int wait_to_complete __rte_unused)
1208 {
1209 	struct rte_eth_link *dev_link = &dev->data->dev_link;
1210 	struct pmd_internals *pmd = dev->data->dev_private;
1211 	struct ifreq ifr = { .ifr_flags = 0 };
1212 
1213 	if (pmd->remote_if_index) {
1214 		tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, REMOTE_ONLY);
1215 		if (!(ifr.ifr_flags & IFF_UP) ||
1216 		    !(ifr.ifr_flags & IFF_RUNNING)) {
1217 			dev_link->link_status = ETH_LINK_DOWN;
1218 			return 0;
1219 		}
1220 	}
1221 	tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, LOCAL_ONLY);
1222 	dev_link->link_status =
1223 		((ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING) ?
1224 		 ETH_LINK_UP :
1225 		 ETH_LINK_DOWN);
1226 	return 0;
1227 }
1228 
1229 static int
1230 tap_promisc_enable(struct rte_eth_dev *dev)
1231 {
1232 	struct pmd_internals *pmd = dev->data->dev_private;
1233 	struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
1234 	int ret;
1235 
1236 	ret = tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
1237 	if (ret != 0)
1238 		return ret;
1239 
1240 	if (pmd->remote_if_index && !pmd->flow_isolate) {
1241 		dev->data->promiscuous = 1;
1242 		ret = tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC);
1243 		if (ret != 0) {
1244 			/* Rollback promisc flag */
1245 			tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
1246 			/*
1247 			 * rte_eth_dev_promiscuous_enable() rollback
1248 			 * dev->data->promiscuous in the case of failure.
1249 			 */
1250 			return ret;
1251 		}
1252 	}
1253 
1254 	return 0;
1255 }
1256 
1257 static int
1258 tap_promisc_disable(struct rte_eth_dev *dev)
1259 {
1260 	struct pmd_internals *pmd = dev->data->dev_private;
1261 	struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
1262 	int ret;
1263 
1264 	ret = tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
1265 	if (ret != 0)
1266 		return ret;
1267 
1268 	if (pmd->remote_if_index && !pmd->flow_isolate) {
1269 		dev->data->promiscuous = 0;
1270 		ret = tap_flow_implicit_destroy(pmd, TAP_REMOTE_PROMISC);
1271 		if (ret != 0) {
1272 			/* Rollback promisc flag */
1273 			tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
1274 			/*
1275 			 * rte_eth_dev_promiscuous_disable() rollback
1276 			 * dev->data->promiscuous in the case of failure.
1277 			 */
1278 			return ret;
1279 		}
1280 	}
1281 
1282 	return 0;
1283 }
1284 
1285 static int
1286 tap_allmulti_enable(struct rte_eth_dev *dev)
1287 {
1288 	struct pmd_internals *pmd = dev->data->dev_private;
1289 	struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
1290 	int ret;
1291 
1292 	ret = tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
1293 	if (ret != 0)
1294 		return ret;
1295 
1296 	if (pmd->remote_if_index && !pmd->flow_isolate) {
1297 		dev->data->all_multicast = 1;
1298 		ret = tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI);
1299 		if (ret != 0) {
1300 			/* Rollback allmulti flag */
1301 			tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
1302 			/*
1303 			 * rte_eth_dev_allmulticast_enable() rollback
1304 			 * dev->data->all_multicast in the case of failure.
1305 			 */
1306 			return ret;
1307 		}
1308 	}
1309 
1310 	return 0;
1311 }
1312 
1313 static int
1314 tap_allmulti_disable(struct rte_eth_dev *dev)
1315 {
1316 	struct pmd_internals *pmd = dev->data->dev_private;
1317 	struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
1318 	int ret;
1319 
1320 	ret = tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
1321 	if (ret != 0)
1322 		return ret;
1323 
1324 	if (pmd->remote_if_index && !pmd->flow_isolate) {
1325 		dev->data->all_multicast = 0;
1326 		ret = tap_flow_implicit_destroy(pmd, TAP_REMOTE_ALLMULTI);
1327 		if (ret != 0) {
1328 			/* Rollback allmulti flag */
1329 			tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
1330 			/*
1331 			 * rte_eth_dev_allmulticast_disable() rollback
1332 			 * dev->data->all_multicast in the case of failure.
1333 			 */
1334 			return ret;
1335 		}
1336 	}
1337 
1338 	return 0;
1339 }
1340 
1341 static int
1342 tap_mac_set(struct rte_eth_dev *dev, struct rte_ether_addr *mac_addr)
1343 {
1344 	struct pmd_internals *pmd = dev->data->dev_private;
1345 	enum ioctl_mode mode = LOCAL_ONLY;
1346 	struct ifreq ifr;
1347 	int ret;
1348 
1349 	if (pmd->type == ETH_TUNTAP_TYPE_TUN) {
1350 		TAP_LOG(ERR, "%s: can't MAC address for TUN",
1351 			dev->device->name);
1352 		return -ENOTSUP;
1353 	}
1354 
1355 	if (rte_is_zero_ether_addr(mac_addr)) {
1356 		TAP_LOG(ERR, "%s: can't set an empty MAC address",
1357 			dev->device->name);
1358 		return -EINVAL;
1359 	}
1360 	/* Check the actual current MAC address on the tap netdevice */
1361 	ret = tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, LOCAL_ONLY);
1362 	if (ret < 0)
1363 		return ret;
1364 	if (rte_is_same_ether_addr(
1365 			(struct rte_ether_addr *)&ifr.ifr_hwaddr.sa_data,
1366 			mac_addr))
1367 		return 0;
1368 	/* Check the current MAC address on the remote */
1369 	ret = tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, REMOTE_ONLY);
1370 	if (ret < 0)
1371 		return ret;
1372 	if (!rte_is_same_ether_addr(
1373 			(struct rte_ether_addr *)&ifr.ifr_hwaddr.sa_data,
1374 			mac_addr))
1375 		mode = LOCAL_AND_REMOTE;
1376 	ifr.ifr_hwaddr.sa_family = AF_LOCAL;
1377 	rte_memcpy(ifr.ifr_hwaddr.sa_data, mac_addr, RTE_ETHER_ADDR_LEN);
1378 	ret = tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 1, mode);
1379 	if (ret < 0)
1380 		return ret;
1381 	rte_memcpy(&pmd->eth_addr, mac_addr, RTE_ETHER_ADDR_LEN);
1382 	if (pmd->remote_if_index && !pmd->flow_isolate) {
1383 		/* Replace MAC redirection rule after a MAC change */
1384 		ret = tap_flow_implicit_destroy(pmd, TAP_REMOTE_LOCAL_MAC);
1385 		if (ret < 0) {
1386 			TAP_LOG(ERR,
1387 				"%s: Couldn't delete MAC redirection rule",
1388 				dev->device->name);
1389 			return ret;
1390 		}
1391 		ret = tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC);
1392 		if (ret < 0) {
1393 			TAP_LOG(ERR,
1394 				"%s: Couldn't add MAC redirection rule",
1395 				dev->device->name);
1396 			return ret;
1397 		}
1398 	}
1399 
1400 	return 0;
1401 }
1402 
1403 static int
1404 tap_gso_ctx_setup(struct rte_gso_ctx *gso_ctx, struct rte_eth_dev *dev)
1405 {
1406 	uint32_t gso_types;
1407 	char pool_name[64];
1408 	struct pmd_internals *pmd = dev->data->dev_private;
1409 	int ret;
1410 
1411 	/* initialize GSO context */
1412 	gso_types = DEV_TX_OFFLOAD_TCP_TSO;
1413 	if (!pmd->gso_ctx_mp) {
1414 		/*
1415 		 * Create private mbuf pool with TAP_GSO_MBUF_SEG_SIZE
1416 		 * bytes size per mbuf use this pool for both direct and
1417 		 * indirect mbufs
1418 		 */
1419 		ret = snprintf(pool_name, sizeof(pool_name), "mp_%s",
1420 				dev->device->name);
1421 		if (ret < 0 || ret >= (int)sizeof(pool_name)) {
1422 			TAP_LOG(ERR,
1423 				"%s: failed to create mbuf pool name for device %s,"
1424 				"device name too long or output error, ret: %d\n",
1425 				pmd->name, dev->device->name, ret);
1426 			return -ENAMETOOLONG;
1427 		}
1428 		pmd->gso_ctx_mp = rte_pktmbuf_pool_create(pool_name,
1429 			TAP_GSO_MBUFS_NUM, TAP_GSO_MBUF_CACHE_SIZE, 0,
1430 			RTE_PKTMBUF_HEADROOM + TAP_GSO_MBUF_SEG_SIZE,
1431 			SOCKET_ID_ANY);
1432 		if (!pmd->gso_ctx_mp) {
1433 			TAP_LOG(ERR,
1434 				"%s: failed to create mbuf pool for device %s\n",
1435 				pmd->name, dev->device->name);
1436 			return -1;
1437 		}
1438 	}
1439 
1440 	gso_ctx->direct_pool = pmd->gso_ctx_mp;
1441 	gso_ctx->indirect_pool = pmd->gso_ctx_mp;
1442 	gso_ctx->gso_types = gso_types;
1443 	gso_ctx->gso_size = 0; /* gso_size is set in tx_burst() per packet */
1444 	gso_ctx->flag = 0;
1445 
1446 	return 0;
1447 }
1448 
1449 static int
1450 tap_setup_queue(struct rte_eth_dev *dev,
1451 		struct pmd_internals *internals,
1452 		uint16_t qid,
1453 		int is_rx)
1454 {
1455 	int ret;
1456 	int *fd;
1457 	int *other_fd;
1458 	const char *dir;
1459 	struct pmd_internals *pmd = dev->data->dev_private;
1460 	struct pmd_process_private *process_private = dev->process_private;
1461 	struct rx_queue *rx = &internals->rxq[qid];
1462 	struct tx_queue *tx = &internals->txq[qid];
1463 	struct rte_gso_ctx *gso_ctx;
1464 
1465 	if (is_rx) {
1466 		fd = &process_private->rxq_fds[qid];
1467 		other_fd = &process_private->txq_fds[qid];
1468 		dir = "rx";
1469 		gso_ctx = NULL;
1470 	} else {
1471 		fd = &process_private->txq_fds[qid];
1472 		other_fd = &process_private->rxq_fds[qid];
1473 		dir = "tx";
1474 		gso_ctx = &tx->gso_ctx;
1475 	}
1476 	if (*fd != -1) {
1477 		/* fd for this queue already exists */
1478 		TAP_LOG(DEBUG, "%s: fd %d for %s queue qid %d exists",
1479 			pmd->name, *fd, dir, qid);
1480 		gso_ctx = NULL;
1481 	} else if (*other_fd != -1) {
1482 		/* Only other_fd exists. dup it */
1483 		*fd = dup(*other_fd);
1484 		if (*fd < 0) {
1485 			*fd = -1;
1486 			TAP_LOG(ERR, "%s: dup() failed.", pmd->name);
1487 			return -1;
1488 		}
1489 		TAP_LOG(DEBUG, "%s: dup fd %d for %s queue qid %d (%d)",
1490 			pmd->name, *other_fd, dir, qid, *fd);
1491 	} else {
1492 		/* Both RX and TX fds do not exist (equal -1). Create fd */
1493 		*fd = tun_alloc(pmd, 0);
1494 		if (*fd < 0) {
1495 			*fd = -1; /* restore original value */
1496 			TAP_LOG(ERR, "%s: tun_alloc() failed.", pmd->name);
1497 			return -1;
1498 		}
1499 		TAP_LOG(DEBUG, "%s: add %s queue for qid %d fd %d",
1500 			pmd->name, dir, qid, *fd);
1501 	}
1502 
1503 	tx->mtu = &dev->data->mtu;
1504 	rx->rxmode = &dev->data->dev_conf.rxmode;
1505 	if (gso_ctx) {
1506 		ret = tap_gso_ctx_setup(gso_ctx, dev);
1507 		if (ret)
1508 			return -1;
1509 	}
1510 
1511 	tx->type = pmd->type;
1512 
1513 	return *fd;
1514 }
1515 
1516 static int
1517 tap_rx_queue_setup(struct rte_eth_dev *dev,
1518 		   uint16_t rx_queue_id,
1519 		   uint16_t nb_rx_desc,
1520 		   unsigned int socket_id,
1521 		   const struct rte_eth_rxconf *rx_conf __rte_unused,
1522 		   struct rte_mempool *mp)
1523 {
1524 	struct pmd_internals *internals = dev->data->dev_private;
1525 	struct pmd_process_private *process_private = dev->process_private;
1526 	struct rx_queue *rxq = &internals->rxq[rx_queue_id];
1527 	struct rte_mbuf **tmp = &rxq->pool;
1528 	long iov_max = sysconf(_SC_IOV_MAX);
1529 
1530 	if (iov_max <= 0) {
1531 		TAP_LOG(WARNING,
1532 			"_SC_IOV_MAX is not defined. Using %d as default",
1533 			TAP_IOV_DEFAULT_MAX);
1534 		iov_max = TAP_IOV_DEFAULT_MAX;
1535 	}
1536 	uint16_t nb_desc = RTE_MIN(nb_rx_desc, iov_max - 1);
1537 	struct iovec (*iovecs)[nb_desc + 1];
1538 	int data_off = RTE_PKTMBUF_HEADROOM;
1539 	int ret = 0;
1540 	int fd;
1541 	int i;
1542 
1543 	if (rx_queue_id >= dev->data->nb_rx_queues || !mp) {
1544 		TAP_LOG(WARNING,
1545 			"nb_rx_queues %d too small or mempool NULL",
1546 			dev->data->nb_rx_queues);
1547 		return -1;
1548 	}
1549 
1550 	rxq->mp = mp;
1551 	rxq->trigger_seen = 1; /* force initial burst */
1552 	rxq->in_port = dev->data->port_id;
1553 	rxq->queue_id = rx_queue_id;
1554 	rxq->nb_rx_desc = nb_desc;
1555 	iovecs = rte_zmalloc_socket(dev->device->name, sizeof(*iovecs), 0,
1556 				    socket_id);
1557 	if (!iovecs) {
1558 		TAP_LOG(WARNING,
1559 			"%s: Couldn't allocate %d RX descriptors",
1560 			dev->device->name, nb_desc);
1561 		return -ENOMEM;
1562 	}
1563 	rxq->iovecs = iovecs;
1564 
1565 	dev->data->rx_queues[rx_queue_id] = rxq;
1566 	fd = tap_setup_queue(dev, internals, rx_queue_id, 1);
1567 	if (fd == -1) {
1568 		ret = fd;
1569 		goto error;
1570 	}
1571 
1572 	(*rxq->iovecs)[0].iov_len = sizeof(struct tun_pi);
1573 	(*rxq->iovecs)[0].iov_base = &rxq->pi;
1574 
1575 	for (i = 1; i <= nb_desc; i++) {
1576 		*tmp = rte_pktmbuf_alloc(rxq->mp);
1577 		if (!*tmp) {
1578 			TAP_LOG(WARNING,
1579 				"%s: couldn't allocate memory for queue %d",
1580 				dev->device->name, rx_queue_id);
1581 			ret = -ENOMEM;
1582 			goto error;
1583 		}
1584 		(*rxq->iovecs)[i].iov_len = (*tmp)->buf_len - data_off;
1585 		(*rxq->iovecs)[i].iov_base =
1586 			(char *)(*tmp)->buf_addr + data_off;
1587 		data_off = 0;
1588 		tmp = &(*tmp)->next;
1589 	}
1590 
1591 	TAP_LOG(DEBUG, "  RX TUNTAP device name %s, qid %d on fd %d",
1592 		internals->name, rx_queue_id,
1593 		process_private->rxq_fds[rx_queue_id]);
1594 
1595 	return 0;
1596 
1597 error:
1598 	tap_rxq_pool_free(rxq->pool);
1599 	rxq->pool = NULL;
1600 	rte_free(rxq->iovecs);
1601 	rxq->iovecs = NULL;
1602 	return ret;
1603 }
1604 
1605 static int
1606 tap_tx_queue_setup(struct rte_eth_dev *dev,
1607 		   uint16_t tx_queue_id,
1608 		   uint16_t nb_tx_desc __rte_unused,
1609 		   unsigned int socket_id __rte_unused,
1610 		   const struct rte_eth_txconf *tx_conf)
1611 {
1612 	struct pmd_internals *internals = dev->data->dev_private;
1613 	struct pmd_process_private *process_private = dev->process_private;
1614 	struct tx_queue *txq;
1615 	int ret;
1616 	uint64_t offloads;
1617 
1618 	if (tx_queue_id >= dev->data->nb_tx_queues)
1619 		return -1;
1620 	dev->data->tx_queues[tx_queue_id] = &internals->txq[tx_queue_id];
1621 	txq = dev->data->tx_queues[tx_queue_id];
1622 	txq->out_port = dev->data->port_id;
1623 	txq->queue_id = tx_queue_id;
1624 
1625 	offloads = tx_conf->offloads | dev->data->dev_conf.txmode.offloads;
1626 	txq->csum = !!(offloads &
1627 			(DEV_TX_OFFLOAD_IPV4_CKSUM |
1628 			 DEV_TX_OFFLOAD_UDP_CKSUM |
1629 			 DEV_TX_OFFLOAD_TCP_CKSUM));
1630 
1631 	ret = tap_setup_queue(dev, internals, tx_queue_id, 0);
1632 	if (ret == -1)
1633 		return -1;
1634 	TAP_LOG(DEBUG,
1635 		"  TX TUNTAP device name %s, qid %d on fd %d csum %s",
1636 		internals->name, tx_queue_id,
1637 		process_private->txq_fds[tx_queue_id],
1638 		txq->csum ? "on" : "off");
1639 
1640 	return 0;
1641 }
1642 
1643 static int
1644 tap_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
1645 {
1646 	struct pmd_internals *pmd = dev->data->dev_private;
1647 	struct ifreq ifr = { .ifr_mtu = mtu };
1648 	int err = 0;
1649 
1650 	err = tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1, LOCAL_AND_REMOTE);
1651 	if (!err)
1652 		dev->data->mtu = mtu;
1653 
1654 	return err;
1655 }
1656 
1657 static int
1658 tap_set_mc_addr_list(struct rte_eth_dev *dev __rte_unused,
1659 		     struct rte_ether_addr *mc_addr_set __rte_unused,
1660 		     uint32_t nb_mc_addr __rte_unused)
1661 {
1662 	/*
1663 	 * Nothing to do actually: the tap has no filtering whatsoever, every
1664 	 * packet is received.
1665 	 */
1666 	return 0;
1667 }
1668 
1669 static int
1670 tap_nl_msg_handler(struct nlmsghdr *nh, void *arg)
1671 {
1672 	struct rte_eth_dev *dev = arg;
1673 	struct pmd_internals *pmd = dev->data->dev_private;
1674 	struct ifinfomsg *info = NLMSG_DATA(nh);
1675 
1676 	if (nh->nlmsg_type != RTM_NEWLINK ||
1677 	    (info->ifi_index != pmd->if_index &&
1678 	     info->ifi_index != pmd->remote_if_index))
1679 		return 0;
1680 	return tap_link_update(dev, 0);
1681 }
1682 
1683 static void
1684 tap_dev_intr_handler(void *cb_arg)
1685 {
1686 	struct rte_eth_dev *dev = cb_arg;
1687 	struct pmd_internals *pmd = dev->data->dev_private;
1688 
1689 	tap_nl_recv(pmd->intr_handle.fd, tap_nl_msg_handler, dev);
1690 }
1691 
1692 static int
1693 tap_lsc_intr_handle_set(struct rte_eth_dev *dev, int set)
1694 {
1695 	struct pmd_internals *pmd = dev->data->dev_private;
1696 	int ret;
1697 
1698 	/* In any case, disable interrupt if the conf is no longer there. */
1699 	if (!dev->data->dev_conf.intr_conf.lsc) {
1700 		if (pmd->intr_handle.fd != -1) {
1701 			goto clean;
1702 		}
1703 		return 0;
1704 	}
1705 	if (set) {
1706 		pmd->intr_handle.fd = tap_nl_init(RTMGRP_LINK);
1707 		if (unlikely(pmd->intr_handle.fd == -1))
1708 			return -EBADF;
1709 		return rte_intr_callback_register(
1710 			&pmd->intr_handle, tap_dev_intr_handler, dev);
1711 	}
1712 
1713 clean:
1714 	do {
1715 		ret = rte_intr_callback_unregister(&pmd->intr_handle,
1716 			tap_dev_intr_handler, dev);
1717 		if (ret >= 0) {
1718 			break;
1719 		} else if (ret == -EAGAIN) {
1720 			rte_delay_ms(100);
1721 		} else {
1722 			TAP_LOG(ERR, "intr callback unregister failed: %d",
1723 				     ret);
1724 			break;
1725 		}
1726 	} while (true);
1727 
1728 	tap_nl_final(pmd->intr_handle.fd);
1729 	pmd->intr_handle.fd = -1;
1730 
1731 	return 0;
1732 }
1733 
1734 static int
1735 tap_intr_handle_set(struct rte_eth_dev *dev, int set)
1736 {
1737 	int err;
1738 
1739 	err = tap_lsc_intr_handle_set(dev, set);
1740 	if (err < 0) {
1741 		if (!set)
1742 			tap_rx_intr_vec_set(dev, 0);
1743 		return err;
1744 	}
1745 	err = tap_rx_intr_vec_set(dev, set);
1746 	if (err && set)
1747 		tap_lsc_intr_handle_set(dev, 0);
1748 	return err;
1749 }
1750 
1751 static const uint32_t*
1752 tap_dev_supported_ptypes_get(struct rte_eth_dev *dev __rte_unused)
1753 {
1754 	static const uint32_t ptypes[] = {
1755 		RTE_PTYPE_INNER_L2_ETHER,
1756 		RTE_PTYPE_INNER_L2_ETHER_VLAN,
1757 		RTE_PTYPE_INNER_L2_ETHER_QINQ,
1758 		RTE_PTYPE_INNER_L3_IPV4,
1759 		RTE_PTYPE_INNER_L3_IPV4_EXT,
1760 		RTE_PTYPE_INNER_L3_IPV6,
1761 		RTE_PTYPE_INNER_L3_IPV6_EXT,
1762 		RTE_PTYPE_INNER_L4_FRAG,
1763 		RTE_PTYPE_INNER_L4_UDP,
1764 		RTE_PTYPE_INNER_L4_TCP,
1765 		RTE_PTYPE_INNER_L4_SCTP,
1766 		RTE_PTYPE_L2_ETHER,
1767 		RTE_PTYPE_L2_ETHER_VLAN,
1768 		RTE_PTYPE_L2_ETHER_QINQ,
1769 		RTE_PTYPE_L3_IPV4,
1770 		RTE_PTYPE_L3_IPV4_EXT,
1771 		RTE_PTYPE_L3_IPV6_EXT,
1772 		RTE_PTYPE_L3_IPV6,
1773 		RTE_PTYPE_L4_FRAG,
1774 		RTE_PTYPE_L4_UDP,
1775 		RTE_PTYPE_L4_TCP,
1776 		RTE_PTYPE_L4_SCTP,
1777 	};
1778 
1779 	return ptypes;
1780 }
1781 
1782 static int
1783 tap_flow_ctrl_get(struct rte_eth_dev *dev __rte_unused,
1784 		  struct rte_eth_fc_conf *fc_conf)
1785 {
1786 	fc_conf->mode = RTE_FC_NONE;
1787 	return 0;
1788 }
1789 
1790 static int
1791 tap_flow_ctrl_set(struct rte_eth_dev *dev __rte_unused,
1792 		  struct rte_eth_fc_conf *fc_conf)
1793 {
1794 	if (fc_conf->mode != RTE_FC_NONE)
1795 		return -ENOTSUP;
1796 	return 0;
1797 }
1798 
1799 /**
1800  * DPDK callback to update the RSS hash configuration.
1801  *
1802  * @param dev
1803  *   Pointer to Ethernet device structure.
1804  * @param[in] rss_conf
1805  *   RSS configuration data.
1806  *
1807  * @return
1808  *   0 on success, a negative errno value otherwise and rte_errno is set.
1809  */
1810 static int
1811 tap_rss_hash_update(struct rte_eth_dev *dev,
1812 		struct rte_eth_rss_conf *rss_conf)
1813 {
1814 	if (rss_conf->rss_hf & TAP_RSS_HF_MASK) {
1815 		rte_errno = EINVAL;
1816 		return -rte_errno;
1817 	}
1818 	if (rss_conf->rss_key && rss_conf->rss_key_len) {
1819 		/*
1820 		 * Currently TAP RSS key is hard coded
1821 		 * and cannot be updated
1822 		 */
1823 		TAP_LOG(ERR,
1824 			"port %u RSS key cannot be updated",
1825 			dev->data->port_id);
1826 		rte_errno = EINVAL;
1827 		return -rte_errno;
1828 	}
1829 	return 0;
1830 }
1831 
1832 static int
1833 tap_rx_queue_start(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1834 {
1835 	dev->data->rx_queue_state[rx_queue_id] = RTE_ETH_QUEUE_STATE_STARTED;
1836 
1837 	return 0;
1838 }
1839 
1840 static int
1841 tap_tx_queue_start(struct rte_eth_dev *dev, uint16_t tx_queue_id)
1842 {
1843 	dev->data->tx_queue_state[tx_queue_id] = RTE_ETH_QUEUE_STATE_STARTED;
1844 
1845 	return 0;
1846 }
1847 
1848 static int
1849 tap_rx_queue_stop(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1850 {
1851 	dev->data->rx_queue_state[rx_queue_id] = RTE_ETH_QUEUE_STATE_STOPPED;
1852 
1853 	return 0;
1854 }
1855 
1856 static int
1857 tap_tx_queue_stop(struct rte_eth_dev *dev, uint16_t tx_queue_id)
1858 {
1859 	dev->data->tx_queue_state[tx_queue_id] = RTE_ETH_QUEUE_STATE_STOPPED;
1860 
1861 	return 0;
1862 }
1863 static const struct eth_dev_ops ops = {
1864 	.dev_start              = tap_dev_start,
1865 	.dev_stop               = tap_dev_stop,
1866 	.dev_close              = tap_dev_close,
1867 	.dev_configure          = tap_dev_configure,
1868 	.dev_infos_get          = tap_dev_info,
1869 	.rx_queue_setup         = tap_rx_queue_setup,
1870 	.tx_queue_setup         = tap_tx_queue_setup,
1871 	.rx_queue_start         = tap_rx_queue_start,
1872 	.tx_queue_start         = tap_tx_queue_start,
1873 	.rx_queue_stop          = tap_rx_queue_stop,
1874 	.tx_queue_stop          = tap_tx_queue_stop,
1875 	.rx_queue_release       = tap_rx_queue_release,
1876 	.tx_queue_release       = tap_tx_queue_release,
1877 	.flow_ctrl_get          = tap_flow_ctrl_get,
1878 	.flow_ctrl_set          = tap_flow_ctrl_set,
1879 	.link_update            = tap_link_update,
1880 	.dev_set_link_up        = tap_link_set_up,
1881 	.dev_set_link_down      = tap_link_set_down,
1882 	.promiscuous_enable     = tap_promisc_enable,
1883 	.promiscuous_disable    = tap_promisc_disable,
1884 	.allmulticast_enable    = tap_allmulti_enable,
1885 	.allmulticast_disable   = tap_allmulti_disable,
1886 	.mac_addr_set           = tap_mac_set,
1887 	.mtu_set                = tap_mtu_set,
1888 	.set_mc_addr_list       = tap_set_mc_addr_list,
1889 	.stats_get              = tap_stats_get,
1890 	.stats_reset            = tap_stats_reset,
1891 	.dev_supported_ptypes_get = tap_dev_supported_ptypes_get,
1892 	.rss_hash_update        = tap_rss_hash_update,
1893 	.flow_ops_get           = tap_dev_flow_ops_get,
1894 };
1895 
1896 static int
1897 eth_dev_tap_create(struct rte_vdev_device *vdev, const char *tap_name,
1898 		   char *remote_iface, struct rte_ether_addr *mac_addr,
1899 		   enum rte_tuntap_type type)
1900 {
1901 	int numa_node = rte_socket_id();
1902 	struct rte_eth_dev *dev;
1903 	struct pmd_internals *pmd;
1904 	struct pmd_process_private *process_private;
1905 	const char *tuntap_name = tuntap_types[type];
1906 	struct rte_eth_dev_data *data;
1907 	struct ifreq ifr;
1908 	int i;
1909 
1910 	TAP_LOG(DEBUG, "%s device on numa %u", tuntap_name, rte_socket_id());
1911 
1912 	dev = rte_eth_vdev_allocate(vdev, sizeof(*pmd));
1913 	if (!dev) {
1914 		TAP_LOG(ERR, "%s Unable to allocate device struct",
1915 				tuntap_name);
1916 		goto error_exit_nodev;
1917 	}
1918 
1919 	process_private = (struct pmd_process_private *)
1920 		rte_zmalloc_socket(tap_name, sizeof(struct pmd_process_private),
1921 			RTE_CACHE_LINE_SIZE, dev->device->numa_node);
1922 
1923 	if (process_private == NULL) {
1924 		TAP_LOG(ERR, "Failed to alloc memory for process private");
1925 		return -1;
1926 	}
1927 	pmd = dev->data->dev_private;
1928 	dev->process_private = process_private;
1929 	pmd->dev = dev;
1930 	strlcpy(pmd->name, tap_name, sizeof(pmd->name));
1931 	pmd->type = type;
1932 	pmd->ka_fd = -1;
1933 	pmd->nlsk_fd = -1;
1934 	pmd->gso_ctx_mp = NULL;
1935 
1936 	pmd->ioctl_sock = socket(AF_INET, SOCK_DGRAM, 0);
1937 	if (pmd->ioctl_sock == -1) {
1938 		TAP_LOG(ERR,
1939 			"%s Unable to get a socket for management: %s",
1940 			tuntap_name, strerror(errno));
1941 		goto error_exit;
1942 	}
1943 
1944 	/* Setup some default values */
1945 	data = dev->data;
1946 	data->dev_private = pmd;
1947 	data->dev_flags = RTE_ETH_DEV_INTR_LSC |
1948 				RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
1949 	data->numa_node = numa_node;
1950 
1951 	data->dev_link = pmd_link;
1952 	data->mac_addrs = &pmd->eth_addr;
1953 	/* Set the number of RX and TX queues */
1954 	data->nb_rx_queues = 0;
1955 	data->nb_tx_queues = 0;
1956 
1957 	dev->dev_ops = &ops;
1958 	dev->rx_pkt_burst = pmd_rx_burst;
1959 	dev->tx_pkt_burst = pmd_tx_burst;
1960 
1961 	pmd->intr_handle.type = RTE_INTR_HANDLE_EXT;
1962 	pmd->intr_handle.fd = -1;
1963 	dev->intr_handle = &pmd->intr_handle;
1964 
1965 	/* Presetup the fds to -1 as being not valid */
1966 	for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
1967 		process_private->rxq_fds[i] = -1;
1968 		process_private->txq_fds[i] = -1;
1969 	}
1970 
1971 	if (pmd->type == ETH_TUNTAP_TYPE_TAP) {
1972 		if (rte_is_zero_ether_addr(mac_addr))
1973 			rte_eth_random_addr((uint8_t *)&pmd->eth_addr);
1974 		else
1975 			rte_memcpy(&pmd->eth_addr, mac_addr, sizeof(*mac_addr));
1976 	}
1977 
1978 	/*
1979 	 * Allocate a TUN device keep-alive file descriptor that will only be
1980 	 * closed when the TUN device itself is closed or removed.
1981 	 * This keep-alive file descriptor will guarantee that the TUN device
1982 	 * exists even when all of its queues are closed
1983 	 */
1984 	pmd->ka_fd = tun_alloc(pmd, 1);
1985 	if (pmd->ka_fd == -1) {
1986 		TAP_LOG(ERR, "Unable to create %s interface", tuntap_name);
1987 		goto error_exit;
1988 	}
1989 	TAP_LOG(DEBUG, "allocated %s", pmd->name);
1990 
1991 	ifr.ifr_mtu = dev->data->mtu;
1992 	if (tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1, LOCAL_AND_REMOTE) < 0)
1993 		goto error_exit;
1994 
1995 	if (pmd->type == ETH_TUNTAP_TYPE_TAP) {
1996 		memset(&ifr, 0, sizeof(struct ifreq));
1997 		ifr.ifr_hwaddr.sa_family = AF_LOCAL;
1998 		rte_memcpy(ifr.ifr_hwaddr.sa_data, &pmd->eth_addr,
1999 				RTE_ETHER_ADDR_LEN);
2000 		if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 0, LOCAL_ONLY) < 0)
2001 			goto error_exit;
2002 	}
2003 
2004 	/*
2005 	 * Set up everything related to rte_flow:
2006 	 * - netlink socket
2007 	 * - tap / remote if_index
2008 	 * - mandatory QDISCs
2009 	 * - rte_flow actual/implicit lists
2010 	 * - implicit rules
2011 	 */
2012 	pmd->nlsk_fd = tap_nl_init(0);
2013 	if (pmd->nlsk_fd == -1) {
2014 		TAP_LOG(WARNING, "%s: failed to create netlink socket.",
2015 			pmd->name);
2016 		goto disable_rte_flow;
2017 	}
2018 	pmd->if_index = if_nametoindex(pmd->name);
2019 	if (!pmd->if_index) {
2020 		TAP_LOG(ERR, "%s: failed to get if_index.", pmd->name);
2021 		goto disable_rte_flow;
2022 	}
2023 	if (qdisc_create_multiq(pmd->nlsk_fd, pmd->if_index) < 0) {
2024 		TAP_LOG(ERR, "%s: failed to create multiq qdisc.",
2025 			pmd->name);
2026 		goto disable_rte_flow;
2027 	}
2028 	if (qdisc_create_ingress(pmd->nlsk_fd, pmd->if_index) < 0) {
2029 		TAP_LOG(ERR, "%s: failed to create ingress qdisc.",
2030 			pmd->name);
2031 		goto disable_rte_flow;
2032 	}
2033 	LIST_INIT(&pmd->flows);
2034 
2035 	if (strlen(remote_iface)) {
2036 		pmd->remote_if_index = if_nametoindex(remote_iface);
2037 		if (!pmd->remote_if_index) {
2038 			TAP_LOG(ERR, "%s: failed to get %s if_index.",
2039 				pmd->name, remote_iface);
2040 			goto error_remote;
2041 		}
2042 		strlcpy(pmd->remote_iface, remote_iface, RTE_ETH_NAME_MAX_LEN);
2043 
2044 		/* Save state of remote device */
2045 		tap_ioctl(pmd, SIOCGIFFLAGS, &pmd->remote_initial_flags, 0, REMOTE_ONLY);
2046 
2047 		/* Replicate remote MAC address */
2048 		if (tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, REMOTE_ONLY) < 0) {
2049 			TAP_LOG(ERR, "%s: failed to get %s MAC address.",
2050 				pmd->name, pmd->remote_iface);
2051 			goto error_remote;
2052 		}
2053 		rte_memcpy(&pmd->eth_addr, ifr.ifr_hwaddr.sa_data,
2054 			   RTE_ETHER_ADDR_LEN);
2055 		/* The desired MAC is already in ifreq after SIOCGIFHWADDR. */
2056 		if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 0, LOCAL_ONLY) < 0) {
2057 			TAP_LOG(ERR, "%s: failed to get %s MAC address.",
2058 				pmd->name, remote_iface);
2059 			goto error_remote;
2060 		}
2061 
2062 		/*
2063 		 * Flush usually returns negative value because it tries to
2064 		 * delete every QDISC (and on a running device, one QDISC at
2065 		 * least is needed). Ignore negative return value.
2066 		 */
2067 		qdisc_flush(pmd->nlsk_fd, pmd->remote_if_index);
2068 		if (qdisc_create_ingress(pmd->nlsk_fd,
2069 					 pmd->remote_if_index) < 0) {
2070 			TAP_LOG(ERR, "%s: failed to create ingress qdisc.",
2071 				pmd->remote_iface);
2072 			goto error_remote;
2073 		}
2074 		LIST_INIT(&pmd->implicit_flows);
2075 		if (tap_flow_implicit_create(pmd, TAP_REMOTE_TX) < 0 ||
2076 		    tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0 ||
2077 		    tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCAST) < 0 ||
2078 		    tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCASTV6) < 0) {
2079 			TAP_LOG(ERR,
2080 				"%s: failed to create implicit rules.",
2081 				pmd->name);
2082 			goto error_remote;
2083 		}
2084 	}
2085 
2086 	rte_eth_dev_probing_finish(dev);
2087 	return 0;
2088 
2089 disable_rte_flow:
2090 	TAP_LOG(ERR, " Disabling rte flow support: %s(%d)",
2091 		strerror(errno), errno);
2092 	if (strlen(remote_iface)) {
2093 		TAP_LOG(ERR, "Remote feature requires flow support.");
2094 		goto error_exit;
2095 	}
2096 	rte_eth_dev_probing_finish(dev);
2097 	return 0;
2098 
2099 error_remote:
2100 	TAP_LOG(ERR, " Can't set up remote feature: %s(%d)",
2101 		strerror(errno), errno);
2102 	tap_flow_implicit_flush(pmd, NULL);
2103 
2104 error_exit:
2105 	if (pmd->nlsk_fd != -1)
2106 		close(pmd->nlsk_fd);
2107 	if (pmd->ka_fd != -1)
2108 		close(pmd->ka_fd);
2109 	if (pmd->ioctl_sock != -1)
2110 		close(pmd->ioctl_sock);
2111 	/* mac_addrs must not be freed alone because part of dev_private */
2112 	dev->data->mac_addrs = NULL;
2113 	rte_eth_dev_release_port(dev);
2114 
2115 error_exit_nodev:
2116 	TAP_LOG(ERR, "%s Unable to initialize %s",
2117 		tuntap_name, rte_vdev_device_name(vdev));
2118 
2119 	return -EINVAL;
2120 }
2121 
2122 /* make sure name is a possible Linux network device name */
2123 static bool
2124 is_valid_iface(const char *name)
2125 {
2126 	if (*name == '\0')
2127 		return false;
2128 
2129 	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
2130 		return false;
2131 
2132 	while (*name) {
2133 		if (*name == '/' || *name == ':' || isspace(*name))
2134 			return false;
2135 		name++;
2136 	}
2137 	return true;
2138 }
2139 
2140 static int
2141 set_interface_name(const char *key __rte_unused,
2142 		   const char *value,
2143 		   void *extra_args)
2144 {
2145 	char *name = (char *)extra_args;
2146 
2147 	if (value) {
2148 		if (!is_valid_iface(value)) {
2149 			TAP_LOG(ERR, "TAP invalid remote interface name (%s)",
2150 				value);
2151 			return -1;
2152 		}
2153 		strlcpy(name, value, RTE_ETH_NAME_MAX_LEN);
2154 	} else {
2155 		/* use tap%d which causes kernel to choose next available */
2156 		strlcpy(name, DEFAULT_TAP_NAME "%d", RTE_ETH_NAME_MAX_LEN);
2157 	}
2158 	return 0;
2159 }
2160 
2161 static int
2162 set_remote_iface(const char *key __rte_unused,
2163 		 const char *value,
2164 		 void *extra_args)
2165 {
2166 	char *name = (char *)extra_args;
2167 
2168 	if (value) {
2169 		if (!is_valid_iface(value)) {
2170 			TAP_LOG(ERR, "TAP invalid remote interface name (%s)",
2171 				value);
2172 			return -1;
2173 		}
2174 		strlcpy(name, value, RTE_ETH_NAME_MAX_LEN);
2175 	}
2176 
2177 	return 0;
2178 }
2179 
2180 static int parse_user_mac(struct rte_ether_addr *user_mac,
2181 		const char *value)
2182 {
2183 	unsigned int index = 0;
2184 	char mac_temp[strlen(ETH_TAP_USR_MAC_FMT) + 1], *mac_byte = NULL;
2185 
2186 	if (user_mac == NULL || value == NULL)
2187 		return 0;
2188 
2189 	strlcpy(mac_temp, value, sizeof(mac_temp));
2190 	mac_byte = strtok(mac_temp, ":");
2191 
2192 	while ((mac_byte != NULL) &&
2193 			(strlen(mac_byte) <= 2) &&
2194 			(strlen(mac_byte) == strspn(mac_byte,
2195 					ETH_TAP_CMP_MAC_FMT))) {
2196 		user_mac->addr_bytes[index++] = strtoul(mac_byte, NULL, 16);
2197 		mac_byte = strtok(NULL, ":");
2198 	}
2199 
2200 	return index;
2201 }
2202 
2203 static int
2204 set_mac_type(const char *key __rte_unused,
2205 	     const char *value,
2206 	     void *extra_args)
2207 {
2208 	struct rte_ether_addr *user_mac = extra_args;
2209 
2210 	if (!value)
2211 		return 0;
2212 
2213 	if (!strncasecmp(ETH_TAP_MAC_FIXED, value, strlen(ETH_TAP_MAC_FIXED))) {
2214 		static int iface_idx;
2215 
2216 		/* fixed mac = 00:64:74:61:70:<iface_idx> */
2217 		memcpy((char *)user_mac->addr_bytes, "\0dtap",
2218 			RTE_ETHER_ADDR_LEN);
2219 		user_mac->addr_bytes[RTE_ETHER_ADDR_LEN - 1] =
2220 			iface_idx++ + '0';
2221 		goto success;
2222 	}
2223 
2224 	if (parse_user_mac(user_mac, value) != 6)
2225 		goto error;
2226 success:
2227 	TAP_LOG(DEBUG, "TAP user MAC param (%s)", value);
2228 	return 0;
2229 
2230 error:
2231 	TAP_LOG(ERR, "TAP user MAC (%s) is not in format (%s|%s)",
2232 		value, ETH_TAP_MAC_FIXED, ETH_TAP_USR_MAC_FMT);
2233 	return -1;
2234 }
2235 
2236 /*
2237  * Open a TUN interface device. TUN PMD
2238  * 1) sets tap_type as false
2239  * 2) intakes iface as argument.
2240  * 3) as interface is virtual set speed to 10G
2241  */
2242 static int
2243 rte_pmd_tun_probe(struct rte_vdev_device *dev)
2244 {
2245 	const char *name, *params;
2246 	int ret;
2247 	struct rte_kvargs *kvlist = NULL;
2248 	char tun_name[RTE_ETH_NAME_MAX_LEN];
2249 	char remote_iface[RTE_ETH_NAME_MAX_LEN];
2250 	struct rte_eth_dev *eth_dev;
2251 
2252 	name = rte_vdev_device_name(dev);
2253 	params = rte_vdev_device_args(dev);
2254 	memset(remote_iface, 0, RTE_ETH_NAME_MAX_LEN);
2255 
2256 	if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
2257 	    strlen(params) == 0) {
2258 		eth_dev = rte_eth_dev_attach_secondary(name);
2259 		if (!eth_dev) {
2260 			TAP_LOG(ERR, "Failed to probe %s", name);
2261 			return -1;
2262 		}
2263 		eth_dev->dev_ops = &ops;
2264 		eth_dev->device = &dev->device;
2265 		rte_eth_dev_probing_finish(eth_dev);
2266 		return 0;
2267 	}
2268 
2269 	/* use tun%d which causes kernel to choose next available */
2270 	strlcpy(tun_name, DEFAULT_TUN_NAME "%d", RTE_ETH_NAME_MAX_LEN);
2271 
2272 	if (params && (params[0] != '\0')) {
2273 		TAP_LOG(DEBUG, "parameters (%s)", params);
2274 
2275 		kvlist = rte_kvargs_parse(params, valid_arguments);
2276 		if (kvlist) {
2277 			if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) {
2278 				ret = rte_kvargs_process(kvlist,
2279 					ETH_TAP_IFACE_ARG,
2280 					&set_interface_name,
2281 					tun_name);
2282 
2283 				if (ret == -1)
2284 					goto leave;
2285 			}
2286 		}
2287 	}
2288 	pmd_link.link_speed = ETH_SPEED_NUM_10G;
2289 
2290 	TAP_LOG(DEBUG, "Initializing pmd_tun for %s", name);
2291 
2292 	ret = eth_dev_tap_create(dev, tun_name, remote_iface, 0,
2293 				 ETH_TUNTAP_TYPE_TUN);
2294 
2295 leave:
2296 	if (ret == -1) {
2297 		TAP_LOG(ERR, "Failed to create pmd for %s as %s",
2298 			name, tun_name);
2299 	}
2300 	rte_kvargs_free(kvlist);
2301 
2302 	return ret;
2303 }
2304 
2305 /* Request queue file descriptors from secondary to primary. */
2306 static int
2307 tap_mp_attach_queues(const char *port_name, struct rte_eth_dev *dev)
2308 {
2309 	int ret;
2310 	struct timespec timeout = {.tv_sec = 1, .tv_nsec = 0};
2311 	struct rte_mp_msg request, *reply;
2312 	struct rte_mp_reply replies;
2313 	struct ipc_queues *request_param = (struct ipc_queues *)request.param;
2314 	struct ipc_queues *reply_param;
2315 	struct pmd_process_private *process_private = dev->process_private;
2316 	int queue, fd_iterator;
2317 
2318 	/* Prepare the request */
2319 	memset(&request, 0, sizeof(request));
2320 	strlcpy(request.name, TAP_MP_KEY, sizeof(request.name));
2321 	strlcpy(request_param->port_name, port_name,
2322 		sizeof(request_param->port_name));
2323 	request.len_param = sizeof(*request_param);
2324 	/* Send request and receive reply */
2325 	ret = rte_mp_request_sync(&request, &replies, &timeout);
2326 	if (ret < 0 || replies.nb_received != 1) {
2327 		TAP_LOG(ERR, "Failed to request queues from primary: %d",
2328 			rte_errno);
2329 		return -1;
2330 	}
2331 	reply = &replies.msgs[0];
2332 	reply_param = (struct ipc_queues *)reply->param;
2333 	TAP_LOG(DEBUG, "Received IPC reply for %s", reply_param->port_name);
2334 
2335 	/* Attach the queues from received file descriptors */
2336 	if (reply_param->rxq_count + reply_param->txq_count != reply->num_fds) {
2337 		TAP_LOG(ERR, "Unexpected number of fds received");
2338 		return -1;
2339 	}
2340 
2341 	dev->data->nb_rx_queues = reply_param->rxq_count;
2342 	dev->data->nb_tx_queues = reply_param->txq_count;
2343 	fd_iterator = 0;
2344 	for (queue = 0; queue < reply_param->rxq_count; queue++)
2345 		process_private->rxq_fds[queue] = reply->fds[fd_iterator++];
2346 	for (queue = 0; queue < reply_param->txq_count; queue++)
2347 		process_private->txq_fds[queue] = reply->fds[fd_iterator++];
2348 	free(reply);
2349 	return 0;
2350 }
2351 
2352 /* Send the queue file descriptors from the primary process to secondary. */
2353 static int
2354 tap_mp_sync_queues(const struct rte_mp_msg *request, const void *peer)
2355 {
2356 	struct rte_eth_dev *dev;
2357 	struct pmd_process_private *process_private;
2358 	struct rte_mp_msg reply;
2359 	const struct ipc_queues *request_param =
2360 		(const struct ipc_queues *)request->param;
2361 	struct ipc_queues *reply_param =
2362 		(struct ipc_queues *)reply.param;
2363 	uint16_t port_id;
2364 	int queue;
2365 	int ret;
2366 
2367 	/* Get requested port */
2368 	TAP_LOG(DEBUG, "Received IPC request for %s", request_param->port_name);
2369 	ret = rte_eth_dev_get_port_by_name(request_param->port_name, &port_id);
2370 	if (ret) {
2371 		TAP_LOG(ERR, "Failed to get port id for %s",
2372 			request_param->port_name);
2373 		return -1;
2374 	}
2375 	dev = &rte_eth_devices[port_id];
2376 	process_private = dev->process_private;
2377 
2378 	/* Fill file descriptors for all queues */
2379 	reply.num_fds = 0;
2380 	reply_param->rxq_count = 0;
2381 	if (dev->data->nb_rx_queues + dev->data->nb_tx_queues >
2382 			RTE_MP_MAX_FD_NUM){
2383 		TAP_LOG(ERR, "Number of rx/tx queues exceeds max number of fds");
2384 		return -1;
2385 	}
2386 
2387 	for (queue = 0; queue < dev->data->nb_rx_queues; queue++) {
2388 		reply.fds[reply.num_fds++] = process_private->rxq_fds[queue];
2389 		reply_param->rxq_count++;
2390 	}
2391 	RTE_ASSERT(reply_param->rxq_count == dev->data->nb_rx_queues);
2392 
2393 	reply_param->txq_count = 0;
2394 	for (queue = 0; queue < dev->data->nb_tx_queues; queue++) {
2395 		reply.fds[reply.num_fds++] = process_private->txq_fds[queue];
2396 		reply_param->txq_count++;
2397 	}
2398 	RTE_ASSERT(reply_param->txq_count == dev->data->nb_tx_queues);
2399 
2400 	/* Send reply */
2401 	strlcpy(reply.name, request->name, sizeof(reply.name));
2402 	strlcpy(reply_param->port_name, request_param->port_name,
2403 		sizeof(reply_param->port_name));
2404 	reply.len_param = sizeof(*reply_param);
2405 	if (rte_mp_reply(&reply, peer) < 0) {
2406 		TAP_LOG(ERR, "Failed to reply an IPC request to sync queues");
2407 		return -1;
2408 	}
2409 	return 0;
2410 }
2411 
2412 /* Open a TAP interface device.
2413  */
2414 static int
2415 rte_pmd_tap_probe(struct rte_vdev_device *dev)
2416 {
2417 	const char *name, *params;
2418 	int ret;
2419 	struct rte_kvargs *kvlist = NULL;
2420 	int speed;
2421 	char tap_name[RTE_ETH_NAME_MAX_LEN];
2422 	char remote_iface[RTE_ETH_NAME_MAX_LEN];
2423 	struct rte_ether_addr user_mac = { .addr_bytes = {0} };
2424 	struct rte_eth_dev *eth_dev;
2425 	int tap_devices_count_increased = 0;
2426 
2427 	name = rte_vdev_device_name(dev);
2428 	params = rte_vdev_device_args(dev);
2429 
2430 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
2431 		eth_dev = rte_eth_dev_attach_secondary(name);
2432 		if (!eth_dev) {
2433 			TAP_LOG(ERR, "Failed to probe %s", name);
2434 			return -1;
2435 		}
2436 		eth_dev->dev_ops = &ops;
2437 		eth_dev->device = &dev->device;
2438 		eth_dev->rx_pkt_burst = pmd_rx_burst;
2439 		eth_dev->tx_pkt_burst = pmd_tx_burst;
2440 		if (!rte_eal_primary_proc_alive(NULL)) {
2441 			TAP_LOG(ERR, "Primary process is missing");
2442 			return -1;
2443 		}
2444 		eth_dev->process_private = (struct pmd_process_private *)
2445 			rte_zmalloc_socket(name,
2446 				sizeof(struct pmd_process_private),
2447 				RTE_CACHE_LINE_SIZE,
2448 				eth_dev->device->numa_node);
2449 		if (eth_dev->process_private == NULL) {
2450 			TAP_LOG(ERR,
2451 				"Failed to alloc memory for process private");
2452 			return -1;
2453 		}
2454 
2455 		ret = tap_mp_attach_queues(name, eth_dev);
2456 		if (ret != 0)
2457 			return -1;
2458 		rte_eth_dev_probing_finish(eth_dev);
2459 		return 0;
2460 	}
2461 
2462 	speed = ETH_SPEED_NUM_10G;
2463 
2464 	/* use tap%d which causes kernel to choose next available */
2465 	strlcpy(tap_name, DEFAULT_TAP_NAME "%d", RTE_ETH_NAME_MAX_LEN);
2466 	memset(remote_iface, 0, RTE_ETH_NAME_MAX_LEN);
2467 
2468 	if (params && (params[0] != '\0')) {
2469 		TAP_LOG(DEBUG, "parameters (%s)", params);
2470 
2471 		kvlist = rte_kvargs_parse(params, valid_arguments);
2472 		if (kvlist) {
2473 			if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) {
2474 				ret = rte_kvargs_process(kvlist,
2475 							 ETH_TAP_IFACE_ARG,
2476 							 &set_interface_name,
2477 							 tap_name);
2478 				if (ret == -1)
2479 					goto leave;
2480 			}
2481 
2482 			if (rte_kvargs_count(kvlist, ETH_TAP_REMOTE_ARG) == 1) {
2483 				ret = rte_kvargs_process(kvlist,
2484 							 ETH_TAP_REMOTE_ARG,
2485 							 &set_remote_iface,
2486 							 remote_iface);
2487 				if (ret == -1)
2488 					goto leave;
2489 			}
2490 
2491 			if (rte_kvargs_count(kvlist, ETH_TAP_MAC_ARG) == 1) {
2492 				ret = rte_kvargs_process(kvlist,
2493 							 ETH_TAP_MAC_ARG,
2494 							 &set_mac_type,
2495 							 &user_mac);
2496 				if (ret == -1)
2497 					goto leave;
2498 			}
2499 		}
2500 	}
2501 	pmd_link.link_speed = speed;
2502 
2503 	TAP_LOG(DEBUG, "Initializing pmd_tap for %s", name);
2504 
2505 	/* Register IPC feed callback */
2506 	if (!tap_devices_count) {
2507 		ret = rte_mp_action_register(TAP_MP_KEY, tap_mp_sync_queues);
2508 		if (ret < 0 && rte_errno != ENOTSUP) {
2509 			TAP_LOG(ERR, "tap: Failed to register IPC callback: %s",
2510 				strerror(rte_errno));
2511 			goto leave;
2512 		}
2513 	}
2514 	tap_devices_count++;
2515 	tap_devices_count_increased = 1;
2516 	ret = eth_dev_tap_create(dev, tap_name, remote_iface, &user_mac,
2517 		ETH_TUNTAP_TYPE_TAP);
2518 
2519 leave:
2520 	if (ret == -1) {
2521 		TAP_LOG(ERR, "Failed to create pmd for %s as %s",
2522 			name, tap_name);
2523 		if (tap_devices_count_increased == 1) {
2524 			if (tap_devices_count == 1)
2525 				rte_mp_action_unregister(TAP_MP_KEY);
2526 			tap_devices_count--;
2527 		}
2528 	}
2529 	rte_kvargs_free(kvlist);
2530 
2531 	return ret;
2532 }
2533 
2534 /* detach a TUNTAP device.
2535  */
2536 static int
2537 rte_pmd_tap_remove(struct rte_vdev_device *dev)
2538 {
2539 	struct rte_eth_dev *eth_dev = NULL;
2540 
2541 	/* find the ethdev entry */
2542 	eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
2543 	if (!eth_dev)
2544 		return 0;
2545 
2546 	tap_dev_close(eth_dev);
2547 	rte_eth_dev_release_port(eth_dev);
2548 
2549 	return 0;
2550 }
2551 
2552 static struct rte_vdev_driver pmd_tun_drv = {
2553 	.probe = rte_pmd_tun_probe,
2554 	.remove = rte_pmd_tap_remove,
2555 };
2556 
2557 static struct rte_vdev_driver pmd_tap_drv = {
2558 	.probe = rte_pmd_tap_probe,
2559 	.remove = rte_pmd_tap_remove,
2560 };
2561 
2562 RTE_PMD_REGISTER_VDEV(net_tap, pmd_tap_drv);
2563 RTE_PMD_REGISTER_VDEV(net_tun, pmd_tun_drv);
2564 RTE_PMD_REGISTER_ALIAS(net_tap, eth_tap);
2565 RTE_PMD_REGISTER_PARAM_STRING(net_tun,
2566 			      ETH_TAP_IFACE_ARG "=<string> ");
2567 RTE_PMD_REGISTER_PARAM_STRING(net_tap,
2568 			      ETH_TAP_IFACE_ARG "=<string> "
2569 			      ETH_TAP_MAC_ARG "=" ETH_TAP_MAC_ARG_FMT " "
2570 			      ETH_TAP_REMOTE_ARG "=<string>");
2571 RTE_LOG_REGISTER(tap_logtype, pmd.net.tap, NOTICE);
2572