xref: /dpdk/drivers/net/af_packet/rte_eth_af_packet.c (revision 6b3246245507e5257a17c1684daed27d8a82fc7a)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2014 John W. Linville <linville@tuxdriver.com>
3  * Originally based upon librte_pmd_pcap code:
4  * Copyright(c) 2010-2015 Intel Corporation.
5  * Copyright(c) 2014 6WIND S.A.
6  * All rights reserved.
7  */
8 
9 #include <rte_common.h>
10 #include <rte_string_fns.h>
11 #include <rte_mbuf.h>
12 #include <ethdev_driver.h>
13 #include <ethdev_vdev.h>
14 #include <rte_malloc.h>
15 #include <rte_kvargs.h>
16 #include <bus_vdev_driver.h>
17 
18 #include <errno.h>
19 #include <linux/if_ether.h>
20 #include <linux/if_packet.h>
21 #include <arpa/inet.h>
22 #include <net/if.h>
23 #include <net/if_arp.h>
24 #include <sys/types.h>
25 #include <sys/socket.h>
26 #include <sys/ioctl.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <sys/mman.h>
30 #include <unistd.h>
31 #include <poll.h>
32 
33 #define ETH_AF_PACKET_IFACE_ARG		"iface"
34 #define ETH_AF_PACKET_NUM_Q_ARG		"qpairs"
35 #define ETH_AF_PACKET_BLOCKSIZE_ARG	"blocksz"
36 #define ETH_AF_PACKET_FRAMESIZE_ARG	"framesz"
37 #define ETH_AF_PACKET_FRAMECOUNT_ARG	"framecnt"
38 #define ETH_AF_PACKET_QDISC_BYPASS_ARG	"qdisc_bypass"
39 #define ETH_AF_PACKET_FANOUT_MODE_ARG	"fanout_mode"
40 
41 #define DFLT_FRAME_SIZE		(1 << 11)
42 #define DFLT_FRAME_COUNT	(1 << 9)
43 
44 static uint64_t timestamp_dynflag;
45 static int timestamp_dynfield_offset = -1;
46 
47 struct __rte_cache_aligned pkt_rx_queue {
48 	int sockfd;
49 
50 	struct iovec *rd;
51 	uint8_t *map;
52 	unsigned int framecount;
53 	unsigned int framenum;
54 
55 	struct rte_mempool *mb_pool;
56 	uint16_t in_port;
57 	uint8_t vlan_strip;
58 	uint8_t timestamp_offloading;
59 
60 	volatile unsigned long rx_pkts;
61 	volatile unsigned long rx_bytes;
62 	volatile unsigned long rx_nombuf;
63 	volatile unsigned long rx_dropped_pkts;
64 };
65 
66 struct __rte_cache_aligned pkt_tx_queue {
67 	int sockfd;
68 	unsigned int frame_data_size;
69 
70 	struct iovec *rd;
71 	uint8_t *map;
72 	unsigned int framecount;
73 	unsigned int framenum;
74 
75 	volatile unsigned long tx_pkts;
76 	volatile unsigned long err_pkts;
77 	volatile unsigned long tx_bytes;
78 };
79 
80 struct pmd_internals {
81 	unsigned nb_queues;
82 
83 	int if_index;
84 	char *if_name;
85 	struct rte_ether_addr eth_addr;
86 
87 	struct tpacket_req req;
88 
89 	struct pkt_rx_queue *rx_queue;
90 	struct pkt_tx_queue *tx_queue;
91 	uint8_t vlan_strip;
92 	uint8_t timestamp_offloading;
93 };
94 
95 static const char *valid_arguments[] = {
96 	ETH_AF_PACKET_IFACE_ARG,
97 	ETH_AF_PACKET_NUM_Q_ARG,
98 	ETH_AF_PACKET_BLOCKSIZE_ARG,
99 	ETH_AF_PACKET_FRAMESIZE_ARG,
100 	ETH_AF_PACKET_FRAMECOUNT_ARG,
101 	ETH_AF_PACKET_QDISC_BYPASS_ARG,
102 	ETH_AF_PACKET_FANOUT_MODE_ARG,
103 	NULL
104 };
105 
106 static struct rte_eth_link pmd_link = {
107 	.link_speed = RTE_ETH_SPEED_NUM_10G,
108 	.link_duplex = RTE_ETH_LINK_FULL_DUPLEX,
109 	.link_status = RTE_ETH_LINK_DOWN,
110 	.link_autoneg = RTE_ETH_LINK_FIXED,
111 };
112 
113 RTE_LOG_REGISTER_DEFAULT(af_packet_logtype, NOTICE);
114 #define RTE_LOGTYPE_AFPACKET af_packet_logtype
115 
116 #define PMD_LOG(level, ...) \
117 	RTE_LOG_LINE_PREFIX(level, AFPACKET, "%s(): ", __func__, __VA_ARGS__)
118 
119 #define PMD_LOG_ERRNO(level, fmt, ...) \
120 	RTE_LOG_LINE(level, AFPACKET, "%s(): " fmt ":%s", __func__, \
121 		## __VA_ARGS__, strerror(errno))
122 
123 static uint16_t
124 eth_af_packet_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
125 {
126 	unsigned i;
127 	struct tpacket2_hdr *ppd;
128 	struct rte_mbuf *mbuf;
129 	uint8_t *pbuf;
130 	struct pkt_rx_queue *pkt_q = queue;
131 	uint16_t num_rx = 0;
132 	unsigned long num_rx_bytes = 0;
133 	unsigned int framecount, framenum;
134 
135 	if (unlikely(nb_pkts == 0))
136 		return 0;
137 
138 	/*
139 	 * Reads the given number of packets from the AF_PACKET socket one by
140 	 * one and copies the packet data into a newly allocated mbuf.
141 	 */
142 	framecount = pkt_q->framecount;
143 	framenum = pkt_q->framenum;
144 	for (i = 0; i < nb_pkts; i++) {
145 		/* point at the next incoming frame */
146 		ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base;
147 		if ((ppd->tp_status & TP_STATUS_USER) == 0)
148 			break;
149 
150 		/* allocate the next mbuf */
151 		mbuf = rte_pktmbuf_alloc(pkt_q->mb_pool);
152 		if (unlikely(mbuf == NULL)) {
153 			pkt_q->rx_nombuf++;
154 			break;
155 		}
156 
157 		/* packet will fit in the mbuf, go ahead and receive it */
158 		rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf) = ppd->tp_snaplen;
159 		pbuf = (uint8_t *) ppd + ppd->tp_mac;
160 		memcpy(rte_pktmbuf_mtod(mbuf, void *), pbuf, rte_pktmbuf_data_len(mbuf));
161 
162 		/* check for vlan info */
163 		if (ppd->tp_status & TP_STATUS_VLAN_VALID) {
164 			mbuf->vlan_tci = ppd->tp_vlan_tci;
165 			mbuf->ol_flags |= (RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED);
166 
167 			if (!pkt_q->vlan_strip && rte_vlan_insert(&mbuf))
168 				PMD_LOG(ERR, "Failed to reinsert VLAN tag");
169 		}
170 
171 		/* add kernel provided timestamp when offloading is enabled */
172 		if (pkt_q->timestamp_offloading) {
173 			/* since TPACKET_V2 timestamps are provided in nanoseconds resolution */
174 			*RTE_MBUF_DYNFIELD(mbuf, timestamp_dynfield_offset,
175 				rte_mbuf_timestamp_t *) =
176 					(uint64_t)ppd->tp_sec * 1000000000 + ppd->tp_nsec;
177 
178 			mbuf->ol_flags |= timestamp_dynflag;
179 		}
180 
181 		/* release incoming frame and advance ring buffer */
182 		ppd->tp_status = TP_STATUS_KERNEL;
183 		if (++framenum >= framecount)
184 			framenum = 0;
185 		mbuf->port = pkt_q->in_port;
186 
187 		/* account for the receive frame */
188 		bufs[i] = mbuf;
189 		num_rx++;
190 		num_rx_bytes += mbuf->pkt_len;
191 	}
192 	pkt_q->framenum = framenum;
193 	pkt_q->rx_pkts += num_rx;
194 	pkt_q->rx_bytes += num_rx_bytes;
195 	return num_rx;
196 }
197 
198 /*
199  * Check if there is an available frame in the ring
200  */
201 static inline bool
202 tx_ring_status_available(uint32_t tp_status)
203 {
204 	/*
205 	 * We eliminate the timestamp status from the packet status.
206 	 * This should only matter if timestamping is enabled on the socket,
207 	 * but there is a bug in the kernel which is fixed in newer releases.
208 	 *
209 	 * See the following kernel commit for reference:
210 	 *     commit 171c3b151118a2fe0fc1e2a9d1b5a1570cfe82d2
211 	 *     net: packetmmap: fix only tx timestamp on request
212 	 */
213 	tp_status &= ~(TP_STATUS_TS_SOFTWARE | TP_STATUS_TS_RAW_HARDWARE);
214 
215 	return tp_status == TP_STATUS_AVAILABLE;
216 }
217 
218 /*
219  * Callback to handle sending packets through a real NIC.
220  */
221 static uint16_t
222 eth_af_packet_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
223 {
224 	struct tpacket2_hdr *ppd;
225 	struct rte_mbuf *mbuf;
226 	uint8_t *pbuf;
227 	unsigned int framecount, framenum;
228 	struct pollfd pfd;
229 	struct pkt_tx_queue *pkt_q = queue;
230 	uint16_t num_tx = 0;
231 	unsigned long num_tx_bytes = 0;
232 	int i;
233 
234 	if (unlikely(nb_pkts == 0))
235 		return 0;
236 
237 	memset(&pfd, 0, sizeof(pfd));
238 	pfd.fd = pkt_q->sockfd;
239 	pfd.events = POLLOUT;
240 	pfd.revents = 0;
241 
242 	framecount = pkt_q->framecount;
243 	framenum = pkt_q->framenum;
244 	ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base;
245 	for (i = 0; i < nb_pkts; i++) {
246 		mbuf = *bufs++;
247 
248 		/* drop oversized packets */
249 		if (mbuf->pkt_len > pkt_q->frame_data_size) {
250 			rte_pktmbuf_free(mbuf);
251 			continue;
252 		}
253 
254 		/* insert vlan info if necessary */
255 		if (mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) {
256 			if (rte_vlan_insert(&mbuf)) {
257 				rte_pktmbuf_free(mbuf);
258 				continue;
259 			}
260 		}
261 
262 		/* point at the next incoming frame */
263 		if (!tx_ring_status_available(ppd->tp_status)) {
264 			if (poll(&pfd, 1, -1) < 0)
265 				break;
266 
267 			/* poll() can return POLLERR if the interface is down */
268 			if (pfd.revents & POLLERR)
269 				break;
270 		}
271 
272 		/*
273 		 * poll() will almost always return POLLOUT, even if there
274 		 * are no extra buffers available
275 		 *
276 		 * This happens, because packet_poll() calls datagram_poll()
277 		 * which checks the space left in the socket buffer and,
278 		 * in the case of packet_mmap, the default socket buffer length
279 		 * doesn't match the requested size for the tx_ring.
280 		 * As such, there is almost always space left in socket buffer,
281 		 * which doesn't seem to be correlated to the requested size
282 		 * for the tx_ring in packet_mmap.
283 		 *
284 		 * This results in poll() returning POLLOUT.
285 		 */
286 		if (!tx_ring_status_available(ppd->tp_status))
287 			break;
288 
289 		/* copy the tx frame data */
290 		pbuf = (uint8_t *) ppd + TPACKET2_HDRLEN -
291 			sizeof(struct sockaddr_ll);
292 
293 		struct rte_mbuf *tmp_mbuf = mbuf;
294 		while (tmp_mbuf) {
295 			uint16_t data_len = rte_pktmbuf_data_len(tmp_mbuf);
296 			memcpy(pbuf, rte_pktmbuf_mtod(tmp_mbuf, void*), data_len);
297 			pbuf += data_len;
298 			tmp_mbuf = tmp_mbuf->next;
299 		}
300 
301 		ppd->tp_len = mbuf->pkt_len;
302 		ppd->tp_snaplen = mbuf->pkt_len;
303 
304 		/* release incoming frame and advance ring buffer */
305 		ppd->tp_status = TP_STATUS_SEND_REQUEST;
306 		if (++framenum >= framecount)
307 			framenum = 0;
308 		ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base;
309 
310 		num_tx++;
311 		num_tx_bytes += mbuf->pkt_len;
312 		rte_pktmbuf_free(mbuf);
313 	}
314 
315 	/* kick-off transmits */
316 	if (sendto(pkt_q->sockfd, NULL, 0, MSG_DONTWAIT, NULL, 0) == -1 &&
317 			errno != ENOBUFS && errno != EAGAIN) {
318 		/*
319 		 * In case of a ENOBUFS/EAGAIN error all of the enqueued
320 		 * packets will be considered successful even though only some
321 		 * are sent.
322 		 */
323 
324 		num_tx = 0;
325 		num_tx_bytes = 0;
326 	}
327 
328 	pkt_q->framenum = framenum;
329 	pkt_q->tx_pkts += num_tx;
330 	pkt_q->err_pkts += i - num_tx;
331 	pkt_q->tx_bytes += num_tx_bytes;
332 	return i;
333 }
334 
335 static int
336 eth_dev_start(struct rte_eth_dev *dev)
337 {
338 	struct pmd_internals *internals = dev->data->dev_private;
339 	uint16_t i;
340 
341 	if (internals->timestamp_offloading) {
342 		/* Register mbuf field and flag for Rx timestamp */
343 		int rc = rte_mbuf_dyn_rx_timestamp_register(&timestamp_dynfield_offset,
344 				&timestamp_dynflag);
345 		if (rc) {
346 			PMD_LOG(ERR, "Cannot register mbuf field/flag for timestamp");
347 			return rc;
348 		}
349 	}
350 
351 	dev->data->dev_link.link_status = RTE_ETH_LINK_UP;
352 	for (i = 0; i < internals->nb_queues; i++) {
353 		dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED;
354 		dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED;
355 	}
356 	return 0;
357 }
358 
359 /*
360  * This function gets called when the current port gets stopped.
361  */
362 static int
363 eth_dev_stop(struct rte_eth_dev *dev)
364 {
365 	unsigned i;
366 	int sockfd;
367 	struct pmd_internals *internals = dev->data->dev_private;
368 
369 	for (i = 0; i < internals->nb_queues; i++) {
370 		sockfd = internals->rx_queue[i].sockfd;
371 		if (sockfd != -1)
372 			close(sockfd);
373 
374 		/* Prevent use after free in case tx fd == rx fd */
375 		if (sockfd != internals->tx_queue[i].sockfd) {
376 			sockfd = internals->tx_queue[i].sockfd;
377 			if (sockfd != -1)
378 				close(sockfd);
379 		}
380 
381 		internals->rx_queue[i].sockfd = -1;
382 		internals->tx_queue[i].sockfd = -1;
383 		dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED;
384 		dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED;
385 	}
386 
387 	dev->data->dev_link.link_status = RTE_ETH_LINK_DOWN;
388 	return 0;
389 }
390 
391 static int
392 eth_dev_configure(struct rte_eth_dev *dev __rte_unused)
393 {
394 	struct rte_eth_conf *dev_conf = &dev->data->dev_conf;
395 	const struct rte_eth_rxmode *rxmode = &dev_conf->rxmode;
396 	struct pmd_internals *internals = dev->data->dev_private;
397 
398 	internals->vlan_strip = !!(rxmode->offloads & RTE_ETH_RX_OFFLOAD_VLAN_STRIP);
399 	internals->timestamp_offloading = !!(rxmode->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP);
400 	return 0;
401 }
402 
403 static int
404 eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
405 {
406 	struct pmd_internals *internals = dev->data->dev_private;
407 
408 	dev_info->if_index = internals->if_index;
409 	dev_info->max_mac_addrs = 1;
410 	dev_info->max_rx_pktlen = RTE_ETHER_MAX_LEN;
411 	dev_info->max_rx_queues = (uint16_t)internals->nb_queues;
412 	dev_info->max_tx_queues = (uint16_t)internals->nb_queues;
413 	dev_info->min_rx_bufsize = 0;
414 	dev_info->tx_offload_capa = RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
415 		RTE_ETH_TX_OFFLOAD_VLAN_INSERT;
416 	dev_info->rx_offload_capa = RTE_ETH_RX_OFFLOAD_VLAN_STRIP |
417 		RTE_ETH_RX_OFFLOAD_TIMESTAMP;
418 
419 	return 0;
420 }
421 
422 
423 /*
424  * Query dropped packets counter from socket.
425  * Reading drop count clears the value of the socket!
426  */
427 static unsigned int
428 packet_drop_count(int sockfd)
429 {
430 	struct tpacket_stats pkt_stats;
431 	socklen_t pkt_stats_len = sizeof(struct tpacket_stats);
432 
433 	if (sockfd == -1)
434 		return 0;
435 
436 	if (getsockopt(sockfd, SOL_PACKET, PACKET_STATISTICS, &pkt_stats,
437 		&pkt_stats_len) < -1)
438 		return 0;
439 
440 	return pkt_stats.tp_drops;
441 }
442 
443 static int
444 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
445 {
446 	unsigned int i;
447 	unsigned long rx_total = 0, rx_dropped_total = 0, rx_nombuf_total = 0;
448 	unsigned long tx_total = 0, tx_err_total = 0;
449 	unsigned long rx_bytes_total = 0, tx_bytes_total = 0;
450 	const struct pmd_internals *internal = dev->data->dev_private;
451 
452 	for (i = 0; i < internal->nb_queues; i++) {
453 		/* reading drop count clears the value, therefore keep total value */
454 		internal->rx_queue[i].rx_dropped_pkts +=
455 			packet_drop_count(internal->rx_queue[i].sockfd);
456 
457 		rx_total += internal->rx_queue[i].rx_pkts;
458 		rx_bytes_total += internal->rx_queue[i].rx_bytes;
459 		rx_dropped_total += internal->rx_queue[i].rx_dropped_pkts;
460 		rx_nombuf_total += internal->rx_queue[i].rx_nombuf;
461 
462 		tx_total += internal->tx_queue[i].tx_pkts;
463 		tx_err_total += internal->tx_queue[i].err_pkts;
464 		tx_bytes_total += internal->tx_queue[i].tx_bytes;
465 
466 		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
467 			stats->q_ipackets[i] = internal->rx_queue[i].rx_pkts;
468 			stats->q_ibytes[i] = internal->rx_queue[i].rx_bytes;
469 			stats->q_opackets[i] = internal->tx_queue[i].tx_pkts;
470 			stats->q_obytes[i] = internal->tx_queue[i].tx_bytes;
471 		}
472 	}
473 
474 	stats->ipackets = rx_total;
475 	stats->ibytes = rx_bytes_total;
476 	stats->imissed = rx_dropped_total;
477 	stats->rx_nombuf = rx_nombuf_total;
478 	stats->opackets = tx_total;
479 	stats->oerrors = tx_err_total;
480 	stats->obytes = tx_bytes_total;
481 	return 0;
482 }
483 
484 static int
485 eth_stats_reset(struct rte_eth_dev *dev)
486 {
487 	unsigned i;
488 	struct pmd_internals *internal = dev->data->dev_private;
489 
490 	for (i = 0; i < internal->nb_queues; i++) {
491 		/* clear socket counter */
492 		packet_drop_count(internal->rx_queue[i].sockfd);
493 
494 		internal->rx_queue[i].rx_pkts = 0;
495 		internal->rx_queue[i].rx_bytes = 0;
496 		internal->rx_queue[i].rx_nombuf = 0;
497 		internal->rx_queue[i].rx_dropped_pkts = 0;
498 
499 		internal->tx_queue[i].tx_pkts = 0;
500 		internal->tx_queue[i].err_pkts = 0;
501 		internal->tx_queue[i].tx_bytes = 0;
502 	}
503 
504 	return 0;
505 }
506 
507 static int
508 eth_dev_close(struct rte_eth_dev *dev)
509 {
510 	struct pmd_internals *internals;
511 	struct tpacket_req *req;
512 	unsigned int q;
513 
514 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
515 		return 0;
516 
517 	PMD_LOG(INFO, "Closing AF_PACKET ethdev on NUMA socket %u",
518 		rte_socket_id());
519 
520 	internals = dev->data->dev_private;
521 	req = &internals->req;
522 	for (q = 0; q < internals->nb_queues; q++) {
523 		munmap(internals->rx_queue[q].map,
524 			2 * req->tp_block_size * req->tp_block_nr);
525 		rte_free(internals->rx_queue[q].rd);
526 		rte_free(internals->tx_queue[q].rd);
527 	}
528 	free(internals->if_name);
529 	rte_free(internals->rx_queue);
530 	rte_free(internals->tx_queue);
531 
532 	/* mac_addrs must not be freed alone because part of dev_private */
533 	dev->data->mac_addrs = NULL;
534 	return 0;
535 }
536 
537 static int
538 eth_link_update(struct rte_eth_dev *dev,
539                 int wait_to_complete __rte_unused)
540 {
541 	const struct pmd_internals *internals = dev->data->dev_private;
542 	struct rte_eth_link *dev_link = &dev->data->dev_link;
543 	int sockfd = internals->rx_queue[0].sockfd;
544 	struct ifreq ifr = { };
545 
546 	if (sockfd == -1)
547 		return 0;
548 
549 	strlcpy(ifr.ifr_name, internals->if_name, IFNAMSIZ);
550 	if (ioctl(sockfd, SIOCGIFFLAGS, &ifr) < 0)
551 		return -errno;
552 	dev_link->link_status = (ifr.ifr_flags & IFF_RUNNING) ?
553 		RTE_ETH_LINK_UP : RTE_ETH_LINK_DOWN;
554 	return 0;
555 }
556 
557 static int
558 eth_rx_queue_setup(struct rte_eth_dev *dev,
559                    uint16_t rx_queue_id,
560                    uint16_t nb_rx_desc __rte_unused,
561                    unsigned int socket_id __rte_unused,
562                    const struct rte_eth_rxconf *rx_conf __rte_unused,
563                    struct rte_mempool *mb_pool)
564 {
565 	struct pmd_internals *internals = dev->data->dev_private;
566 	struct pkt_rx_queue *pkt_q = &internals->rx_queue[rx_queue_id];
567 	unsigned int buf_size, data_size;
568 
569 	pkt_q->mb_pool = mb_pool;
570 
571 	/* Now get the space available for data in the mbuf */
572 	buf_size = rte_pktmbuf_data_room_size(pkt_q->mb_pool) -
573 		RTE_PKTMBUF_HEADROOM;
574 	data_size = internals->req.tp_frame_size;
575 	data_size -= TPACKET2_HDRLEN - sizeof(struct sockaddr_ll);
576 
577 	if (data_size > buf_size) {
578 		PMD_LOG(ERR,
579 			"%s: %d bytes will not fit in mbuf (%d bytes)",
580 			dev->device->name, data_size, buf_size);
581 		return -ENOMEM;
582 	}
583 
584 	dev->data->rx_queues[rx_queue_id] = pkt_q;
585 	pkt_q->in_port = dev->data->port_id;
586 	pkt_q->vlan_strip = internals->vlan_strip;
587 	pkt_q->timestamp_offloading = internals->timestamp_offloading;
588 
589 	return 0;
590 }
591 
592 static int
593 eth_tx_queue_setup(struct rte_eth_dev *dev,
594                    uint16_t tx_queue_id,
595                    uint16_t nb_tx_desc __rte_unused,
596                    unsigned int socket_id __rte_unused,
597                    const struct rte_eth_txconf *tx_conf __rte_unused)
598 {
599 
600 	struct pmd_internals *internals = dev->data->dev_private;
601 
602 	dev->data->tx_queues[tx_queue_id] = &internals->tx_queue[tx_queue_id];
603 	return 0;
604 }
605 
606 static int
607 eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
608 {
609 	struct pmd_internals *internals = dev->data->dev_private;
610 	struct ifreq ifr = { .ifr_mtu = mtu };
611 	int ret;
612 	int s;
613 	unsigned int data_size = internals->req.tp_frame_size -
614 				 TPACKET2_HDRLEN;
615 
616 	if (mtu > data_size)
617 		return -EINVAL;
618 
619 	s = socket(PF_INET, SOCK_DGRAM, 0);
620 	if (s < 0)
621 		return -EINVAL;
622 
623 	strlcpy(ifr.ifr_name, internals->if_name, IFNAMSIZ);
624 	ret = ioctl(s, SIOCSIFMTU, &ifr);
625 	close(s);
626 
627 	if (ret < 0)
628 		return -EINVAL;
629 
630 	return 0;
631 }
632 
633 static int
634 eth_dev_macaddr_set(struct rte_eth_dev *dev, struct rte_ether_addr *addr)
635 {
636 	struct pmd_internals *internals = dev->data->dev_private;
637 	struct ifreq ifr = { };
638 	int sockfd = internals->rx_queue[0].sockfd;
639 	int ret;
640 
641 	if (sockfd == -1) {
642 		PMD_LOG(ERR, "receive socket not found");
643 		return -EINVAL;
644 	}
645 
646 	strlcpy(ifr.ifr_name, internals->if_name, IFNAMSIZ);
647 	ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
648 	memcpy(ifr.ifr_hwaddr.sa_data, addr, sizeof(*addr));
649 	ret = ioctl(sockfd, SIOCSIFHWADDR, &ifr);
650 
651 	if (ret < 0) {
652 		PMD_LOG_ERRNO(ERR, "ioctl(SIOCSIFHWADDR) failed");
653 		return -EINVAL;
654 	}
655 
656 	return 0;
657 }
658 
659 static int
660 eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask)
661 {
662 	struct ifreq ifr;
663 	int ret = 0;
664 	int s;
665 
666 	s = socket(PF_INET, SOCK_DGRAM, 0);
667 	if (s < 0)
668 		return -errno;
669 
670 	strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
671 	if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0) {
672 		ret = -errno;
673 		goto out;
674 	}
675 	ifr.ifr_flags &= mask;
676 	ifr.ifr_flags |= flags;
677 	if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) {
678 		ret = -errno;
679 		goto out;
680 	}
681 out:
682 	close(s);
683 	return ret;
684 }
685 
686 static int
687 eth_dev_promiscuous_enable(struct rte_eth_dev *dev)
688 {
689 	struct pmd_internals *internals = dev->data->dev_private;
690 
691 	return eth_dev_change_flags(internals->if_name, IFF_PROMISC, ~0);
692 }
693 
694 static int
695 eth_dev_promiscuous_disable(struct rte_eth_dev *dev)
696 {
697 	struct pmd_internals *internals = dev->data->dev_private;
698 
699 	return eth_dev_change_flags(internals->if_name, 0, ~IFF_PROMISC);
700 }
701 
702 static const struct eth_dev_ops ops = {
703 	.dev_start = eth_dev_start,
704 	.dev_stop = eth_dev_stop,
705 	.dev_close = eth_dev_close,
706 	.dev_configure = eth_dev_configure,
707 	.dev_infos_get = eth_dev_info,
708 	.mac_addr_set = eth_dev_macaddr_set,
709 	.mtu_set = eth_dev_mtu_set,
710 	.promiscuous_enable = eth_dev_promiscuous_enable,
711 	.promiscuous_disable = eth_dev_promiscuous_disable,
712 	.rx_queue_setup = eth_rx_queue_setup,
713 	.tx_queue_setup = eth_tx_queue_setup,
714 	.link_update = eth_link_update,
715 	.stats_get = eth_stats_get,
716 	.stats_reset = eth_stats_reset,
717 };
718 
719 /*
720  * Opens an AF_PACKET socket
721  */
722 static int
723 open_packet_iface(const char *key __rte_unused,
724                   const char *value __rte_unused,
725                   void *extra_args)
726 {
727 	int *sockfd = extra_args;
728 
729 	/* Open an AF_PACKET socket... */
730 	*sockfd = socket(AF_PACKET, SOCK_RAW, 0);
731 	if (*sockfd == -1) {
732 		PMD_LOG(ERR, "Could not open AF_PACKET socket");
733 		return -1;
734 	}
735 
736 	return 0;
737 }
738 
739 #define PACKET_FANOUT_INVALID -1
740 
741 static int
742 get_fanout_group_id(int if_index)
743 {
744 	return (getpid() ^ if_index) & 0xffff;
745 }
746 
747 static int
748 get_fanout_mode(const char *fanout_mode)
749 {
750 	int load_balance = PACKET_FANOUT_FLAG_DEFRAG |
751 			   PACKET_FANOUT_FLAG_ROLLOVER;
752 
753 	if (!fanout_mode) {
754 		/* Default */
755 		load_balance |= PACKET_FANOUT_HASH;
756 	} else if (!strcmp(fanout_mode, "hash")) {
757 		load_balance |= PACKET_FANOUT_HASH;
758 	} else if (!strcmp(fanout_mode, "lb")) {
759 		load_balance |= PACKET_FANOUT_LB;
760 	} else if (!strcmp(fanout_mode, "cpu")) {
761 		load_balance |= PACKET_FANOUT_CPU;
762 	} else if (!strcmp(fanout_mode, "rollover")) {
763 		load_balance |= PACKET_FANOUT_ROLLOVER;
764 	} else if (!strcmp(fanout_mode, "rnd")) {
765 		load_balance |= PACKET_FANOUT_RND;
766 	} else if (!strcmp(fanout_mode, "qm")) {
767 		load_balance |= PACKET_FANOUT_QM;
768 	} else {
769 		/* Invalid Fanout Mode */
770 		load_balance = PACKET_FANOUT_INVALID;
771 	}
772 
773 	return load_balance;
774 }
775 
776 static int
777 get_fanout(const char *fanout_mode, int if_index)
778 {
779 	int load_balance = get_fanout_mode(fanout_mode);
780 	if (load_balance != PACKET_FANOUT_INVALID)
781 		return get_fanout_group_id(if_index) | (load_balance << 16);
782 	else
783 		return PACKET_FANOUT_INVALID;
784 }
785 
786 static int
787 rte_pmd_init_internals(struct rte_vdev_device *dev,
788                        const int sockfd,
789                        const unsigned nb_queues,
790                        unsigned int blocksize,
791                        unsigned int blockcnt,
792                        unsigned int framesize,
793                        unsigned int framecnt,
794 		       unsigned int qdisc_bypass,
795 		       const char *fanout_mode,
796                        struct pmd_internals **internals,
797                        struct rte_eth_dev **eth_dev,
798                        struct rte_kvargs *kvlist)
799 {
800 	const char *name = rte_vdev_device_name(dev);
801 	const unsigned int numa_node = dev->device.numa_node;
802 	struct rte_eth_dev_data *data = NULL;
803 	struct rte_kvargs_pair *pair = NULL;
804 	struct ifreq ifr;
805 	size_t ifnamelen;
806 	unsigned k_idx;
807 	struct sockaddr_ll sockaddr;
808 	struct tpacket_req *req;
809 	struct pkt_rx_queue *rx_queue;
810 	struct pkt_tx_queue *tx_queue;
811 	int rc, tpver, discard;
812 	int qsockfd = -1;
813 	unsigned int i, q, rdsize;
814 	int fanout_arg;
815 
816 	for (k_idx = 0; k_idx < kvlist->count; k_idx++) {
817 		pair = &kvlist->pairs[k_idx];
818 		if (strstr(pair->key, ETH_AF_PACKET_IFACE_ARG) != NULL)
819 			break;
820 	}
821 	if (pair == NULL) {
822 		PMD_LOG(ERR,
823 			"%s: no interface specified for AF_PACKET ethdev",
824 		        name);
825 		return -1;
826 	}
827 
828 	PMD_LOG(INFO,
829 		"%s: creating AF_PACKET-backed ethdev on numa socket %u",
830 		name, numa_node);
831 
832 	*internals = rte_zmalloc_socket(name, sizeof(**internals),
833 	                                0, numa_node);
834 	if (*internals == NULL)
835 		return -1;
836 
837 
838 	(*internals)->rx_queue = rte_calloc_socket("af_packet_rx",
839 						nb_queues,
840 						sizeof(struct pkt_rx_queue),
841 						0, numa_node);
842 	(*internals)->tx_queue = rte_calloc_socket("af_packet_tx",
843 						nb_queues,
844 						sizeof(struct pkt_tx_queue),
845 						0, numa_node);
846 	if (!(*internals)->rx_queue || !(*internals)->tx_queue) {
847 		goto free_internals;
848 	}
849 
850 	for (q = 0; q < nb_queues; q++) {
851 		(*internals)->rx_queue[q].map = MAP_FAILED;
852 		(*internals)->tx_queue[q].map = MAP_FAILED;
853 		(*internals)->rx_queue[q].sockfd = -1;
854 		(*internals)->tx_queue[q].sockfd = -1;
855 	}
856 
857 	req = &((*internals)->req);
858 
859 	req->tp_block_size = blocksize;
860 	req->tp_block_nr = blockcnt;
861 	req->tp_frame_size = framesize;
862 	req->tp_frame_nr = framecnt;
863 
864 	ifnamelen = strlen(pair->value);
865 	if (ifnamelen < sizeof(ifr.ifr_name)) {
866 		memcpy(ifr.ifr_name, pair->value, ifnamelen);
867 		ifr.ifr_name[ifnamelen] = '\0';
868 	} else {
869 		PMD_LOG(ERR,
870 			"%s: I/F name too long (%s)",
871 			name, pair->value);
872 		goto free_internals;
873 	}
874 	if (ioctl(sockfd, SIOCGIFINDEX, &ifr) == -1) {
875 		PMD_LOG_ERRNO(ERR, "%s: ioctl failed (SIOCGIFINDEX)", name);
876 		goto free_internals;
877 	}
878 	(*internals)->if_name = strdup(pair->value);
879 	if ((*internals)->if_name == NULL)
880 		goto free_internals;
881 	(*internals)->if_index = ifr.ifr_ifindex;
882 
883 	if (ioctl(sockfd, SIOCGIFHWADDR, &ifr) == -1) {
884 		PMD_LOG_ERRNO(ERR, "%s: ioctl failed (SIOCGIFHWADDR)", name);
885 		goto free_internals;
886 	}
887 	memcpy(&(*internals)->eth_addr, ifr.ifr_hwaddr.sa_data, ETH_ALEN);
888 
889 	memset(&sockaddr, 0, sizeof(sockaddr));
890 	sockaddr.sll_family = AF_PACKET;
891 	sockaddr.sll_protocol = htons(ETH_P_ALL);
892 	sockaddr.sll_ifindex = (*internals)->if_index;
893 
894 	fanout_arg = get_fanout(fanout_mode, (*internals)->if_index);
895 	if (fanout_arg == PACKET_FANOUT_INVALID) {
896 		PMD_LOG(ERR, "Invalid fanout mode: %s", fanout_mode);
897 		goto error;
898 	}
899 
900 	for (q = 0; q < nb_queues; q++) {
901 		/* Open an AF_PACKET socket for this queue... */
902 		qsockfd = socket(AF_PACKET, SOCK_RAW, 0);
903 		if (qsockfd == -1) {
904 			PMD_LOG_ERRNO(ERR,
905 				"%s: could not open AF_PACKET socket",
906 				name);
907 			goto error;
908 		}
909 
910 		tpver = TPACKET_V2;
911 		rc = setsockopt(qsockfd, SOL_PACKET, PACKET_VERSION,
912 				&tpver, sizeof(tpver));
913 		if (rc == -1) {
914 			PMD_LOG_ERRNO(ERR,
915 				"%s: could not set PACKET_VERSION on AF_PACKET socket for %s",
916 				name, pair->value);
917 			goto error;
918 		}
919 
920 		discard = 1;
921 		rc = setsockopt(qsockfd, SOL_PACKET, PACKET_LOSS,
922 				&discard, sizeof(discard));
923 		if (rc == -1) {
924 			PMD_LOG_ERRNO(ERR,
925 				"%s: could not set PACKET_LOSS on AF_PACKET socket for %s",
926 				name, pair->value);
927 			goto error;
928 		}
929 
930 		if (qdisc_bypass) {
931 #if defined(PACKET_QDISC_BYPASS)
932 			rc = setsockopt(qsockfd, SOL_PACKET, PACKET_QDISC_BYPASS,
933 					&qdisc_bypass, sizeof(qdisc_bypass));
934 			if (rc == -1) {
935 				PMD_LOG_ERRNO(ERR,
936 					"%s: could not set PACKET_QDISC_BYPASS on AF_PACKET socket for %s",
937 					name, pair->value);
938 				goto error;
939 			}
940 #endif
941 		}
942 
943 		rc = setsockopt(qsockfd, SOL_PACKET, PACKET_RX_RING, req, sizeof(*req));
944 		if (rc == -1) {
945 			PMD_LOG_ERRNO(ERR,
946 				"%s: could not set PACKET_RX_RING on AF_PACKET socket for %s",
947 				name, pair->value);
948 			goto error;
949 		}
950 
951 		rc = setsockopt(qsockfd, SOL_PACKET, PACKET_TX_RING, req, sizeof(*req));
952 		if (rc == -1) {
953 			PMD_LOG_ERRNO(ERR,
954 				"%s: could not set PACKET_TX_RING on AF_PACKET "
955 				"socket for %s", name, pair->value);
956 			goto error;
957 		}
958 
959 		rx_queue = &((*internals)->rx_queue[q]);
960 		rx_queue->framecount = req->tp_frame_nr;
961 
962 		rx_queue->map = mmap(NULL, 2 * req->tp_block_size * req->tp_block_nr,
963 				    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED,
964 				    qsockfd, 0);
965 		if (rx_queue->map == MAP_FAILED) {
966 			PMD_LOG_ERRNO(ERR,
967 				"%s: call to mmap failed on AF_PACKET socket for %s",
968 				name, pair->value);
969 			goto error;
970 		}
971 
972 		/* rdsize is same for both Tx and Rx */
973 		rdsize = req->tp_frame_nr * sizeof(*(rx_queue->rd));
974 
975 		rx_queue->rd = rte_zmalloc_socket(name, rdsize, 0, numa_node);
976 		if (rx_queue->rd == NULL)
977 			goto error;
978 		for (i = 0; i < req->tp_frame_nr; ++i) {
979 			rx_queue->rd[i].iov_base = rx_queue->map + (i * framesize);
980 			rx_queue->rd[i].iov_len = req->tp_frame_size;
981 		}
982 		rx_queue->sockfd = qsockfd;
983 
984 		tx_queue = &((*internals)->tx_queue[q]);
985 		tx_queue->framecount = req->tp_frame_nr;
986 		tx_queue->frame_data_size = req->tp_frame_size;
987 		tx_queue->frame_data_size -= TPACKET2_HDRLEN -
988 			sizeof(struct sockaddr_ll);
989 
990 		tx_queue->map = rx_queue->map + req->tp_block_size * req->tp_block_nr;
991 
992 		tx_queue->rd = rte_zmalloc_socket(name, rdsize, 0, numa_node);
993 		if (tx_queue->rd == NULL)
994 			goto error;
995 		for (i = 0; i < req->tp_frame_nr; ++i) {
996 			tx_queue->rd[i].iov_base = tx_queue->map + (i * framesize);
997 			tx_queue->rd[i].iov_len = req->tp_frame_size;
998 		}
999 		tx_queue->sockfd = qsockfd;
1000 
1001 		rc = bind(qsockfd, (const struct sockaddr*)&sockaddr, sizeof(sockaddr));
1002 		if (rc == -1) {
1003 			PMD_LOG_ERRNO(ERR,
1004 				"%s: could not bind AF_PACKET socket to %s",
1005 				name, pair->value);
1006 			goto error;
1007 		}
1008 
1009 		if (nb_queues > 1) {
1010 			rc = setsockopt(qsockfd, SOL_PACKET, PACKET_FANOUT,
1011 					&fanout_arg, sizeof(fanout_arg));
1012 			if (rc == -1) {
1013 				PMD_LOG_ERRNO(ERR,
1014 					"%s: could not set PACKET_FANOUT "
1015 					"on AF_PACKET socket for %s",
1016 					name, pair->value);
1017 				goto error;
1018 			}
1019 		}
1020 	}
1021 
1022 	/* reserve an ethdev entry */
1023 	*eth_dev = rte_eth_vdev_allocate(dev, 0);
1024 	if (*eth_dev == NULL)
1025 		goto error;
1026 
1027 	/*
1028 	 * now put it all together
1029 	 * - store queue data in internals,
1030 	 * - store numa_node in eth_dev
1031 	 * - point eth_dev_data to internals
1032 	 * - and point eth_dev structure to new eth_dev_data structure
1033 	 */
1034 
1035 	(*internals)->nb_queues = nb_queues;
1036 
1037 	data = (*eth_dev)->data;
1038 	data->dev_private = *internals;
1039 	data->nb_rx_queues = (uint16_t)nb_queues;
1040 	data->nb_tx_queues = (uint16_t)nb_queues;
1041 	data->dev_link = pmd_link;
1042 	data->mac_addrs = &(*internals)->eth_addr;
1043 	data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
1044 
1045 	(*eth_dev)->dev_ops = &ops;
1046 
1047 	return 0;
1048 
1049 error:
1050 	if (qsockfd != -1)
1051 		close(qsockfd);
1052 	for (q = 0; q < nb_queues; q++) {
1053 		if ((*internals)->rx_queue[q].map != MAP_FAILED)
1054 			munmap((*internals)->rx_queue[q].map,
1055 			       2 * req->tp_block_size * req->tp_block_nr);
1056 
1057 		rte_free((*internals)->rx_queue[q].rd);
1058 		rte_free((*internals)->tx_queue[q].rd);
1059 		if (((*internals)->rx_queue[q].sockfd >= 0) &&
1060 			((*internals)->rx_queue[q].sockfd != qsockfd))
1061 			close((*internals)->rx_queue[q].sockfd);
1062 	}
1063 free_internals:
1064 	rte_free((*internals)->rx_queue);
1065 	rte_free((*internals)->tx_queue);
1066 	free((*internals)->if_name);
1067 	rte_free(*internals);
1068 	return -1;
1069 }
1070 
1071 static int
1072 rte_eth_from_packet(struct rte_vdev_device *dev,
1073                     int const *sockfd,
1074                     struct rte_kvargs *kvlist)
1075 {
1076 	const char *name = rte_vdev_device_name(dev);
1077 	struct pmd_internals *internals = NULL;
1078 	struct rte_eth_dev *eth_dev = NULL;
1079 	struct rte_kvargs_pair *pair = NULL;
1080 	unsigned k_idx;
1081 	unsigned int blockcount;
1082 	unsigned int blocksize;
1083 	unsigned int framesize = DFLT_FRAME_SIZE;
1084 	unsigned int framecount = DFLT_FRAME_COUNT;
1085 	unsigned int qpairs = 1;
1086 	unsigned int qdisc_bypass = 1;
1087 	const char *fanout_mode = NULL;
1088 
1089 	/* do some parameter checking */
1090 	if (*sockfd < 0)
1091 		return -1;
1092 
1093 	blocksize = getpagesize();
1094 
1095 	/*
1096 	 * Walk arguments for configurable settings
1097 	 */
1098 	for (k_idx = 0; k_idx < kvlist->count; k_idx++) {
1099 		pair = &kvlist->pairs[k_idx];
1100 		if (strstr(pair->key, ETH_AF_PACKET_NUM_Q_ARG) != NULL) {
1101 			qpairs = atoi(pair->value);
1102 			if (qpairs < 1) {
1103 				PMD_LOG(ERR,
1104 					"%s: invalid qpairs value",
1105 				        name);
1106 				return -1;
1107 			}
1108 			continue;
1109 		}
1110 		if (strstr(pair->key, ETH_AF_PACKET_BLOCKSIZE_ARG) != NULL) {
1111 			blocksize = atoi(pair->value);
1112 			if (!blocksize) {
1113 				PMD_LOG(ERR,
1114 					"%s: invalid blocksize value",
1115 				        name);
1116 				return -1;
1117 			}
1118 			continue;
1119 		}
1120 		if (strstr(pair->key, ETH_AF_PACKET_FRAMESIZE_ARG) != NULL) {
1121 			framesize = atoi(pair->value);
1122 			if (!framesize) {
1123 				PMD_LOG(ERR,
1124 					"%s: invalid framesize value",
1125 				        name);
1126 				return -1;
1127 			}
1128 			continue;
1129 		}
1130 		if (strstr(pair->key, ETH_AF_PACKET_FRAMECOUNT_ARG) != NULL) {
1131 			framecount = atoi(pair->value);
1132 			if (!framecount) {
1133 				PMD_LOG(ERR,
1134 					"%s: invalid framecount value",
1135 				        name);
1136 				return -1;
1137 			}
1138 			continue;
1139 		}
1140 		if (strstr(pair->key, ETH_AF_PACKET_QDISC_BYPASS_ARG) != NULL) {
1141 			qdisc_bypass = atoi(pair->value);
1142 			if (qdisc_bypass > 1) {
1143 				PMD_LOG(ERR,
1144 					"%s: invalid bypass value",
1145 					name);
1146 				return -1;
1147 			}
1148 			continue;
1149 		}
1150 		if (strstr(pair->key, ETH_AF_PACKET_FANOUT_MODE_ARG) != NULL) {
1151 			fanout_mode = pair->value;
1152 			continue;
1153 		}
1154 	}
1155 
1156 	if (framesize > blocksize) {
1157 		PMD_LOG(ERR,
1158 			"%s: AF_PACKET MMAP frame size exceeds block size!",
1159 		        name);
1160 		return -1;
1161 	}
1162 
1163 	blockcount = framecount / (blocksize / framesize);
1164 	if (!blockcount) {
1165 		PMD_LOG(ERR,
1166 			"%s: invalid AF_PACKET MMAP parameters", name);
1167 		return -1;
1168 	}
1169 
1170 	PMD_LOG(INFO, "%s: AF_PACKET MMAP parameters:", name);
1171 	PMD_LOG(INFO, "%s:\tblock size %d", name, blocksize);
1172 	PMD_LOG(INFO, "%s:\tblock count %d", name, blockcount);
1173 	PMD_LOG(INFO, "%s:\tframe size %d", name, framesize);
1174 	PMD_LOG(INFO, "%s:\tframe count %d", name, framecount);
1175 
1176 	if (rte_pmd_init_internals(dev, *sockfd, qpairs,
1177 				   blocksize, blockcount,
1178 				   framesize, framecount,
1179 				   qdisc_bypass,
1180 				   fanout_mode,
1181 				   &internals, &eth_dev,
1182 				   kvlist) < 0)
1183 		return -1;
1184 
1185 	eth_dev->rx_pkt_burst = eth_af_packet_rx;
1186 	eth_dev->tx_pkt_burst = eth_af_packet_tx;
1187 
1188 	rte_eth_dev_probing_finish(eth_dev);
1189 	return 0;
1190 }
1191 
1192 static int
1193 rte_pmd_af_packet_probe(struct rte_vdev_device *dev)
1194 {
1195 	int ret = 0;
1196 	struct rte_kvargs *kvlist;
1197 	int sockfd = -1;
1198 	struct rte_eth_dev *eth_dev;
1199 	const char *name = rte_vdev_device_name(dev);
1200 
1201 	PMD_LOG(INFO, "Initializing pmd_af_packet for %s", name);
1202 
1203 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1204 		eth_dev = rte_eth_dev_attach_secondary(name);
1205 		if (!eth_dev) {
1206 			PMD_LOG(ERR, "Failed to probe %s", name);
1207 			return -1;
1208 		}
1209 		/* TODO: request info from primary to set up Rx and Tx */
1210 		eth_dev->dev_ops = &ops;
1211 		eth_dev->device = &dev->device;
1212 		rte_eth_dev_probing_finish(eth_dev);
1213 		return 0;
1214 	}
1215 
1216 	kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1217 	if (kvlist == NULL) {
1218 		ret = -1;
1219 		goto exit;
1220 	}
1221 
1222 	/*
1223 	 * If iface argument is passed we open the NICs and use them for
1224 	 * reading / writing
1225 	 */
1226 	if (rte_kvargs_count(kvlist, ETH_AF_PACKET_IFACE_ARG) == 1) {
1227 
1228 		ret = rte_kvargs_process(kvlist, ETH_AF_PACKET_IFACE_ARG,
1229 		                         &open_packet_iface, &sockfd);
1230 		if (ret < 0)
1231 			goto exit;
1232 	}
1233 
1234 	if (dev->device.numa_node == SOCKET_ID_ANY)
1235 		dev->device.numa_node = rte_socket_id();
1236 
1237 	ret = rte_eth_from_packet(dev, &sockfd, kvlist);
1238 	close(sockfd); /* no longer needed */
1239 
1240 exit:
1241 	rte_kvargs_free(kvlist);
1242 	return ret;
1243 }
1244 
1245 static int
1246 rte_pmd_af_packet_remove(struct rte_vdev_device *dev)
1247 {
1248 	struct rte_eth_dev *eth_dev;
1249 
1250 	if (dev == NULL)
1251 		return -1;
1252 
1253 	/* find the ethdev entry */
1254 	eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
1255 	if (eth_dev == NULL)
1256 		return 0; /* port already released */
1257 
1258 	eth_dev_close(eth_dev);
1259 	rte_eth_dev_release_port(eth_dev);
1260 
1261 	return 0;
1262 }
1263 
1264 static struct rte_vdev_driver pmd_af_packet_drv = {
1265 	.probe = rte_pmd_af_packet_probe,
1266 	.remove = rte_pmd_af_packet_remove,
1267 };
1268 
1269 RTE_PMD_REGISTER_VDEV(net_af_packet, pmd_af_packet_drv);
1270 RTE_PMD_REGISTER_ALIAS(net_af_packet, eth_af_packet);
1271 RTE_PMD_REGISTER_PARAM_STRING(net_af_packet,
1272 	"iface=<string> "
1273 	"qpairs=<int> "
1274 	"blocksz=<int> "
1275 	"framesz=<int> "
1276 	"framecnt=<int> "
1277 	"qdisc_bypass=<0|1>");
1278