1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2014 John W. Linville <linville@tuxdriver.com> 3 * Originally based upon librte_pmd_pcap code: 4 * Copyright(c) 2010-2015 Intel Corporation. 5 * Copyright(c) 2014 6WIND S.A. 6 * All rights reserved. 7 */ 8 9 #include <rte_common.h> 10 #include <rte_string_fns.h> 11 #include <rte_mbuf.h> 12 #include <ethdev_driver.h> 13 #include <ethdev_vdev.h> 14 #include <rte_malloc.h> 15 #include <rte_kvargs.h> 16 #include <bus_vdev_driver.h> 17 18 #include <errno.h> 19 #include <linux/if_ether.h> 20 #include <linux/if_packet.h> 21 #include <arpa/inet.h> 22 #include <net/if.h> 23 #include <net/if_arp.h> 24 #include <sys/types.h> 25 #include <sys/socket.h> 26 #include <sys/ioctl.h> 27 #include <stdlib.h> 28 #include <string.h> 29 #include <sys/mman.h> 30 #include <unistd.h> 31 #include <poll.h> 32 33 #define ETH_AF_PACKET_IFACE_ARG "iface" 34 #define ETH_AF_PACKET_NUM_Q_ARG "qpairs" 35 #define ETH_AF_PACKET_BLOCKSIZE_ARG "blocksz" 36 #define ETH_AF_PACKET_FRAMESIZE_ARG "framesz" 37 #define ETH_AF_PACKET_FRAMECOUNT_ARG "framecnt" 38 #define ETH_AF_PACKET_QDISC_BYPASS_ARG "qdisc_bypass" 39 40 #define DFLT_FRAME_SIZE (1 << 11) 41 #define DFLT_FRAME_COUNT (1 << 9) 42 43 struct __rte_cache_aligned pkt_rx_queue { 44 int sockfd; 45 46 struct iovec *rd; 47 uint8_t *map; 48 unsigned int framecount; 49 unsigned int framenum; 50 51 struct rte_mempool *mb_pool; 52 uint16_t in_port; 53 uint8_t vlan_strip; 54 55 volatile unsigned long rx_pkts; 56 volatile unsigned long rx_bytes; 57 }; 58 59 struct __rte_cache_aligned pkt_tx_queue { 60 int sockfd; 61 unsigned int frame_data_size; 62 63 struct iovec *rd; 64 uint8_t *map; 65 unsigned int framecount; 66 unsigned int framenum; 67 68 volatile unsigned long tx_pkts; 69 volatile unsigned long err_pkts; 70 volatile unsigned long tx_bytes; 71 }; 72 73 struct pmd_internals { 74 unsigned nb_queues; 75 76 int if_index; 77 char *if_name; 78 struct rte_ether_addr eth_addr; 79 80 struct tpacket_req req; 81 82 struct pkt_rx_queue *rx_queue; 83 struct pkt_tx_queue *tx_queue; 84 uint8_t vlan_strip; 85 }; 86 87 static const char *valid_arguments[] = { 88 ETH_AF_PACKET_IFACE_ARG, 89 ETH_AF_PACKET_NUM_Q_ARG, 90 ETH_AF_PACKET_BLOCKSIZE_ARG, 91 ETH_AF_PACKET_FRAMESIZE_ARG, 92 ETH_AF_PACKET_FRAMECOUNT_ARG, 93 ETH_AF_PACKET_QDISC_BYPASS_ARG, 94 NULL 95 }; 96 97 static struct rte_eth_link pmd_link = { 98 .link_speed = RTE_ETH_SPEED_NUM_10G, 99 .link_duplex = RTE_ETH_LINK_FULL_DUPLEX, 100 .link_status = RTE_ETH_LINK_DOWN, 101 .link_autoneg = RTE_ETH_LINK_FIXED, 102 }; 103 104 RTE_LOG_REGISTER_DEFAULT(af_packet_logtype, NOTICE); 105 106 #define PMD_LOG(level, fmt, args...) \ 107 rte_log(RTE_LOG_ ## level, af_packet_logtype, \ 108 "%s(): " fmt "\n", __func__, ##args) 109 110 #define PMD_LOG_ERRNO(level, fmt, args...) \ 111 rte_log(RTE_LOG_ ## level, af_packet_logtype, \ 112 "%s(): " fmt ":%s\n", __func__, ##args, strerror(errno)) 113 114 static uint16_t 115 eth_af_packet_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) 116 { 117 unsigned i; 118 struct tpacket2_hdr *ppd; 119 struct rte_mbuf *mbuf; 120 uint8_t *pbuf; 121 struct pkt_rx_queue *pkt_q = queue; 122 uint16_t num_rx = 0; 123 unsigned long num_rx_bytes = 0; 124 unsigned int framecount, framenum; 125 126 if (unlikely(nb_pkts == 0)) 127 return 0; 128 129 /* 130 * Reads the given number of packets from the AF_PACKET socket one by 131 * one and copies the packet data into a newly allocated mbuf. 132 */ 133 framecount = pkt_q->framecount; 134 framenum = pkt_q->framenum; 135 for (i = 0; i < nb_pkts; i++) { 136 /* point at the next incoming frame */ 137 ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base; 138 if ((ppd->tp_status & TP_STATUS_USER) == 0) 139 break; 140 141 /* allocate the next mbuf */ 142 mbuf = rte_pktmbuf_alloc(pkt_q->mb_pool); 143 if (unlikely(mbuf == NULL)) 144 break; 145 146 /* packet will fit in the mbuf, go ahead and receive it */ 147 rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf) = ppd->tp_snaplen; 148 pbuf = (uint8_t *) ppd + ppd->tp_mac; 149 memcpy(rte_pktmbuf_mtod(mbuf, void *), pbuf, rte_pktmbuf_data_len(mbuf)); 150 151 /* check for vlan info */ 152 if (ppd->tp_status & TP_STATUS_VLAN_VALID) { 153 mbuf->vlan_tci = ppd->tp_vlan_tci; 154 mbuf->ol_flags |= (RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED); 155 156 if (!pkt_q->vlan_strip && rte_vlan_insert(&mbuf)) 157 PMD_LOG(ERR, "Failed to reinsert VLAN tag"); 158 } 159 160 /* release incoming frame and advance ring buffer */ 161 ppd->tp_status = TP_STATUS_KERNEL; 162 if (++framenum >= framecount) 163 framenum = 0; 164 mbuf->port = pkt_q->in_port; 165 166 /* account for the receive frame */ 167 bufs[i] = mbuf; 168 num_rx++; 169 num_rx_bytes += mbuf->pkt_len; 170 } 171 pkt_q->framenum = framenum; 172 pkt_q->rx_pkts += num_rx; 173 pkt_q->rx_bytes += num_rx_bytes; 174 return num_rx; 175 } 176 177 /* 178 * Check if there is an available frame in the ring 179 */ 180 static inline bool 181 tx_ring_status_available(uint32_t tp_status) 182 { 183 /* 184 * We eliminate the timestamp status from the packet status. 185 * This should only matter if timestamping is enabled on the socket, 186 * but there is a bug in the kernel which is fixed in newer releases. 187 * 188 * See the following kernel commit for reference: 189 * commit 171c3b151118a2fe0fc1e2a9d1b5a1570cfe82d2 190 * net: packetmmap: fix only tx timestamp on request 191 */ 192 tp_status &= ~(TP_STATUS_TS_SOFTWARE | TP_STATUS_TS_RAW_HARDWARE); 193 194 return tp_status == TP_STATUS_AVAILABLE; 195 } 196 197 /* 198 * Callback to handle sending packets through a real NIC. 199 */ 200 static uint16_t 201 eth_af_packet_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) 202 { 203 struct tpacket2_hdr *ppd; 204 struct rte_mbuf *mbuf; 205 uint8_t *pbuf; 206 unsigned int framecount, framenum; 207 struct pollfd pfd; 208 struct pkt_tx_queue *pkt_q = queue; 209 uint16_t num_tx = 0; 210 unsigned long num_tx_bytes = 0; 211 int i; 212 213 if (unlikely(nb_pkts == 0)) 214 return 0; 215 216 memset(&pfd, 0, sizeof(pfd)); 217 pfd.fd = pkt_q->sockfd; 218 pfd.events = POLLOUT; 219 pfd.revents = 0; 220 221 framecount = pkt_q->framecount; 222 framenum = pkt_q->framenum; 223 ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base; 224 for (i = 0; i < nb_pkts; i++) { 225 mbuf = *bufs++; 226 227 /* drop oversized packets */ 228 if (mbuf->pkt_len > pkt_q->frame_data_size) { 229 rte_pktmbuf_free(mbuf); 230 continue; 231 } 232 233 /* insert vlan info if necessary */ 234 if (mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) { 235 if (rte_vlan_insert(&mbuf)) { 236 rte_pktmbuf_free(mbuf); 237 continue; 238 } 239 } 240 241 /* point at the next incoming frame */ 242 if (!tx_ring_status_available(ppd->tp_status)) { 243 if (poll(&pfd, 1, -1) < 0) 244 break; 245 246 /* poll() can return POLLERR if the interface is down */ 247 if (pfd.revents & POLLERR) 248 break; 249 } 250 251 /* 252 * poll() will almost always return POLLOUT, even if there 253 * are no extra buffers available 254 * 255 * This happens, because packet_poll() calls datagram_poll() 256 * which checks the space left in the socket buffer and, 257 * in the case of packet_mmap, the default socket buffer length 258 * doesn't match the requested size for the tx_ring. 259 * As such, there is almost always space left in socket buffer, 260 * which doesn't seem to be correlated to the requested size 261 * for the tx_ring in packet_mmap. 262 * 263 * This results in poll() returning POLLOUT. 264 */ 265 if (!tx_ring_status_available(ppd->tp_status)) 266 break; 267 268 /* copy the tx frame data */ 269 pbuf = (uint8_t *) ppd + TPACKET2_HDRLEN - 270 sizeof(struct sockaddr_ll); 271 272 struct rte_mbuf *tmp_mbuf = mbuf; 273 while (tmp_mbuf) { 274 uint16_t data_len = rte_pktmbuf_data_len(tmp_mbuf); 275 memcpy(pbuf, rte_pktmbuf_mtod(tmp_mbuf, void*), data_len); 276 pbuf += data_len; 277 tmp_mbuf = tmp_mbuf->next; 278 } 279 280 ppd->tp_len = mbuf->pkt_len; 281 ppd->tp_snaplen = mbuf->pkt_len; 282 283 /* release incoming frame and advance ring buffer */ 284 ppd->tp_status = TP_STATUS_SEND_REQUEST; 285 if (++framenum >= framecount) 286 framenum = 0; 287 ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base; 288 289 num_tx++; 290 num_tx_bytes += mbuf->pkt_len; 291 rte_pktmbuf_free(mbuf); 292 } 293 294 /* kick-off transmits */ 295 if (sendto(pkt_q->sockfd, NULL, 0, MSG_DONTWAIT, NULL, 0) == -1 && 296 errno != ENOBUFS && errno != EAGAIN) { 297 /* 298 * In case of a ENOBUFS/EAGAIN error all of the enqueued 299 * packets will be considered successful even though only some 300 * are sent. 301 */ 302 303 num_tx = 0; 304 num_tx_bytes = 0; 305 } 306 307 pkt_q->framenum = framenum; 308 pkt_q->tx_pkts += num_tx; 309 pkt_q->err_pkts += i - num_tx; 310 pkt_q->tx_bytes += num_tx_bytes; 311 return i; 312 } 313 314 static int 315 eth_dev_start(struct rte_eth_dev *dev) 316 { 317 struct pmd_internals *internals = dev->data->dev_private; 318 uint16_t i; 319 320 dev->data->dev_link.link_status = RTE_ETH_LINK_UP; 321 for (i = 0; i < internals->nb_queues; i++) { 322 dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED; 323 dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED; 324 } 325 return 0; 326 } 327 328 /* 329 * This function gets called when the current port gets stopped. 330 */ 331 static int 332 eth_dev_stop(struct rte_eth_dev *dev) 333 { 334 unsigned i; 335 int sockfd; 336 struct pmd_internals *internals = dev->data->dev_private; 337 338 for (i = 0; i < internals->nb_queues; i++) { 339 sockfd = internals->rx_queue[i].sockfd; 340 if (sockfd != -1) 341 close(sockfd); 342 343 /* Prevent use after free in case tx fd == rx fd */ 344 if (sockfd != internals->tx_queue[i].sockfd) { 345 sockfd = internals->tx_queue[i].sockfd; 346 if (sockfd != -1) 347 close(sockfd); 348 } 349 350 internals->rx_queue[i].sockfd = -1; 351 internals->tx_queue[i].sockfd = -1; 352 dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED; 353 dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED; 354 } 355 356 dev->data->dev_link.link_status = RTE_ETH_LINK_DOWN; 357 return 0; 358 } 359 360 static int 361 eth_dev_configure(struct rte_eth_dev *dev __rte_unused) 362 { 363 struct rte_eth_conf *dev_conf = &dev->data->dev_conf; 364 const struct rte_eth_rxmode *rxmode = &dev_conf->rxmode; 365 struct pmd_internals *internals = dev->data->dev_private; 366 367 internals->vlan_strip = !!(rxmode->offloads & RTE_ETH_RX_OFFLOAD_VLAN_STRIP); 368 return 0; 369 } 370 371 static int 372 eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) 373 { 374 struct pmd_internals *internals = dev->data->dev_private; 375 376 dev_info->if_index = internals->if_index; 377 dev_info->max_mac_addrs = 1; 378 dev_info->max_rx_pktlen = RTE_ETHER_MAX_LEN; 379 dev_info->max_rx_queues = (uint16_t)internals->nb_queues; 380 dev_info->max_tx_queues = (uint16_t)internals->nb_queues; 381 dev_info->min_rx_bufsize = 0; 382 dev_info->tx_offload_capa = RTE_ETH_TX_OFFLOAD_MULTI_SEGS | 383 RTE_ETH_TX_OFFLOAD_VLAN_INSERT; 384 dev_info->rx_offload_capa = RTE_ETH_RX_OFFLOAD_VLAN_STRIP; 385 386 return 0; 387 } 388 389 static int 390 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats) 391 { 392 unsigned i, imax; 393 unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0; 394 unsigned long rx_bytes_total = 0, tx_bytes_total = 0; 395 const struct pmd_internals *internal = dev->data->dev_private; 396 397 imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ? 398 internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS); 399 for (i = 0; i < imax; i++) { 400 igb_stats->q_ipackets[i] = internal->rx_queue[i].rx_pkts; 401 igb_stats->q_ibytes[i] = internal->rx_queue[i].rx_bytes; 402 rx_total += igb_stats->q_ipackets[i]; 403 rx_bytes_total += igb_stats->q_ibytes[i]; 404 } 405 406 imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ? 407 internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS); 408 for (i = 0; i < imax; i++) { 409 igb_stats->q_opackets[i] = internal->tx_queue[i].tx_pkts; 410 igb_stats->q_obytes[i] = internal->tx_queue[i].tx_bytes; 411 tx_total += igb_stats->q_opackets[i]; 412 tx_err_total += internal->tx_queue[i].err_pkts; 413 tx_bytes_total += igb_stats->q_obytes[i]; 414 } 415 416 igb_stats->ipackets = rx_total; 417 igb_stats->ibytes = rx_bytes_total; 418 igb_stats->opackets = tx_total; 419 igb_stats->oerrors = tx_err_total; 420 igb_stats->obytes = tx_bytes_total; 421 return 0; 422 } 423 424 static int 425 eth_stats_reset(struct rte_eth_dev *dev) 426 { 427 unsigned i; 428 struct pmd_internals *internal = dev->data->dev_private; 429 430 for (i = 0; i < internal->nb_queues; i++) { 431 internal->rx_queue[i].rx_pkts = 0; 432 internal->rx_queue[i].rx_bytes = 0; 433 } 434 435 for (i = 0; i < internal->nb_queues; i++) { 436 internal->tx_queue[i].tx_pkts = 0; 437 internal->tx_queue[i].err_pkts = 0; 438 internal->tx_queue[i].tx_bytes = 0; 439 } 440 441 return 0; 442 } 443 444 static int 445 eth_dev_close(struct rte_eth_dev *dev) 446 { 447 struct pmd_internals *internals; 448 struct tpacket_req *req; 449 unsigned int q; 450 451 if (rte_eal_process_type() != RTE_PROC_PRIMARY) 452 return 0; 453 454 PMD_LOG(INFO, "Closing AF_PACKET ethdev on NUMA socket %u", 455 rte_socket_id()); 456 457 internals = dev->data->dev_private; 458 req = &internals->req; 459 for (q = 0; q < internals->nb_queues; q++) { 460 munmap(internals->rx_queue[q].map, 461 2 * req->tp_block_size * req->tp_block_nr); 462 rte_free(internals->rx_queue[q].rd); 463 rte_free(internals->tx_queue[q].rd); 464 } 465 free(internals->if_name); 466 rte_free(internals->rx_queue); 467 rte_free(internals->tx_queue); 468 469 /* mac_addrs must not be freed alone because part of dev_private */ 470 dev->data->mac_addrs = NULL; 471 return 0; 472 } 473 474 static int 475 eth_link_update(struct rte_eth_dev *dev __rte_unused, 476 int wait_to_complete __rte_unused) 477 { 478 return 0; 479 } 480 481 static int 482 eth_rx_queue_setup(struct rte_eth_dev *dev, 483 uint16_t rx_queue_id, 484 uint16_t nb_rx_desc __rte_unused, 485 unsigned int socket_id __rte_unused, 486 const struct rte_eth_rxconf *rx_conf __rte_unused, 487 struct rte_mempool *mb_pool) 488 { 489 struct pmd_internals *internals = dev->data->dev_private; 490 struct pkt_rx_queue *pkt_q = &internals->rx_queue[rx_queue_id]; 491 unsigned int buf_size, data_size; 492 493 pkt_q->mb_pool = mb_pool; 494 495 /* Now get the space available for data in the mbuf */ 496 buf_size = rte_pktmbuf_data_room_size(pkt_q->mb_pool) - 497 RTE_PKTMBUF_HEADROOM; 498 data_size = internals->req.tp_frame_size; 499 data_size -= TPACKET2_HDRLEN - sizeof(struct sockaddr_ll); 500 501 if (data_size > buf_size) { 502 PMD_LOG(ERR, 503 "%s: %d bytes will not fit in mbuf (%d bytes)", 504 dev->device->name, data_size, buf_size); 505 return -ENOMEM; 506 } 507 508 dev->data->rx_queues[rx_queue_id] = pkt_q; 509 pkt_q->in_port = dev->data->port_id; 510 pkt_q->vlan_strip = internals->vlan_strip; 511 512 return 0; 513 } 514 515 static int 516 eth_tx_queue_setup(struct rte_eth_dev *dev, 517 uint16_t tx_queue_id, 518 uint16_t nb_tx_desc __rte_unused, 519 unsigned int socket_id __rte_unused, 520 const struct rte_eth_txconf *tx_conf __rte_unused) 521 { 522 523 struct pmd_internals *internals = dev->data->dev_private; 524 525 dev->data->tx_queues[tx_queue_id] = &internals->tx_queue[tx_queue_id]; 526 return 0; 527 } 528 529 static int 530 eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu) 531 { 532 struct pmd_internals *internals = dev->data->dev_private; 533 struct ifreq ifr = { .ifr_mtu = mtu }; 534 int ret; 535 int s; 536 unsigned int data_size = internals->req.tp_frame_size - 537 TPACKET2_HDRLEN; 538 539 if (mtu > data_size) 540 return -EINVAL; 541 542 s = socket(PF_INET, SOCK_DGRAM, 0); 543 if (s < 0) 544 return -EINVAL; 545 546 strlcpy(ifr.ifr_name, internals->if_name, IFNAMSIZ); 547 ret = ioctl(s, SIOCSIFMTU, &ifr); 548 close(s); 549 550 if (ret < 0) 551 return -EINVAL; 552 553 return 0; 554 } 555 556 static int 557 eth_dev_macaddr_set(struct rte_eth_dev *dev, struct rte_ether_addr *addr) 558 { 559 struct pmd_internals *internals = dev->data->dev_private; 560 struct ifreq ifr = { }; 561 int sockfd = internals->rx_queue[0].sockfd; 562 int ret; 563 564 if (sockfd == -1) { 565 PMD_LOG(ERR, "receive socket not found"); 566 return -EINVAL; 567 } 568 569 strlcpy(ifr.ifr_name, internals->if_name, IFNAMSIZ); 570 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER; 571 memcpy(ifr.ifr_hwaddr.sa_data, addr, sizeof(*addr)); 572 ret = ioctl(sockfd, SIOCSIFHWADDR, &ifr); 573 574 if (ret < 0) { 575 PMD_LOG_ERRNO(ERR, "ioctl(SIOCSIFHWADDR) failed"); 576 return -EINVAL; 577 } 578 579 return 0; 580 } 581 582 static int 583 eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask) 584 { 585 struct ifreq ifr; 586 int ret = 0; 587 int s; 588 589 s = socket(PF_INET, SOCK_DGRAM, 0); 590 if (s < 0) 591 return -errno; 592 593 strlcpy(ifr.ifr_name, if_name, IFNAMSIZ); 594 if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0) { 595 ret = -errno; 596 goto out; 597 } 598 ifr.ifr_flags &= mask; 599 ifr.ifr_flags |= flags; 600 if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) { 601 ret = -errno; 602 goto out; 603 } 604 out: 605 close(s); 606 return ret; 607 } 608 609 static int 610 eth_dev_promiscuous_enable(struct rte_eth_dev *dev) 611 { 612 struct pmd_internals *internals = dev->data->dev_private; 613 614 return eth_dev_change_flags(internals->if_name, IFF_PROMISC, ~0); 615 } 616 617 static int 618 eth_dev_promiscuous_disable(struct rte_eth_dev *dev) 619 { 620 struct pmd_internals *internals = dev->data->dev_private; 621 622 return eth_dev_change_flags(internals->if_name, 0, ~IFF_PROMISC); 623 } 624 625 static const struct eth_dev_ops ops = { 626 .dev_start = eth_dev_start, 627 .dev_stop = eth_dev_stop, 628 .dev_close = eth_dev_close, 629 .dev_configure = eth_dev_configure, 630 .dev_infos_get = eth_dev_info, 631 .mac_addr_set = eth_dev_macaddr_set, 632 .mtu_set = eth_dev_mtu_set, 633 .promiscuous_enable = eth_dev_promiscuous_enable, 634 .promiscuous_disable = eth_dev_promiscuous_disable, 635 .rx_queue_setup = eth_rx_queue_setup, 636 .tx_queue_setup = eth_tx_queue_setup, 637 .link_update = eth_link_update, 638 .stats_get = eth_stats_get, 639 .stats_reset = eth_stats_reset, 640 }; 641 642 /* 643 * Opens an AF_PACKET socket 644 */ 645 static int 646 open_packet_iface(const char *key __rte_unused, 647 const char *value __rte_unused, 648 void *extra_args) 649 { 650 int *sockfd = extra_args; 651 652 /* Open an AF_PACKET socket... */ 653 *sockfd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); 654 if (*sockfd == -1) { 655 PMD_LOG(ERR, "Could not open AF_PACKET socket"); 656 return -1; 657 } 658 659 return 0; 660 } 661 662 static int 663 rte_pmd_init_internals(struct rte_vdev_device *dev, 664 const int sockfd, 665 const unsigned nb_queues, 666 unsigned int blocksize, 667 unsigned int blockcnt, 668 unsigned int framesize, 669 unsigned int framecnt, 670 unsigned int qdisc_bypass, 671 struct pmd_internals **internals, 672 struct rte_eth_dev **eth_dev, 673 struct rte_kvargs *kvlist) 674 { 675 const char *name = rte_vdev_device_name(dev); 676 const unsigned int numa_node = dev->device.numa_node; 677 struct rte_eth_dev_data *data = NULL; 678 struct rte_kvargs_pair *pair = NULL; 679 struct ifreq ifr; 680 size_t ifnamelen; 681 unsigned k_idx; 682 struct sockaddr_ll sockaddr; 683 struct tpacket_req *req; 684 struct pkt_rx_queue *rx_queue; 685 struct pkt_tx_queue *tx_queue; 686 int rc, tpver, discard; 687 int qsockfd = -1; 688 unsigned int i, q, rdsize; 689 #if defined(PACKET_FANOUT) 690 int fanout_arg; 691 #endif 692 693 for (k_idx = 0; k_idx < kvlist->count; k_idx++) { 694 pair = &kvlist->pairs[k_idx]; 695 if (strstr(pair->key, ETH_AF_PACKET_IFACE_ARG) != NULL) 696 break; 697 } 698 if (pair == NULL) { 699 PMD_LOG(ERR, 700 "%s: no interface specified for AF_PACKET ethdev", 701 name); 702 return -1; 703 } 704 705 PMD_LOG(INFO, 706 "%s: creating AF_PACKET-backed ethdev on numa socket %u", 707 name, numa_node); 708 709 *internals = rte_zmalloc_socket(name, sizeof(**internals), 710 0, numa_node); 711 if (*internals == NULL) 712 return -1; 713 714 715 (*internals)->rx_queue = rte_calloc_socket("af_packet_rx", 716 nb_queues, 717 sizeof(struct pkt_rx_queue), 718 0, numa_node); 719 (*internals)->tx_queue = rte_calloc_socket("af_packet_tx", 720 nb_queues, 721 sizeof(struct pkt_tx_queue), 722 0, numa_node); 723 if (!(*internals)->rx_queue || !(*internals)->tx_queue) { 724 goto free_internals; 725 } 726 727 for (q = 0; q < nb_queues; q++) { 728 (*internals)->rx_queue[q].map = MAP_FAILED; 729 (*internals)->tx_queue[q].map = MAP_FAILED; 730 (*internals)->rx_queue[q].sockfd = -1; 731 (*internals)->tx_queue[q].sockfd = -1; 732 } 733 734 req = &((*internals)->req); 735 736 req->tp_block_size = blocksize; 737 req->tp_block_nr = blockcnt; 738 req->tp_frame_size = framesize; 739 req->tp_frame_nr = framecnt; 740 741 ifnamelen = strlen(pair->value); 742 if (ifnamelen < sizeof(ifr.ifr_name)) { 743 memcpy(ifr.ifr_name, pair->value, ifnamelen); 744 ifr.ifr_name[ifnamelen] = '\0'; 745 } else { 746 PMD_LOG(ERR, 747 "%s: I/F name too long (%s)", 748 name, pair->value); 749 goto free_internals; 750 } 751 if (ioctl(sockfd, SIOCGIFINDEX, &ifr) == -1) { 752 PMD_LOG_ERRNO(ERR, "%s: ioctl failed (SIOCGIFINDEX)", name); 753 goto free_internals; 754 } 755 (*internals)->if_name = strdup(pair->value); 756 if ((*internals)->if_name == NULL) 757 goto free_internals; 758 (*internals)->if_index = ifr.ifr_ifindex; 759 760 if (ioctl(sockfd, SIOCGIFHWADDR, &ifr) == -1) { 761 PMD_LOG_ERRNO(ERR, "%s: ioctl failed (SIOCGIFHWADDR)", name); 762 goto free_internals; 763 } 764 memcpy(&(*internals)->eth_addr, ifr.ifr_hwaddr.sa_data, ETH_ALEN); 765 766 memset(&sockaddr, 0, sizeof(sockaddr)); 767 sockaddr.sll_family = AF_PACKET; 768 sockaddr.sll_protocol = htons(ETH_P_ALL); 769 sockaddr.sll_ifindex = (*internals)->if_index; 770 771 #if defined(PACKET_FANOUT) 772 fanout_arg = (getpid() ^ (*internals)->if_index) & 0xffff; 773 fanout_arg |= (PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_DEFRAG) << 16; 774 #if defined(PACKET_FANOUT_FLAG_ROLLOVER) 775 fanout_arg |= PACKET_FANOUT_FLAG_ROLLOVER << 16; 776 #endif 777 #endif 778 779 for (q = 0; q < nb_queues; q++) { 780 /* Open an AF_PACKET socket for this queue... */ 781 qsockfd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); 782 if (qsockfd == -1) { 783 PMD_LOG_ERRNO(ERR, 784 "%s: could not open AF_PACKET socket", 785 name); 786 goto error; 787 } 788 789 tpver = TPACKET_V2; 790 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_VERSION, 791 &tpver, sizeof(tpver)); 792 if (rc == -1) { 793 PMD_LOG_ERRNO(ERR, 794 "%s: could not set PACKET_VERSION on AF_PACKET socket for %s", 795 name, pair->value); 796 goto error; 797 } 798 799 discard = 1; 800 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_LOSS, 801 &discard, sizeof(discard)); 802 if (rc == -1) { 803 PMD_LOG_ERRNO(ERR, 804 "%s: could not set PACKET_LOSS on AF_PACKET socket for %s", 805 name, pair->value); 806 goto error; 807 } 808 809 if (qdisc_bypass) { 810 #if defined(PACKET_QDISC_BYPASS) 811 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_QDISC_BYPASS, 812 &qdisc_bypass, sizeof(qdisc_bypass)); 813 if (rc == -1) { 814 PMD_LOG_ERRNO(ERR, 815 "%s: could not set PACKET_QDISC_BYPASS on AF_PACKET socket for %s", 816 name, pair->value); 817 goto error; 818 } 819 #endif 820 } 821 822 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_RX_RING, req, sizeof(*req)); 823 if (rc == -1) { 824 PMD_LOG_ERRNO(ERR, 825 "%s: could not set PACKET_RX_RING on AF_PACKET socket for %s", 826 name, pair->value); 827 goto error; 828 } 829 830 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_TX_RING, req, sizeof(*req)); 831 if (rc == -1) { 832 PMD_LOG_ERRNO(ERR, 833 "%s: could not set PACKET_TX_RING on AF_PACKET " 834 "socket for %s", name, pair->value); 835 goto error; 836 } 837 838 rx_queue = &((*internals)->rx_queue[q]); 839 rx_queue->framecount = req->tp_frame_nr; 840 841 rx_queue->map = mmap(NULL, 2 * req->tp_block_size * req->tp_block_nr, 842 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED, 843 qsockfd, 0); 844 if (rx_queue->map == MAP_FAILED) { 845 PMD_LOG_ERRNO(ERR, 846 "%s: call to mmap failed on AF_PACKET socket for %s", 847 name, pair->value); 848 goto error; 849 } 850 851 /* rdsize is same for both Tx and Rx */ 852 rdsize = req->tp_frame_nr * sizeof(*(rx_queue->rd)); 853 854 rx_queue->rd = rte_zmalloc_socket(name, rdsize, 0, numa_node); 855 if (rx_queue->rd == NULL) 856 goto error; 857 for (i = 0; i < req->tp_frame_nr; ++i) { 858 rx_queue->rd[i].iov_base = rx_queue->map + (i * framesize); 859 rx_queue->rd[i].iov_len = req->tp_frame_size; 860 } 861 rx_queue->sockfd = qsockfd; 862 863 tx_queue = &((*internals)->tx_queue[q]); 864 tx_queue->framecount = req->tp_frame_nr; 865 tx_queue->frame_data_size = req->tp_frame_size; 866 tx_queue->frame_data_size -= TPACKET2_HDRLEN - 867 sizeof(struct sockaddr_ll); 868 869 tx_queue->map = rx_queue->map + req->tp_block_size * req->tp_block_nr; 870 871 tx_queue->rd = rte_zmalloc_socket(name, rdsize, 0, numa_node); 872 if (tx_queue->rd == NULL) 873 goto error; 874 for (i = 0; i < req->tp_frame_nr; ++i) { 875 tx_queue->rd[i].iov_base = tx_queue->map + (i * framesize); 876 tx_queue->rd[i].iov_len = req->tp_frame_size; 877 } 878 tx_queue->sockfd = qsockfd; 879 880 rc = bind(qsockfd, (const struct sockaddr*)&sockaddr, sizeof(sockaddr)); 881 if (rc == -1) { 882 PMD_LOG_ERRNO(ERR, 883 "%s: could not bind AF_PACKET socket to %s", 884 name, pair->value); 885 goto error; 886 } 887 888 #if defined(PACKET_FANOUT) 889 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_FANOUT, 890 &fanout_arg, sizeof(fanout_arg)); 891 if (rc == -1) { 892 PMD_LOG_ERRNO(ERR, 893 "%s: could not set PACKET_FANOUT on AF_PACKET socket for %s", 894 name, pair->value); 895 goto error; 896 } 897 #endif 898 } 899 900 /* reserve an ethdev entry */ 901 *eth_dev = rte_eth_vdev_allocate(dev, 0); 902 if (*eth_dev == NULL) 903 goto error; 904 905 /* 906 * now put it all together 907 * - store queue data in internals, 908 * - store numa_node in eth_dev 909 * - point eth_dev_data to internals 910 * - and point eth_dev structure to new eth_dev_data structure 911 */ 912 913 (*internals)->nb_queues = nb_queues; 914 915 data = (*eth_dev)->data; 916 data->dev_private = *internals; 917 data->nb_rx_queues = (uint16_t)nb_queues; 918 data->nb_tx_queues = (uint16_t)nb_queues; 919 data->dev_link = pmd_link; 920 data->mac_addrs = &(*internals)->eth_addr; 921 data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS; 922 923 (*eth_dev)->dev_ops = &ops; 924 925 return 0; 926 927 error: 928 if (qsockfd != -1) 929 close(qsockfd); 930 for (q = 0; q < nb_queues; q++) { 931 if ((*internals)->rx_queue[q].map != MAP_FAILED) 932 munmap((*internals)->rx_queue[q].map, 933 2 * req->tp_block_size * req->tp_block_nr); 934 935 rte_free((*internals)->rx_queue[q].rd); 936 rte_free((*internals)->tx_queue[q].rd); 937 if (((*internals)->rx_queue[q].sockfd >= 0) && 938 ((*internals)->rx_queue[q].sockfd != qsockfd)) 939 close((*internals)->rx_queue[q].sockfd); 940 } 941 free_internals: 942 rte_free((*internals)->rx_queue); 943 rte_free((*internals)->tx_queue); 944 free((*internals)->if_name); 945 rte_free(*internals); 946 return -1; 947 } 948 949 static int 950 rte_eth_from_packet(struct rte_vdev_device *dev, 951 int const *sockfd, 952 struct rte_kvargs *kvlist) 953 { 954 const char *name = rte_vdev_device_name(dev); 955 struct pmd_internals *internals = NULL; 956 struct rte_eth_dev *eth_dev = NULL; 957 struct rte_kvargs_pair *pair = NULL; 958 unsigned k_idx; 959 unsigned int blockcount; 960 unsigned int blocksize; 961 unsigned int framesize = DFLT_FRAME_SIZE; 962 unsigned int framecount = DFLT_FRAME_COUNT; 963 unsigned int qpairs = 1; 964 unsigned int qdisc_bypass = 1; 965 966 /* do some parameter checking */ 967 if (*sockfd < 0) 968 return -1; 969 970 blocksize = getpagesize(); 971 972 /* 973 * Walk arguments for configurable settings 974 */ 975 for (k_idx = 0; k_idx < kvlist->count; k_idx++) { 976 pair = &kvlist->pairs[k_idx]; 977 if (strstr(pair->key, ETH_AF_PACKET_NUM_Q_ARG) != NULL) { 978 qpairs = atoi(pair->value); 979 if (qpairs < 1) { 980 PMD_LOG(ERR, 981 "%s: invalid qpairs value", 982 name); 983 return -1; 984 } 985 continue; 986 } 987 if (strstr(pair->key, ETH_AF_PACKET_BLOCKSIZE_ARG) != NULL) { 988 blocksize = atoi(pair->value); 989 if (!blocksize) { 990 PMD_LOG(ERR, 991 "%s: invalid blocksize value", 992 name); 993 return -1; 994 } 995 continue; 996 } 997 if (strstr(pair->key, ETH_AF_PACKET_FRAMESIZE_ARG) != NULL) { 998 framesize = atoi(pair->value); 999 if (!framesize) { 1000 PMD_LOG(ERR, 1001 "%s: invalid framesize value", 1002 name); 1003 return -1; 1004 } 1005 continue; 1006 } 1007 if (strstr(pair->key, ETH_AF_PACKET_FRAMECOUNT_ARG) != NULL) { 1008 framecount = atoi(pair->value); 1009 if (!framecount) { 1010 PMD_LOG(ERR, 1011 "%s: invalid framecount value", 1012 name); 1013 return -1; 1014 } 1015 continue; 1016 } 1017 if (strstr(pair->key, ETH_AF_PACKET_QDISC_BYPASS_ARG) != NULL) { 1018 qdisc_bypass = atoi(pair->value); 1019 if (qdisc_bypass > 1) { 1020 PMD_LOG(ERR, 1021 "%s: invalid bypass value", 1022 name); 1023 return -1; 1024 } 1025 continue; 1026 } 1027 } 1028 1029 if (framesize > blocksize) { 1030 PMD_LOG(ERR, 1031 "%s: AF_PACKET MMAP frame size exceeds block size!", 1032 name); 1033 return -1; 1034 } 1035 1036 blockcount = framecount / (blocksize / framesize); 1037 if (!blockcount) { 1038 PMD_LOG(ERR, 1039 "%s: invalid AF_PACKET MMAP parameters", name); 1040 return -1; 1041 } 1042 1043 PMD_LOG(INFO, "%s: AF_PACKET MMAP parameters:", name); 1044 PMD_LOG(INFO, "%s:\tblock size %d", name, blocksize); 1045 PMD_LOG(INFO, "%s:\tblock count %d", name, blockcount); 1046 PMD_LOG(INFO, "%s:\tframe size %d", name, framesize); 1047 PMD_LOG(INFO, "%s:\tframe count %d", name, framecount); 1048 1049 if (rte_pmd_init_internals(dev, *sockfd, qpairs, 1050 blocksize, blockcount, 1051 framesize, framecount, 1052 qdisc_bypass, 1053 &internals, ð_dev, 1054 kvlist) < 0) 1055 return -1; 1056 1057 eth_dev->rx_pkt_burst = eth_af_packet_rx; 1058 eth_dev->tx_pkt_burst = eth_af_packet_tx; 1059 1060 rte_eth_dev_probing_finish(eth_dev); 1061 return 0; 1062 } 1063 1064 static int 1065 rte_pmd_af_packet_probe(struct rte_vdev_device *dev) 1066 { 1067 int ret = 0; 1068 struct rte_kvargs *kvlist; 1069 int sockfd = -1; 1070 struct rte_eth_dev *eth_dev; 1071 const char *name = rte_vdev_device_name(dev); 1072 1073 PMD_LOG(INFO, "Initializing pmd_af_packet for %s", name); 1074 1075 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 1076 eth_dev = rte_eth_dev_attach_secondary(name); 1077 if (!eth_dev) { 1078 PMD_LOG(ERR, "Failed to probe %s", name); 1079 return -1; 1080 } 1081 /* TODO: request info from primary to set up Rx and Tx */ 1082 eth_dev->dev_ops = &ops; 1083 eth_dev->device = &dev->device; 1084 rte_eth_dev_probing_finish(eth_dev); 1085 return 0; 1086 } 1087 1088 kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments); 1089 if (kvlist == NULL) { 1090 ret = -1; 1091 goto exit; 1092 } 1093 1094 /* 1095 * If iface argument is passed we open the NICs and use them for 1096 * reading / writing 1097 */ 1098 if (rte_kvargs_count(kvlist, ETH_AF_PACKET_IFACE_ARG) == 1) { 1099 1100 ret = rte_kvargs_process(kvlist, ETH_AF_PACKET_IFACE_ARG, 1101 &open_packet_iface, &sockfd); 1102 if (ret < 0) 1103 goto exit; 1104 } 1105 1106 if (dev->device.numa_node == SOCKET_ID_ANY) 1107 dev->device.numa_node = rte_socket_id(); 1108 1109 ret = rte_eth_from_packet(dev, &sockfd, kvlist); 1110 close(sockfd); /* no longer needed */ 1111 1112 exit: 1113 rte_kvargs_free(kvlist); 1114 return ret; 1115 } 1116 1117 static int 1118 rte_pmd_af_packet_remove(struct rte_vdev_device *dev) 1119 { 1120 struct rte_eth_dev *eth_dev; 1121 1122 if (dev == NULL) 1123 return -1; 1124 1125 /* find the ethdev entry */ 1126 eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev)); 1127 if (eth_dev == NULL) 1128 return 0; /* port already released */ 1129 1130 eth_dev_close(eth_dev); 1131 rte_eth_dev_release_port(eth_dev); 1132 1133 return 0; 1134 } 1135 1136 static struct rte_vdev_driver pmd_af_packet_drv = { 1137 .probe = rte_pmd_af_packet_probe, 1138 .remove = rte_pmd_af_packet_remove, 1139 }; 1140 1141 RTE_PMD_REGISTER_VDEV(net_af_packet, pmd_af_packet_drv); 1142 RTE_PMD_REGISTER_ALIAS(net_af_packet, eth_af_packet); 1143 RTE_PMD_REGISTER_PARAM_STRING(net_af_packet, 1144 "iface=<string> " 1145 "qpairs=<int> " 1146 "blocksz=<int> " 1147 "framesz=<int> " 1148 "framecnt=<int> " 1149 "qdisc_bypass=<0|1>"); 1150