1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2014 John W. Linville <linville@tuxdriver.com> 3 * Originally based upon librte_pmd_pcap code: 4 * Copyright(c) 2010-2015 Intel Corporation. 5 * Copyright(c) 2014 6WIND S.A. 6 * All rights reserved. 7 */ 8 9 #include <rte_string_fns.h> 10 #include <rte_mbuf.h> 11 #include <ethdev_driver.h> 12 #include <ethdev_vdev.h> 13 #include <rte_malloc.h> 14 #include <rte_kvargs.h> 15 #include <rte_bus_vdev.h> 16 17 #include <errno.h> 18 #include <linux/if_ether.h> 19 #include <linux/if_packet.h> 20 #include <arpa/inet.h> 21 #include <net/if.h> 22 #include <net/if_arp.h> 23 #include <sys/types.h> 24 #include <sys/socket.h> 25 #include <sys/ioctl.h> 26 #include <string.h> 27 #include <sys/mman.h> 28 #include <unistd.h> 29 #include <poll.h> 30 31 #define ETH_AF_PACKET_IFACE_ARG "iface" 32 #define ETH_AF_PACKET_NUM_Q_ARG "qpairs" 33 #define ETH_AF_PACKET_BLOCKSIZE_ARG "blocksz" 34 #define ETH_AF_PACKET_FRAMESIZE_ARG "framesz" 35 #define ETH_AF_PACKET_FRAMECOUNT_ARG "framecnt" 36 #define ETH_AF_PACKET_QDISC_BYPASS_ARG "qdisc_bypass" 37 38 #define DFLT_FRAME_SIZE (1 << 11) 39 #define DFLT_FRAME_COUNT (1 << 9) 40 41 struct pkt_rx_queue { 42 int sockfd; 43 44 struct iovec *rd; 45 uint8_t *map; 46 unsigned int framecount; 47 unsigned int framenum; 48 49 struct rte_mempool *mb_pool; 50 uint16_t in_port; 51 uint8_t vlan_strip; 52 53 volatile unsigned long rx_pkts; 54 volatile unsigned long rx_bytes; 55 }; 56 57 struct pkt_tx_queue { 58 int sockfd; 59 unsigned int frame_data_size; 60 61 struct iovec *rd; 62 uint8_t *map; 63 unsigned int framecount; 64 unsigned int framenum; 65 66 volatile unsigned long tx_pkts; 67 volatile unsigned long err_pkts; 68 volatile unsigned long tx_bytes; 69 }; 70 71 struct pmd_internals { 72 unsigned nb_queues; 73 74 int if_index; 75 char *if_name; 76 struct rte_ether_addr eth_addr; 77 78 struct tpacket_req req; 79 80 struct pkt_rx_queue *rx_queue; 81 struct pkt_tx_queue *tx_queue; 82 uint8_t vlan_strip; 83 }; 84 85 static const char *valid_arguments[] = { 86 ETH_AF_PACKET_IFACE_ARG, 87 ETH_AF_PACKET_NUM_Q_ARG, 88 ETH_AF_PACKET_BLOCKSIZE_ARG, 89 ETH_AF_PACKET_FRAMESIZE_ARG, 90 ETH_AF_PACKET_FRAMECOUNT_ARG, 91 ETH_AF_PACKET_QDISC_BYPASS_ARG, 92 NULL 93 }; 94 95 static struct rte_eth_link pmd_link = { 96 .link_speed = RTE_ETH_SPEED_NUM_10G, 97 .link_duplex = RTE_ETH_LINK_FULL_DUPLEX, 98 .link_status = RTE_ETH_LINK_DOWN, 99 .link_autoneg = RTE_ETH_LINK_FIXED, 100 }; 101 102 RTE_LOG_REGISTER_DEFAULT(af_packet_logtype, NOTICE); 103 104 #define PMD_LOG(level, fmt, args...) \ 105 rte_log(RTE_LOG_ ## level, af_packet_logtype, \ 106 "%s(): " fmt "\n", __func__, ##args) 107 108 #define PMD_LOG_ERRNO(level, fmt, args...) \ 109 rte_log(RTE_LOG_ ## level, af_packet_logtype, \ 110 "%s(): " fmt ":%s\n", __func__, ##args, strerror(errno)) 111 112 static uint16_t 113 eth_af_packet_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) 114 { 115 unsigned i; 116 struct tpacket2_hdr *ppd; 117 struct rte_mbuf *mbuf; 118 uint8_t *pbuf; 119 struct pkt_rx_queue *pkt_q = queue; 120 uint16_t num_rx = 0; 121 unsigned long num_rx_bytes = 0; 122 unsigned int framecount, framenum; 123 124 if (unlikely(nb_pkts == 0)) 125 return 0; 126 127 /* 128 * Reads the given number of packets from the AF_PACKET socket one by 129 * one and copies the packet data into a newly allocated mbuf. 130 */ 131 framecount = pkt_q->framecount; 132 framenum = pkt_q->framenum; 133 for (i = 0; i < nb_pkts; i++) { 134 /* point at the next incoming frame */ 135 ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base; 136 if ((ppd->tp_status & TP_STATUS_USER) == 0) 137 break; 138 139 /* allocate the next mbuf */ 140 mbuf = rte_pktmbuf_alloc(pkt_q->mb_pool); 141 if (unlikely(mbuf == NULL)) 142 break; 143 144 /* packet will fit in the mbuf, go ahead and receive it */ 145 rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf) = ppd->tp_snaplen; 146 pbuf = (uint8_t *) ppd + ppd->tp_mac; 147 memcpy(rte_pktmbuf_mtod(mbuf, void *), pbuf, rte_pktmbuf_data_len(mbuf)); 148 149 /* check for vlan info */ 150 if (ppd->tp_status & TP_STATUS_VLAN_VALID) { 151 mbuf->vlan_tci = ppd->tp_vlan_tci; 152 mbuf->ol_flags |= (RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED); 153 154 if (!pkt_q->vlan_strip && rte_vlan_insert(&mbuf)) 155 PMD_LOG(ERR, "Failed to reinsert VLAN tag"); 156 } 157 158 /* release incoming frame and advance ring buffer */ 159 ppd->tp_status = TP_STATUS_KERNEL; 160 if (++framenum >= framecount) 161 framenum = 0; 162 mbuf->port = pkt_q->in_port; 163 164 /* account for the receive frame */ 165 bufs[i] = mbuf; 166 num_rx++; 167 num_rx_bytes += mbuf->pkt_len; 168 } 169 pkt_q->framenum = framenum; 170 pkt_q->rx_pkts += num_rx; 171 pkt_q->rx_bytes += num_rx_bytes; 172 return num_rx; 173 } 174 175 /* 176 * Check if there is an available frame in the ring 177 */ 178 static inline bool 179 tx_ring_status_available(uint32_t tp_status) 180 { 181 /* 182 * We eliminate the timestamp status from the packet status. 183 * This should only matter if timestamping is enabled on the socket, 184 * but there is a bug in the kernel which is fixed in newer releases. 185 * 186 * See the following kernel commit for reference: 187 * commit 171c3b151118a2fe0fc1e2a9d1b5a1570cfe82d2 188 * net: packetmmap: fix only tx timestamp on request 189 */ 190 tp_status &= ~(TP_STATUS_TS_SOFTWARE | TP_STATUS_TS_RAW_HARDWARE); 191 192 return tp_status == TP_STATUS_AVAILABLE; 193 } 194 195 /* 196 * Callback to handle sending packets through a real NIC. 197 */ 198 static uint16_t 199 eth_af_packet_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) 200 { 201 struct tpacket2_hdr *ppd; 202 struct rte_mbuf *mbuf; 203 uint8_t *pbuf; 204 unsigned int framecount, framenum; 205 struct pollfd pfd; 206 struct pkt_tx_queue *pkt_q = queue; 207 uint16_t num_tx = 0; 208 unsigned long num_tx_bytes = 0; 209 int i; 210 211 if (unlikely(nb_pkts == 0)) 212 return 0; 213 214 memset(&pfd, 0, sizeof(pfd)); 215 pfd.fd = pkt_q->sockfd; 216 pfd.events = POLLOUT; 217 pfd.revents = 0; 218 219 framecount = pkt_q->framecount; 220 framenum = pkt_q->framenum; 221 ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base; 222 for (i = 0; i < nb_pkts; i++) { 223 mbuf = *bufs++; 224 225 /* drop oversized packets */ 226 if (mbuf->pkt_len > pkt_q->frame_data_size) { 227 rte_pktmbuf_free(mbuf); 228 continue; 229 } 230 231 /* insert vlan info if necessary */ 232 if (mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) { 233 if (rte_vlan_insert(&mbuf)) { 234 rte_pktmbuf_free(mbuf); 235 continue; 236 } 237 } 238 239 /* point at the next incoming frame */ 240 if (!tx_ring_status_available(ppd->tp_status)) { 241 if (poll(&pfd, 1, -1) < 0) 242 break; 243 244 /* poll() can return POLLERR if the interface is down */ 245 if (pfd.revents & POLLERR) 246 break; 247 } 248 249 /* 250 * poll() will almost always return POLLOUT, even if there 251 * are no extra buffers available 252 * 253 * This happens, because packet_poll() calls datagram_poll() 254 * which checks the space left in the socket buffer and, 255 * in the case of packet_mmap, the default socket buffer length 256 * doesn't match the requested size for the tx_ring. 257 * As such, there is almost always space left in socket buffer, 258 * which doesn't seem to be correlated to the requested size 259 * for the tx_ring in packet_mmap. 260 * 261 * This results in poll() returning POLLOUT. 262 */ 263 if (!tx_ring_status_available(ppd->tp_status)) 264 break; 265 266 /* copy the tx frame data */ 267 pbuf = (uint8_t *) ppd + TPACKET2_HDRLEN - 268 sizeof(struct sockaddr_ll); 269 270 struct rte_mbuf *tmp_mbuf = mbuf; 271 while (tmp_mbuf) { 272 uint16_t data_len = rte_pktmbuf_data_len(tmp_mbuf); 273 memcpy(pbuf, rte_pktmbuf_mtod(tmp_mbuf, void*), data_len); 274 pbuf += data_len; 275 tmp_mbuf = tmp_mbuf->next; 276 } 277 278 ppd->tp_len = mbuf->pkt_len; 279 ppd->tp_snaplen = mbuf->pkt_len; 280 281 /* release incoming frame and advance ring buffer */ 282 ppd->tp_status = TP_STATUS_SEND_REQUEST; 283 if (++framenum >= framecount) 284 framenum = 0; 285 ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base; 286 287 num_tx++; 288 num_tx_bytes += mbuf->pkt_len; 289 rte_pktmbuf_free(mbuf); 290 } 291 292 /* kick-off transmits */ 293 if (sendto(pkt_q->sockfd, NULL, 0, MSG_DONTWAIT, NULL, 0) == -1 && 294 errno != ENOBUFS && errno != EAGAIN) { 295 /* 296 * In case of a ENOBUFS/EAGAIN error all of the enqueued 297 * packets will be considered successful even though only some 298 * are sent. 299 */ 300 301 num_tx = 0; 302 num_tx_bytes = 0; 303 } 304 305 pkt_q->framenum = framenum; 306 pkt_q->tx_pkts += num_tx; 307 pkt_q->err_pkts += i - num_tx; 308 pkt_q->tx_bytes += num_tx_bytes; 309 return i; 310 } 311 312 static int 313 eth_dev_start(struct rte_eth_dev *dev) 314 { 315 dev->data->dev_link.link_status = RTE_ETH_LINK_UP; 316 return 0; 317 } 318 319 /* 320 * This function gets called when the current port gets stopped. 321 */ 322 static int 323 eth_dev_stop(struct rte_eth_dev *dev) 324 { 325 unsigned i; 326 int sockfd; 327 struct pmd_internals *internals = dev->data->dev_private; 328 329 for (i = 0; i < internals->nb_queues; i++) { 330 sockfd = internals->rx_queue[i].sockfd; 331 if (sockfd != -1) 332 close(sockfd); 333 334 /* Prevent use after free in case tx fd == rx fd */ 335 if (sockfd != internals->tx_queue[i].sockfd) { 336 sockfd = internals->tx_queue[i].sockfd; 337 if (sockfd != -1) 338 close(sockfd); 339 } 340 341 internals->rx_queue[i].sockfd = -1; 342 internals->tx_queue[i].sockfd = -1; 343 } 344 345 dev->data->dev_link.link_status = RTE_ETH_LINK_DOWN; 346 return 0; 347 } 348 349 static int 350 eth_dev_configure(struct rte_eth_dev *dev __rte_unused) 351 { 352 struct rte_eth_conf *dev_conf = &dev->data->dev_conf; 353 const struct rte_eth_rxmode *rxmode = &dev_conf->rxmode; 354 struct pmd_internals *internals = dev->data->dev_private; 355 356 internals->vlan_strip = !!(rxmode->offloads & RTE_ETH_RX_OFFLOAD_VLAN_STRIP); 357 return 0; 358 } 359 360 static int 361 eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) 362 { 363 struct pmd_internals *internals = dev->data->dev_private; 364 365 dev_info->if_index = internals->if_index; 366 dev_info->max_mac_addrs = 1; 367 dev_info->max_rx_pktlen = RTE_ETHER_MAX_LEN; 368 dev_info->max_rx_queues = (uint16_t)internals->nb_queues; 369 dev_info->max_tx_queues = (uint16_t)internals->nb_queues; 370 dev_info->min_rx_bufsize = 0; 371 dev_info->tx_offload_capa = RTE_ETH_TX_OFFLOAD_MULTI_SEGS | 372 RTE_ETH_TX_OFFLOAD_VLAN_INSERT; 373 dev_info->rx_offload_capa = RTE_ETH_RX_OFFLOAD_VLAN_STRIP; 374 375 return 0; 376 } 377 378 static int 379 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats) 380 { 381 unsigned i, imax; 382 unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0; 383 unsigned long rx_bytes_total = 0, tx_bytes_total = 0; 384 const struct pmd_internals *internal = dev->data->dev_private; 385 386 imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ? 387 internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS); 388 for (i = 0; i < imax; i++) { 389 igb_stats->q_ipackets[i] = internal->rx_queue[i].rx_pkts; 390 igb_stats->q_ibytes[i] = internal->rx_queue[i].rx_bytes; 391 rx_total += igb_stats->q_ipackets[i]; 392 rx_bytes_total += igb_stats->q_ibytes[i]; 393 } 394 395 imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ? 396 internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS); 397 for (i = 0; i < imax; i++) { 398 igb_stats->q_opackets[i] = internal->tx_queue[i].tx_pkts; 399 igb_stats->q_obytes[i] = internal->tx_queue[i].tx_bytes; 400 tx_total += igb_stats->q_opackets[i]; 401 tx_err_total += internal->tx_queue[i].err_pkts; 402 tx_bytes_total += igb_stats->q_obytes[i]; 403 } 404 405 igb_stats->ipackets = rx_total; 406 igb_stats->ibytes = rx_bytes_total; 407 igb_stats->opackets = tx_total; 408 igb_stats->oerrors = tx_err_total; 409 igb_stats->obytes = tx_bytes_total; 410 return 0; 411 } 412 413 static int 414 eth_stats_reset(struct rte_eth_dev *dev) 415 { 416 unsigned i; 417 struct pmd_internals *internal = dev->data->dev_private; 418 419 for (i = 0; i < internal->nb_queues; i++) { 420 internal->rx_queue[i].rx_pkts = 0; 421 internal->rx_queue[i].rx_bytes = 0; 422 } 423 424 for (i = 0; i < internal->nb_queues; i++) { 425 internal->tx_queue[i].tx_pkts = 0; 426 internal->tx_queue[i].err_pkts = 0; 427 internal->tx_queue[i].tx_bytes = 0; 428 } 429 430 return 0; 431 } 432 433 static int 434 eth_dev_close(struct rte_eth_dev *dev) 435 { 436 struct pmd_internals *internals; 437 struct tpacket_req *req; 438 unsigned int q; 439 440 if (rte_eal_process_type() != RTE_PROC_PRIMARY) 441 return 0; 442 443 PMD_LOG(INFO, "Closing AF_PACKET ethdev on NUMA socket %u", 444 rte_socket_id()); 445 446 internals = dev->data->dev_private; 447 req = &internals->req; 448 for (q = 0; q < internals->nb_queues; q++) { 449 munmap(internals->rx_queue[q].map, 450 2 * req->tp_block_size * req->tp_block_nr); 451 rte_free(internals->rx_queue[q].rd); 452 rte_free(internals->tx_queue[q].rd); 453 } 454 free(internals->if_name); 455 rte_free(internals->rx_queue); 456 rte_free(internals->tx_queue); 457 458 /* mac_addrs must not be freed alone because part of dev_private */ 459 dev->data->mac_addrs = NULL; 460 return 0; 461 } 462 463 static int 464 eth_link_update(struct rte_eth_dev *dev __rte_unused, 465 int wait_to_complete __rte_unused) 466 { 467 return 0; 468 } 469 470 static int 471 eth_rx_queue_setup(struct rte_eth_dev *dev, 472 uint16_t rx_queue_id, 473 uint16_t nb_rx_desc __rte_unused, 474 unsigned int socket_id __rte_unused, 475 const struct rte_eth_rxconf *rx_conf __rte_unused, 476 struct rte_mempool *mb_pool) 477 { 478 struct pmd_internals *internals = dev->data->dev_private; 479 struct pkt_rx_queue *pkt_q = &internals->rx_queue[rx_queue_id]; 480 unsigned int buf_size, data_size; 481 482 pkt_q->mb_pool = mb_pool; 483 484 /* Now get the space available for data in the mbuf */ 485 buf_size = rte_pktmbuf_data_room_size(pkt_q->mb_pool) - 486 RTE_PKTMBUF_HEADROOM; 487 data_size = internals->req.tp_frame_size; 488 data_size -= TPACKET2_HDRLEN - sizeof(struct sockaddr_ll); 489 490 if (data_size > buf_size) { 491 PMD_LOG(ERR, 492 "%s: %d bytes will not fit in mbuf (%d bytes)", 493 dev->device->name, data_size, buf_size); 494 return -ENOMEM; 495 } 496 497 dev->data->rx_queues[rx_queue_id] = pkt_q; 498 pkt_q->in_port = dev->data->port_id; 499 pkt_q->vlan_strip = internals->vlan_strip; 500 501 return 0; 502 } 503 504 static int 505 eth_tx_queue_setup(struct rte_eth_dev *dev, 506 uint16_t tx_queue_id, 507 uint16_t nb_tx_desc __rte_unused, 508 unsigned int socket_id __rte_unused, 509 const struct rte_eth_txconf *tx_conf __rte_unused) 510 { 511 512 struct pmd_internals *internals = dev->data->dev_private; 513 514 dev->data->tx_queues[tx_queue_id] = &internals->tx_queue[tx_queue_id]; 515 return 0; 516 } 517 518 static int 519 eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu) 520 { 521 struct pmd_internals *internals = dev->data->dev_private; 522 struct ifreq ifr = { .ifr_mtu = mtu }; 523 int ret; 524 int s; 525 unsigned int data_size = internals->req.tp_frame_size - 526 TPACKET2_HDRLEN; 527 528 if (mtu > data_size) 529 return -EINVAL; 530 531 s = socket(PF_INET, SOCK_DGRAM, 0); 532 if (s < 0) 533 return -EINVAL; 534 535 strlcpy(ifr.ifr_name, internals->if_name, IFNAMSIZ); 536 ret = ioctl(s, SIOCSIFMTU, &ifr); 537 close(s); 538 539 if (ret < 0) 540 return -EINVAL; 541 542 return 0; 543 } 544 545 static int 546 eth_dev_macaddr_set(struct rte_eth_dev *dev, struct rte_ether_addr *addr) 547 { 548 struct pmd_internals *internals = dev->data->dev_private; 549 struct ifreq ifr = { }; 550 int sockfd = internals->rx_queue[0].sockfd; 551 int ret; 552 553 if (sockfd == -1) { 554 PMD_LOG(ERR, "receive socket not found"); 555 return -EINVAL; 556 } 557 558 strlcpy(ifr.ifr_name, internals->if_name, IFNAMSIZ); 559 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER; 560 memcpy(ifr.ifr_hwaddr.sa_data, addr, sizeof(*addr)); 561 ret = ioctl(sockfd, SIOCSIFHWADDR, &ifr); 562 563 if (ret < 0) { 564 PMD_LOG_ERRNO(ERR, "ioctl(SIOCSIFHWADDR) failed"); 565 return -EINVAL; 566 } 567 568 return 0; 569 } 570 571 static int 572 eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask) 573 { 574 struct ifreq ifr; 575 int ret = 0; 576 int s; 577 578 s = socket(PF_INET, SOCK_DGRAM, 0); 579 if (s < 0) 580 return -errno; 581 582 strlcpy(ifr.ifr_name, if_name, IFNAMSIZ); 583 if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0) { 584 ret = -errno; 585 goto out; 586 } 587 ifr.ifr_flags &= mask; 588 ifr.ifr_flags |= flags; 589 if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) { 590 ret = -errno; 591 goto out; 592 } 593 out: 594 close(s); 595 return ret; 596 } 597 598 static int 599 eth_dev_promiscuous_enable(struct rte_eth_dev *dev) 600 { 601 struct pmd_internals *internals = dev->data->dev_private; 602 603 return eth_dev_change_flags(internals->if_name, IFF_PROMISC, ~0); 604 } 605 606 static int 607 eth_dev_promiscuous_disable(struct rte_eth_dev *dev) 608 { 609 struct pmd_internals *internals = dev->data->dev_private; 610 611 return eth_dev_change_flags(internals->if_name, 0, ~IFF_PROMISC); 612 } 613 614 static const struct eth_dev_ops ops = { 615 .dev_start = eth_dev_start, 616 .dev_stop = eth_dev_stop, 617 .dev_close = eth_dev_close, 618 .dev_configure = eth_dev_configure, 619 .dev_infos_get = eth_dev_info, 620 .mac_addr_set = eth_dev_macaddr_set, 621 .mtu_set = eth_dev_mtu_set, 622 .promiscuous_enable = eth_dev_promiscuous_enable, 623 .promiscuous_disable = eth_dev_promiscuous_disable, 624 .rx_queue_setup = eth_rx_queue_setup, 625 .tx_queue_setup = eth_tx_queue_setup, 626 .link_update = eth_link_update, 627 .stats_get = eth_stats_get, 628 .stats_reset = eth_stats_reset, 629 }; 630 631 /* 632 * Opens an AF_PACKET socket 633 */ 634 static int 635 open_packet_iface(const char *key __rte_unused, 636 const char *value __rte_unused, 637 void *extra_args) 638 { 639 int *sockfd = extra_args; 640 641 /* Open an AF_PACKET socket... */ 642 *sockfd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); 643 if (*sockfd == -1) { 644 PMD_LOG(ERR, "Could not open AF_PACKET socket"); 645 return -1; 646 } 647 648 return 0; 649 } 650 651 static int 652 rte_pmd_init_internals(struct rte_vdev_device *dev, 653 const int sockfd, 654 const unsigned nb_queues, 655 unsigned int blocksize, 656 unsigned int blockcnt, 657 unsigned int framesize, 658 unsigned int framecnt, 659 unsigned int qdisc_bypass, 660 struct pmd_internals **internals, 661 struct rte_eth_dev **eth_dev, 662 struct rte_kvargs *kvlist) 663 { 664 const char *name = rte_vdev_device_name(dev); 665 const unsigned int numa_node = dev->device.numa_node; 666 struct rte_eth_dev_data *data = NULL; 667 struct rte_kvargs_pair *pair = NULL; 668 struct ifreq ifr; 669 size_t ifnamelen; 670 unsigned k_idx; 671 struct sockaddr_ll sockaddr; 672 struct tpacket_req *req; 673 struct pkt_rx_queue *rx_queue; 674 struct pkt_tx_queue *tx_queue; 675 int rc, tpver, discard; 676 int qsockfd = -1; 677 unsigned int i, q, rdsize; 678 #if defined(PACKET_FANOUT) 679 int fanout_arg; 680 #endif 681 682 for (k_idx = 0; k_idx < kvlist->count; k_idx++) { 683 pair = &kvlist->pairs[k_idx]; 684 if (strstr(pair->key, ETH_AF_PACKET_IFACE_ARG) != NULL) 685 break; 686 } 687 if (pair == NULL) { 688 PMD_LOG(ERR, 689 "%s: no interface specified for AF_PACKET ethdev", 690 name); 691 return -1; 692 } 693 694 PMD_LOG(INFO, 695 "%s: creating AF_PACKET-backed ethdev on numa socket %u", 696 name, numa_node); 697 698 *internals = rte_zmalloc_socket(name, sizeof(**internals), 699 0, numa_node); 700 if (*internals == NULL) 701 return -1; 702 703 704 (*internals)->rx_queue = rte_calloc_socket("af_packet_rx", 705 nb_queues, 706 sizeof(struct pkt_rx_queue), 707 0, numa_node); 708 (*internals)->tx_queue = rte_calloc_socket("af_packet_tx", 709 nb_queues, 710 sizeof(struct pkt_tx_queue), 711 0, numa_node); 712 if (!(*internals)->rx_queue || !(*internals)->tx_queue) { 713 goto free_internals; 714 } 715 716 for (q = 0; q < nb_queues; q++) { 717 (*internals)->rx_queue[q].map = MAP_FAILED; 718 (*internals)->tx_queue[q].map = MAP_FAILED; 719 (*internals)->rx_queue[q].sockfd = -1; 720 (*internals)->tx_queue[q].sockfd = -1; 721 } 722 723 req = &((*internals)->req); 724 725 req->tp_block_size = blocksize; 726 req->tp_block_nr = blockcnt; 727 req->tp_frame_size = framesize; 728 req->tp_frame_nr = framecnt; 729 730 ifnamelen = strlen(pair->value); 731 if (ifnamelen < sizeof(ifr.ifr_name)) { 732 memcpy(ifr.ifr_name, pair->value, ifnamelen); 733 ifr.ifr_name[ifnamelen] = '\0'; 734 } else { 735 PMD_LOG(ERR, 736 "%s: I/F name too long (%s)", 737 name, pair->value); 738 goto free_internals; 739 } 740 if (ioctl(sockfd, SIOCGIFINDEX, &ifr) == -1) { 741 PMD_LOG_ERRNO(ERR, "%s: ioctl failed (SIOCGIFINDEX)", name); 742 goto free_internals; 743 } 744 (*internals)->if_name = strdup(pair->value); 745 if ((*internals)->if_name == NULL) 746 goto free_internals; 747 (*internals)->if_index = ifr.ifr_ifindex; 748 749 if (ioctl(sockfd, SIOCGIFHWADDR, &ifr) == -1) { 750 PMD_LOG_ERRNO(ERR, "%s: ioctl failed (SIOCGIFHWADDR)", name); 751 goto free_internals; 752 } 753 memcpy(&(*internals)->eth_addr, ifr.ifr_hwaddr.sa_data, ETH_ALEN); 754 755 memset(&sockaddr, 0, sizeof(sockaddr)); 756 sockaddr.sll_family = AF_PACKET; 757 sockaddr.sll_protocol = htons(ETH_P_ALL); 758 sockaddr.sll_ifindex = (*internals)->if_index; 759 760 #if defined(PACKET_FANOUT) 761 fanout_arg = (getpid() ^ (*internals)->if_index) & 0xffff; 762 fanout_arg |= (PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_DEFRAG) << 16; 763 #if defined(PACKET_FANOUT_FLAG_ROLLOVER) 764 fanout_arg |= PACKET_FANOUT_FLAG_ROLLOVER << 16; 765 #endif 766 #endif 767 768 for (q = 0; q < nb_queues; q++) { 769 /* Open an AF_PACKET socket for this queue... */ 770 qsockfd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); 771 if (qsockfd == -1) { 772 PMD_LOG_ERRNO(ERR, 773 "%s: could not open AF_PACKET socket", 774 name); 775 goto error; 776 } 777 778 tpver = TPACKET_V2; 779 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_VERSION, 780 &tpver, sizeof(tpver)); 781 if (rc == -1) { 782 PMD_LOG_ERRNO(ERR, 783 "%s: could not set PACKET_VERSION on AF_PACKET socket for %s", 784 name, pair->value); 785 goto error; 786 } 787 788 discard = 1; 789 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_LOSS, 790 &discard, sizeof(discard)); 791 if (rc == -1) { 792 PMD_LOG_ERRNO(ERR, 793 "%s: could not set PACKET_LOSS on AF_PACKET socket for %s", 794 name, pair->value); 795 goto error; 796 } 797 798 if (qdisc_bypass) { 799 #if defined(PACKET_QDISC_BYPASS) 800 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_QDISC_BYPASS, 801 &qdisc_bypass, sizeof(qdisc_bypass)); 802 if (rc == -1) { 803 PMD_LOG_ERRNO(ERR, 804 "%s: could not set PACKET_QDISC_BYPASS on AF_PACKET socket for %s", 805 name, pair->value); 806 goto error; 807 } 808 #endif 809 } 810 811 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_RX_RING, req, sizeof(*req)); 812 if (rc == -1) { 813 PMD_LOG_ERRNO(ERR, 814 "%s: could not set PACKET_RX_RING on AF_PACKET socket for %s", 815 name, pair->value); 816 goto error; 817 } 818 819 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_TX_RING, req, sizeof(*req)); 820 if (rc == -1) { 821 PMD_LOG_ERRNO(ERR, 822 "%s: could not set PACKET_TX_RING on AF_PACKET " 823 "socket for %s", name, pair->value); 824 goto error; 825 } 826 827 rx_queue = &((*internals)->rx_queue[q]); 828 rx_queue->framecount = req->tp_frame_nr; 829 830 rx_queue->map = mmap(NULL, 2 * req->tp_block_size * req->tp_block_nr, 831 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED, 832 qsockfd, 0); 833 if (rx_queue->map == MAP_FAILED) { 834 PMD_LOG_ERRNO(ERR, 835 "%s: call to mmap failed on AF_PACKET socket for %s", 836 name, pair->value); 837 goto error; 838 } 839 840 /* rdsize is same for both Tx and Rx */ 841 rdsize = req->tp_frame_nr * sizeof(*(rx_queue->rd)); 842 843 rx_queue->rd = rte_zmalloc_socket(name, rdsize, 0, numa_node); 844 if (rx_queue->rd == NULL) 845 goto error; 846 for (i = 0; i < req->tp_frame_nr; ++i) { 847 rx_queue->rd[i].iov_base = rx_queue->map + (i * framesize); 848 rx_queue->rd[i].iov_len = req->tp_frame_size; 849 } 850 rx_queue->sockfd = qsockfd; 851 852 tx_queue = &((*internals)->tx_queue[q]); 853 tx_queue->framecount = req->tp_frame_nr; 854 tx_queue->frame_data_size = req->tp_frame_size; 855 tx_queue->frame_data_size -= TPACKET2_HDRLEN - 856 sizeof(struct sockaddr_ll); 857 858 tx_queue->map = rx_queue->map + req->tp_block_size * req->tp_block_nr; 859 860 tx_queue->rd = rte_zmalloc_socket(name, rdsize, 0, numa_node); 861 if (tx_queue->rd == NULL) 862 goto error; 863 for (i = 0; i < req->tp_frame_nr; ++i) { 864 tx_queue->rd[i].iov_base = tx_queue->map + (i * framesize); 865 tx_queue->rd[i].iov_len = req->tp_frame_size; 866 } 867 tx_queue->sockfd = qsockfd; 868 869 rc = bind(qsockfd, (const struct sockaddr*)&sockaddr, sizeof(sockaddr)); 870 if (rc == -1) { 871 PMD_LOG_ERRNO(ERR, 872 "%s: could not bind AF_PACKET socket to %s", 873 name, pair->value); 874 goto error; 875 } 876 877 #if defined(PACKET_FANOUT) 878 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_FANOUT, 879 &fanout_arg, sizeof(fanout_arg)); 880 if (rc == -1) { 881 PMD_LOG_ERRNO(ERR, 882 "%s: could not set PACKET_FANOUT on AF_PACKET socket for %s", 883 name, pair->value); 884 goto error; 885 } 886 #endif 887 } 888 889 /* reserve an ethdev entry */ 890 *eth_dev = rte_eth_vdev_allocate(dev, 0); 891 if (*eth_dev == NULL) 892 goto error; 893 894 /* 895 * now put it all together 896 * - store queue data in internals, 897 * - store numa_node in eth_dev 898 * - point eth_dev_data to internals 899 * - and point eth_dev structure to new eth_dev_data structure 900 */ 901 902 (*internals)->nb_queues = nb_queues; 903 904 data = (*eth_dev)->data; 905 data->dev_private = *internals; 906 data->nb_rx_queues = (uint16_t)nb_queues; 907 data->nb_tx_queues = (uint16_t)nb_queues; 908 data->dev_link = pmd_link; 909 data->mac_addrs = &(*internals)->eth_addr; 910 data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS; 911 912 (*eth_dev)->dev_ops = &ops; 913 914 return 0; 915 916 error: 917 if (qsockfd != -1) 918 close(qsockfd); 919 for (q = 0; q < nb_queues; q++) { 920 if ((*internals)->rx_queue[q].map != MAP_FAILED) 921 munmap((*internals)->rx_queue[q].map, 922 2 * req->tp_block_size * req->tp_block_nr); 923 924 rte_free((*internals)->rx_queue[q].rd); 925 rte_free((*internals)->tx_queue[q].rd); 926 if (((*internals)->rx_queue[q].sockfd >= 0) && 927 ((*internals)->rx_queue[q].sockfd != qsockfd)) 928 close((*internals)->rx_queue[q].sockfd); 929 } 930 free_internals: 931 rte_free((*internals)->rx_queue); 932 rte_free((*internals)->tx_queue); 933 free((*internals)->if_name); 934 rte_free(*internals); 935 return -1; 936 } 937 938 static int 939 rte_eth_from_packet(struct rte_vdev_device *dev, 940 int const *sockfd, 941 struct rte_kvargs *kvlist) 942 { 943 const char *name = rte_vdev_device_name(dev); 944 struct pmd_internals *internals = NULL; 945 struct rte_eth_dev *eth_dev = NULL; 946 struct rte_kvargs_pair *pair = NULL; 947 unsigned k_idx; 948 unsigned int blockcount; 949 unsigned int blocksize; 950 unsigned int framesize = DFLT_FRAME_SIZE; 951 unsigned int framecount = DFLT_FRAME_COUNT; 952 unsigned int qpairs = 1; 953 unsigned int qdisc_bypass = 1; 954 955 /* do some parameter checking */ 956 if (*sockfd < 0) 957 return -1; 958 959 blocksize = getpagesize(); 960 961 /* 962 * Walk arguments for configurable settings 963 */ 964 for (k_idx = 0; k_idx < kvlist->count; k_idx++) { 965 pair = &kvlist->pairs[k_idx]; 966 if (strstr(pair->key, ETH_AF_PACKET_NUM_Q_ARG) != NULL) { 967 qpairs = atoi(pair->value); 968 if (qpairs < 1) { 969 PMD_LOG(ERR, 970 "%s: invalid qpairs value", 971 name); 972 return -1; 973 } 974 continue; 975 } 976 if (strstr(pair->key, ETH_AF_PACKET_BLOCKSIZE_ARG) != NULL) { 977 blocksize = atoi(pair->value); 978 if (!blocksize) { 979 PMD_LOG(ERR, 980 "%s: invalid blocksize value", 981 name); 982 return -1; 983 } 984 continue; 985 } 986 if (strstr(pair->key, ETH_AF_PACKET_FRAMESIZE_ARG) != NULL) { 987 framesize = atoi(pair->value); 988 if (!framesize) { 989 PMD_LOG(ERR, 990 "%s: invalid framesize value", 991 name); 992 return -1; 993 } 994 continue; 995 } 996 if (strstr(pair->key, ETH_AF_PACKET_FRAMECOUNT_ARG) != NULL) { 997 framecount = atoi(pair->value); 998 if (!framecount) { 999 PMD_LOG(ERR, 1000 "%s: invalid framecount value", 1001 name); 1002 return -1; 1003 } 1004 continue; 1005 } 1006 if (strstr(pair->key, ETH_AF_PACKET_QDISC_BYPASS_ARG) != NULL) { 1007 qdisc_bypass = atoi(pair->value); 1008 if (qdisc_bypass > 1) { 1009 PMD_LOG(ERR, 1010 "%s: invalid bypass value", 1011 name); 1012 return -1; 1013 } 1014 continue; 1015 } 1016 } 1017 1018 if (framesize > blocksize) { 1019 PMD_LOG(ERR, 1020 "%s: AF_PACKET MMAP frame size exceeds block size!", 1021 name); 1022 return -1; 1023 } 1024 1025 blockcount = framecount / (blocksize / framesize); 1026 if (!blockcount) { 1027 PMD_LOG(ERR, 1028 "%s: invalid AF_PACKET MMAP parameters", name); 1029 return -1; 1030 } 1031 1032 PMD_LOG(INFO, "%s: AF_PACKET MMAP parameters:", name); 1033 PMD_LOG(INFO, "%s:\tblock size %d", name, blocksize); 1034 PMD_LOG(INFO, "%s:\tblock count %d", name, blockcount); 1035 PMD_LOG(INFO, "%s:\tframe size %d", name, framesize); 1036 PMD_LOG(INFO, "%s:\tframe count %d", name, framecount); 1037 1038 if (rte_pmd_init_internals(dev, *sockfd, qpairs, 1039 blocksize, blockcount, 1040 framesize, framecount, 1041 qdisc_bypass, 1042 &internals, ð_dev, 1043 kvlist) < 0) 1044 return -1; 1045 1046 eth_dev->rx_pkt_burst = eth_af_packet_rx; 1047 eth_dev->tx_pkt_burst = eth_af_packet_tx; 1048 1049 rte_eth_dev_probing_finish(eth_dev); 1050 return 0; 1051 } 1052 1053 static int 1054 rte_pmd_af_packet_probe(struct rte_vdev_device *dev) 1055 { 1056 int ret = 0; 1057 struct rte_kvargs *kvlist; 1058 int sockfd = -1; 1059 struct rte_eth_dev *eth_dev; 1060 const char *name = rte_vdev_device_name(dev); 1061 1062 PMD_LOG(INFO, "Initializing pmd_af_packet for %s", name); 1063 1064 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 1065 eth_dev = rte_eth_dev_attach_secondary(name); 1066 if (!eth_dev) { 1067 PMD_LOG(ERR, "Failed to probe %s", name); 1068 return -1; 1069 } 1070 /* TODO: request info from primary to set up Rx and Tx */ 1071 eth_dev->dev_ops = &ops; 1072 eth_dev->device = &dev->device; 1073 rte_eth_dev_probing_finish(eth_dev); 1074 return 0; 1075 } 1076 1077 kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments); 1078 if (kvlist == NULL) { 1079 ret = -1; 1080 goto exit; 1081 } 1082 1083 /* 1084 * If iface argument is passed we open the NICs and use them for 1085 * reading / writing 1086 */ 1087 if (rte_kvargs_count(kvlist, ETH_AF_PACKET_IFACE_ARG) == 1) { 1088 1089 ret = rte_kvargs_process(kvlist, ETH_AF_PACKET_IFACE_ARG, 1090 &open_packet_iface, &sockfd); 1091 if (ret < 0) 1092 goto exit; 1093 } 1094 1095 if (dev->device.numa_node == SOCKET_ID_ANY) 1096 dev->device.numa_node = rte_socket_id(); 1097 1098 ret = rte_eth_from_packet(dev, &sockfd, kvlist); 1099 close(sockfd); /* no longer needed */ 1100 1101 exit: 1102 rte_kvargs_free(kvlist); 1103 return ret; 1104 } 1105 1106 static int 1107 rte_pmd_af_packet_remove(struct rte_vdev_device *dev) 1108 { 1109 struct rte_eth_dev *eth_dev; 1110 1111 if (dev == NULL) 1112 return -1; 1113 1114 /* find the ethdev entry */ 1115 eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev)); 1116 if (eth_dev == NULL) 1117 return 0; /* port already released */ 1118 1119 eth_dev_close(eth_dev); 1120 rte_eth_dev_release_port(eth_dev); 1121 1122 return 0; 1123 } 1124 1125 static struct rte_vdev_driver pmd_af_packet_drv = { 1126 .probe = rte_pmd_af_packet_probe, 1127 .remove = rte_pmd_af_packet_remove, 1128 }; 1129 1130 RTE_PMD_REGISTER_VDEV(net_af_packet, pmd_af_packet_drv); 1131 RTE_PMD_REGISTER_ALIAS(net_af_packet, eth_af_packet); 1132 RTE_PMD_REGISTER_PARAM_STRING(net_af_packet, 1133 "iface=<string> " 1134 "qpairs=<int> " 1135 "blocksz=<int> " 1136 "framesz=<int> " 1137 "framecnt=<int> " 1138 "qdisc_bypass=<0|1>"); 1139