1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2014 John W. Linville <linville@tuxdriver.com> 3 * Originally based upon librte_pmd_pcap code: 4 * Copyright(c) 2010-2015 Intel Corporation. 5 * Copyright(c) 2014 6WIND S.A. 6 * All rights reserved. 7 */ 8 9 #include <rte_string_fns.h> 10 #include <rte_mbuf.h> 11 #include <ethdev_driver.h> 12 #include <ethdev_vdev.h> 13 #include <rte_malloc.h> 14 #include <rte_kvargs.h> 15 #include <bus_vdev_driver.h> 16 17 #include <errno.h> 18 #include <linux/if_ether.h> 19 #include <linux/if_packet.h> 20 #include <arpa/inet.h> 21 #include <net/if.h> 22 #include <net/if_arp.h> 23 #include <sys/types.h> 24 #include <sys/socket.h> 25 #include <sys/ioctl.h> 26 #include <stdlib.h> 27 #include <string.h> 28 #include <sys/mman.h> 29 #include <unistd.h> 30 #include <poll.h> 31 32 #define ETH_AF_PACKET_IFACE_ARG "iface" 33 #define ETH_AF_PACKET_NUM_Q_ARG "qpairs" 34 #define ETH_AF_PACKET_BLOCKSIZE_ARG "blocksz" 35 #define ETH_AF_PACKET_FRAMESIZE_ARG "framesz" 36 #define ETH_AF_PACKET_FRAMECOUNT_ARG "framecnt" 37 #define ETH_AF_PACKET_QDISC_BYPASS_ARG "qdisc_bypass" 38 39 #define DFLT_FRAME_SIZE (1 << 11) 40 #define DFLT_FRAME_COUNT (1 << 9) 41 42 struct pkt_rx_queue { 43 int sockfd; 44 45 struct iovec *rd; 46 uint8_t *map; 47 unsigned int framecount; 48 unsigned int framenum; 49 50 struct rte_mempool *mb_pool; 51 uint16_t in_port; 52 uint8_t vlan_strip; 53 54 volatile unsigned long rx_pkts; 55 volatile unsigned long rx_bytes; 56 }; 57 58 struct pkt_tx_queue { 59 int sockfd; 60 unsigned int frame_data_size; 61 62 struct iovec *rd; 63 uint8_t *map; 64 unsigned int framecount; 65 unsigned int framenum; 66 67 volatile unsigned long tx_pkts; 68 volatile unsigned long err_pkts; 69 volatile unsigned long tx_bytes; 70 }; 71 72 struct pmd_internals { 73 unsigned nb_queues; 74 75 int if_index; 76 char *if_name; 77 struct rte_ether_addr eth_addr; 78 79 struct tpacket_req req; 80 81 struct pkt_rx_queue *rx_queue; 82 struct pkt_tx_queue *tx_queue; 83 uint8_t vlan_strip; 84 }; 85 86 static const char *valid_arguments[] = { 87 ETH_AF_PACKET_IFACE_ARG, 88 ETH_AF_PACKET_NUM_Q_ARG, 89 ETH_AF_PACKET_BLOCKSIZE_ARG, 90 ETH_AF_PACKET_FRAMESIZE_ARG, 91 ETH_AF_PACKET_FRAMECOUNT_ARG, 92 ETH_AF_PACKET_QDISC_BYPASS_ARG, 93 NULL 94 }; 95 96 static struct rte_eth_link pmd_link = { 97 .link_speed = RTE_ETH_SPEED_NUM_10G, 98 .link_duplex = RTE_ETH_LINK_FULL_DUPLEX, 99 .link_status = RTE_ETH_LINK_DOWN, 100 .link_autoneg = RTE_ETH_LINK_FIXED, 101 }; 102 103 RTE_LOG_REGISTER_DEFAULT(af_packet_logtype, NOTICE); 104 105 #define PMD_LOG(level, fmt, args...) \ 106 rte_log(RTE_LOG_ ## level, af_packet_logtype, \ 107 "%s(): " fmt "\n", __func__, ##args) 108 109 #define PMD_LOG_ERRNO(level, fmt, args...) \ 110 rte_log(RTE_LOG_ ## level, af_packet_logtype, \ 111 "%s(): " fmt ":%s\n", __func__, ##args, strerror(errno)) 112 113 static uint16_t 114 eth_af_packet_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) 115 { 116 unsigned i; 117 struct tpacket2_hdr *ppd; 118 struct rte_mbuf *mbuf; 119 uint8_t *pbuf; 120 struct pkt_rx_queue *pkt_q = queue; 121 uint16_t num_rx = 0; 122 unsigned long num_rx_bytes = 0; 123 unsigned int framecount, framenum; 124 125 if (unlikely(nb_pkts == 0)) 126 return 0; 127 128 /* 129 * Reads the given number of packets from the AF_PACKET socket one by 130 * one and copies the packet data into a newly allocated mbuf. 131 */ 132 framecount = pkt_q->framecount; 133 framenum = pkt_q->framenum; 134 for (i = 0; i < nb_pkts; i++) { 135 /* point at the next incoming frame */ 136 ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base; 137 if ((ppd->tp_status & TP_STATUS_USER) == 0) 138 break; 139 140 /* allocate the next mbuf */ 141 mbuf = rte_pktmbuf_alloc(pkt_q->mb_pool); 142 if (unlikely(mbuf == NULL)) 143 break; 144 145 /* packet will fit in the mbuf, go ahead and receive it */ 146 rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf) = ppd->tp_snaplen; 147 pbuf = (uint8_t *) ppd + ppd->tp_mac; 148 memcpy(rte_pktmbuf_mtod(mbuf, void *), pbuf, rte_pktmbuf_data_len(mbuf)); 149 150 /* check for vlan info */ 151 if (ppd->tp_status & TP_STATUS_VLAN_VALID) { 152 mbuf->vlan_tci = ppd->tp_vlan_tci; 153 mbuf->ol_flags |= (RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED); 154 155 if (!pkt_q->vlan_strip && rte_vlan_insert(&mbuf)) 156 PMD_LOG(ERR, "Failed to reinsert VLAN tag"); 157 } 158 159 /* release incoming frame and advance ring buffer */ 160 ppd->tp_status = TP_STATUS_KERNEL; 161 if (++framenum >= framecount) 162 framenum = 0; 163 mbuf->port = pkt_q->in_port; 164 165 /* account for the receive frame */ 166 bufs[i] = mbuf; 167 num_rx++; 168 num_rx_bytes += mbuf->pkt_len; 169 } 170 pkt_q->framenum = framenum; 171 pkt_q->rx_pkts += num_rx; 172 pkt_q->rx_bytes += num_rx_bytes; 173 return num_rx; 174 } 175 176 /* 177 * Check if there is an available frame in the ring 178 */ 179 static inline bool 180 tx_ring_status_available(uint32_t tp_status) 181 { 182 /* 183 * We eliminate the timestamp status from the packet status. 184 * This should only matter if timestamping is enabled on the socket, 185 * but there is a bug in the kernel which is fixed in newer releases. 186 * 187 * See the following kernel commit for reference: 188 * commit 171c3b151118a2fe0fc1e2a9d1b5a1570cfe82d2 189 * net: packetmmap: fix only tx timestamp on request 190 */ 191 tp_status &= ~(TP_STATUS_TS_SOFTWARE | TP_STATUS_TS_RAW_HARDWARE); 192 193 return tp_status == TP_STATUS_AVAILABLE; 194 } 195 196 /* 197 * Callback to handle sending packets through a real NIC. 198 */ 199 static uint16_t 200 eth_af_packet_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) 201 { 202 struct tpacket2_hdr *ppd; 203 struct rte_mbuf *mbuf; 204 uint8_t *pbuf; 205 unsigned int framecount, framenum; 206 struct pollfd pfd; 207 struct pkt_tx_queue *pkt_q = queue; 208 uint16_t num_tx = 0; 209 unsigned long num_tx_bytes = 0; 210 int i; 211 212 if (unlikely(nb_pkts == 0)) 213 return 0; 214 215 memset(&pfd, 0, sizeof(pfd)); 216 pfd.fd = pkt_q->sockfd; 217 pfd.events = POLLOUT; 218 pfd.revents = 0; 219 220 framecount = pkt_q->framecount; 221 framenum = pkt_q->framenum; 222 ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base; 223 for (i = 0; i < nb_pkts; i++) { 224 mbuf = *bufs++; 225 226 /* drop oversized packets */ 227 if (mbuf->pkt_len > pkt_q->frame_data_size) { 228 rte_pktmbuf_free(mbuf); 229 continue; 230 } 231 232 /* insert vlan info if necessary */ 233 if (mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) { 234 if (rte_vlan_insert(&mbuf)) { 235 rte_pktmbuf_free(mbuf); 236 continue; 237 } 238 } 239 240 /* point at the next incoming frame */ 241 if (!tx_ring_status_available(ppd->tp_status)) { 242 if (poll(&pfd, 1, -1) < 0) 243 break; 244 245 /* poll() can return POLLERR if the interface is down */ 246 if (pfd.revents & POLLERR) 247 break; 248 } 249 250 /* 251 * poll() will almost always return POLLOUT, even if there 252 * are no extra buffers available 253 * 254 * This happens, because packet_poll() calls datagram_poll() 255 * which checks the space left in the socket buffer and, 256 * in the case of packet_mmap, the default socket buffer length 257 * doesn't match the requested size for the tx_ring. 258 * As such, there is almost always space left in socket buffer, 259 * which doesn't seem to be correlated to the requested size 260 * for the tx_ring in packet_mmap. 261 * 262 * This results in poll() returning POLLOUT. 263 */ 264 if (!tx_ring_status_available(ppd->tp_status)) 265 break; 266 267 /* copy the tx frame data */ 268 pbuf = (uint8_t *) ppd + TPACKET2_HDRLEN - 269 sizeof(struct sockaddr_ll); 270 271 struct rte_mbuf *tmp_mbuf = mbuf; 272 while (tmp_mbuf) { 273 uint16_t data_len = rte_pktmbuf_data_len(tmp_mbuf); 274 memcpy(pbuf, rte_pktmbuf_mtod(tmp_mbuf, void*), data_len); 275 pbuf += data_len; 276 tmp_mbuf = tmp_mbuf->next; 277 } 278 279 ppd->tp_len = mbuf->pkt_len; 280 ppd->tp_snaplen = mbuf->pkt_len; 281 282 /* release incoming frame and advance ring buffer */ 283 ppd->tp_status = TP_STATUS_SEND_REQUEST; 284 if (++framenum >= framecount) 285 framenum = 0; 286 ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base; 287 288 num_tx++; 289 num_tx_bytes += mbuf->pkt_len; 290 rte_pktmbuf_free(mbuf); 291 } 292 293 /* kick-off transmits */ 294 if (sendto(pkt_q->sockfd, NULL, 0, MSG_DONTWAIT, NULL, 0) == -1 && 295 errno != ENOBUFS && errno != EAGAIN) { 296 /* 297 * In case of a ENOBUFS/EAGAIN error all of the enqueued 298 * packets will be considered successful even though only some 299 * are sent. 300 */ 301 302 num_tx = 0; 303 num_tx_bytes = 0; 304 } 305 306 pkt_q->framenum = framenum; 307 pkt_q->tx_pkts += num_tx; 308 pkt_q->err_pkts += i - num_tx; 309 pkt_q->tx_bytes += num_tx_bytes; 310 return i; 311 } 312 313 static int 314 eth_dev_start(struct rte_eth_dev *dev) 315 { 316 struct pmd_internals *internals = dev->data->dev_private; 317 uint16_t i; 318 319 dev->data->dev_link.link_status = RTE_ETH_LINK_UP; 320 for (i = 0; i < internals->nb_queues; i++) { 321 dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED; 322 dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED; 323 } 324 return 0; 325 } 326 327 /* 328 * This function gets called when the current port gets stopped. 329 */ 330 static int 331 eth_dev_stop(struct rte_eth_dev *dev) 332 { 333 unsigned i; 334 int sockfd; 335 struct pmd_internals *internals = dev->data->dev_private; 336 337 for (i = 0; i < internals->nb_queues; i++) { 338 sockfd = internals->rx_queue[i].sockfd; 339 if (sockfd != -1) 340 close(sockfd); 341 342 /* Prevent use after free in case tx fd == rx fd */ 343 if (sockfd != internals->tx_queue[i].sockfd) { 344 sockfd = internals->tx_queue[i].sockfd; 345 if (sockfd != -1) 346 close(sockfd); 347 } 348 349 internals->rx_queue[i].sockfd = -1; 350 internals->tx_queue[i].sockfd = -1; 351 dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED; 352 dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED; 353 } 354 355 dev->data->dev_link.link_status = RTE_ETH_LINK_DOWN; 356 return 0; 357 } 358 359 static int 360 eth_dev_configure(struct rte_eth_dev *dev __rte_unused) 361 { 362 struct rte_eth_conf *dev_conf = &dev->data->dev_conf; 363 const struct rte_eth_rxmode *rxmode = &dev_conf->rxmode; 364 struct pmd_internals *internals = dev->data->dev_private; 365 366 internals->vlan_strip = !!(rxmode->offloads & RTE_ETH_RX_OFFLOAD_VLAN_STRIP); 367 return 0; 368 } 369 370 static int 371 eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) 372 { 373 struct pmd_internals *internals = dev->data->dev_private; 374 375 dev_info->if_index = internals->if_index; 376 dev_info->max_mac_addrs = 1; 377 dev_info->max_rx_pktlen = RTE_ETHER_MAX_LEN; 378 dev_info->max_rx_queues = (uint16_t)internals->nb_queues; 379 dev_info->max_tx_queues = (uint16_t)internals->nb_queues; 380 dev_info->min_rx_bufsize = 0; 381 dev_info->tx_offload_capa = RTE_ETH_TX_OFFLOAD_MULTI_SEGS | 382 RTE_ETH_TX_OFFLOAD_VLAN_INSERT; 383 dev_info->rx_offload_capa = RTE_ETH_RX_OFFLOAD_VLAN_STRIP; 384 385 return 0; 386 } 387 388 static int 389 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats) 390 { 391 unsigned i, imax; 392 unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0; 393 unsigned long rx_bytes_total = 0, tx_bytes_total = 0; 394 const struct pmd_internals *internal = dev->data->dev_private; 395 396 imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ? 397 internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS); 398 for (i = 0; i < imax; i++) { 399 igb_stats->q_ipackets[i] = internal->rx_queue[i].rx_pkts; 400 igb_stats->q_ibytes[i] = internal->rx_queue[i].rx_bytes; 401 rx_total += igb_stats->q_ipackets[i]; 402 rx_bytes_total += igb_stats->q_ibytes[i]; 403 } 404 405 imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ? 406 internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS); 407 for (i = 0; i < imax; i++) { 408 igb_stats->q_opackets[i] = internal->tx_queue[i].tx_pkts; 409 igb_stats->q_obytes[i] = internal->tx_queue[i].tx_bytes; 410 tx_total += igb_stats->q_opackets[i]; 411 tx_err_total += internal->tx_queue[i].err_pkts; 412 tx_bytes_total += igb_stats->q_obytes[i]; 413 } 414 415 igb_stats->ipackets = rx_total; 416 igb_stats->ibytes = rx_bytes_total; 417 igb_stats->opackets = tx_total; 418 igb_stats->oerrors = tx_err_total; 419 igb_stats->obytes = tx_bytes_total; 420 return 0; 421 } 422 423 static int 424 eth_stats_reset(struct rte_eth_dev *dev) 425 { 426 unsigned i; 427 struct pmd_internals *internal = dev->data->dev_private; 428 429 for (i = 0; i < internal->nb_queues; i++) { 430 internal->rx_queue[i].rx_pkts = 0; 431 internal->rx_queue[i].rx_bytes = 0; 432 } 433 434 for (i = 0; i < internal->nb_queues; i++) { 435 internal->tx_queue[i].tx_pkts = 0; 436 internal->tx_queue[i].err_pkts = 0; 437 internal->tx_queue[i].tx_bytes = 0; 438 } 439 440 return 0; 441 } 442 443 static int 444 eth_dev_close(struct rte_eth_dev *dev) 445 { 446 struct pmd_internals *internals; 447 struct tpacket_req *req; 448 unsigned int q; 449 450 if (rte_eal_process_type() != RTE_PROC_PRIMARY) 451 return 0; 452 453 PMD_LOG(INFO, "Closing AF_PACKET ethdev on NUMA socket %u", 454 rte_socket_id()); 455 456 internals = dev->data->dev_private; 457 req = &internals->req; 458 for (q = 0; q < internals->nb_queues; q++) { 459 munmap(internals->rx_queue[q].map, 460 2 * req->tp_block_size * req->tp_block_nr); 461 rte_free(internals->rx_queue[q].rd); 462 rte_free(internals->tx_queue[q].rd); 463 } 464 free(internals->if_name); 465 rte_free(internals->rx_queue); 466 rte_free(internals->tx_queue); 467 468 /* mac_addrs must not be freed alone because part of dev_private */ 469 dev->data->mac_addrs = NULL; 470 return 0; 471 } 472 473 static int 474 eth_link_update(struct rte_eth_dev *dev __rte_unused, 475 int wait_to_complete __rte_unused) 476 { 477 return 0; 478 } 479 480 static int 481 eth_rx_queue_setup(struct rte_eth_dev *dev, 482 uint16_t rx_queue_id, 483 uint16_t nb_rx_desc __rte_unused, 484 unsigned int socket_id __rte_unused, 485 const struct rte_eth_rxconf *rx_conf __rte_unused, 486 struct rte_mempool *mb_pool) 487 { 488 struct pmd_internals *internals = dev->data->dev_private; 489 struct pkt_rx_queue *pkt_q = &internals->rx_queue[rx_queue_id]; 490 unsigned int buf_size, data_size; 491 492 pkt_q->mb_pool = mb_pool; 493 494 /* Now get the space available for data in the mbuf */ 495 buf_size = rte_pktmbuf_data_room_size(pkt_q->mb_pool) - 496 RTE_PKTMBUF_HEADROOM; 497 data_size = internals->req.tp_frame_size; 498 data_size -= TPACKET2_HDRLEN - sizeof(struct sockaddr_ll); 499 500 if (data_size > buf_size) { 501 PMD_LOG(ERR, 502 "%s: %d bytes will not fit in mbuf (%d bytes)", 503 dev->device->name, data_size, buf_size); 504 return -ENOMEM; 505 } 506 507 dev->data->rx_queues[rx_queue_id] = pkt_q; 508 pkt_q->in_port = dev->data->port_id; 509 pkt_q->vlan_strip = internals->vlan_strip; 510 511 return 0; 512 } 513 514 static int 515 eth_tx_queue_setup(struct rte_eth_dev *dev, 516 uint16_t tx_queue_id, 517 uint16_t nb_tx_desc __rte_unused, 518 unsigned int socket_id __rte_unused, 519 const struct rte_eth_txconf *tx_conf __rte_unused) 520 { 521 522 struct pmd_internals *internals = dev->data->dev_private; 523 524 dev->data->tx_queues[tx_queue_id] = &internals->tx_queue[tx_queue_id]; 525 return 0; 526 } 527 528 static int 529 eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu) 530 { 531 struct pmd_internals *internals = dev->data->dev_private; 532 struct ifreq ifr = { .ifr_mtu = mtu }; 533 int ret; 534 int s; 535 unsigned int data_size = internals->req.tp_frame_size - 536 TPACKET2_HDRLEN; 537 538 if (mtu > data_size) 539 return -EINVAL; 540 541 s = socket(PF_INET, SOCK_DGRAM, 0); 542 if (s < 0) 543 return -EINVAL; 544 545 strlcpy(ifr.ifr_name, internals->if_name, IFNAMSIZ); 546 ret = ioctl(s, SIOCSIFMTU, &ifr); 547 close(s); 548 549 if (ret < 0) 550 return -EINVAL; 551 552 return 0; 553 } 554 555 static int 556 eth_dev_macaddr_set(struct rte_eth_dev *dev, struct rte_ether_addr *addr) 557 { 558 struct pmd_internals *internals = dev->data->dev_private; 559 struct ifreq ifr = { }; 560 int sockfd = internals->rx_queue[0].sockfd; 561 int ret; 562 563 if (sockfd == -1) { 564 PMD_LOG(ERR, "receive socket not found"); 565 return -EINVAL; 566 } 567 568 strlcpy(ifr.ifr_name, internals->if_name, IFNAMSIZ); 569 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER; 570 memcpy(ifr.ifr_hwaddr.sa_data, addr, sizeof(*addr)); 571 ret = ioctl(sockfd, SIOCSIFHWADDR, &ifr); 572 573 if (ret < 0) { 574 PMD_LOG_ERRNO(ERR, "ioctl(SIOCSIFHWADDR) failed"); 575 return -EINVAL; 576 } 577 578 return 0; 579 } 580 581 static int 582 eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask) 583 { 584 struct ifreq ifr; 585 int ret = 0; 586 int s; 587 588 s = socket(PF_INET, SOCK_DGRAM, 0); 589 if (s < 0) 590 return -errno; 591 592 strlcpy(ifr.ifr_name, if_name, IFNAMSIZ); 593 if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0) { 594 ret = -errno; 595 goto out; 596 } 597 ifr.ifr_flags &= mask; 598 ifr.ifr_flags |= flags; 599 if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) { 600 ret = -errno; 601 goto out; 602 } 603 out: 604 close(s); 605 return ret; 606 } 607 608 static int 609 eth_dev_promiscuous_enable(struct rte_eth_dev *dev) 610 { 611 struct pmd_internals *internals = dev->data->dev_private; 612 613 return eth_dev_change_flags(internals->if_name, IFF_PROMISC, ~0); 614 } 615 616 static int 617 eth_dev_promiscuous_disable(struct rte_eth_dev *dev) 618 { 619 struct pmd_internals *internals = dev->data->dev_private; 620 621 return eth_dev_change_flags(internals->if_name, 0, ~IFF_PROMISC); 622 } 623 624 static const struct eth_dev_ops ops = { 625 .dev_start = eth_dev_start, 626 .dev_stop = eth_dev_stop, 627 .dev_close = eth_dev_close, 628 .dev_configure = eth_dev_configure, 629 .dev_infos_get = eth_dev_info, 630 .mac_addr_set = eth_dev_macaddr_set, 631 .mtu_set = eth_dev_mtu_set, 632 .promiscuous_enable = eth_dev_promiscuous_enable, 633 .promiscuous_disable = eth_dev_promiscuous_disable, 634 .rx_queue_setup = eth_rx_queue_setup, 635 .tx_queue_setup = eth_tx_queue_setup, 636 .link_update = eth_link_update, 637 .stats_get = eth_stats_get, 638 .stats_reset = eth_stats_reset, 639 }; 640 641 /* 642 * Opens an AF_PACKET socket 643 */ 644 static int 645 open_packet_iface(const char *key __rte_unused, 646 const char *value __rte_unused, 647 void *extra_args) 648 { 649 int *sockfd = extra_args; 650 651 /* Open an AF_PACKET socket... */ 652 *sockfd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); 653 if (*sockfd == -1) { 654 PMD_LOG(ERR, "Could not open AF_PACKET socket"); 655 return -1; 656 } 657 658 return 0; 659 } 660 661 static int 662 rte_pmd_init_internals(struct rte_vdev_device *dev, 663 const int sockfd, 664 const unsigned nb_queues, 665 unsigned int blocksize, 666 unsigned int blockcnt, 667 unsigned int framesize, 668 unsigned int framecnt, 669 unsigned int qdisc_bypass, 670 struct pmd_internals **internals, 671 struct rte_eth_dev **eth_dev, 672 struct rte_kvargs *kvlist) 673 { 674 const char *name = rte_vdev_device_name(dev); 675 const unsigned int numa_node = dev->device.numa_node; 676 struct rte_eth_dev_data *data = NULL; 677 struct rte_kvargs_pair *pair = NULL; 678 struct ifreq ifr; 679 size_t ifnamelen; 680 unsigned k_idx; 681 struct sockaddr_ll sockaddr; 682 struct tpacket_req *req; 683 struct pkt_rx_queue *rx_queue; 684 struct pkt_tx_queue *tx_queue; 685 int rc, tpver, discard; 686 int qsockfd = -1; 687 unsigned int i, q, rdsize; 688 #if defined(PACKET_FANOUT) 689 int fanout_arg; 690 #endif 691 692 for (k_idx = 0; k_idx < kvlist->count; k_idx++) { 693 pair = &kvlist->pairs[k_idx]; 694 if (strstr(pair->key, ETH_AF_PACKET_IFACE_ARG) != NULL) 695 break; 696 } 697 if (pair == NULL) { 698 PMD_LOG(ERR, 699 "%s: no interface specified for AF_PACKET ethdev", 700 name); 701 return -1; 702 } 703 704 PMD_LOG(INFO, 705 "%s: creating AF_PACKET-backed ethdev on numa socket %u", 706 name, numa_node); 707 708 *internals = rte_zmalloc_socket(name, sizeof(**internals), 709 0, numa_node); 710 if (*internals == NULL) 711 return -1; 712 713 714 (*internals)->rx_queue = rte_calloc_socket("af_packet_rx", 715 nb_queues, 716 sizeof(struct pkt_rx_queue), 717 0, numa_node); 718 (*internals)->tx_queue = rte_calloc_socket("af_packet_tx", 719 nb_queues, 720 sizeof(struct pkt_tx_queue), 721 0, numa_node); 722 if (!(*internals)->rx_queue || !(*internals)->tx_queue) { 723 goto free_internals; 724 } 725 726 for (q = 0; q < nb_queues; q++) { 727 (*internals)->rx_queue[q].map = MAP_FAILED; 728 (*internals)->tx_queue[q].map = MAP_FAILED; 729 (*internals)->rx_queue[q].sockfd = -1; 730 (*internals)->tx_queue[q].sockfd = -1; 731 } 732 733 req = &((*internals)->req); 734 735 req->tp_block_size = blocksize; 736 req->tp_block_nr = blockcnt; 737 req->tp_frame_size = framesize; 738 req->tp_frame_nr = framecnt; 739 740 ifnamelen = strlen(pair->value); 741 if (ifnamelen < sizeof(ifr.ifr_name)) { 742 memcpy(ifr.ifr_name, pair->value, ifnamelen); 743 ifr.ifr_name[ifnamelen] = '\0'; 744 } else { 745 PMD_LOG(ERR, 746 "%s: I/F name too long (%s)", 747 name, pair->value); 748 goto free_internals; 749 } 750 if (ioctl(sockfd, SIOCGIFINDEX, &ifr) == -1) { 751 PMD_LOG_ERRNO(ERR, "%s: ioctl failed (SIOCGIFINDEX)", name); 752 goto free_internals; 753 } 754 (*internals)->if_name = strdup(pair->value); 755 if ((*internals)->if_name == NULL) 756 goto free_internals; 757 (*internals)->if_index = ifr.ifr_ifindex; 758 759 if (ioctl(sockfd, SIOCGIFHWADDR, &ifr) == -1) { 760 PMD_LOG_ERRNO(ERR, "%s: ioctl failed (SIOCGIFHWADDR)", name); 761 goto free_internals; 762 } 763 memcpy(&(*internals)->eth_addr, ifr.ifr_hwaddr.sa_data, ETH_ALEN); 764 765 memset(&sockaddr, 0, sizeof(sockaddr)); 766 sockaddr.sll_family = AF_PACKET; 767 sockaddr.sll_protocol = htons(ETH_P_ALL); 768 sockaddr.sll_ifindex = (*internals)->if_index; 769 770 #if defined(PACKET_FANOUT) 771 fanout_arg = (getpid() ^ (*internals)->if_index) & 0xffff; 772 fanout_arg |= (PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_DEFRAG) << 16; 773 #if defined(PACKET_FANOUT_FLAG_ROLLOVER) 774 fanout_arg |= PACKET_FANOUT_FLAG_ROLLOVER << 16; 775 #endif 776 #endif 777 778 for (q = 0; q < nb_queues; q++) { 779 /* Open an AF_PACKET socket for this queue... */ 780 qsockfd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); 781 if (qsockfd == -1) { 782 PMD_LOG_ERRNO(ERR, 783 "%s: could not open AF_PACKET socket", 784 name); 785 goto error; 786 } 787 788 tpver = TPACKET_V2; 789 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_VERSION, 790 &tpver, sizeof(tpver)); 791 if (rc == -1) { 792 PMD_LOG_ERRNO(ERR, 793 "%s: could not set PACKET_VERSION on AF_PACKET socket for %s", 794 name, pair->value); 795 goto error; 796 } 797 798 discard = 1; 799 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_LOSS, 800 &discard, sizeof(discard)); 801 if (rc == -1) { 802 PMD_LOG_ERRNO(ERR, 803 "%s: could not set PACKET_LOSS on AF_PACKET socket for %s", 804 name, pair->value); 805 goto error; 806 } 807 808 if (qdisc_bypass) { 809 #if defined(PACKET_QDISC_BYPASS) 810 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_QDISC_BYPASS, 811 &qdisc_bypass, sizeof(qdisc_bypass)); 812 if (rc == -1) { 813 PMD_LOG_ERRNO(ERR, 814 "%s: could not set PACKET_QDISC_BYPASS on AF_PACKET socket for %s", 815 name, pair->value); 816 goto error; 817 } 818 #endif 819 } 820 821 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_RX_RING, req, sizeof(*req)); 822 if (rc == -1) { 823 PMD_LOG_ERRNO(ERR, 824 "%s: could not set PACKET_RX_RING on AF_PACKET socket for %s", 825 name, pair->value); 826 goto error; 827 } 828 829 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_TX_RING, req, sizeof(*req)); 830 if (rc == -1) { 831 PMD_LOG_ERRNO(ERR, 832 "%s: could not set PACKET_TX_RING on AF_PACKET " 833 "socket for %s", name, pair->value); 834 goto error; 835 } 836 837 rx_queue = &((*internals)->rx_queue[q]); 838 rx_queue->framecount = req->tp_frame_nr; 839 840 rx_queue->map = mmap(NULL, 2 * req->tp_block_size * req->tp_block_nr, 841 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED, 842 qsockfd, 0); 843 if (rx_queue->map == MAP_FAILED) { 844 PMD_LOG_ERRNO(ERR, 845 "%s: call to mmap failed on AF_PACKET socket for %s", 846 name, pair->value); 847 goto error; 848 } 849 850 /* rdsize is same for both Tx and Rx */ 851 rdsize = req->tp_frame_nr * sizeof(*(rx_queue->rd)); 852 853 rx_queue->rd = rte_zmalloc_socket(name, rdsize, 0, numa_node); 854 if (rx_queue->rd == NULL) 855 goto error; 856 for (i = 0; i < req->tp_frame_nr; ++i) { 857 rx_queue->rd[i].iov_base = rx_queue->map + (i * framesize); 858 rx_queue->rd[i].iov_len = req->tp_frame_size; 859 } 860 rx_queue->sockfd = qsockfd; 861 862 tx_queue = &((*internals)->tx_queue[q]); 863 tx_queue->framecount = req->tp_frame_nr; 864 tx_queue->frame_data_size = req->tp_frame_size; 865 tx_queue->frame_data_size -= TPACKET2_HDRLEN - 866 sizeof(struct sockaddr_ll); 867 868 tx_queue->map = rx_queue->map + req->tp_block_size * req->tp_block_nr; 869 870 tx_queue->rd = rte_zmalloc_socket(name, rdsize, 0, numa_node); 871 if (tx_queue->rd == NULL) 872 goto error; 873 for (i = 0; i < req->tp_frame_nr; ++i) { 874 tx_queue->rd[i].iov_base = tx_queue->map + (i * framesize); 875 tx_queue->rd[i].iov_len = req->tp_frame_size; 876 } 877 tx_queue->sockfd = qsockfd; 878 879 rc = bind(qsockfd, (const struct sockaddr*)&sockaddr, sizeof(sockaddr)); 880 if (rc == -1) { 881 PMD_LOG_ERRNO(ERR, 882 "%s: could not bind AF_PACKET socket to %s", 883 name, pair->value); 884 goto error; 885 } 886 887 #if defined(PACKET_FANOUT) 888 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_FANOUT, 889 &fanout_arg, sizeof(fanout_arg)); 890 if (rc == -1) { 891 PMD_LOG_ERRNO(ERR, 892 "%s: could not set PACKET_FANOUT on AF_PACKET socket for %s", 893 name, pair->value); 894 goto error; 895 } 896 #endif 897 } 898 899 /* reserve an ethdev entry */ 900 *eth_dev = rte_eth_vdev_allocate(dev, 0); 901 if (*eth_dev == NULL) 902 goto error; 903 904 /* 905 * now put it all together 906 * - store queue data in internals, 907 * - store numa_node in eth_dev 908 * - point eth_dev_data to internals 909 * - and point eth_dev structure to new eth_dev_data structure 910 */ 911 912 (*internals)->nb_queues = nb_queues; 913 914 data = (*eth_dev)->data; 915 data->dev_private = *internals; 916 data->nb_rx_queues = (uint16_t)nb_queues; 917 data->nb_tx_queues = (uint16_t)nb_queues; 918 data->dev_link = pmd_link; 919 data->mac_addrs = &(*internals)->eth_addr; 920 data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS; 921 922 (*eth_dev)->dev_ops = &ops; 923 924 return 0; 925 926 error: 927 if (qsockfd != -1) 928 close(qsockfd); 929 for (q = 0; q < nb_queues; q++) { 930 if ((*internals)->rx_queue[q].map != MAP_FAILED) 931 munmap((*internals)->rx_queue[q].map, 932 2 * req->tp_block_size * req->tp_block_nr); 933 934 rte_free((*internals)->rx_queue[q].rd); 935 rte_free((*internals)->tx_queue[q].rd); 936 if (((*internals)->rx_queue[q].sockfd >= 0) && 937 ((*internals)->rx_queue[q].sockfd != qsockfd)) 938 close((*internals)->rx_queue[q].sockfd); 939 } 940 free_internals: 941 rte_free((*internals)->rx_queue); 942 rte_free((*internals)->tx_queue); 943 free((*internals)->if_name); 944 rte_free(*internals); 945 return -1; 946 } 947 948 static int 949 rte_eth_from_packet(struct rte_vdev_device *dev, 950 int const *sockfd, 951 struct rte_kvargs *kvlist) 952 { 953 const char *name = rte_vdev_device_name(dev); 954 struct pmd_internals *internals = NULL; 955 struct rte_eth_dev *eth_dev = NULL; 956 struct rte_kvargs_pair *pair = NULL; 957 unsigned k_idx; 958 unsigned int blockcount; 959 unsigned int blocksize; 960 unsigned int framesize = DFLT_FRAME_SIZE; 961 unsigned int framecount = DFLT_FRAME_COUNT; 962 unsigned int qpairs = 1; 963 unsigned int qdisc_bypass = 1; 964 965 /* do some parameter checking */ 966 if (*sockfd < 0) 967 return -1; 968 969 blocksize = getpagesize(); 970 971 /* 972 * Walk arguments for configurable settings 973 */ 974 for (k_idx = 0; k_idx < kvlist->count; k_idx++) { 975 pair = &kvlist->pairs[k_idx]; 976 if (strstr(pair->key, ETH_AF_PACKET_NUM_Q_ARG) != NULL) { 977 qpairs = atoi(pair->value); 978 if (qpairs < 1) { 979 PMD_LOG(ERR, 980 "%s: invalid qpairs value", 981 name); 982 return -1; 983 } 984 continue; 985 } 986 if (strstr(pair->key, ETH_AF_PACKET_BLOCKSIZE_ARG) != NULL) { 987 blocksize = atoi(pair->value); 988 if (!blocksize) { 989 PMD_LOG(ERR, 990 "%s: invalid blocksize value", 991 name); 992 return -1; 993 } 994 continue; 995 } 996 if (strstr(pair->key, ETH_AF_PACKET_FRAMESIZE_ARG) != NULL) { 997 framesize = atoi(pair->value); 998 if (!framesize) { 999 PMD_LOG(ERR, 1000 "%s: invalid framesize value", 1001 name); 1002 return -1; 1003 } 1004 continue; 1005 } 1006 if (strstr(pair->key, ETH_AF_PACKET_FRAMECOUNT_ARG) != NULL) { 1007 framecount = atoi(pair->value); 1008 if (!framecount) { 1009 PMD_LOG(ERR, 1010 "%s: invalid framecount value", 1011 name); 1012 return -1; 1013 } 1014 continue; 1015 } 1016 if (strstr(pair->key, ETH_AF_PACKET_QDISC_BYPASS_ARG) != NULL) { 1017 qdisc_bypass = atoi(pair->value); 1018 if (qdisc_bypass > 1) { 1019 PMD_LOG(ERR, 1020 "%s: invalid bypass value", 1021 name); 1022 return -1; 1023 } 1024 continue; 1025 } 1026 } 1027 1028 if (framesize > blocksize) { 1029 PMD_LOG(ERR, 1030 "%s: AF_PACKET MMAP frame size exceeds block size!", 1031 name); 1032 return -1; 1033 } 1034 1035 blockcount = framecount / (blocksize / framesize); 1036 if (!blockcount) { 1037 PMD_LOG(ERR, 1038 "%s: invalid AF_PACKET MMAP parameters", name); 1039 return -1; 1040 } 1041 1042 PMD_LOG(INFO, "%s: AF_PACKET MMAP parameters:", name); 1043 PMD_LOG(INFO, "%s:\tblock size %d", name, blocksize); 1044 PMD_LOG(INFO, "%s:\tblock count %d", name, blockcount); 1045 PMD_LOG(INFO, "%s:\tframe size %d", name, framesize); 1046 PMD_LOG(INFO, "%s:\tframe count %d", name, framecount); 1047 1048 if (rte_pmd_init_internals(dev, *sockfd, qpairs, 1049 blocksize, blockcount, 1050 framesize, framecount, 1051 qdisc_bypass, 1052 &internals, ð_dev, 1053 kvlist) < 0) 1054 return -1; 1055 1056 eth_dev->rx_pkt_burst = eth_af_packet_rx; 1057 eth_dev->tx_pkt_burst = eth_af_packet_tx; 1058 1059 rte_eth_dev_probing_finish(eth_dev); 1060 return 0; 1061 } 1062 1063 static int 1064 rte_pmd_af_packet_probe(struct rte_vdev_device *dev) 1065 { 1066 int ret = 0; 1067 struct rte_kvargs *kvlist; 1068 int sockfd = -1; 1069 struct rte_eth_dev *eth_dev; 1070 const char *name = rte_vdev_device_name(dev); 1071 1072 PMD_LOG(INFO, "Initializing pmd_af_packet for %s", name); 1073 1074 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 1075 eth_dev = rte_eth_dev_attach_secondary(name); 1076 if (!eth_dev) { 1077 PMD_LOG(ERR, "Failed to probe %s", name); 1078 return -1; 1079 } 1080 /* TODO: request info from primary to set up Rx and Tx */ 1081 eth_dev->dev_ops = &ops; 1082 eth_dev->device = &dev->device; 1083 rte_eth_dev_probing_finish(eth_dev); 1084 return 0; 1085 } 1086 1087 kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments); 1088 if (kvlist == NULL) { 1089 ret = -1; 1090 goto exit; 1091 } 1092 1093 /* 1094 * If iface argument is passed we open the NICs and use them for 1095 * reading / writing 1096 */ 1097 if (rte_kvargs_count(kvlist, ETH_AF_PACKET_IFACE_ARG) == 1) { 1098 1099 ret = rte_kvargs_process(kvlist, ETH_AF_PACKET_IFACE_ARG, 1100 &open_packet_iface, &sockfd); 1101 if (ret < 0) 1102 goto exit; 1103 } 1104 1105 if (dev->device.numa_node == SOCKET_ID_ANY) 1106 dev->device.numa_node = rte_socket_id(); 1107 1108 ret = rte_eth_from_packet(dev, &sockfd, kvlist); 1109 close(sockfd); /* no longer needed */ 1110 1111 exit: 1112 rte_kvargs_free(kvlist); 1113 return ret; 1114 } 1115 1116 static int 1117 rte_pmd_af_packet_remove(struct rte_vdev_device *dev) 1118 { 1119 struct rte_eth_dev *eth_dev; 1120 1121 if (dev == NULL) 1122 return -1; 1123 1124 /* find the ethdev entry */ 1125 eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev)); 1126 if (eth_dev == NULL) 1127 return 0; /* port already released */ 1128 1129 eth_dev_close(eth_dev); 1130 rte_eth_dev_release_port(eth_dev); 1131 1132 return 0; 1133 } 1134 1135 static struct rte_vdev_driver pmd_af_packet_drv = { 1136 .probe = rte_pmd_af_packet_probe, 1137 .remove = rte_pmd_af_packet_remove, 1138 }; 1139 1140 RTE_PMD_REGISTER_VDEV(net_af_packet, pmd_af_packet_drv); 1141 RTE_PMD_REGISTER_ALIAS(net_af_packet, eth_af_packet); 1142 RTE_PMD_REGISTER_PARAM_STRING(net_af_packet, 1143 "iface=<string> " 1144 "qpairs=<int> " 1145 "blocksz=<int> " 1146 "framesz=<int> " 1147 "framecnt=<int> " 1148 "qdisc_bypass=<0|1>"); 1149