1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2014 John W. Linville <linville@tuxdriver.com> 3 * Originally based upon librte_pmd_pcap code: 4 * Copyright(c) 2010-2015 Intel Corporation. 5 * Copyright(c) 2014 6WIND S.A. 6 * All rights reserved. 7 */ 8 9 #include <rte_common.h> 10 #include <rte_string_fns.h> 11 #include <rte_mbuf.h> 12 #include <ethdev_driver.h> 13 #include <ethdev_vdev.h> 14 #include <rte_malloc.h> 15 #include <rte_kvargs.h> 16 #include <bus_vdev_driver.h> 17 18 #include <errno.h> 19 #include <linux/if_ether.h> 20 #include <linux/if_packet.h> 21 #include <arpa/inet.h> 22 #include <net/if.h> 23 #include <net/if_arp.h> 24 #include <sys/types.h> 25 #include <sys/socket.h> 26 #include <sys/ioctl.h> 27 #include <stdlib.h> 28 #include <string.h> 29 #include <sys/mman.h> 30 #include <unistd.h> 31 #include <poll.h> 32 33 #define ETH_AF_PACKET_IFACE_ARG "iface" 34 #define ETH_AF_PACKET_NUM_Q_ARG "qpairs" 35 #define ETH_AF_PACKET_BLOCKSIZE_ARG "blocksz" 36 #define ETH_AF_PACKET_FRAMESIZE_ARG "framesz" 37 #define ETH_AF_PACKET_FRAMECOUNT_ARG "framecnt" 38 #define ETH_AF_PACKET_QDISC_BYPASS_ARG "qdisc_bypass" 39 40 #define DFLT_FRAME_SIZE (1 << 11) 41 #define DFLT_FRAME_COUNT (1 << 9) 42 43 static uint64_t timestamp_dynflag; 44 static int timestamp_dynfield_offset = -1; 45 46 struct __rte_cache_aligned pkt_rx_queue { 47 int sockfd; 48 49 struct iovec *rd; 50 uint8_t *map; 51 unsigned int framecount; 52 unsigned int framenum; 53 54 struct rte_mempool *mb_pool; 55 uint16_t in_port; 56 uint8_t vlan_strip; 57 uint8_t timestamp_offloading; 58 59 volatile unsigned long rx_pkts; 60 volatile unsigned long rx_bytes; 61 }; 62 63 struct __rte_cache_aligned pkt_tx_queue { 64 int sockfd; 65 unsigned int frame_data_size; 66 67 struct iovec *rd; 68 uint8_t *map; 69 unsigned int framecount; 70 unsigned int framenum; 71 72 volatile unsigned long tx_pkts; 73 volatile unsigned long err_pkts; 74 volatile unsigned long tx_bytes; 75 }; 76 77 struct pmd_internals { 78 unsigned nb_queues; 79 80 int if_index; 81 char *if_name; 82 struct rte_ether_addr eth_addr; 83 84 struct tpacket_req req; 85 86 struct pkt_rx_queue *rx_queue; 87 struct pkt_tx_queue *tx_queue; 88 uint8_t vlan_strip; 89 uint8_t timestamp_offloading; 90 }; 91 92 static const char *valid_arguments[] = { 93 ETH_AF_PACKET_IFACE_ARG, 94 ETH_AF_PACKET_NUM_Q_ARG, 95 ETH_AF_PACKET_BLOCKSIZE_ARG, 96 ETH_AF_PACKET_FRAMESIZE_ARG, 97 ETH_AF_PACKET_FRAMECOUNT_ARG, 98 ETH_AF_PACKET_QDISC_BYPASS_ARG, 99 NULL 100 }; 101 102 static struct rte_eth_link pmd_link = { 103 .link_speed = RTE_ETH_SPEED_NUM_10G, 104 .link_duplex = RTE_ETH_LINK_FULL_DUPLEX, 105 .link_status = RTE_ETH_LINK_DOWN, 106 .link_autoneg = RTE_ETH_LINK_FIXED, 107 }; 108 109 RTE_LOG_REGISTER_DEFAULT(af_packet_logtype, NOTICE); 110 #define RTE_LOGTYPE_AFPACKET af_packet_logtype 111 112 #define PMD_LOG(level, ...) \ 113 RTE_LOG_LINE_PREFIX(level, AFPACKET, "%s(): ", __func__, __VA_ARGS__) 114 115 #define PMD_LOG_ERRNO(level, fmt, ...) \ 116 RTE_LOG_LINE(level, AFPACKET, "%s(): " fmt ":%s", __func__, \ 117 ## __VA_ARGS__, strerror(errno)) 118 119 static uint16_t 120 eth_af_packet_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) 121 { 122 unsigned i; 123 struct tpacket2_hdr *ppd; 124 struct rte_mbuf *mbuf; 125 uint8_t *pbuf; 126 struct pkt_rx_queue *pkt_q = queue; 127 uint16_t num_rx = 0; 128 unsigned long num_rx_bytes = 0; 129 unsigned int framecount, framenum; 130 131 if (unlikely(nb_pkts == 0)) 132 return 0; 133 134 /* 135 * Reads the given number of packets from the AF_PACKET socket one by 136 * one and copies the packet data into a newly allocated mbuf. 137 */ 138 framecount = pkt_q->framecount; 139 framenum = pkt_q->framenum; 140 for (i = 0; i < nb_pkts; i++) { 141 /* point at the next incoming frame */ 142 ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base; 143 if ((ppd->tp_status & TP_STATUS_USER) == 0) 144 break; 145 146 /* allocate the next mbuf */ 147 mbuf = rte_pktmbuf_alloc(pkt_q->mb_pool); 148 if (unlikely(mbuf == NULL)) 149 break; 150 151 /* packet will fit in the mbuf, go ahead and receive it */ 152 rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf) = ppd->tp_snaplen; 153 pbuf = (uint8_t *) ppd + ppd->tp_mac; 154 memcpy(rte_pktmbuf_mtod(mbuf, void *), pbuf, rte_pktmbuf_data_len(mbuf)); 155 156 /* check for vlan info */ 157 if (ppd->tp_status & TP_STATUS_VLAN_VALID) { 158 mbuf->vlan_tci = ppd->tp_vlan_tci; 159 mbuf->ol_flags |= (RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED); 160 161 if (!pkt_q->vlan_strip && rte_vlan_insert(&mbuf)) 162 PMD_LOG(ERR, "Failed to reinsert VLAN tag"); 163 } 164 165 /* add kernel provided timestamp when offloading is enabled */ 166 if (pkt_q->timestamp_offloading) { 167 /* since TPACKET_V2 timestamps are provided in nanoseconds resolution */ 168 *RTE_MBUF_DYNFIELD(mbuf, timestamp_dynfield_offset, 169 rte_mbuf_timestamp_t *) = 170 (uint64_t)ppd->tp_sec * 1000000000 + ppd->tp_nsec; 171 172 mbuf->ol_flags |= timestamp_dynflag; 173 } 174 175 /* release incoming frame and advance ring buffer */ 176 ppd->tp_status = TP_STATUS_KERNEL; 177 if (++framenum >= framecount) 178 framenum = 0; 179 mbuf->port = pkt_q->in_port; 180 181 /* account for the receive frame */ 182 bufs[i] = mbuf; 183 num_rx++; 184 num_rx_bytes += mbuf->pkt_len; 185 } 186 pkt_q->framenum = framenum; 187 pkt_q->rx_pkts += num_rx; 188 pkt_q->rx_bytes += num_rx_bytes; 189 return num_rx; 190 } 191 192 /* 193 * Check if there is an available frame in the ring 194 */ 195 static inline bool 196 tx_ring_status_available(uint32_t tp_status) 197 { 198 /* 199 * We eliminate the timestamp status from the packet status. 200 * This should only matter if timestamping is enabled on the socket, 201 * but there is a bug in the kernel which is fixed in newer releases. 202 * 203 * See the following kernel commit for reference: 204 * commit 171c3b151118a2fe0fc1e2a9d1b5a1570cfe82d2 205 * net: packetmmap: fix only tx timestamp on request 206 */ 207 tp_status &= ~(TP_STATUS_TS_SOFTWARE | TP_STATUS_TS_RAW_HARDWARE); 208 209 return tp_status == TP_STATUS_AVAILABLE; 210 } 211 212 /* 213 * Callback to handle sending packets through a real NIC. 214 */ 215 static uint16_t 216 eth_af_packet_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) 217 { 218 struct tpacket2_hdr *ppd; 219 struct rte_mbuf *mbuf; 220 uint8_t *pbuf; 221 unsigned int framecount, framenum; 222 struct pollfd pfd; 223 struct pkt_tx_queue *pkt_q = queue; 224 uint16_t num_tx = 0; 225 unsigned long num_tx_bytes = 0; 226 int i; 227 228 if (unlikely(nb_pkts == 0)) 229 return 0; 230 231 memset(&pfd, 0, sizeof(pfd)); 232 pfd.fd = pkt_q->sockfd; 233 pfd.events = POLLOUT; 234 pfd.revents = 0; 235 236 framecount = pkt_q->framecount; 237 framenum = pkt_q->framenum; 238 ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base; 239 for (i = 0; i < nb_pkts; i++) { 240 mbuf = *bufs++; 241 242 /* drop oversized packets */ 243 if (mbuf->pkt_len > pkt_q->frame_data_size) { 244 rte_pktmbuf_free(mbuf); 245 continue; 246 } 247 248 /* insert vlan info if necessary */ 249 if (mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) { 250 if (rte_vlan_insert(&mbuf)) { 251 rte_pktmbuf_free(mbuf); 252 continue; 253 } 254 } 255 256 /* point at the next incoming frame */ 257 if (!tx_ring_status_available(ppd->tp_status)) { 258 if (poll(&pfd, 1, -1) < 0) 259 break; 260 261 /* poll() can return POLLERR if the interface is down */ 262 if (pfd.revents & POLLERR) 263 break; 264 } 265 266 /* 267 * poll() will almost always return POLLOUT, even if there 268 * are no extra buffers available 269 * 270 * This happens, because packet_poll() calls datagram_poll() 271 * which checks the space left in the socket buffer and, 272 * in the case of packet_mmap, the default socket buffer length 273 * doesn't match the requested size for the tx_ring. 274 * As such, there is almost always space left in socket buffer, 275 * which doesn't seem to be correlated to the requested size 276 * for the tx_ring in packet_mmap. 277 * 278 * This results in poll() returning POLLOUT. 279 */ 280 if (!tx_ring_status_available(ppd->tp_status)) 281 break; 282 283 /* copy the tx frame data */ 284 pbuf = (uint8_t *) ppd + TPACKET2_HDRLEN - 285 sizeof(struct sockaddr_ll); 286 287 struct rte_mbuf *tmp_mbuf = mbuf; 288 while (tmp_mbuf) { 289 uint16_t data_len = rte_pktmbuf_data_len(tmp_mbuf); 290 memcpy(pbuf, rte_pktmbuf_mtod(tmp_mbuf, void*), data_len); 291 pbuf += data_len; 292 tmp_mbuf = tmp_mbuf->next; 293 } 294 295 ppd->tp_len = mbuf->pkt_len; 296 ppd->tp_snaplen = mbuf->pkt_len; 297 298 /* release incoming frame and advance ring buffer */ 299 ppd->tp_status = TP_STATUS_SEND_REQUEST; 300 if (++framenum >= framecount) 301 framenum = 0; 302 ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base; 303 304 num_tx++; 305 num_tx_bytes += mbuf->pkt_len; 306 rte_pktmbuf_free(mbuf); 307 } 308 309 /* kick-off transmits */ 310 if (sendto(pkt_q->sockfd, NULL, 0, MSG_DONTWAIT, NULL, 0) == -1 && 311 errno != ENOBUFS && errno != EAGAIN) { 312 /* 313 * In case of a ENOBUFS/EAGAIN error all of the enqueued 314 * packets will be considered successful even though only some 315 * are sent. 316 */ 317 318 num_tx = 0; 319 num_tx_bytes = 0; 320 } 321 322 pkt_q->framenum = framenum; 323 pkt_q->tx_pkts += num_tx; 324 pkt_q->err_pkts += i - num_tx; 325 pkt_q->tx_bytes += num_tx_bytes; 326 return i; 327 } 328 329 static int 330 eth_dev_start(struct rte_eth_dev *dev) 331 { 332 struct pmd_internals *internals = dev->data->dev_private; 333 uint16_t i; 334 335 if (internals->timestamp_offloading) { 336 /* Register mbuf field and flag for Rx timestamp */ 337 int rc = rte_mbuf_dyn_rx_timestamp_register(×tamp_dynfield_offset, 338 ×tamp_dynflag); 339 if (rc) { 340 PMD_LOG(ERR, "Cannot register mbuf field/flag for timestamp"); 341 return rc; 342 } 343 } 344 345 dev->data->dev_link.link_status = RTE_ETH_LINK_UP; 346 for (i = 0; i < internals->nb_queues; i++) { 347 dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED; 348 dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED; 349 } 350 return 0; 351 } 352 353 /* 354 * This function gets called when the current port gets stopped. 355 */ 356 static int 357 eth_dev_stop(struct rte_eth_dev *dev) 358 { 359 unsigned i; 360 int sockfd; 361 struct pmd_internals *internals = dev->data->dev_private; 362 363 for (i = 0; i < internals->nb_queues; i++) { 364 sockfd = internals->rx_queue[i].sockfd; 365 if (sockfd != -1) 366 close(sockfd); 367 368 /* Prevent use after free in case tx fd == rx fd */ 369 if (sockfd != internals->tx_queue[i].sockfd) { 370 sockfd = internals->tx_queue[i].sockfd; 371 if (sockfd != -1) 372 close(sockfd); 373 } 374 375 internals->rx_queue[i].sockfd = -1; 376 internals->tx_queue[i].sockfd = -1; 377 dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED; 378 dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED; 379 } 380 381 dev->data->dev_link.link_status = RTE_ETH_LINK_DOWN; 382 return 0; 383 } 384 385 static int 386 eth_dev_configure(struct rte_eth_dev *dev __rte_unused) 387 { 388 struct rte_eth_conf *dev_conf = &dev->data->dev_conf; 389 const struct rte_eth_rxmode *rxmode = &dev_conf->rxmode; 390 struct pmd_internals *internals = dev->data->dev_private; 391 392 internals->vlan_strip = !!(rxmode->offloads & RTE_ETH_RX_OFFLOAD_VLAN_STRIP); 393 internals->timestamp_offloading = !!(rxmode->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP); 394 return 0; 395 } 396 397 static int 398 eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) 399 { 400 struct pmd_internals *internals = dev->data->dev_private; 401 402 dev_info->if_index = internals->if_index; 403 dev_info->max_mac_addrs = 1; 404 dev_info->max_rx_pktlen = RTE_ETHER_MAX_LEN; 405 dev_info->max_rx_queues = (uint16_t)internals->nb_queues; 406 dev_info->max_tx_queues = (uint16_t)internals->nb_queues; 407 dev_info->min_rx_bufsize = 0; 408 dev_info->tx_offload_capa = RTE_ETH_TX_OFFLOAD_MULTI_SEGS | 409 RTE_ETH_TX_OFFLOAD_VLAN_INSERT; 410 dev_info->rx_offload_capa = RTE_ETH_RX_OFFLOAD_VLAN_STRIP | 411 RTE_ETH_RX_OFFLOAD_TIMESTAMP; 412 413 return 0; 414 } 415 416 static int 417 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats) 418 { 419 unsigned i, imax; 420 unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0; 421 unsigned long rx_bytes_total = 0, tx_bytes_total = 0; 422 const struct pmd_internals *internal = dev->data->dev_private; 423 424 imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ? 425 internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS); 426 for (i = 0; i < imax; i++) { 427 igb_stats->q_ipackets[i] = internal->rx_queue[i].rx_pkts; 428 igb_stats->q_ibytes[i] = internal->rx_queue[i].rx_bytes; 429 rx_total += igb_stats->q_ipackets[i]; 430 rx_bytes_total += igb_stats->q_ibytes[i]; 431 } 432 433 imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS ? 434 internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS); 435 for (i = 0; i < imax; i++) { 436 igb_stats->q_opackets[i] = internal->tx_queue[i].tx_pkts; 437 igb_stats->q_obytes[i] = internal->tx_queue[i].tx_bytes; 438 tx_total += igb_stats->q_opackets[i]; 439 tx_err_total += internal->tx_queue[i].err_pkts; 440 tx_bytes_total += igb_stats->q_obytes[i]; 441 } 442 443 igb_stats->ipackets = rx_total; 444 igb_stats->ibytes = rx_bytes_total; 445 igb_stats->opackets = tx_total; 446 igb_stats->oerrors = tx_err_total; 447 igb_stats->obytes = tx_bytes_total; 448 return 0; 449 } 450 451 static int 452 eth_stats_reset(struct rte_eth_dev *dev) 453 { 454 unsigned i; 455 struct pmd_internals *internal = dev->data->dev_private; 456 457 for (i = 0; i < internal->nb_queues; i++) { 458 internal->rx_queue[i].rx_pkts = 0; 459 internal->rx_queue[i].rx_bytes = 0; 460 } 461 462 for (i = 0; i < internal->nb_queues; i++) { 463 internal->tx_queue[i].tx_pkts = 0; 464 internal->tx_queue[i].err_pkts = 0; 465 internal->tx_queue[i].tx_bytes = 0; 466 } 467 468 return 0; 469 } 470 471 static int 472 eth_dev_close(struct rte_eth_dev *dev) 473 { 474 struct pmd_internals *internals; 475 struct tpacket_req *req; 476 unsigned int q; 477 478 if (rte_eal_process_type() != RTE_PROC_PRIMARY) 479 return 0; 480 481 PMD_LOG(INFO, "Closing AF_PACKET ethdev on NUMA socket %u", 482 rte_socket_id()); 483 484 internals = dev->data->dev_private; 485 req = &internals->req; 486 for (q = 0; q < internals->nb_queues; q++) { 487 munmap(internals->rx_queue[q].map, 488 2 * req->tp_block_size * req->tp_block_nr); 489 rte_free(internals->rx_queue[q].rd); 490 rte_free(internals->tx_queue[q].rd); 491 } 492 free(internals->if_name); 493 rte_free(internals->rx_queue); 494 rte_free(internals->tx_queue); 495 496 /* mac_addrs must not be freed alone because part of dev_private */ 497 dev->data->mac_addrs = NULL; 498 return 0; 499 } 500 501 static int 502 eth_link_update(struct rte_eth_dev *dev __rte_unused, 503 int wait_to_complete __rte_unused) 504 { 505 return 0; 506 } 507 508 static int 509 eth_rx_queue_setup(struct rte_eth_dev *dev, 510 uint16_t rx_queue_id, 511 uint16_t nb_rx_desc __rte_unused, 512 unsigned int socket_id __rte_unused, 513 const struct rte_eth_rxconf *rx_conf __rte_unused, 514 struct rte_mempool *mb_pool) 515 { 516 struct pmd_internals *internals = dev->data->dev_private; 517 struct pkt_rx_queue *pkt_q = &internals->rx_queue[rx_queue_id]; 518 unsigned int buf_size, data_size; 519 520 pkt_q->mb_pool = mb_pool; 521 522 /* Now get the space available for data in the mbuf */ 523 buf_size = rte_pktmbuf_data_room_size(pkt_q->mb_pool) - 524 RTE_PKTMBUF_HEADROOM; 525 data_size = internals->req.tp_frame_size; 526 data_size -= TPACKET2_HDRLEN - sizeof(struct sockaddr_ll); 527 528 if (data_size > buf_size) { 529 PMD_LOG(ERR, 530 "%s: %d bytes will not fit in mbuf (%d bytes)", 531 dev->device->name, data_size, buf_size); 532 return -ENOMEM; 533 } 534 535 dev->data->rx_queues[rx_queue_id] = pkt_q; 536 pkt_q->in_port = dev->data->port_id; 537 pkt_q->vlan_strip = internals->vlan_strip; 538 pkt_q->timestamp_offloading = internals->timestamp_offloading; 539 540 return 0; 541 } 542 543 static int 544 eth_tx_queue_setup(struct rte_eth_dev *dev, 545 uint16_t tx_queue_id, 546 uint16_t nb_tx_desc __rte_unused, 547 unsigned int socket_id __rte_unused, 548 const struct rte_eth_txconf *tx_conf __rte_unused) 549 { 550 551 struct pmd_internals *internals = dev->data->dev_private; 552 553 dev->data->tx_queues[tx_queue_id] = &internals->tx_queue[tx_queue_id]; 554 return 0; 555 } 556 557 static int 558 eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu) 559 { 560 struct pmd_internals *internals = dev->data->dev_private; 561 struct ifreq ifr = { .ifr_mtu = mtu }; 562 int ret; 563 int s; 564 unsigned int data_size = internals->req.tp_frame_size - 565 TPACKET2_HDRLEN; 566 567 if (mtu > data_size) 568 return -EINVAL; 569 570 s = socket(PF_INET, SOCK_DGRAM, 0); 571 if (s < 0) 572 return -EINVAL; 573 574 strlcpy(ifr.ifr_name, internals->if_name, IFNAMSIZ); 575 ret = ioctl(s, SIOCSIFMTU, &ifr); 576 close(s); 577 578 if (ret < 0) 579 return -EINVAL; 580 581 return 0; 582 } 583 584 static int 585 eth_dev_macaddr_set(struct rte_eth_dev *dev, struct rte_ether_addr *addr) 586 { 587 struct pmd_internals *internals = dev->data->dev_private; 588 struct ifreq ifr = { }; 589 int sockfd = internals->rx_queue[0].sockfd; 590 int ret; 591 592 if (sockfd == -1) { 593 PMD_LOG(ERR, "receive socket not found"); 594 return -EINVAL; 595 } 596 597 strlcpy(ifr.ifr_name, internals->if_name, IFNAMSIZ); 598 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER; 599 memcpy(ifr.ifr_hwaddr.sa_data, addr, sizeof(*addr)); 600 ret = ioctl(sockfd, SIOCSIFHWADDR, &ifr); 601 602 if (ret < 0) { 603 PMD_LOG_ERRNO(ERR, "ioctl(SIOCSIFHWADDR) failed"); 604 return -EINVAL; 605 } 606 607 return 0; 608 } 609 610 static int 611 eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask) 612 { 613 struct ifreq ifr; 614 int ret = 0; 615 int s; 616 617 s = socket(PF_INET, SOCK_DGRAM, 0); 618 if (s < 0) 619 return -errno; 620 621 strlcpy(ifr.ifr_name, if_name, IFNAMSIZ); 622 if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0) { 623 ret = -errno; 624 goto out; 625 } 626 ifr.ifr_flags &= mask; 627 ifr.ifr_flags |= flags; 628 if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) { 629 ret = -errno; 630 goto out; 631 } 632 out: 633 close(s); 634 return ret; 635 } 636 637 static int 638 eth_dev_promiscuous_enable(struct rte_eth_dev *dev) 639 { 640 struct pmd_internals *internals = dev->data->dev_private; 641 642 return eth_dev_change_flags(internals->if_name, IFF_PROMISC, ~0); 643 } 644 645 static int 646 eth_dev_promiscuous_disable(struct rte_eth_dev *dev) 647 { 648 struct pmd_internals *internals = dev->data->dev_private; 649 650 return eth_dev_change_flags(internals->if_name, 0, ~IFF_PROMISC); 651 } 652 653 static const struct eth_dev_ops ops = { 654 .dev_start = eth_dev_start, 655 .dev_stop = eth_dev_stop, 656 .dev_close = eth_dev_close, 657 .dev_configure = eth_dev_configure, 658 .dev_infos_get = eth_dev_info, 659 .mac_addr_set = eth_dev_macaddr_set, 660 .mtu_set = eth_dev_mtu_set, 661 .promiscuous_enable = eth_dev_promiscuous_enable, 662 .promiscuous_disable = eth_dev_promiscuous_disable, 663 .rx_queue_setup = eth_rx_queue_setup, 664 .tx_queue_setup = eth_tx_queue_setup, 665 .link_update = eth_link_update, 666 .stats_get = eth_stats_get, 667 .stats_reset = eth_stats_reset, 668 }; 669 670 /* 671 * Opens an AF_PACKET socket 672 */ 673 static int 674 open_packet_iface(const char *key __rte_unused, 675 const char *value __rte_unused, 676 void *extra_args) 677 { 678 int *sockfd = extra_args; 679 680 /* Open an AF_PACKET socket... */ 681 *sockfd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); 682 if (*sockfd == -1) { 683 PMD_LOG(ERR, "Could not open AF_PACKET socket"); 684 return -1; 685 } 686 687 return 0; 688 } 689 690 static int 691 rte_pmd_init_internals(struct rte_vdev_device *dev, 692 const int sockfd, 693 const unsigned nb_queues, 694 unsigned int blocksize, 695 unsigned int blockcnt, 696 unsigned int framesize, 697 unsigned int framecnt, 698 unsigned int qdisc_bypass, 699 struct pmd_internals **internals, 700 struct rte_eth_dev **eth_dev, 701 struct rte_kvargs *kvlist) 702 { 703 const char *name = rte_vdev_device_name(dev); 704 const unsigned int numa_node = dev->device.numa_node; 705 struct rte_eth_dev_data *data = NULL; 706 struct rte_kvargs_pair *pair = NULL; 707 struct ifreq ifr; 708 size_t ifnamelen; 709 unsigned k_idx; 710 struct sockaddr_ll sockaddr; 711 struct tpacket_req *req; 712 struct pkt_rx_queue *rx_queue; 713 struct pkt_tx_queue *tx_queue; 714 int rc, tpver, discard; 715 int qsockfd = -1; 716 unsigned int i, q, rdsize; 717 #if defined(PACKET_FANOUT) 718 int fanout_arg; 719 #endif 720 721 for (k_idx = 0; k_idx < kvlist->count; k_idx++) { 722 pair = &kvlist->pairs[k_idx]; 723 if (strstr(pair->key, ETH_AF_PACKET_IFACE_ARG) != NULL) 724 break; 725 } 726 if (pair == NULL) { 727 PMD_LOG(ERR, 728 "%s: no interface specified for AF_PACKET ethdev", 729 name); 730 return -1; 731 } 732 733 PMD_LOG(INFO, 734 "%s: creating AF_PACKET-backed ethdev on numa socket %u", 735 name, numa_node); 736 737 *internals = rte_zmalloc_socket(name, sizeof(**internals), 738 0, numa_node); 739 if (*internals == NULL) 740 return -1; 741 742 743 (*internals)->rx_queue = rte_calloc_socket("af_packet_rx", 744 nb_queues, 745 sizeof(struct pkt_rx_queue), 746 0, numa_node); 747 (*internals)->tx_queue = rte_calloc_socket("af_packet_tx", 748 nb_queues, 749 sizeof(struct pkt_tx_queue), 750 0, numa_node); 751 if (!(*internals)->rx_queue || !(*internals)->tx_queue) { 752 goto free_internals; 753 } 754 755 for (q = 0; q < nb_queues; q++) { 756 (*internals)->rx_queue[q].map = MAP_FAILED; 757 (*internals)->tx_queue[q].map = MAP_FAILED; 758 (*internals)->rx_queue[q].sockfd = -1; 759 (*internals)->tx_queue[q].sockfd = -1; 760 } 761 762 req = &((*internals)->req); 763 764 req->tp_block_size = blocksize; 765 req->tp_block_nr = blockcnt; 766 req->tp_frame_size = framesize; 767 req->tp_frame_nr = framecnt; 768 769 ifnamelen = strlen(pair->value); 770 if (ifnamelen < sizeof(ifr.ifr_name)) { 771 memcpy(ifr.ifr_name, pair->value, ifnamelen); 772 ifr.ifr_name[ifnamelen] = '\0'; 773 } else { 774 PMD_LOG(ERR, 775 "%s: I/F name too long (%s)", 776 name, pair->value); 777 goto free_internals; 778 } 779 if (ioctl(sockfd, SIOCGIFINDEX, &ifr) == -1) { 780 PMD_LOG_ERRNO(ERR, "%s: ioctl failed (SIOCGIFINDEX)", name); 781 goto free_internals; 782 } 783 (*internals)->if_name = strdup(pair->value); 784 if ((*internals)->if_name == NULL) 785 goto free_internals; 786 (*internals)->if_index = ifr.ifr_ifindex; 787 788 if (ioctl(sockfd, SIOCGIFHWADDR, &ifr) == -1) { 789 PMD_LOG_ERRNO(ERR, "%s: ioctl failed (SIOCGIFHWADDR)", name); 790 goto free_internals; 791 } 792 memcpy(&(*internals)->eth_addr, ifr.ifr_hwaddr.sa_data, ETH_ALEN); 793 794 memset(&sockaddr, 0, sizeof(sockaddr)); 795 sockaddr.sll_family = AF_PACKET; 796 sockaddr.sll_protocol = htons(ETH_P_ALL); 797 sockaddr.sll_ifindex = (*internals)->if_index; 798 799 #if defined(PACKET_FANOUT) 800 fanout_arg = (getpid() ^ (*internals)->if_index) & 0xffff; 801 fanout_arg |= (PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_DEFRAG) << 16; 802 #if defined(PACKET_FANOUT_FLAG_ROLLOVER) 803 fanout_arg |= PACKET_FANOUT_FLAG_ROLLOVER << 16; 804 #endif 805 #endif 806 807 for (q = 0; q < nb_queues; q++) { 808 /* Open an AF_PACKET socket for this queue... */ 809 qsockfd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); 810 if (qsockfd == -1) { 811 PMD_LOG_ERRNO(ERR, 812 "%s: could not open AF_PACKET socket", 813 name); 814 goto error; 815 } 816 817 tpver = TPACKET_V2; 818 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_VERSION, 819 &tpver, sizeof(tpver)); 820 if (rc == -1) { 821 PMD_LOG_ERRNO(ERR, 822 "%s: could not set PACKET_VERSION on AF_PACKET socket for %s", 823 name, pair->value); 824 goto error; 825 } 826 827 discard = 1; 828 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_LOSS, 829 &discard, sizeof(discard)); 830 if (rc == -1) { 831 PMD_LOG_ERRNO(ERR, 832 "%s: could not set PACKET_LOSS on AF_PACKET socket for %s", 833 name, pair->value); 834 goto error; 835 } 836 837 if (qdisc_bypass) { 838 #if defined(PACKET_QDISC_BYPASS) 839 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_QDISC_BYPASS, 840 &qdisc_bypass, sizeof(qdisc_bypass)); 841 if (rc == -1) { 842 PMD_LOG_ERRNO(ERR, 843 "%s: could not set PACKET_QDISC_BYPASS on AF_PACKET socket for %s", 844 name, pair->value); 845 goto error; 846 } 847 #endif 848 } 849 850 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_RX_RING, req, sizeof(*req)); 851 if (rc == -1) { 852 PMD_LOG_ERRNO(ERR, 853 "%s: could not set PACKET_RX_RING on AF_PACKET socket for %s", 854 name, pair->value); 855 goto error; 856 } 857 858 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_TX_RING, req, sizeof(*req)); 859 if (rc == -1) { 860 PMD_LOG_ERRNO(ERR, 861 "%s: could not set PACKET_TX_RING on AF_PACKET " 862 "socket for %s", name, pair->value); 863 goto error; 864 } 865 866 rx_queue = &((*internals)->rx_queue[q]); 867 rx_queue->framecount = req->tp_frame_nr; 868 869 rx_queue->map = mmap(NULL, 2 * req->tp_block_size * req->tp_block_nr, 870 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED, 871 qsockfd, 0); 872 if (rx_queue->map == MAP_FAILED) { 873 PMD_LOG_ERRNO(ERR, 874 "%s: call to mmap failed on AF_PACKET socket for %s", 875 name, pair->value); 876 goto error; 877 } 878 879 /* rdsize is same for both Tx and Rx */ 880 rdsize = req->tp_frame_nr * sizeof(*(rx_queue->rd)); 881 882 rx_queue->rd = rte_zmalloc_socket(name, rdsize, 0, numa_node); 883 if (rx_queue->rd == NULL) 884 goto error; 885 for (i = 0; i < req->tp_frame_nr; ++i) { 886 rx_queue->rd[i].iov_base = rx_queue->map + (i * framesize); 887 rx_queue->rd[i].iov_len = req->tp_frame_size; 888 } 889 rx_queue->sockfd = qsockfd; 890 891 tx_queue = &((*internals)->tx_queue[q]); 892 tx_queue->framecount = req->tp_frame_nr; 893 tx_queue->frame_data_size = req->tp_frame_size; 894 tx_queue->frame_data_size -= TPACKET2_HDRLEN - 895 sizeof(struct sockaddr_ll); 896 897 tx_queue->map = rx_queue->map + req->tp_block_size * req->tp_block_nr; 898 899 tx_queue->rd = rte_zmalloc_socket(name, rdsize, 0, numa_node); 900 if (tx_queue->rd == NULL) 901 goto error; 902 for (i = 0; i < req->tp_frame_nr; ++i) { 903 tx_queue->rd[i].iov_base = tx_queue->map + (i * framesize); 904 tx_queue->rd[i].iov_len = req->tp_frame_size; 905 } 906 tx_queue->sockfd = qsockfd; 907 908 rc = bind(qsockfd, (const struct sockaddr*)&sockaddr, sizeof(sockaddr)); 909 if (rc == -1) { 910 PMD_LOG_ERRNO(ERR, 911 "%s: could not bind AF_PACKET socket to %s", 912 name, pair->value); 913 goto error; 914 } 915 916 #if defined(PACKET_FANOUT) 917 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_FANOUT, 918 &fanout_arg, sizeof(fanout_arg)); 919 if (rc == -1) { 920 PMD_LOG_ERRNO(ERR, 921 "%s: could not set PACKET_FANOUT on AF_PACKET socket for %s", 922 name, pair->value); 923 goto error; 924 } 925 #endif 926 } 927 928 /* reserve an ethdev entry */ 929 *eth_dev = rte_eth_vdev_allocate(dev, 0); 930 if (*eth_dev == NULL) 931 goto error; 932 933 /* 934 * now put it all together 935 * - store queue data in internals, 936 * - store numa_node in eth_dev 937 * - point eth_dev_data to internals 938 * - and point eth_dev structure to new eth_dev_data structure 939 */ 940 941 (*internals)->nb_queues = nb_queues; 942 943 data = (*eth_dev)->data; 944 data->dev_private = *internals; 945 data->nb_rx_queues = (uint16_t)nb_queues; 946 data->nb_tx_queues = (uint16_t)nb_queues; 947 data->dev_link = pmd_link; 948 data->mac_addrs = &(*internals)->eth_addr; 949 data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS; 950 951 (*eth_dev)->dev_ops = &ops; 952 953 return 0; 954 955 error: 956 if (qsockfd != -1) 957 close(qsockfd); 958 for (q = 0; q < nb_queues; q++) { 959 if ((*internals)->rx_queue[q].map != MAP_FAILED) 960 munmap((*internals)->rx_queue[q].map, 961 2 * req->tp_block_size * req->tp_block_nr); 962 963 rte_free((*internals)->rx_queue[q].rd); 964 rte_free((*internals)->tx_queue[q].rd); 965 if (((*internals)->rx_queue[q].sockfd >= 0) && 966 ((*internals)->rx_queue[q].sockfd != qsockfd)) 967 close((*internals)->rx_queue[q].sockfd); 968 } 969 free_internals: 970 rte_free((*internals)->rx_queue); 971 rte_free((*internals)->tx_queue); 972 free((*internals)->if_name); 973 rte_free(*internals); 974 return -1; 975 } 976 977 static int 978 rte_eth_from_packet(struct rte_vdev_device *dev, 979 int const *sockfd, 980 struct rte_kvargs *kvlist) 981 { 982 const char *name = rte_vdev_device_name(dev); 983 struct pmd_internals *internals = NULL; 984 struct rte_eth_dev *eth_dev = NULL; 985 struct rte_kvargs_pair *pair = NULL; 986 unsigned k_idx; 987 unsigned int blockcount; 988 unsigned int blocksize; 989 unsigned int framesize = DFLT_FRAME_SIZE; 990 unsigned int framecount = DFLT_FRAME_COUNT; 991 unsigned int qpairs = 1; 992 unsigned int qdisc_bypass = 1; 993 994 /* do some parameter checking */ 995 if (*sockfd < 0) 996 return -1; 997 998 blocksize = getpagesize(); 999 1000 /* 1001 * Walk arguments for configurable settings 1002 */ 1003 for (k_idx = 0; k_idx < kvlist->count; k_idx++) { 1004 pair = &kvlist->pairs[k_idx]; 1005 if (strstr(pair->key, ETH_AF_PACKET_NUM_Q_ARG) != NULL) { 1006 qpairs = atoi(pair->value); 1007 if (qpairs < 1) { 1008 PMD_LOG(ERR, 1009 "%s: invalid qpairs value", 1010 name); 1011 return -1; 1012 } 1013 continue; 1014 } 1015 if (strstr(pair->key, ETH_AF_PACKET_BLOCKSIZE_ARG) != NULL) { 1016 blocksize = atoi(pair->value); 1017 if (!blocksize) { 1018 PMD_LOG(ERR, 1019 "%s: invalid blocksize value", 1020 name); 1021 return -1; 1022 } 1023 continue; 1024 } 1025 if (strstr(pair->key, ETH_AF_PACKET_FRAMESIZE_ARG) != NULL) { 1026 framesize = atoi(pair->value); 1027 if (!framesize) { 1028 PMD_LOG(ERR, 1029 "%s: invalid framesize value", 1030 name); 1031 return -1; 1032 } 1033 continue; 1034 } 1035 if (strstr(pair->key, ETH_AF_PACKET_FRAMECOUNT_ARG) != NULL) { 1036 framecount = atoi(pair->value); 1037 if (!framecount) { 1038 PMD_LOG(ERR, 1039 "%s: invalid framecount value", 1040 name); 1041 return -1; 1042 } 1043 continue; 1044 } 1045 if (strstr(pair->key, ETH_AF_PACKET_QDISC_BYPASS_ARG) != NULL) { 1046 qdisc_bypass = atoi(pair->value); 1047 if (qdisc_bypass > 1) { 1048 PMD_LOG(ERR, 1049 "%s: invalid bypass value", 1050 name); 1051 return -1; 1052 } 1053 continue; 1054 } 1055 } 1056 1057 if (framesize > blocksize) { 1058 PMD_LOG(ERR, 1059 "%s: AF_PACKET MMAP frame size exceeds block size!", 1060 name); 1061 return -1; 1062 } 1063 1064 blockcount = framecount / (blocksize / framesize); 1065 if (!blockcount) { 1066 PMD_LOG(ERR, 1067 "%s: invalid AF_PACKET MMAP parameters", name); 1068 return -1; 1069 } 1070 1071 PMD_LOG(INFO, "%s: AF_PACKET MMAP parameters:", name); 1072 PMD_LOG(INFO, "%s:\tblock size %d", name, blocksize); 1073 PMD_LOG(INFO, "%s:\tblock count %d", name, blockcount); 1074 PMD_LOG(INFO, "%s:\tframe size %d", name, framesize); 1075 PMD_LOG(INFO, "%s:\tframe count %d", name, framecount); 1076 1077 if (rte_pmd_init_internals(dev, *sockfd, qpairs, 1078 blocksize, blockcount, 1079 framesize, framecount, 1080 qdisc_bypass, 1081 &internals, ð_dev, 1082 kvlist) < 0) 1083 return -1; 1084 1085 eth_dev->rx_pkt_burst = eth_af_packet_rx; 1086 eth_dev->tx_pkt_burst = eth_af_packet_tx; 1087 1088 rte_eth_dev_probing_finish(eth_dev); 1089 return 0; 1090 } 1091 1092 static int 1093 rte_pmd_af_packet_probe(struct rte_vdev_device *dev) 1094 { 1095 int ret = 0; 1096 struct rte_kvargs *kvlist; 1097 int sockfd = -1; 1098 struct rte_eth_dev *eth_dev; 1099 const char *name = rte_vdev_device_name(dev); 1100 1101 PMD_LOG(INFO, "Initializing pmd_af_packet for %s", name); 1102 1103 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 1104 eth_dev = rte_eth_dev_attach_secondary(name); 1105 if (!eth_dev) { 1106 PMD_LOG(ERR, "Failed to probe %s", name); 1107 return -1; 1108 } 1109 /* TODO: request info from primary to set up Rx and Tx */ 1110 eth_dev->dev_ops = &ops; 1111 eth_dev->device = &dev->device; 1112 rte_eth_dev_probing_finish(eth_dev); 1113 return 0; 1114 } 1115 1116 kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments); 1117 if (kvlist == NULL) { 1118 ret = -1; 1119 goto exit; 1120 } 1121 1122 /* 1123 * If iface argument is passed we open the NICs and use them for 1124 * reading / writing 1125 */ 1126 if (rte_kvargs_count(kvlist, ETH_AF_PACKET_IFACE_ARG) == 1) { 1127 1128 ret = rte_kvargs_process(kvlist, ETH_AF_PACKET_IFACE_ARG, 1129 &open_packet_iface, &sockfd); 1130 if (ret < 0) 1131 goto exit; 1132 } 1133 1134 if (dev->device.numa_node == SOCKET_ID_ANY) 1135 dev->device.numa_node = rte_socket_id(); 1136 1137 ret = rte_eth_from_packet(dev, &sockfd, kvlist); 1138 close(sockfd); /* no longer needed */ 1139 1140 exit: 1141 rte_kvargs_free(kvlist); 1142 return ret; 1143 } 1144 1145 static int 1146 rte_pmd_af_packet_remove(struct rte_vdev_device *dev) 1147 { 1148 struct rte_eth_dev *eth_dev; 1149 1150 if (dev == NULL) 1151 return -1; 1152 1153 /* find the ethdev entry */ 1154 eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev)); 1155 if (eth_dev == NULL) 1156 return 0; /* port already released */ 1157 1158 eth_dev_close(eth_dev); 1159 rte_eth_dev_release_port(eth_dev); 1160 1161 return 0; 1162 } 1163 1164 static struct rte_vdev_driver pmd_af_packet_drv = { 1165 .probe = rte_pmd_af_packet_probe, 1166 .remove = rte_pmd_af_packet_remove, 1167 }; 1168 1169 RTE_PMD_REGISTER_VDEV(net_af_packet, pmd_af_packet_drv); 1170 RTE_PMD_REGISTER_ALIAS(net_af_packet, eth_af_packet); 1171 RTE_PMD_REGISTER_PARAM_STRING(net_af_packet, 1172 "iface=<string> " 1173 "qpairs=<int> " 1174 "blocksz=<int> " 1175 "framesz=<int> " 1176 "framecnt=<int> " 1177 "qdisc_bypass=<0|1>"); 1178