1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2014 John W. Linville <linville@tuxdriver.com> 3 * Originally based upon librte_pmd_pcap code: 4 * Copyright(c) 2010-2015 Intel Corporation. 5 * Copyright(c) 2014 6WIND S.A. 6 * All rights reserved. 7 */ 8 9 #include <rte_common.h> 10 #include <rte_string_fns.h> 11 #include <rte_mbuf.h> 12 #include <ethdev_driver.h> 13 #include <ethdev_vdev.h> 14 #include <rte_malloc.h> 15 #include <rte_kvargs.h> 16 #include <bus_vdev_driver.h> 17 18 #include <errno.h> 19 #include <linux/if_ether.h> 20 #include <linux/if_packet.h> 21 #include <arpa/inet.h> 22 #include <net/if.h> 23 #include <net/if_arp.h> 24 #include <sys/types.h> 25 #include <sys/socket.h> 26 #include <sys/ioctl.h> 27 #include <stdlib.h> 28 #include <string.h> 29 #include <sys/mman.h> 30 #include <unistd.h> 31 #include <poll.h> 32 33 #define ETH_AF_PACKET_IFACE_ARG "iface" 34 #define ETH_AF_PACKET_NUM_Q_ARG "qpairs" 35 #define ETH_AF_PACKET_BLOCKSIZE_ARG "blocksz" 36 #define ETH_AF_PACKET_FRAMESIZE_ARG "framesz" 37 #define ETH_AF_PACKET_FRAMECOUNT_ARG "framecnt" 38 #define ETH_AF_PACKET_QDISC_BYPASS_ARG "qdisc_bypass" 39 #define ETH_AF_PACKET_FANOUT_MODE_ARG "fanout_mode" 40 41 #define DFLT_FRAME_SIZE (1 << 11) 42 #define DFLT_FRAME_COUNT (1 << 9) 43 44 static uint64_t timestamp_dynflag; 45 static int timestamp_dynfield_offset = -1; 46 47 struct __rte_cache_aligned pkt_rx_queue { 48 int sockfd; 49 50 struct iovec *rd; 51 uint8_t *map; 52 unsigned int framecount; 53 unsigned int framenum; 54 55 struct rte_mempool *mb_pool; 56 uint16_t in_port; 57 uint8_t vlan_strip; 58 uint8_t timestamp_offloading; 59 60 volatile unsigned long rx_pkts; 61 volatile unsigned long rx_bytes; 62 volatile unsigned long rx_nombuf; 63 volatile unsigned long rx_dropped_pkts; 64 }; 65 66 struct __rte_cache_aligned pkt_tx_queue { 67 int sockfd; 68 unsigned int frame_data_size; 69 70 struct iovec *rd; 71 uint8_t *map; 72 unsigned int framecount; 73 unsigned int framenum; 74 75 volatile unsigned long tx_pkts; 76 volatile unsigned long err_pkts; 77 volatile unsigned long tx_bytes; 78 }; 79 80 struct pmd_internals { 81 unsigned nb_queues; 82 83 int if_index; 84 char *if_name; 85 struct rte_ether_addr eth_addr; 86 87 struct tpacket_req req; 88 89 struct pkt_rx_queue *rx_queue; 90 struct pkt_tx_queue *tx_queue; 91 uint8_t vlan_strip; 92 uint8_t timestamp_offloading; 93 }; 94 95 static const char *valid_arguments[] = { 96 ETH_AF_PACKET_IFACE_ARG, 97 ETH_AF_PACKET_NUM_Q_ARG, 98 ETH_AF_PACKET_BLOCKSIZE_ARG, 99 ETH_AF_PACKET_FRAMESIZE_ARG, 100 ETH_AF_PACKET_FRAMECOUNT_ARG, 101 ETH_AF_PACKET_QDISC_BYPASS_ARG, 102 ETH_AF_PACKET_FANOUT_MODE_ARG, 103 NULL 104 }; 105 106 static struct rte_eth_link pmd_link = { 107 .link_speed = RTE_ETH_SPEED_NUM_10G, 108 .link_duplex = RTE_ETH_LINK_FULL_DUPLEX, 109 .link_status = RTE_ETH_LINK_DOWN, 110 .link_autoneg = RTE_ETH_LINK_FIXED, 111 }; 112 113 RTE_LOG_REGISTER_DEFAULT(af_packet_logtype, NOTICE); 114 #define RTE_LOGTYPE_AFPACKET af_packet_logtype 115 116 #define PMD_LOG(level, ...) \ 117 RTE_LOG_LINE_PREFIX(level, AFPACKET, "%s(): ", __func__, __VA_ARGS__) 118 119 #define PMD_LOG_ERRNO(level, fmt, ...) \ 120 RTE_LOG_LINE(level, AFPACKET, "%s(): " fmt ":%s", __func__, \ 121 ## __VA_ARGS__, strerror(errno)) 122 123 static uint16_t 124 eth_af_packet_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) 125 { 126 unsigned i; 127 struct tpacket2_hdr *ppd; 128 struct rte_mbuf *mbuf; 129 uint8_t *pbuf; 130 struct pkt_rx_queue *pkt_q = queue; 131 uint16_t num_rx = 0; 132 unsigned long num_rx_bytes = 0; 133 unsigned int framecount, framenum; 134 135 if (unlikely(nb_pkts == 0)) 136 return 0; 137 138 /* 139 * Reads the given number of packets from the AF_PACKET socket one by 140 * one and copies the packet data into a newly allocated mbuf. 141 */ 142 framecount = pkt_q->framecount; 143 framenum = pkt_q->framenum; 144 for (i = 0; i < nb_pkts; i++) { 145 /* point at the next incoming frame */ 146 ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base; 147 if ((ppd->tp_status & TP_STATUS_USER) == 0) 148 break; 149 150 /* allocate the next mbuf */ 151 mbuf = rte_pktmbuf_alloc(pkt_q->mb_pool); 152 if (unlikely(mbuf == NULL)) { 153 pkt_q->rx_nombuf++; 154 break; 155 } 156 157 /* packet will fit in the mbuf, go ahead and receive it */ 158 rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf) = ppd->tp_snaplen; 159 pbuf = (uint8_t *) ppd + ppd->tp_mac; 160 memcpy(rte_pktmbuf_mtod(mbuf, void *), pbuf, rte_pktmbuf_data_len(mbuf)); 161 162 /* check for vlan info */ 163 if (ppd->tp_status & TP_STATUS_VLAN_VALID) { 164 mbuf->vlan_tci = ppd->tp_vlan_tci; 165 mbuf->ol_flags |= (RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED); 166 167 if (!pkt_q->vlan_strip && rte_vlan_insert(&mbuf)) 168 PMD_LOG(ERR, "Failed to reinsert VLAN tag"); 169 } 170 171 /* add kernel provided timestamp when offloading is enabled */ 172 if (pkt_q->timestamp_offloading) { 173 /* since TPACKET_V2 timestamps are provided in nanoseconds resolution */ 174 *RTE_MBUF_DYNFIELD(mbuf, timestamp_dynfield_offset, 175 rte_mbuf_timestamp_t *) = 176 (uint64_t)ppd->tp_sec * 1000000000 + ppd->tp_nsec; 177 178 mbuf->ol_flags |= timestamp_dynflag; 179 } 180 181 /* release incoming frame and advance ring buffer */ 182 ppd->tp_status = TP_STATUS_KERNEL; 183 if (++framenum >= framecount) 184 framenum = 0; 185 mbuf->port = pkt_q->in_port; 186 187 /* account for the receive frame */ 188 bufs[i] = mbuf; 189 num_rx++; 190 num_rx_bytes += mbuf->pkt_len; 191 } 192 pkt_q->framenum = framenum; 193 pkt_q->rx_pkts += num_rx; 194 pkt_q->rx_bytes += num_rx_bytes; 195 return num_rx; 196 } 197 198 /* 199 * Check if there is an available frame in the ring 200 */ 201 static inline bool 202 tx_ring_status_available(uint32_t tp_status) 203 { 204 /* 205 * We eliminate the timestamp status from the packet status. 206 * This should only matter if timestamping is enabled on the socket, 207 * but there is a bug in the kernel which is fixed in newer releases. 208 * 209 * See the following kernel commit for reference: 210 * commit 171c3b151118a2fe0fc1e2a9d1b5a1570cfe82d2 211 * net: packetmmap: fix only tx timestamp on request 212 */ 213 tp_status &= ~(TP_STATUS_TS_SOFTWARE | TP_STATUS_TS_RAW_HARDWARE); 214 215 return tp_status == TP_STATUS_AVAILABLE; 216 } 217 218 /* 219 * Callback to handle sending packets through a real NIC. 220 */ 221 static uint16_t 222 eth_af_packet_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) 223 { 224 struct tpacket2_hdr *ppd; 225 struct rte_mbuf *mbuf; 226 uint8_t *pbuf; 227 unsigned int framecount, framenum; 228 struct pollfd pfd; 229 struct pkt_tx_queue *pkt_q = queue; 230 uint16_t num_tx = 0; 231 unsigned long num_tx_bytes = 0; 232 int i; 233 234 if (unlikely(nb_pkts == 0)) 235 return 0; 236 237 memset(&pfd, 0, sizeof(pfd)); 238 pfd.fd = pkt_q->sockfd; 239 pfd.events = POLLOUT; 240 pfd.revents = 0; 241 242 framecount = pkt_q->framecount; 243 framenum = pkt_q->framenum; 244 ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base; 245 for (i = 0; i < nb_pkts; i++) { 246 mbuf = *bufs++; 247 248 /* drop oversized packets */ 249 if (mbuf->pkt_len > pkt_q->frame_data_size) { 250 rte_pktmbuf_free(mbuf); 251 continue; 252 } 253 254 /* insert vlan info if necessary */ 255 if (mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) { 256 if (rte_vlan_insert(&mbuf)) { 257 rte_pktmbuf_free(mbuf); 258 continue; 259 } 260 } 261 262 /* point at the next incoming frame */ 263 if (!tx_ring_status_available(ppd->tp_status)) { 264 if (poll(&pfd, 1, -1) < 0) 265 break; 266 267 /* poll() can return POLLERR if the interface is down */ 268 if (pfd.revents & POLLERR) 269 break; 270 } 271 272 /* 273 * poll() will almost always return POLLOUT, even if there 274 * are no extra buffers available 275 * 276 * This happens, because packet_poll() calls datagram_poll() 277 * which checks the space left in the socket buffer and, 278 * in the case of packet_mmap, the default socket buffer length 279 * doesn't match the requested size for the tx_ring. 280 * As such, there is almost always space left in socket buffer, 281 * which doesn't seem to be correlated to the requested size 282 * for the tx_ring in packet_mmap. 283 * 284 * This results in poll() returning POLLOUT. 285 */ 286 if (!tx_ring_status_available(ppd->tp_status)) 287 break; 288 289 /* copy the tx frame data */ 290 pbuf = (uint8_t *) ppd + TPACKET2_HDRLEN - 291 sizeof(struct sockaddr_ll); 292 293 struct rte_mbuf *tmp_mbuf = mbuf; 294 while (tmp_mbuf) { 295 uint16_t data_len = rte_pktmbuf_data_len(tmp_mbuf); 296 memcpy(pbuf, rte_pktmbuf_mtod(tmp_mbuf, void*), data_len); 297 pbuf += data_len; 298 tmp_mbuf = tmp_mbuf->next; 299 } 300 301 ppd->tp_len = mbuf->pkt_len; 302 ppd->tp_snaplen = mbuf->pkt_len; 303 304 /* release incoming frame and advance ring buffer */ 305 ppd->tp_status = TP_STATUS_SEND_REQUEST; 306 if (++framenum >= framecount) 307 framenum = 0; 308 ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base; 309 310 num_tx++; 311 num_tx_bytes += mbuf->pkt_len; 312 rte_pktmbuf_free(mbuf); 313 } 314 315 /* kick-off transmits */ 316 if (sendto(pkt_q->sockfd, NULL, 0, MSG_DONTWAIT, NULL, 0) == -1 && 317 errno != ENOBUFS && errno != EAGAIN) { 318 /* 319 * In case of a ENOBUFS/EAGAIN error all of the enqueued 320 * packets will be considered successful even though only some 321 * are sent. 322 */ 323 324 num_tx = 0; 325 num_tx_bytes = 0; 326 } 327 328 pkt_q->framenum = framenum; 329 pkt_q->tx_pkts += num_tx; 330 pkt_q->err_pkts += i - num_tx; 331 pkt_q->tx_bytes += num_tx_bytes; 332 return i; 333 } 334 335 static int 336 eth_dev_start(struct rte_eth_dev *dev) 337 { 338 struct pmd_internals *internals = dev->data->dev_private; 339 uint16_t i; 340 341 if (internals->timestamp_offloading) { 342 /* Register mbuf field and flag for Rx timestamp */ 343 int rc = rte_mbuf_dyn_rx_timestamp_register(×tamp_dynfield_offset, 344 ×tamp_dynflag); 345 if (rc) { 346 PMD_LOG(ERR, "Cannot register mbuf field/flag for timestamp"); 347 return rc; 348 } 349 } 350 351 dev->data->dev_link.link_status = RTE_ETH_LINK_UP; 352 for (i = 0; i < internals->nb_queues; i++) { 353 dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED; 354 dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED; 355 } 356 return 0; 357 } 358 359 /* 360 * This function gets called when the current port gets stopped. 361 */ 362 static int 363 eth_dev_stop(struct rte_eth_dev *dev) 364 { 365 unsigned i; 366 int sockfd; 367 struct pmd_internals *internals = dev->data->dev_private; 368 369 for (i = 0; i < internals->nb_queues; i++) { 370 sockfd = internals->rx_queue[i].sockfd; 371 if (sockfd != -1) 372 close(sockfd); 373 374 /* Prevent use after free in case tx fd == rx fd */ 375 if (sockfd != internals->tx_queue[i].sockfd) { 376 sockfd = internals->tx_queue[i].sockfd; 377 if (sockfd != -1) 378 close(sockfd); 379 } 380 381 internals->rx_queue[i].sockfd = -1; 382 internals->tx_queue[i].sockfd = -1; 383 dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED; 384 dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED; 385 } 386 387 dev->data->dev_link.link_status = RTE_ETH_LINK_DOWN; 388 return 0; 389 } 390 391 static int 392 eth_dev_configure(struct rte_eth_dev *dev __rte_unused) 393 { 394 struct rte_eth_conf *dev_conf = &dev->data->dev_conf; 395 const struct rte_eth_rxmode *rxmode = &dev_conf->rxmode; 396 struct pmd_internals *internals = dev->data->dev_private; 397 398 internals->vlan_strip = !!(rxmode->offloads & RTE_ETH_RX_OFFLOAD_VLAN_STRIP); 399 internals->timestamp_offloading = !!(rxmode->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP); 400 return 0; 401 } 402 403 static int 404 eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) 405 { 406 struct pmd_internals *internals = dev->data->dev_private; 407 408 dev_info->if_index = internals->if_index; 409 dev_info->max_mac_addrs = 1; 410 dev_info->max_rx_pktlen = RTE_ETHER_MAX_LEN; 411 dev_info->max_rx_queues = (uint16_t)internals->nb_queues; 412 dev_info->max_tx_queues = (uint16_t)internals->nb_queues; 413 dev_info->min_rx_bufsize = 0; 414 dev_info->tx_offload_capa = RTE_ETH_TX_OFFLOAD_MULTI_SEGS | 415 RTE_ETH_TX_OFFLOAD_VLAN_INSERT; 416 dev_info->rx_offload_capa = RTE_ETH_RX_OFFLOAD_VLAN_STRIP | 417 RTE_ETH_RX_OFFLOAD_TIMESTAMP; 418 419 return 0; 420 } 421 422 423 /* 424 * Query dropped packets counter from socket. 425 * Reading drop count clears the value of the socket! 426 */ 427 static unsigned int 428 packet_drop_count(int sockfd) 429 { 430 struct tpacket_stats pkt_stats; 431 socklen_t pkt_stats_len = sizeof(struct tpacket_stats); 432 433 if (sockfd == -1) 434 return 0; 435 436 if (getsockopt(sockfd, SOL_PACKET, PACKET_STATISTICS, &pkt_stats, 437 &pkt_stats_len) < -1) 438 return 0; 439 440 return pkt_stats.tp_drops; 441 } 442 443 static int 444 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats) 445 { 446 unsigned int i; 447 unsigned long rx_total = 0, rx_dropped_total = 0, rx_nombuf_total = 0; 448 unsigned long tx_total = 0, tx_err_total = 0; 449 unsigned long rx_bytes_total = 0, tx_bytes_total = 0; 450 const struct pmd_internals *internal = dev->data->dev_private; 451 452 for (i = 0; i < internal->nb_queues; i++) { 453 /* reading drop count clears the value, therefore keep total value */ 454 internal->rx_queue[i].rx_dropped_pkts += 455 packet_drop_count(internal->rx_queue[i].sockfd); 456 457 rx_total += internal->rx_queue[i].rx_pkts; 458 rx_bytes_total += internal->rx_queue[i].rx_bytes; 459 rx_dropped_total += internal->rx_queue[i].rx_dropped_pkts; 460 rx_nombuf_total += internal->rx_queue[i].rx_nombuf; 461 462 tx_total += internal->tx_queue[i].tx_pkts; 463 tx_err_total += internal->tx_queue[i].err_pkts; 464 tx_bytes_total += internal->tx_queue[i].tx_bytes; 465 466 if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) { 467 stats->q_ipackets[i] = internal->rx_queue[i].rx_pkts; 468 stats->q_ibytes[i] = internal->rx_queue[i].rx_bytes; 469 stats->q_opackets[i] = internal->tx_queue[i].tx_pkts; 470 stats->q_obytes[i] = internal->tx_queue[i].tx_bytes; 471 } 472 } 473 474 stats->ipackets = rx_total; 475 stats->ibytes = rx_bytes_total; 476 stats->imissed = rx_dropped_total; 477 stats->rx_nombuf = rx_nombuf_total; 478 stats->opackets = tx_total; 479 stats->oerrors = tx_err_total; 480 stats->obytes = tx_bytes_total; 481 return 0; 482 } 483 484 static int 485 eth_stats_reset(struct rte_eth_dev *dev) 486 { 487 unsigned i; 488 struct pmd_internals *internal = dev->data->dev_private; 489 490 for (i = 0; i < internal->nb_queues; i++) { 491 /* clear socket counter */ 492 packet_drop_count(internal->rx_queue[i].sockfd); 493 494 internal->rx_queue[i].rx_pkts = 0; 495 internal->rx_queue[i].rx_bytes = 0; 496 internal->rx_queue[i].rx_nombuf = 0; 497 internal->rx_queue[i].rx_dropped_pkts = 0; 498 499 internal->tx_queue[i].tx_pkts = 0; 500 internal->tx_queue[i].err_pkts = 0; 501 internal->tx_queue[i].tx_bytes = 0; 502 } 503 504 return 0; 505 } 506 507 static int 508 eth_dev_close(struct rte_eth_dev *dev) 509 { 510 struct pmd_internals *internals; 511 struct tpacket_req *req; 512 unsigned int q; 513 514 if (rte_eal_process_type() != RTE_PROC_PRIMARY) 515 return 0; 516 517 PMD_LOG(INFO, "Closing AF_PACKET ethdev on NUMA socket %u", 518 rte_socket_id()); 519 520 internals = dev->data->dev_private; 521 req = &internals->req; 522 for (q = 0; q < internals->nb_queues; q++) { 523 munmap(internals->rx_queue[q].map, 524 2 * req->tp_block_size * req->tp_block_nr); 525 rte_free(internals->rx_queue[q].rd); 526 rte_free(internals->tx_queue[q].rd); 527 } 528 free(internals->if_name); 529 rte_free(internals->rx_queue); 530 rte_free(internals->tx_queue); 531 532 /* mac_addrs must not be freed alone because part of dev_private */ 533 dev->data->mac_addrs = NULL; 534 return 0; 535 } 536 537 static int 538 eth_link_update(struct rte_eth_dev *dev, 539 int wait_to_complete __rte_unused) 540 { 541 const struct pmd_internals *internals = dev->data->dev_private; 542 struct rte_eth_link *dev_link = &dev->data->dev_link; 543 int sockfd = internals->rx_queue[0].sockfd; 544 struct ifreq ifr = { }; 545 546 if (sockfd == -1) 547 return 0; 548 549 strlcpy(ifr.ifr_name, internals->if_name, IFNAMSIZ); 550 if (ioctl(sockfd, SIOCGIFFLAGS, &ifr) < 0) 551 return -errno; 552 dev_link->link_status = (ifr.ifr_flags & IFF_RUNNING) ? 553 RTE_ETH_LINK_UP : RTE_ETH_LINK_DOWN; 554 return 0; 555 } 556 557 static int 558 eth_rx_queue_setup(struct rte_eth_dev *dev, 559 uint16_t rx_queue_id, 560 uint16_t nb_rx_desc __rte_unused, 561 unsigned int socket_id __rte_unused, 562 const struct rte_eth_rxconf *rx_conf __rte_unused, 563 struct rte_mempool *mb_pool) 564 { 565 struct pmd_internals *internals = dev->data->dev_private; 566 struct pkt_rx_queue *pkt_q = &internals->rx_queue[rx_queue_id]; 567 unsigned int buf_size, data_size; 568 569 pkt_q->mb_pool = mb_pool; 570 571 /* Now get the space available for data in the mbuf */ 572 buf_size = rte_pktmbuf_data_room_size(pkt_q->mb_pool) - 573 RTE_PKTMBUF_HEADROOM; 574 data_size = internals->req.tp_frame_size; 575 data_size -= TPACKET2_HDRLEN - sizeof(struct sockaddr_ll); 576 577 if (data_size > buf_size) { 578 PMD_LOG(ERR, 579 "%s: %d bytes will not fit in mbuf (%d bytes)", 580 dev->device->name, data_size, buf_size); 581 return -ENOMEM; 582 } 583 584 dev->data->rx_queues[rx_queue_id] = pkt_q; 585 pkt_q->in_port = dev->data->port_id; 586 pkt_q->vlan_strip = internals->vlan_strip; 587 pkt_q->timestamp_offloading = internals->timestamp_offloading; 588 589 return 0; 590 } 591 592 static int 593 eth_tx_queue_setup(struct rte_eth_dev *dev, 594 uint16_t tx_queue_id, 595 uint16_t nb_tx_desc __rte_unused, 596 unsigned int socket_id __rte_unused, 597 const struct rte_eth_txconf *tx_conf __rte_unused) 598 { 599 600 struct pmd_internals *internals = dev->data->dev_private; 601 602 dev->data->tx_queues[tx_queue_id] = &internals->tx_queue[tx_queue_id]; 603 return 0; 604 } 605 606 static int 607 eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu) 608 { 609 struct pmd_internals *internals = dev->data->dev_private; 610 struct ifreq ifr = { .ifr_mtu = mtu }; 611 int ret; 612 int s; 613 unsigned int data_size = internals->req.tp_frame_size - 614 TPACKET2_HDRLEN; 615 616 if (mtu > data_size) 617 return -EINVAL; 618 619 s = socket(PF_INET, SOCK_DGRAM, 0); 620 if (s < 0) 621 return -EINVAL; 622 623 strlcpy(ifr.ifr_name, internals->if_name, IFNAMSIZ); 624 ret = ioctl(s, SIOCSIFMTU, &ifr); 625 close(s); 626 627 if (ret < 0) 628 return -EINVAL; 629 630 return 0; 631 } 632 633 static int 634 eth_dev_macaddr_set(struct rte_eth_dev *dev, struct rte_ether_addr *addr) 635 { 636 struct pmd_internals *internals = dev->data->dev_private; 637 struct ifreq ifr = { }; 638 int sockfd = internals->rx_queue[0].sockfd; 639 int ret; 640 641 if (sockfd == -1) { 642 PMD_LOG(ERR, "receive socket not found"); 643 return -EINVAL; 644 } 645 646 strlcpy(ifr.ifr_name, internals->if_name, IFNAMSIZ); 647 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER; 648 memcpy(ifr.ifr_hwaddr.sa_data, addr, sizeof(*addr)); 649 ret = ioctl(sockfd, SIOCSIFHWADDR, &ifr); 650 651 if (ret < 0) { 652 PMD_LOG_ERRNO(ERR, "ioctl(SIOCSIFHWADDR) failed"); 653 return -EINVAL; 654 } 655 656 return 0; 657 } 658 659 static int 660 eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask) 661 { 662 struct ifreq ifr; 663 int ret = 0; 664 int s; 665 666 s = socket(PF_INET, SOCK_DGRAM, 0); 667 if (s < 0) 668 return -errno; 669 670 strlcpy(ifr.ifr_name, if_name, IFNAMSIZ); 671 if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0) { 672 ret = -errno; 673 goto out; 674 } 675 ifr.ifr_flags &= mask; 676 ifr.ifr_flags |= flags; 677 if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) { 678 ret = -errno; 679 goto out; 680 } 681 out: 682 close(s); 683 return ret; 684 } 685 686 static int 687 eth_dev_promiscuous_enable(struct rte_eth_dev *dev) 688 { 689 struct pmd_internals *internals = dev->data->dev_private; 690 691 return eth_dev_change_flags(internals->if_name, IFF_PROMISC, ~0); 692 } 693 694 static int 695 eth_dev_promiscuous_disable(struct rte_eth_dev *dev) 696 { 697 struct pmd_internals *internals = dev->data->dev_private; 698 699 return eth_dev_change_flags(internals->if_name, 0, ~IFF_PROMISC); 700 } 701 702 static const struct eth_dev_ops ops = { 703 .dev_start = eth_dev_start, 704 .dev_stop = eth_dev_stop, 705 .dev_close = eth_dev_close, 706 .dev_configure = eth_dev_configure, 707 .dev_infos_get = eth_dev_info, 708 .mac_addr_set = eth_dev_macaddr_set, 709 .mtu_set = eth_dev_mtu_set, 710 .promiscuous_enable = eth_dev_promiscuous_enable, 711 .promiscuous_disable = eth_dev_promiscuous_disable, 712 .rx_queue_setup = eth_rx_queue_setup, 713 .tx_queue_setup = eth_tx_queue_setup, 714 .link_update = eth_link_update, 715 .stats_get = eth_stats_get, 716 .stats_reset = eth_stats_reset, 717 }; 718 719 /* 720 * Opens an AF_PACKET socket 721 */ 722 static int 723 open_packet_iface(const char *key __rte_unused, 724 const char *value __rte_unused, 725 void *extra_args) 726 { 727 int *sockfd = extra_args; 728 729 /* Open an AF_PACKET socket... */ 730 *sockfd = socket(AF_PACKET, SOCK_RAW, 0); 731 if (*sockfd == -1) { 732 PMD_LOG(ERR, "Could not open AF_PACKET socket"); 733 return -1; 734 } 735 736 return 0; 737 } 738 739 #define PACKET_FANOUT_INVALID -1 740 741 static int 742 get_fanout_group_id(int if_index) 743 { 744 return (getpid() ^ if_index) & 0xffff; 745 } 746 747 static int 748 get_fanout_mode(const char *fanout_mode) 749 { 750 int load_balance = PACKET_FANOUT_FLAG_DEFRAG | 751 PACKET_FANOUT_FLAG_ROLLOVER; 752 753 if (!fanout_mode) { 754 /* Default */ 755 load_balance |= PACKET_FANOUT_HASH; 756 } else if (!strcmp(fanout_mode, "hash")) { 757 load_balance |= PACKET_FANOUT_HASH; 758 } else if (!strcmp(fanout_mode, "lb")) { 759 load_balance |= PACKET_FANOUT_LB; 760 } else if (!strcmp(fanout_mode, "cpu")) { 761 load_balance |= PACKET_FANOUT_CPU; 762 } else if (!strcmp(fanout_mode, "rollover")) { 763 load_balance |= PACKET_FANOUT_ROLLOVER; 764 } else if (!strcmp(fanout_mode, "rnd")) { 765 load_balance |= PACKET_FANOUT_RND; 766 } else if (!strcmp(fanout_mode, "qm")) { 767 load_balance |= PACKET_FANOUT_QM; 768 } else { 769 /* Invalid Fanout Mode */ 770 load_balance = PACKET_FANOUT_INVALID; 771 } 772 773 return load_balance; 774 } 775 776 static int 777 get_fanout(const char *fanout_mode, int if_index) 778 { 779 int load_balance = get_fanout_mode(fanout_mode); 780 if (load_balance != PACKET_FANOUT_INVALID) 781 return get_fanout_group_id(if_index) | (load_balance << 16); 782 else 783 return PACKET_FANOUT_INVALID; 784 } 785 786 static int 787 rte_pmd_init_internals(struct rte_vdev_device *dev, 788 const int sockfd, 789 const unsigned nb_queues, 790 unsigned int blocksize, 791 unsigned int blockcnt, 792 unsigned int framesize, 793 unsigned int framecnt, 794 unsigned int qdisc_bypass, 795 const char *fanout_mode, 796 struct pmd_internals **internals, 797 struct rte_eth_dev **eth_dev, 798 struct rte_kvargs *kvlist) 799 { 800 const char *name = rte_vdev_device_name(dev); 801 const unsigned int numa_node = dev->device.numa_node; 802 struct rte_eth_dev_data *data = NULL; 803 struct rte_kvargs_pair *pair = NULL; 804 struct ifreq ifr; 805 size_t ifnamelen; 806 unsigned k_idx; 807 struct sockaddr_ll sockaddr; 808 struct tpacket_req *req; 809 struct pkt_rx_queue *rx_queue; 810 struct pkt_tx_queue *tx_queue; 811 int rc, tpver, discard; 812 int qsockfd = -1; 813 unsigned int i, q, rdsize; 814 int fanout_arg; 815 816 for (k_idx = 0; k_idx < kvlist->count; k_idx++) { 817 pair = &kvlist->pairs[k_idx]; 818 if (strstr(pair->key, ETH_AF_PACKET_IFACE_ARG) != NULL) 819 break; 820 } 821 if (pair == NULL) { 822 PMD_LOG(ERR, 823 "%s: no interface specified for AF_PACKET ethdev", 824 name); 825 return -1; 826 } 827 828 PMD_LOG(INFO, 829 "%s: creating AF_PACKET-backed ethdev on numa socket %u", 830 name, numa_node); 831 832 *internals = rte_zmalloc_socket(name, sizeof(**internals), 833 0, numa_node); 834 if (*internals == NULL) 835 return -1; 836 837 838 (*internals)->rx_queue = rte_calloc_socket("af_packet_rx", 839 nb_queues, 840 sizeof(struct pkt_rx_queue), 841 0, numa_node); 842 (*internals)->tx_queue = rte_calloc_socket("af_packet_tx", 843 nb_queues, 844 sizeof(struct pkt_tx_queue), 845 0, numa_node); 846 if (!(*internals)->rx_queue || !(*internals)->tx_queue) { 847 goto free_internals; 848 } 849 850 for (q = 0; q < nb_queues; q++) { 851 (*internals)->rx_queue[q].map = MAP_FAILED; 852 (*internals)->tx_queue[q].map = MAP_FAILED; 853 (*internals)->rx_queue[q].sockfd = -1; 854 (*internals)->tx_queue[q].sockfd = -1; 855 } 856 857 req = &((*internals)->req); 858 859 req->tp_block_size = blocksize; 860 req->tp_block_nr = blockcnt; 861 req->tp_frame_size = framesize; 862 req->tp_frame_nr = framecnt; 863 864 ifnamelen = strlen(pair->value); 865 if (ifnamelen < sizeof(ifr.ifr_name)) { 866 memcpy(ifr.ifr_name, pair->value, ifnamelen); 867 ifr.ifr_name[ifnamelen] = '\0'; 868 } else { 869 PMD_LOG(ERR, 870 "%s: I/F name too long (%s)", 871 name, pair->value); 872 goto free_internals; 873 } 874 if (ioctl(sockfd, SIOCGIFINDEX, &ifr) == -1) { 875 PMD_LOG_ERRNO(ERR, "%s: ioctl failed (SIOCGIFINDEX)", name); 876 goto free_internals; 877 } 878 (*internals)->if_name = strdup(pair->value); 879 if ((*internals)->if_name == NULL) 880 goto free_internals; 881 (*internals)->if_index = ifr.ifr_ifindex; 882 883 if (ioctl(sockfd, SIOCGIFHWADDR, &ifr) == -1) { 884 PMD_LOG_ERRNO(ERR, "%s: ioctl failed (SIOCGIFHWADDR)", name); 885 goto free_internals; 886 } 887 memcpy(&(*internals)->eth_addr, ifr.ifr_hwaddr.sa_data, ETH_ALEN); 888 889 memset(&sockaddr, 0, sizeof(sockaddr)); 890 sockaddr.sll_family = AF_PACKET; 891 sockaddr.sll_protocol = htons(ETH_P_ALL); 892 sockaddr.sll_ifindex = (*internals)->if_index; 893 894 fanout_arg = get_fanout(fanout_mode, (*internals)->if_index); 895 if (fanout_arg == PACKET_FANOUT_INVALID) { 896 PMD_LOG(ERR, "Invalid fanout mode: %s", fanout_mode); 897 goto error; 898 } 899 900 for (q = 0; q < nb_queues; q++) { 901 /* Open an AF_PACKET socket for this queue... */ 902 qsockfd = socket(AF_PACKET, SOCK_RAW, 0); 903 if (qsockfd == -1) { 904 PMD_LOG_ERRNO(ERR, 905 "%s: could not open AF_PACKET socket", 906 name); 907 goto error; 908 } 909 910 tpver = TPACKET_V2; 911 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_VERSION, 912 &tpver, sizeof(tpver)); 913 if (rc == -1) { 914 PMD_LOG_ERRNO(ERR, 915 "%s: could not set PACKET_VERSION on AF_PACKET socket for %s", 916 name, pair->value); 917 goto error; 918 } 919 920 discard = 1; 921 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_LOSS, 922 &discard, sizeof(discard)); 923 if (rc == -1) { 924 PMD_LOG_ERRNO(ERR, 925 "%s: could not set PACKET_LOSS on AF_PACKET socket for %s", 926 name, pair->value); 927 goto error; 928 } 929 930 if (qdisc_bypass) { 931 #if defined(PACKET_QDISC_BYPASS) 932 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_QDISC_BYPASS, 933 &qdisc_bypass, sizeof(qdisc_bypass)); 934 if (rc == -1) { 935 PMD_LOG_ERRNO(ERR, 936 "%s: could not set PACKET_QDISC_BYPASS on AF_PACKET socket for %s", 937 name, pair->value); 938 goto error; 939 } 940 #endif 941 } 942 943 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_RX_RING, req, sizeof(*req)); 944 if (rc == -1) { 945 PMD_LOG_ERRNO(ERR, 946 "%s: could not set PACKET_RX_RING on AF_PACKET socket for %s", 947 name, pair->value); 948 goto error; 949 } 950 951 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_TX_RING, req, sizeof(*req)); 952 if (rc == -1) { 953 PMD_LOG_ERRNO(ERR, 954 "%s: could not set PACKET_TX_RING on AF_PACKET " 955 "socket for %s", name, pair->value); 956 goto error; 957 } 958 959 rx_queue = &((*internals)->rx_queue[q]); 960 rx_queue->framecount = req->tp_frame_nr; 961 962 rx_queue->map = mmap(NULL, 2 * req->tp_block_size * req->tp_block_nr, 963 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED, 964 qsockfd, 0); 965 if (rx_queue->map == MAP_FAILED) { 966 PMD_LOG_ERRNO(ERR, 967 "%s: call to mmap failed on AF_PACKET socket for %s", 968 name, pair->value); 969 goto error; 970 } 971 972 /* rdsize is same for both Tx and Rx */ 973 rdsize = req->tp_frame_nr * sizeof(*(rx_queue->rd)); 974 975 rx_queue->rd = rte_zmalloc_socket(name, rdsize, 0, numa_node); 976 if (rx_queue->rd == NULL) 977 goto error; 978 for (i = 0; i < req->tp_frame_nr; ++i) { 979 rx_queue->rd[i].iov_base = rx_queue->map + (i * framesize); 980 rx_queue->rd[i].iov_len = req->tp_frame_size; 981 } 982 rx_queue->sockfd = qsockfd; 983 984 tx_queue = &((*internals)->tx_queue[q]); 985 tx_queue->framecount = req->tp_frame_nr; 986 tx_queue->frame_data_size = req->tp_frame_size; 987 tx_queue->frame_data_size -= TPACKET2_HDRLEN - 988 sizeof(struct sockaddr_ll); 989 990 tx_queue->map = rx_queue->map + req->tp_block_size * req->tp_block_nr; 991 992 tx_queue->rd = rte_zmalloc_socket(name, rdsize, 0, numa_node); 993 if (tx_queue->rd == NULL) 994 goto error; 995 for (i = 0; i < req->tp_frame_nr; ++i) { 996 tx_queue->rd[i].iov_base = tx_queue->map + (i * framesize); 997 tx_queue->rd[i].iov_len = req->tp_frame_size; 998 } 999 tx_queue->sockfd = qsockfd; 1000 1001 rc = bind(qsockfd, (const struct sockaddr*)&sockaddr, sizeof(sockaddr)); 1002 if (rc == -1) { 1003 PMD_LOG_ERRNO(ERR, 1004 "%s: could not bind AF_PACKET socket to %s", 1005 name, pair->value); 1006 goto error; 1007 } 1008 1009 if (nb_queues > 1) { 1010 rc = setsockopt(qsockfd, SOL_PACKET, PACKET_FANOUT, 1011 &fanout_arg, sizeof(fanout_arg)); 1012 if (rc == -1) { 1013 PMD_LOG_ERRNO(ERR, 1014 "%s: could not set PACKET_FANOUT " 1015 "on AF_PACKET socket for %s", 1016 name, pair->value); 1017 goto error; 1018 } 1019 } 1020 } 1021 1022 /* reserve an ethdev entry */ 1023 *eth_dev = rte_eth_vdev_allocate(dev, 0); 1024 if (*eth_dev == NULL) 1025 goto error; 1026 1027 /* 1028 * now put it all together 1029 * - store queue data in internals, 1030 * - store numa_node in eth_dev 1031 * - point eth_dev_data to internals 1032 * - and point eth_dev structure to new eth_dev_data structure 1033 */ 1034 1035 (*internals)->nb_queues = nb_queues; 1036 1037 data = (*eth_dev)->data; 1038 data->dev_private = *internals; 1039 data->nb_rx_queues = (uint16_t)nb_queues; 1040 data->nb_tx_queues = (uint16_t)nb_queues; 1041 data->dev_link = pmd_link; 1042 data->mac_addrs = &(*internals)->eth_addr; 1043 data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS; 1044 1045 (*eth_dev)->dev_ops = &ops; 1046 1047 return 0; 1048 1049 error: 1050 if (qsockfd != -1) 1051 close(qsockfd); 1052 for (q = 0; q < nb_queues; q++) { 1053 if ((*internals)->rx_queue[q].map != MAP_FAILED) 1054 munmap((*internals)->rx_queue[q].map, 1055 2 * req->tp_block_size * req->tp_block_nr); 1056 1057 rte_free((*internals)->rx_queue[q].rd); 1058 rte_free((*internals)->tx_queue[q].rd); 1059 if (((*internals)->rx_queue[q].sockfd >= 0) && 1060 ((*internals)->rx_queue[q].sockfd != qsockfd)) 1061 close((*internals)->rx_queue[q].sockfd); 1062 } 1063 free_internals: 1064 rte_free((*internals)->rx_queue); 1065 rte_free((*internals)->tx_queue); 1066 free((*internals)->if_name); 1067 rte_free(*internals); 1068 return -1; 1069 } 1070 1071 static int 1072 rte_eth_from_packet(struct rte_vdev_device *dev, 1073 int const *sockfd, 1074 struct rte_kvargs *kvlist) 1075 { 1076 const char *name = rte_vdev_device_name(dev); 1077 struct pmd_internals *internals = NULL; 1078 struct rte_eth_dev *eth_dev = NULL; 1079 struct rte_kvargs_pair *pair = NULL; 1080 unsigned k_idx; 1081 unsigned int blockcount; 1082 unsigned int blocksize; 1083 unsigned int framesize = DFLT_FRAME_SIZE; 1084 unsigned int framecount = DFLT_FRAME_COUNT; 1085 unsigned int qpairs = 1; 1086 unsigned int qdisc_bypass = 1; 1087 const char *fanout_mode = NULL; 1088 1089 /* do some parameter checking */ 1090 if (*sockfd < 0) 1091 return -1; 1092 1093 blocksize = getpagesize(); 1094 1095 /* 1096 * Walk arguments for configurable settings 1097 */ 1098 for (k_idx = 0; k_idx < kvlist->count; k_idx++) { 1099 pair = &kvlist->pairs[k_idx]; 1100 if (strstr(pair->key, ETH_AF_PACKET_NUM_Q_ARG) != NULL) { 1101 qpairs = atoi(pair->value); 1102 if (qpairs < 1) { 1103 PMD_LOG(ERR, 1104 "%s: invalid qpairs value", 1105 name); 1106 return -1; 1107 } 1108 continue; 1109 } 1110 if (strstr(pair->key, ETH_AF_PACKET_BLOCKSIZE_ARG) != NULL) { 1111 blocksize = atoi(pair->value); 1112 if (!blocksize) { 1113 PMD_LOG(ERR, 1114 "%s: invalid blocksize value", 1115 name); 1116 return -1; 1117 } 1118 continue; 1119 } 1120 if (strstr(pair->key, ETH_AF_PACKET_FRAMESIZE_ARG) != NULL) { 1121 framesize = atoi(pair->value); 1122 if (!framesize) { 1123 PMD_LOG(ERR, 1124 "%s: invalid framesize value", 1125 name); 1126 return -1; 1127 } 1128 continue; 1129 } 1130 if (strstr(pair->key, ETH_AF_PACKET_FRAMECOUNT_ARG) != NULL) { 1131 framecount = atoi(pair->value); 1132 if (!framecount) { 1133 PMD_LOG(ERR, 1134 "%s: invalid framecount value", 1135 name); 1136 return -1; 1137 } 1138 continue; 1139 } 1140 if (strstr(pair->key, ETH_AF_PACKET_QDISC_BYPASS_ARG) != NULL) { 1141 qdisc_bypass = atoi(pair->value); 1142 if (qdisc_bypass > 1) { 1143 PMD_LOG(ERR, 1144 "%s: invalid bypass value", 1145 name); 1146 return -1; 1147 } 1148 continue; 1149 } 1150 if (strstr(pair->key, ETH_AF_PACKET_FANOUT_MODE_ARG) != NULL) { 1151 fanout_mode = pair->value; 1152 continue; 1153 } 1154 } 1155 1156 if (framesize > blocksize) { 1157 PMD_LOG(ERR, 1158 "%s: AF_PACKET MMAP frame size exceeds block size!", 1159 name); 1160 return -1; 1161 } 1162 1163 blockcount = framecount / (blocksize / framesize); 1164 if (!blockcount) { 1165 PMD_LOG(ERR, 1166 "%s: invalid AF_PACKET MMAP parameters", name); 1167 return -1; 1168 } 1169 1170 PMD_LOG(INFO, "%s: AF_PACKET MMAP parameters:", name); 1171 PMD_LOG(INFO, "%s:\tblock size %d", name, blocksize); 1172 PMD_LOG(INFO, "%s:\tblock count %d", name, blockcount); 1173 PMD_LOG(INFO, "%s:\tframe size %d", name, framesize); 1174 PMD_LOG(INFO, "%s:\tframe count %d", name, framecount); 1175 1176 if (rte_pmd_init_internals(dev, *sockfd, qpairs, 1177 blocksize, blockcount, 1178 framesize, framecount, 1179 qdisc_bypass, 1180 fanout_mode, 1181 &internals, ð_dev, 1182 kvlist) < 0) 1183 return -1; 1184 1185 eth_dev->rx_pkt_burst = eth_af_packet_rx; 1186 eth_dev->tx_pkt_burst = eth_af_packet_tx; 1187 1188 rte_eth_dev_probing_finish(eth_dev); 1189 return 0; 1190 } 1191 1192 static int 1193 rte_pmd_af_packet_probe(struct rte_vdev_device *dev) 1194 { 1195 int ret = 0; 1196 struct rte_kvargs *kvlist; 1197 int sockfd = -1; 1198 struct rte_eth_dev *eth_dev; 1199 const char *name = rte_vdev_device_name(dev); 1200 1201 PMD_LOG(INFO, "Initializing pmd_af_packet for %s", name); 1202 1203 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 1204 eth_dev = rte_eth_dev_attach_secondary(name); 1205 if (!eth_dev) { 1206 PMD_LOG(ERR, "Failed to probe %s", name); 1207 return -1; 1208 } 1209 /* TODO: request info from primary to set up Rx and Tx */ 1210 eth_dev->dev_ops = &ops; 1211 eth_dev->device = &dev->device; 1212 rte_eth_dev_probing_finish(eth_dev); 1213 return 0; 1214 } 1215 1216 kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments); 1217 if (kvlist == NULL) { 1218 ret = -1; 1219 goto exit; 1220 } 1221 1222 /* 1223 * If iface argument is passed we open the NICs and use them for 1224 * reading / writing 1225 */ 1226 if (rte_kvargs_count(kvlist, ETH_AF_PACKET_IFACE_ARG) == 1) { 1227 1228 ret = rte_kvargs_process(kvlist, ETH_AF_PACKET_IFACE_ARG, 1229 &open_packet_iface, &sockfd); 1230 if (ret < 0) 1231 goto exit; 1232 } 1233 1234 if (dev->device.numa_node == SOCKET_ID_ANY) 1235 dev->device.numa_node = rte_socket_id(); 1236 1237 ret = rte_eth_from_packet(dev, &sockfd, kvlist); 1238 close(sockfd); /* no longer needed */ 1239 1240 exit: 1241 rte_kvargs_free(kvlist); 1242 return ret; 1243 } 1244 1245 static int 1246 rte_pmd_af_packet_remove(struct rte_vdev_device *dev) 1247 { 1248 struct rte_eth_dev *eth_dev; 1249 1250 if (dev == NULL) 1251 return -1; 1252 1253 /* find the ethdev entry */ 1254 eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev)); 1255 if (eth_dev == NULL) 1256 return 0; /* port already released */ 1257 1258 eth_dev_close(eth_dev); 1259 rte_eth_dev_release_port(eth_dev); 1260 1261 return 0; 1262 } 1263 1264 static struct rte_vdev_driver pmd_af_packet_drv = { 1265 .probe = rte_pmd_af_packet_probe, 1266 .remove = rte_pmd_af_packet_remove, 1267 }; 1268 1269 RTE_PMD_REGISTER_VDEV(net_af_packet, pmd_af_packet_drv); 1270 RTE_PMD_REGISTER_ALIAS(net_af_packet, eth_af_packet); 1271 RTE_PMD_REGISTER_PARAM_STRING(net_af_packet, 1272 "iface=<string> " 1273 "qpairs=<int> " 1274 "blocksz=<int> " 1275 "framesz=<int> " 1276 "framecnt=<int> " 1277 "qdisc_bypass=<0|1>"); 1278