1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2018 Intel Corporation 3 */ 4 5 #include <stdio.h> 6 #include <stdlib.h> 7 #include <stdint.h> 8 #include <inttypes.h> 9 #include <sys/types.h> 10 #include <string.h> 11 #include <sys/queue.h> 12 #include <stdarg.h> 13 #include <errno.h> 14 #include <getopt.h> 15 #include <unistd.h> 16 #include <signal.h> 17 #include <math.h> 18 19 #include <rte_common.h> 20 #include <rte_byteorder.h> 21 #include <rte_log.h> 22 #include <rte_malloc.h> 23 #include <rte_memory.h> 24 #include <rte_memcpy.h> 25 #include <rte_eal.h> 26 #include <rte_launch.h> 27 #include <rte_cycles.h> 28 #include <rte_prefetch.h> 29 #include <rte_lcore.h> 30 #include <rte_per_lcore.h> 31 #include <rte_branch_prediction.h> 32 #include <rte_interrupts.h> 33 #include <rte_random.h> 34 #include <rte_debug.h> 35 #include <rte_ether.h> 36 #include <rte_ethdev.h> 37 #include <rte_mempool.h> 38 #include <rte_mbuf.h> 39 #include <rte_ip.h> 40 #include <rte_tcp.h> 41 #include <rte_udp.h> 42 #include <rte_string_fns.h> 43 #include <rte_timer.h> 44 #include <rte_power.h> 45 #include <rte_spinlock.h> 46 #include <rte_metrics.h> 47 #include <rte_telemetry.h> 48 #include <rte_power_pmd_mgmt.h> 49 #include <rte_power_uncore.h> 50 51 #include "perf_core.h" 52 #include "main.h" 53 54 RTE_LOG_REGISTER(l3fwd_power_logtype, l3fwd.power, INFO); 55 #define RTE_LOGTYPE_L3FWD_POWER l3fwd_power_logtype 56 57 #define MAX_PKT_BURST 32 58 59 #define MIN_ZERO_POLL_COUNT 10 60 61 /* 100 ms interval */ 62 #define TIMER_NUMBER_PER_SECOND 10 63 /* (10ms) */ 64 #define INTERVALS_PER_SECOND 100 65 /* 100000 us */ 66 #define SCALING_PERIOD (1000000/TIMER_NUMBER_PER_SECOND) 67 #define SCALING_DOWN_TIME_RATIO_THRESHOLD 0.25 68 69 #define APP_LOOKUP_EXACT_MATCH 0 70 #define APP_LOOKUP_LPM 1 71 #define DO_RFC_1812_CHECKS 72 73 #ifndef APP_LOOKUP_METHOD 74 #define APP_LOOKUP_METHOD APP_LOOKUP_LPM 75 #endif 76 77 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 78 #include <rte_hash.h> 79 #elif (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 80 #include <rte_lpm.h> 81 #else 82 #error "APP_LOOKUP_METHOD set to incorrect value" 83 #endif 84 85 #ifndef IPv6_BYTES 86 #define IPv6_BYTES_FMT "%02x%02x:%02x%02x:%02x%02x:%02x%02x:"\ 87 "%02x%02x:%02x%02x:%02x%02x:%02x%02x" 88 #define IPv6_BYTES(addr) \ 89 addr[0], addr[1], addr[2], addr[3], \ 90 addr[4], addr[5], addr[6], addr[7], \ 91 addr[8], addr[9], addr[10], addr[11],\ 92 addr[12], addr[13],addr[14], addr[15] 93 #endif 94 95 #define MAX_JUMBO_PKT_LEN 9600 96 97 #define IPV6_ADDR_LEN 16 98 99 #define MEMPOOL_CACHE_SIZE 256 100 101 /* 102 * This expression is used to calculate the number of mbufs needed depending on 103 * user input, taking into account memory for rx and tx hardware rings, cache 104 * per lcore and mtable per port per lcore. RTE_MAX is used to ensure that 105 * NB_MBUF never goes below a minimum value of 8192. 106 */ 107 108 #define NB_MBUF RTE_MAX ( \ 109 (nb_ports*nb_rx_queue*nb_rxd + \ 110 nb_ports*nb_lcores*MAX_PKT_BURST + \ 111 nb_ports*n_tx_queue*nb_txd + \ 112 nb_lcores*MEMPOOL_CACHE_SIZE), \ 113 (unsigned)8192) 114 115 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 116 117 #define NB_SOCKETS 8 118 119 /* Configure how many packets ahead to prefetch, when reading packets */ 120 #define PREFETCH_OFFSET 3 121 122 /* 123 * Configurable number of RX/TX ring descriptors 124 */ 125 #define RX_DESC_DEFAULT 1024 126 #define TX_DESC_DEFAULT 1024 127 128 #define NUM_TELSTATS RTE_DIM(telstats_strings) 129 130 static uint16_t nb_rxd = RX_DESC_DEFAULT; 131 static uint16_t nb_txd = TX_DESC_DEFAULT; 132 133 /* ethernet addresses of ports */ 134 static struct rte_ether_addr ports_eth_addr[RTE_MAX_ETHPORTS]; 135 136 /* ethernet addresses of ports */ 137 static rte_spinlock_t locks[RTE_MAX_ETHPORTS]; 138 139 /* mask of enabled ports */ 140 static uint32_t enabled_port_mask = 0; 141 /* Ports set in promiscuous mode off by default. */ 142 static int promiscuous_on = 0; 143 /* NUMA is enabled by default. */ 144 static int numa_on = 1; 145 volatile bool quit_signal; 146 /* timer to update telemetry every 500ms */ 147 static struct rte_timer telemetry_timer; 148 149 /* stats index returned by metrics lib */ 150 int telstats_index; 151 152 /* flag to check if uncore option enabled */ 153 int enabled_uncore = -1; 154 155 struct telstats_name { 156 char name[RTE_ETH_XSTATS_NAME_SIZE]; 157 }; 158 159 /* telemetry stats to be reported */ 160 const struct telstats_name telstats_strings[] = { 161 {"empty_poll"}, 162 {"full_poll"}, 163 {"busy_percent"} 164 }; 165 166 /* core busyness in percentage */ 167 enum busy_rate { 168 ZERO = 0, 169 PARTIAL = 50, 170 FULL = 100 171 }; 172 173 enum uncore_choice { 174 UNCORE_MIN = 0, 175 UNCORE_MAX = 1, 176 UNCORE_IDX = 2 177 }; 178 179 /* reference poll count to measure core busyness */ 180 #define DEFAULT_COUNT 10000 181 /* 182 * reference CYCLES to be used to 183 * measure core busyness based on poll count 184 */ 185 #define MIN_CYCLES 1500000ULL 186 #define MAX_CYCLES 22000000ULL 187 188 /* (500ms) */ 189 #define TELEMETRY_INTERVALS_PER_SEC 2 190 191 static int parse_ptype; /**< Parse packet type using rx callback, and */ 192 /**< disabled by default */ 193 194 enum appmode { 195 APP_MODE_DEFAULT = 0, 196 APP_MODE_LEGACY, 197 APP_MODE_TELEMETRY, 198 APP_MODE_INTERRUPT, 199 APP_MODE_PMD_MGMT 200 }; 201 202 enum appmode app_mode; 203 204 static enum rte_power_pmd_mgmt_type pmgmt_type; 205 bool baseline_enabled; 206 207 enum freq_scale_hint_t 208 { 209 FREQ_LOWER = -1, 210 FREQ_CURRENT = 0, 211 FREQ_HIGHER = 1, 212 FREQ_HIGHEST = 2 213 }; 214 215 struct __rte_cache_aligned lcore_rx_queue { 216 uint16_t port_id; 217 uint16_t queue_id; 218 enum freq_scale_hint_t freq_up_hint; 219 uint32_t zero_rx_packet_count; 220 uint32_t idle_hint; 221 }; 222 223 #define MAX_RX_QUEUE_PER_LCORE 16 224 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS 225 #define MAX_RX_QUEUE_PER_PORT 128 226 227 #define MAX_RX_QUEUE_INTERRUPT_PER_PORT 16 228 229 230 struct lcore_params lcore_params_array[MAX_LCORE_PARAMS]; 231 static struct lcore_params lcore_params_array_default[] = { 232 {0, 0, 2}, 233 {0, 1, 2}, 234 {0, 2, 2}, 235 {1, 0, 2}, 236 {1, 1, 2}, 237 {1, 2, 2}, 238 {2, 0, 2}, 239 {3, 0, 3}, 240 {3, 1, 3}, 241 }; 242 243 struct lcore_params *lcore_params = lcore_params_array_default; 244 uint16_t nb_lcore_params = RTE_DIM(lcore_params_array_default); 245 246 static struct rte_eth_conf port_conf = { 247 .rxmode = { 248 .mq_mode = RTE_ETH_MQ_RX_RSS, 249 .offloads = RTE_ETH_RX_OFFLOAD_CHECKSUM, 250 }, 251 .rx_adv_conf = { 252 .rss_conf = { 253 .rss_key = NULL, 254 .rss_hf = RTE_ETH_RSS_UDP, 255 }, 256 }, 257 .txmode = { 258 .mq_mode = RTE_ETH_MQ_TX_NONE, 259 } 260 }; 261 262 static uint32_t max_pkt_len; 263 static uint32_t max_empty_polls = 512; 264 static uint32_t pause_duration = 1; 265 static uint32_t scale_freq_min; 266 static uint32_t scale_freq_max; 267 268 static struct rte_mempool * pktmbuf_pool[NB_SOCKETS]; 269 270 271 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 272 273 #ifdef RTE_ARCH_X86 274 #include <rte_hash_crc.h> 275 #define DEFAULT_HASH_FUNC rte_hash_crc 276 #else 277 #include <rte_jhash.h> 278 #define DEFAULT_HASH_FUNC rte_jhash 279 #endif 280 281 struct ipv4_5tuple { 282 uint32_t ip_dst; 283 uint32_t ip_src; 284 uint16_t port_dst; 285 uint16_t port_src; 286 uint8_t proto; 287 } __rte_packed; 288 289 struct ipv6_5tuple { 290 uint8_t ip_dst[IPV6_ADDR_LEN]; 291 uint8_t ip_src[IPV6_ADDR_LEN]; 292 uint16_t port_dst; 293 uint16_t port_src; 294 uint8_t proto; 295 } __rte_packed; 296 297 struct ipv4_l3fwd_route { 298 struct ipv4_5tuple key; 299 uint8_t if_out; 300 }; 301 302 struct ipv6_l3fwd_route { 303 struct ipv6_5tuple key; 304 uint8_t if_out; 305 }; 306 307 static struct ipv4_l3fwd_route ipv4_l3fwd_route_array[] = { 308 {{RTE_IPV4(100,10,0,1), RTE_IPV4(200,10,0,1), 101, 11, IPPROTO_TCP}, 0}, 309 {{RTE_IPV4(100,20,0,2), RTE_IPV4(200,20,0,2), 102, 12, IPPROTO_TCP}, 1}, 310 {{RTE_IPV4(100,30,0,3), RTE_IPV4(200,30,0,3), 103, 13, IPPROTO_TCP}, 2}, 311 {{RTE_IPV4(100,40,0,4), RTE_IPV4(200,40,0,4), 104, 14, IPPROTO_TCP}, 3}, 312 }; 313 314 static struct ipv6_l3fwd_route ipv6_l3fwd_route_array[] = { 315 { 316 { 317 {0xfe, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 318 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05}, 319 {0xfe, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 320 0x02, 0x1e, 0x67, 0xff, 0xfe, 0x0d, 0xb6, 0x0a}, 321 1, 10, IPPROTO_UDP 322 }, 4 323 }, 324 }; 325 326 typedef struct rte_hash lookup_struct_t; 327 static lookup_struct_t *ipv4_l3fwd_lookup_struct[NB_SOCKETS]; 328 static lookup_struct_t *ipv6_l3fwd_lookup_struct[NB_SOCKETS]; 329 330 #define L3FWD_HASH_ENTRIES 1024 331 332 static alignas(RTE_CACHE_LINE_SIZE) uint16_t ipv4_l3fwd_out_if[L3FWD_HASH_ENTRIES]; 333 static alignas(RTE_CACHE_LINE_SIZE) uint16_t ipv6_l3fwd_out_if[L3FWD_HASH_ENTRIES]; 334 #endif 335 336 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 337 struct ipv4_l3fwd_route { 338 uint32_t ip; 339 uint8_t depth; 340 uint8_t if_out; 341 }; 342 343 static struct ipv4_l3fwd_route ipv4_l3fwd_route_array[] = { 344 {RTE_IPV4(1,1,1,0), 24, 0}, 345 {RTE_IPV4(2,1,1,0), 24, 1}, 346 {RTE_IPV4(3,1,1,0), 24, 2}, 347 {RTE_IPV4(4,1,1,0), 24, 3}, 348 {RTE_IPV4(5,1,1,0), 24, 4}, 349 {RTE_IPV4(6,1,1,0), 24, 5}, 350 {RTE_IPV4(7,1,1,0), 24, 6}, 351 {RTE_IPV4(8,1,1,0), 24, 7}, 352 }; 353 354 #define IPV4_L3FWD_LPM_MAX_RULES 1024 355 356 typedef struct rte_lpm lookup_struct_t; 357 static lookup_struct_t *ipv4_l3fwd_lookup_struct[NB_SOCKETS]; 358 #endif 359 360 struct __rte_cache_aligned lcore_conf { 361 uint16_t n_rx_queue; 362 struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; 363 uint16_t n_tx_port; 364 uint16_t tx_port_id[RTE_MAX_ETHPORTS]; 365 uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; 366 struct rte_eth_dev_tx_buffer *tx_buffer[RTE_MAX_ETHPORTS]; 367 lookup_struct_t * ipv4_lookup_struct; 368 lookup_struct_t * ipv6_lookup_struct; 369 }; 370 371 struct __rte_cache_aligned lcore_stats { 372 /* total sleep time in ms since last frequency scaling down */ 373 uint32_t sleep_time; 374 /* number of long sleep recently */ 375 uint32_t nb_long_sleep; 376 /* freq. scaling up trend */ 377 uint32_t trend; 378 /* total packet processed recently */ 379 uint64_t nb_rx_processed; 380 /* total iterations looped recently */ 381 uint64_t nb_iteration_looped; 382 /* 383 * Represents empty and non empty polls 384 * of rte_eth_rx_burst(); 385 * ep_nep[0] holds non empty polls 386 * i.e. 0 < nb_rx <= MAX_BURST 387 * ep_nep[1] holds empty polls. 388 * i.e. nb_rx == 0 389 */ 390 uint64_t ep_nep[2]; 391 /* 392 * Represents full and empty+partial 393 * polls of rte_eth_rx_burst(); 394 * ep_nep[0] holds empty+partial polls. 395 * i.e. 0 <= nb_rx < MAX_BURST 396 * ep_nep[1] holds full polls 397 * i.e. nb_rx == MAX_BURST 398 */ 399 uint64_t fp_nfp[2]; 400 enum busy_rate br; 401 rte_spinlock_t telemetry_lock; 402 }; 403 404 static alignas(RTE_CACHE_LINE_SIZE) struct lcore_conf lcore_conf[RTE_MAX_LCORE]; 405 static alignas(RTE_CACHE_LINE_SIZE) struct lcore_stats stats[RTE_MAX_LCORE]; 406 static struct rte_timer power_timers[RTE_MAX_LCORE]; 407 408 static inline uint32_t power_idle_heuristic(uint32_t zero_rx_packet_count); 409 static inline enum freq_scale_hint_t power_freq_scaleup_heuristic( \ 410 unsigned int lcore_id, uint16_t port_id, uint16_t queue_id); 411 412 static int is_done(void) 413 { 414 return quit_signal; 415 } 416 417 /* exit signal handler */ 418 static void 419 signal_exit_now(int sigtype) 420 { 421 422 if (sigtype == SIGINT) 423 quit_signal = true; 424 425 } 426 427 /* Frequency scale down timer callback */ 428 static void 429 power_timer_cb(__rte_unused struct rte_timer *tim, 430 __rte_unused void *arg) 431 { 432 uint64_t hz; 433 float sleep_time_ratio; 434 unsigned lcore_id = rte_lcore_id(); 435 436 /* accumulate total execution time in us when callback is invoked */ 437 sleep_time_ratio = (float)(stats[lcore_id].sleep_time) / 438 (float)SCALING_PERIOD; 439 /** 440 * check whether need to scale down frequency a step if it sleep a lot. 441 */ 442 if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD) { 443 if (rte_power_freq_down) 444 rte_power_freq_down(lcore_id); 445 } 446 else if ( (unsigned)(stats[lcore_id].nb_rx_processed / 447 stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST) { 448 /** 449 * scale down a step if average packet per iteration less 450 * than expectation. 451 */ 452 if (rte_power_freq_down) 453 rte_power_freq_down(lcore_id); 454 } 455 456 /** 457 * initialize another timer according to current frequency to ensure 458 * timer interval is relatively fixed. 459 */ 460 hz = rte_get_timer_hz(); 461 rte_timer_reset(&power_timers[lcore_id], hz/TIMER_NUMBER_PER_SECOND, 462 SINGLE, lcore_id, power_timer_cb, NULL); 463 464 stats[lcore_id].nb_rx_processed = 0; 465 stats[lcore_id].nb_iteration_looped = 0; 466 467 stats[lcore_id].sleep_time = 0; 468 } 469 470 /* Enqueue a single packet, and send burst if queue is filled */ 471 static inline int 472 send_single_packet(struct rte_mbuf *m, uint16_t port) 473 { 474 uint32_t lcore_id; 475 struct lcore_conf *qconf; 476 477 lcore_id = rte_lcore_id(); 478 qconf = &lcore_conf[lcore_id]; 479 480 rte_eth_tx_buffer(port, qconf->tx_queue_id[port], 481 qconf->tx_buffer[port], m); 482 483 return 0; 484 } 485 486 #ifdef DO_RFC_1812_CHECKS 487 static inline int 488 is_valid_ipv4_pkt(struct rte_ipv4_hdr *pkt, uint32_t link_len) 489 { 490 /* From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2 */ 491 /* 492 * 1. The packet length reported by the Link Layer must be large 493 * enough to hold the minimum length legal IP datagram (20 bytes). 494 */ 495 if (link_len < sizeof(struct rte_ipv4_hdr)) 496 return -1; 497 498 /* 2. The IP checksum must be correct. */ 499 /* if this is not checked in H/W, check it. */ 500 if ((port_conf.rxmode.offloads & RTE_ETH_RX_OFFLOAD_IPV4_CKSUM) == 0) { 501 uint16_t actual_cksum, expected_cksum; 502 actual_cksum = pkt->hdr_checksum; 503 pkt->hdr_checksum = 0; 504 expected_cksum = rte_ipv4_cksum(pkt); 505 if (actual_cksum != expected_cksum) 506 return -2; 507 } 508 509 /* 510 * 3. The IP version number must be 4. If the version number is not 4 511 * then the packet may be another version of IP, such as IPng or 512 * ST-II. 513 */ 514 if (((pkt->version_ihl) >> 4) != 4) 515 return -3; 516 /* 517 * 4. The IP header length field must be large enough to hold the 518 * minimum length legal IP datagram (20 bytes = 5 words). 519 */ 520 if ((pkt->version_ihl & 0xf) < 5) 521 return -4; 522 523 /* 524 * 5. The IP total length field must be large enough to hold the IP 525 * datagram header, whose length is specified in the IP header length 526 * field. 527 */ 528 if (rte_cpu_to_be_16(pkt->total_length) < sizeof(struct rte_ipv4_hdr)) 529 return -5; 530 531 return 0; 532 } 533 #endif 534 535 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 536 static void 537 print_ipv4_key(struct ipv4_5tuple key) 538 { 539 printf("IP dst = %08x, IP src = %08x, port dst = %d, port src = %d, " 540 "proto = %d\n", (unsigned)key.ip_dst, (unsigned)key.ip_src, 541 key.port_dst, key.port_src, key.proto); 542 } 543 static void 544 print_ipv6_key(struct ipv6_5tuple key) 545 { 546 printf( "IP dst = " IPv6_BYTES_FMT ", IP src = " IPv6_BYTES_FMT ", " 547 "port dst = %d, port src = %d, proto = %d\n", 548 IPv6_BYTES(key.ip_dst), IPv6_BYTES(key.ip_src), 549 key.port_dst, key.port_src, key.proto); 550 } 551 552 static inline uint16_t 553 get_ipv4_dst_port(struct rte_ipv4_hdr *ipv4_hdr, uint16_t portid, 554 lookup_struct_t * ipv4_l3fwd_lookup_struct) 555 { 556 struct ipv4_5tuple key; 557 struct rte_tcp_hdr *tcp; 558 struct rte_udp_hdr *udp; 559 int ret = 0; 560 561 key.ip_dst = rte_be_to_cpu_32(ipv4_hdr->dst_addr); 562 key.ip_src = rte_be_to_cpu_32(ipv4_hdr->src_addr); 563 key.proto = ipv4_hdr->next_proto_id; 564 565 switch (ipv4_hdr->next_proto_id) { 566 case IPPROTO_TCP: 567 tcp = (struct rte_tcp_hdr *)((unsigned char *)ipv4_hdr + 568 sizeof(struct rte_ipv4_hdr)); 569 key.port_dst = rte_be_to_cpu_16(tcp->dst_port); 570 key.port_src = rte_be_to_cpu_16(tcp->src_port); 571 break; 572 573 case IPPROTO_UDP: 574 udp = (struct rte_udp_hdr *)((unsigned char *)ipv4_hdr + 575 sizeof(struct rte_ipv4_hdr)); 576 key.port_dst = rte_be_to_cpu_16(udp->dst_port); 577 key.port_src = rte_be_to_cpu_16(udp->src_port); 578 break; 579 580 default: 581 key.port_dst = 0; 582 key.port_src = 0; 583 break; 584 } 585 586 /* Find destination port */ 587 ret = rte_hash_lookup(ipv4_l3fwd_lookup_struct, (const void *)&key); 588 return ((ret < 0) ? portid : ipv4_l3fwd_out_if[ret]); 589 } 590 591 static inline uint16_t 592 get_ipv6_dst_port(struct rte_ipv6_hdr *ipv6_hdr, uint16_t portid, 593 lookup_struct_t *ipv6_l3fwd_lookup_struct) 594 { 595 struct ipv6_5tuple key; 596 struct rte_tcp_hdr *tcp; 597 struct rte_udp_hdr *udp; 598 int ret = 0; 599 600 memcpy(key.ip_dst, ipv6_hdr->dst_addr, IPV6_ADDR_LEN); 601 memcpy(key.ip_src, ipv6_hdr->src_addr, IPV6_ADDR_LEN); 602 603 key.proto = ipv6_hdr->proto; 604 605 switch (ipv6_hdr->proto) { 606 case IPPROTO_TCP: 607 tcp = (struct rte_tcp_hdr *)((unsigned char *) ipv6_hdr + 608 sizeof(struct rte_ipv6_hdr)); 609 key.port_dst = rte_be_to_cpu_16(tcp->dst_port); 610 key.port_src = rte_be_to_cpu_16(tcp->src_port); 611 break; 612 613 case IPPROTO_UDP: 614 udp = (struct rte_udp_hdr *)((unsigned char *) ipv6_hdr + 615 sizeof(struct rte_ipv6_hdr)); 616 key.port_dst = rte_be_to_cpu_16(udp->dst_port); 617 key.port_src = rte_be_to_cpu_16(udp->src_port); 618 break; 619 620 default: 621 key.port_dst = 0; 622 key.port_src = 0; 623 break; 624 } 625 626 /* Find destination port */ 627 ret = rte_hash_lookup(ipv6_l3fwd_lookup_struct, (const void *)&key); 628 return ((ret < 0) ? portid : ipv6_l3fwd_out_if[ret]); 629 } 630 #endif 631 632 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 633 static inline uint16_t 634 get_ipv4_dst_port(struct rte_ipv4_hdr *ipv4_hdr, uint16_t portid, 635 lookup_struct_t *ipv4_l3fwd_lookup_struct) 636 { 637 uint32_t next_hop; 638 639 return ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct, 640 rte_be_to_cpu_32(ipv4_hdr->dst_addr), &next_hop) == 0)? 641 next_hop : portid); 642 } 643 #endif 644 645 static inline void 646 parse_ptype_one(struct rte_mbuf *m) 647 { 648 struct rte_ether_hdr *eth_hdr; 649 uint32_t packet_type = RTE_PTYPE_UNKNOWN; 650 uint16_t ether_type; 651 652 eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 653 ether_type = eth_hdr->ether_type; 654 if (ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4)) 655 packet_type |= RTE_PTYPE_L3_IPV4_EXT_UNKNOWN; 656 else if (ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV6)) 657 packet_type |= RTE_PTYPE_L3_IPV6_EXT_UNKNOWN; 658 659 m->packet_type = packet_type; 660 } 661 662 static uint16_t 663 cb_parse_ptype(uint16_t port __rte_unused, uint16_t queue __rte_unused, 664 struct rte_mbuf *pkts[], uint16_t nb_pkts, 665 uint16_t max_pkts __rte_unused, 666 void *user_param __rte_unused) 667 { 668 unsigned int i; 669 670 for (i = 0; i < nb_pkts; ++i) 671 parse_ptype_one(pkts[i]); 672 673 return nb_pkts; 674 } 675 676 static int 677 add_cb_parse_ptype(uint16_t portid, uint16_t queueid) 678 { 679 printf("Port %d: softly parse packet type info\n", portid); 680 if (rte_eth_add_rx_callback(portid, queueid, cb_parse_ptype, NULL)) 681 return 0; 682 683 printf("Failed to add rx callback: port=%d\n", portid); 684 return -1; 685 } 686 687 static inline void 688 l3fwd_simple_forward(struct rte_mbuf *m, uint16_t portid, 689 struct lcore_conf *qconf) 690 { 691 struct rte_ether_hdr *eth_hdr; 692 struct rte_ipv4_hdr *ipv4_hdr; 693 void *d_addr_bytes; 694 uint16_t dst_port; 695 696 eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 697 698 if (RTE_ETH_IS_IPV4_HDR(m->packet_type)) { 699 /* Handle IPv4 headers.*/ 700 ipv4_hdr = 701 rte_pktmbuf_mtod_offset(m, struct rte_ipv4_hdr *, 702 sizeof(struct rte_ether_hdr)); 703 704 #ifdef DO_RFC_1812_CHECKS 705 /* Check to make sure the packet is valid (RFC1812) */ 706 if (is_valid_ipv4_pkt(ipv4_hdr, m->pkt_len) < 0) { 707 rte_pktmbuf_free(m); 708 return; 709 } 710 #endif 711 712 dst_port = get_ipv4_dst_port(ipv4_hdr, portid, 713 qconf->ipv4_lookup_struct); 714 if (dst_port >= RTE_MAX_ETHPORTS || 715 (enabled_port_mask & 1 << dst_port) == 0) 716 dst_port = portid; 717 718 /* 02:00:00:00:00:xx */ 719 d_addr_bytes = ð_hdr->dst_addr.addr_bytes[0]; 720 *((uint64_t *)d_addr_bytes) = 721 0x000000000002 + ((uint64_t)dst_port << 40); 722 723 #ifdef DO_RFC_1812_CHECKS 724 /* Update time to live and header checksum */ 725 --(ipv4_hdr->time_to_live); 726 ++(ipv4_hdr->hdr_checksum); 727 #endif 728 729 /* src addr */ 730 rte_ether_addr_copy(&ports_eth_addr[dst_port], 731 ð_hdr->src_addr); 732 733 send_single_packet(m, dst_port); 734 } else if (RTE_ETH_IS_IPV6_HDR(m->packet_type)) { 735 /* Handle IPv6 headers.*/ 736 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 737 struct rte_ipv6_hdr *ipv6_hdr; 738 739 ipv6_hdr = 740 rte_pktmbuf_mtod_offset(m, struct rte_ipv6_hdr *, 741 sizeof(struct rte_ether_hdr)); 742 743 dst_port = get_ipv6_dst_port(ipv6_hdr, portid, 744 qconf->ipv6_lookup_struct); 745 746 if (dst_port >= RTE_MAX_ETHPORTS || 747 (enabled_port_mask & 1 << dst_port) == 0) 748 dst_port = portid; 749 750 /* 02:00:00:00:00:xx */ 751 d_addr_bytes = ð_hdr->dst_addr.addr_bytes[0]; 752 *((uint64_t *)d_addr_bytes) = 753 0x000000000002 + ((uint64_t)dst_port << 40); 754 755 /* src addr */ 756 rte_ether_addr_copy(&ports_eth_addr[dst_port], 757 ð_hdr->src_addr); 758 759 send_single_packet(m, dst_port); 760 #else 761 /* We don't currently handle IPv6 packets in LPM mode. */ 762 rte_pktmbuf_free(m); 763 #endif 764 } else 765 rte_pktmbuf_free(m); 766 767 } 768 769 #define MINIMUM_SLEEP_TIME 1 770 #define SUSPEND_THRESHOLD 300 771 772 static inline uint32_t 773 power_idle_heuristic(uint32_t zero_rx_packet_count) 774 { 775 /* If zero count is less than 100, sleep 1us */ 776 if (zero_rx_packet_count < SUSPEND_THRESHOLD) 777 return MINIMUM_SLEEP_TIME; 778 /* If zero count is less than 1000, sleep 100 us which is the 779 minimum latency switching from C3/C6 to C0 780 */ 781 else 782 return SUSPEND_THRESHOLD; 783 } 784 785 static inline enum freq_scale_hint_t 786 power_freq_scaleup_heuristic(unsigned lcore_id, 787 uint16_t port_id, 788 uint16_t queue_id) 789 { 790 uint32_t rxq_count = rte_eth_rx_queue_count(port_id, queue_id); 791 /** 792 * HW Rx queue size is 128 by default, Rx burst read at maximum 32 entries 793 * per iteration 794 */ 795 #define FREQ_GEAR1_RX_PACKET_THRESHOLD MAX_PKT_BURST 796 #define FREQ_GEAR2_RX_PACKET_THRESHOLD (MAX_PKT_BURST*2) 797 #define FREQ_GEAR3_RX_PACKET_THRESHOLD (MAX_PKT_BURST*3) 798 #define FREQ_UP_TREND1_ACC 1 799 #define FREQ_UP_TREND2_ACC 100 800 #define FREQ_UP_THRESHOLD 10000 801 802 if (likely(rxq_count > FREQ_GEAR3_RX_PACKET_THRESHOLD)) { 803 stats[lcore_id].trend = 0; 804 return FREQ_HIGHEST; 805 } else if (likely(rxq_count > FREQ_GEAR2_RX_PACKET_THRESHOLD)) 806 stats[lcore_id].trend += FREQ_UP_TREND2_ACC; 807 else if (likely(rxq_count > FREQ_GEAR1_RX_PACKET_THRESHOLD)) 808 stats[lcore_id].trend += FREQ_UP_TREND1_ACC; 809 810 if (likely(stats[lcore_id].trend > FREQ_UP_THRESHOLD)) { 811 stats[lcore_id].trend = 0; 812 return FREQ_HIGHER; 813 } 814 815 return FREQ_CURRENT; 816 } 817 818 /** 819 * force polling thread sleep until one-shot rx interrupt triggers 820 * @param port_id 821 * Port id. 822 * @param queue_id 823 * Rx queue id. 824 * @return 825 * 0 on success 826 */ 827 static int 828 sleep_until_rx_interrupt(int num, int lcore) 829 { 830 /* 831 * we want to track when we are woken up by traffic so that we can go 832 * back to sleep again without log spamming. Avoid cache line sharing 833 * to prevent threads stepping on each others' toes. 834 */ 835 static alignas(RTE_CACHE_LINE_SIZE) struct { 836 bool wakeup; 837 } status[RTE_MAX_LCORE]; 838 struct rte_epoll_event event[num]; 839 int n, i; 840 uint16_t port_id; 841 uint16_t queue_id; 842 void *data; 843 844 if (status[lcore].wakeup) { 845 RTE_LOG(INFO, L3FWD_POWER, 846 "lcore %u sleeps until interrupt triggers\n", 847 rte_lcore_id()); 848 } 849 850 n = rte_epoll_wait(RTE_EPOLL_PER_THREAD, event, num, 10); 851 for (i = 0; i < n; i++) { 852 data = event[i].epdata.data; 853 port_id = ((uintptr_t)data) >> (sizeof(uint16_t) * CHAR_BIT); 854 queue_id = ((uintptr_t)data) & 855 RTE_LEN2MASK((sizeof(uint16_t) * CHAR_BIT), uint16_t); 856 RTE_LOG(INFO, L3FWD_POWER, 857 "lcore %u is waked up from rx interrupt on" 858 " port %d queue %d\n", 859 rte_lcore_id(), port_id, queue_id); 860 } 861 status[lcore].wakeup = n != 0; 862 863 return 0; 864 } 865 866 static void turn_on_off_intr(struct lcore_conf *qconf, bool on) 867 { 868 int i; 869 struct lcore_rx_queue *rx_queue; 870 uint16_t queue_id; 871 uint16_t port_id; 872 873 for (i = 0; i < qconf->n_rx_queue; ++i) { 874 rx_queue = &(qconf->rx_queue_list[i]); 875 port_id = rx_queue->port_id; 876 queue_id = rx_queue->queue_id; 877 878 rte_spinlock_lock(&(locks[port_id])); 879 if (on) 880 rte_eth_dev_rx_intr_enable(port_id, queue_id); 881 else 882 rte_eth_dev_rx_intr_disable(port_id, queue_id); 883 rte_spinlock_unlock(&(locks[port_id])); 884 } 885 } 886 887 static int event_register(struct lcore_conf *qconf) 888 { 889 struct lcore_rx_queue *rx_queue; 890 uint16_t queueid; 891 uint16_t portid; 892 uint32_t data; 893 int ret; 894 int i; 895 896 for (i = 0; i < qconf->n_rx_queue; ++i) { 897 rx_queue = &(qconf->rx_queue_list[i]); 898 portid = rx_queue->port_id; 899 queueid = rx_queue->queue_id; 900 data = portid << (sizeof(uint16_t) * CHAR_BIT) | queueid; 901 902 ret = rte_eth_dev_rx_intr_ctl_q(portid, queueid, 903 RTE_EPOLL_PER_THREAD, 904 RTE_INTR_EVENT_ADD, 905 (void *)((uintptr_t)data)); 906 if (ret) 907 return ret; 908 } 909 910 return 0; 911 } 912 913 /* Main processing loop. 8< */ 914 static int main_intr_loop(__rte_unused void *dummy) 915 { 916 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 917 unsigned int lcore_id; 918 uint64_t prev_tsc, diff_tsc, cur_tsc; 919 int i, j, nb_rx; 920 uint16_t portid, queueid; 921 struct lcore_conf *qconf; 922 struct lcore_rx_queue *rx_queue; 923 uint32_t lcore_rx_idle_count = 0; 924 uint32_t lcore_idle_hint = 0; 925 int intr_en = 0; 926 927 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 928 US_PER_S * BURST_TX_DRAIN_US; 929 930 prev_tsc = 0; 931 932 lcore_id = rte_lcore_id(); 933 qconf = &lcore_conf[lcore_id]; 934 935 if (qconf->n_rx_queue == 0) { 936 RTE_LOG(INFO, L3FWD_POWER, "lcore %u has nothing to do\n", 937 lcore_id); 938 return 0; 939 } 940 941 RTE_LOG(INFO, L3FWD_POWER, "entering main interrupt loop on lcore %u\n", 942 lcore_id); 943 944 for (i = 0; i < qconf->n_rx_queue; i++) { 945 portid = qconf->rx_queue_list[i].port_id; 946 queueid = qconf->rx_queue_list[i].queue_id; 947 RTE_LOG(INFO, L3FWD_POWER, 948 " -- lcoreid=%u portid=%u rxqueueid=%" PRIu16 "\n", 949 lcore_id, portid, queueid); 950 } 951 952 /* add into event wait list */ 953 if (event_register(qconf) == 0) 954 intr_en = 1; 955 else 956 RTE_LOG(INFO, L3FWD_POWER, "RX interrupt won't enable.\n"); 957 958 while (!is_done()) { 959 stats[lcore_id].nb_iteration_looped++; 960 961 cur_tsc = rte_rdtsc(); 962 963 /* 964 * TX burst queue drain 965 */ 966 diff_tsc = cur_tsc - prev_tsc; 967 if (unlikely(diff_tsc > drain_tsc)) { 968 for (i = 0; i < qconf->n_tx_port; ++i) { 969 portid = qconf->tx_port_id[i]; 970 rte_eth_tx_buffer_flush(portid, 971 qconf->tx_queue_id[portid], 972 qconf->tx_buffer[portid]); 973 } 974 prev_tsc = cur_tsc; 975 } 976 977 start_rx: 978 /* 979 * Read packet from RX queues 980 */ 981 lcore_rx_idle_count = 0; 982 for (i = 0; i < qconf->n_rx_queue; ++i) { 983 rx_queue = &(qconf->rx_queue_list[i]); 984 rx_queue->idle_hint = 0; 985 portid = rx_queue->port_id; 986 queueid = rx_queue->queue_id; 987 988 nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst, 989 MAX_PKT_BURST); 990 991 stats[lcore_id].nb_rx_processed += nb_rx; 992 if (unlikely(nb_rx == 0)) { 993 /** 994 * no packet received from rx queue, try to 995 * sleep for a while forcing CPU enter deeper 996 * C states. 997 */ 998 rx_queue->zero_rx_packet_count++; 999 1000 if (rx_queue->zero_rx_packet_count <= 1001 MIN_ZERO_POLL_COUNT) 1002 continue; 1003 1004 rx_queue->idle_hint = power_idle_heuristic( 1005 rx_queue->zero_rx_packet_count); 1006 lcore_rx_idle_count++; 1007 } else { 1008 rx_queue->zero_rx_packet_count = 0; 1009 } 1010 1011 /* Prefetch first packets */ 1012 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1013 rte_prefetch0(rte_pktmbuf_mtod( 1014 pkts_burst[j], void *)); 1015 } 1016 1017 /* Prefetch and forward already prefetched packets */ 1018 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1019 rte_prefetch0(rte_pktmbuf_mtod( 1020 pkts_burst[j + PREFETCH_OFFSET], 1021 void *)); 1022 l3fwd_simple_forward( 1023 pkts_burst[j], portid, qconf); 1024 } 1025 1026 /* Forward remaining prefetched packets */ 1027 for (; j < nb_rx; j++) { 1028 l3fwd_simple_forward( 1029 pkts_burst[j], portid, qconf); 1030 } 1031 } 1032 1033 if (unlikely(lcore_rx_idle_count == qconf->n_rx_queue)) { 1034 /** 1035 * All Rx queues empty in recent consecutive polls, 1036 * sleep in a conservative manner, meaning sleep as 1037 * less as possible. 1038 */ 1039 for (i = 1, 1040 lcore_idle_hint = qconf->rx_queue_list[0].idle_hint; 1041 i < qconf->n_rx_queue; ++i) { 1042 rx_queue = &(qconf->rx_queue_list[i]); 1043 if (rx_queue->idle_hint < lcore_idle_hint) 1044 lcore_idle_hint = rx_queue->idle_hint; 1045 } 1046 1047 if (lcore_idle_hint < SUSPEND_THRESHOLD) 1048 /** 1049 * execute "pause" instruction to avoid context 1050 * switch which generally take hundred of 1051 * microseconds for short sleep. 1052 */ 1053 rte_delay_us(lcore_idle_hint); 1054 else { 1055 /* suspend until rx interrupt triggers */ 1056 if (intr_en) { 1057 turn_on_off_intr(qconf, 1); 1058 sleep_until_rx_interrupt( 1059 qconf->n_rx_queue, 1060 lcore_id); 1061 turn_on_off_intr(qconf, 0); 1062 /** 1063 * start receiving packets immediately 1064 */ 1065 if (likely(!is_done())) 1066 goto start_rx; 1067 } 1068 } 1069 stats[lcore_id].sleep_time += lcore_idle_hint; 1070 } 1071 } 1072 1073 return 0; 1074 } 1075 /* >8 End of main processing loop. */ 1076 1077 /* main processing loop */ 1078 static int 1079 main_telemetry_loop(__rte_unused void *dummy) 1080 { 1081 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1082 unsigned int lcore_id; 1083 uint64_t prev_tsc, diff_tsc, cur_tsc, prev_tel_tsc; 1084 int i, j, nb_rx; 1085 uint16_t portid, queueid; 1086 struct lcore_conf *qconf; 1087 struct lcore_rx_queue *rx_queue; 1088 uint64_t ep_nep[2] = {0}, fp_nfp[2] = {0}; 1089 uint64_t poll_count; 1090 enum busy_rate br; 1091 1092 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 1093 US_PER_S * BURST_TX_DRAIN_US; 1094 1095 poll_count = 0; 1096 prev_tsc = 0; 1097 prev_tel_tsc = 0; 1098 1099 lcore_id = rte_lcore_id(); 1100 qconf = &lcore_conf[lcore_id]; 1101 1102 if (qconf->n_rx_queue == 0) { 1103 RTE_LOG(INFO, L3FWD_POWER, "lcore %u has nothing to do\n", 1104 lcore_id); 1105 return 0; 1106 } 1107 1108 RTE_LOG(INFO, L3FWD_POWER, "entering main telemetry loop on lcore %u\n", 1109 lcore_id); 1110 1111 for (i = 0; i < qconf->n_rx_queue; i++) { 1112 portid = qconf->rx_queue_list[i].port_id; 1113 queueid = qconf->rx_queue_list[i].queue_id; 1114 RTE_LOG(INFO, L3FWD_POWER, " -- lcoreid=%u portid=%u " 1115 "rxqueueid=%" PRIu16 "\n", lcore_id, portid, queueid); 1116 } 1117 1118 while (!is_done()) { 1119 1120 cur_tsc = rte_rdtsc(); 1121 /* 1122 * TX burst queue drain 1123 */ 1124 diff_tsc = cur_tsc - prev_tsc; 1125 if (unlikely(diff_tsc > drain_tsc)) { 1126 for (i = 0; i < qconf->n_tx_port; ++i) { 1127 portid = qconf->tx_port_id[i]; 1128 rte_eth_tx_buffer_flush(portid, 1129 qconf->tx_queue_id[portid], 1130 qconf->tx_buffer[portid]); 1131 } 1132 prev_tsc = cur_tsc; 1133 } 1134 1135 /* 1136 * Read packet from RX queues 1137 */ 1138 for (i = 0; i < qconf->n_rx_queue; ++i) { 1139 rx_queue = &(qconf->rx_queue_list[i]); 1140 portid = rx_queue->port_id; 1141 queueid = rx_queue->queue_id; 1142 1143 nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst, 1144 MAX_PKT_BURST); 1145 ep_nep[nb_rx == 0]++; 1146 fp_nfp[nb_rx == MAX_PKT_BURST]++; 1147 poll_count++; 1148 if (unlikely(nb_rx == 0)) 1149 continue; 1150 1151 /* Prefetch first packets */ 1152 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1153 rte_prefetch0(rte_pktmbuf_mtod( 1154 pkts_burst[j], void *)); 1155 } 1156 1157 /* Prefetch and forward already prefetched packets */ 1158 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1159 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1160 j + PREFETCH_OFFSET], void *)); 1161 l3fwd_simple_forward(pkts_burst[j], portid, 1162 qconf); 1163 } 1164 1165 /* Forward remaining prefetched packets */ 1166 for (; j < nb_rx; j++) { 1167 l3fwd_simple_forward(pkts_burst[j], portid, 1168 qconf); 1169 } 1170 } 1171 if (unlikely(poll_count >= DEFAULT_COUNT)) { 1172 diff_tsc = cur_tsc - prev_tel_tsc; 1173 if (diff_tsc >= MAX_CYCLES) { 1174 br = FULL; 1175 } else if (diff_tsc > MIN_CYCLES && 1176 diff_tsc < MAX_CYCLES) { 1177 br = (diff_tsc * 100) / MAX_CYCLES; 1178 } else { 1179 br = ZERO; 1180 } 1181 poll_count = 0; 1182 prev_tel_tsc = cur_tsc; 1183 /* update stats for telemetry */ 1184 rte_spinlock_lock(&stats[lcore_id].telemetry_lock); 1185 stats[lcore_id].ep_nep[0] = ep_nep[0]; 1186 stats[lcore_id].ep_nep[1] = ep_nep[1]; 1187 stats[lcore_id].fp_nfp[0] = fp_nfp[0]; 1188 stats[lcore_id].fp_nfp[1] = fp_nfp[1]; 1189 stats[lcore_id].br = br; 1190 rte_spinlock_unlock(&stats[lcore_id].telemetry_lock); 1191 } 1192 } 1193 1194 return 0; 1195 } 1196 1197 /* main processing loop */ 1198 static int 1199 main_legacy_loop(__rte_unused void *dummy) 1200 { 1201 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1202 unsigned lcore_id; 1203 uint64_t prev_tsc, diff_tsc, cur_tsc, tim_res_tsc, hz; 1204 uint64_t prev_tsc_power = 0, cur_tsc_power, diff_tsc_power; 1205 int i, j, nb_rx; 1206 uint16_t portid, queueid; 1207 struct lcore_conf *qconf; 1208 struct lcore_rx_queue *rx_queue; 1209 enum freq_scale_hint_t lcore_scaleup_hint; 1210 uint32_t lcore_rx_idle_count = 0; 1211 uint32_t lcore_idle_hint = 0; 1212 int intr_en = 0; 1213 1214 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1215 1216 prev_tsc = 0; 1217 hz = rte_get_timer_hz(); 1218 tim_res_tsc = hz/TIMER_NUMBER_PER_SECOND; 1219 1220 lcore_id = rte_lcore_id(); 1221 qconf = &lcore_conf[lcore_id]; 1222 1223 if (qconf->n_rx_queue == 0) { 1224 RTE_LOG(INFO, L3FWD_POWER, "lcore %u has nothing to do\n", lcore_id); 1225 return 0; 1226 } 1227 1228 RTE_LOG(INFO, L3FWD_POWER, "entering main loop on lcore %u\n", lcore_id); 1229 1230 for (i = 0; i < qconf->n_rx_queue; i++) { 1231 portid = qconf->rx_queue_list[i].port_id; 1232 queueid = qconf->rx_queue_list[i].queue_id; 1233 RTE_LOG(INFO, L3FWD_POWER, " -- lcoreid=%u portid=%u " 1234 "rxqueueid=%" PRIu16 "\n", lcore_id, portid, queueid); 1235 } 1236 1237 /* add into event wait list */ 1238 if (event_register(qconf) == 0) 1239 intr_en = 1; 1240 else 1241 RTE_LOG(INFO, L3FWD_POWER, "RX interrupt won't enable.\n"); 1242 1243 while (!is_done()) { 1244 stats[lcore_id].nb_iteration_looped++; 1245 1246 cur_tsc = rte_rdtsc(); 1247 cur_tsc_power = cur_tsc; 1248 1249 /* 1250 * TX burst queue drain 1251 */ 1252 diff_tsc = cur_tsc - prev_tsc; 1253 if (unlikely(diff_tsc > drain_tsc)) { 1254 for (i = 0; i < qconf->n_tx_port; ++i) { 1255 portid = qconf->tx_port_id[i]; 1256 rte_eth_tx_buffer_flush(portid, 1257 qconf->tx_queue_id[portid], 1258 qconf->tx_buffer[portid]); 1259 } 1260 prev_tsc = cur_tsc; 1261 } 1262 1263 diff_tsc_power = cur_tsc_power - prev_tsc_power; 1264 if (diff_tsc_power > tim_res_tsc) { 1265 rte_timer_manage(); 1266 prev_tsc_power = cur_tsc_power; 1267 } 1268 1269 start_rx: 1270 /* 1271 * Read packet from RX queues 1272 */ 1273 lcore_scaleup_hint = FREQ_CURRENT; 1274 lcore_rx_idle_count = 0; 1275 for (i = 0; i < qconf->n_rx_queue; ++i) { 1276 rx_queue = &(qconf->rx_queue_list[i]); 1277 rx_queue->idle_hint = 0; 1278 portid = rx_queue->port_id; 1279 queueid = rx_queue->queue_id; 1280 1281 nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst, 1282 MAX_PKT_BURST); 1283 1284 stats[lcore_id].nb_rx_processed += nb_rx; 1285 if (unlikely(nb_rx == 0)) { 1286 /** 1287 * no packet received from rx queue, try to 1288 * sleep for a while forcing CPU enter deeper 1289 * C states. 1290 */ 1291 rx_queue->zero_rx_packet_count++; 1292 1293 if (rx_queue->zero_rx_packet_count <= 1294 MIN_ZERO_POLL_COUNT) 1295 continue; 1296 1297 rx_queue->idle_hint = power_idle_heuristic(\ 1298 rx_queue->zero_rx_packet_count); 1299 lcore_rx_idle_count++; 1300 } else { 1301 rx_queue->zero_rx_packet_count = 0; 1302 1303 /** 1304 * do not scale up frequency immediately as 1305 * user to kernel space communication is costly 1306 * which might impact packet I/O for received 1307 * packets. 1308 */ 1309 rx_queue->freq_up_hint = 1310 power_freq_scaleup_heuristic(lcore_id, 1311 portid, queueid); 1312 } 1313 1314 /* Prefetch first packets */ 1315 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1316 rte_prefetch0(rte_pktmbuf_mtod( 1317 pkts_burst[j], void *)); 1318 } 1319 1320 /* Prefetch and forward already prefetched packets */ 1321 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1322 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1323 j + PREFETCH_OFFSET], void *)); 1324 l3fwd_simple_forward(pkts_burst[j], portid, 1325 qconf); 1326 } 1327 1328 /* Forward remaining prefetched packets */ 1329 for (; j < nb_rx; j++) { 1330 l3fwd_simple_forward(pkts_burst[j], portid, 1331 qconf); 1332 } 1333 } 1334 1335 if (likely(lcore_rx_idle_count != qconf->n_rx_queue)) { 1336 for (i = 1, lcore_scaleup_hint = 1337 qconf->rx_queue_list[0].freq_up_hint; 1338 i < qconf->n_rx_queue; ++i) { 1339 rx_queue = &(qconf->rx_queue_list[i]); 1340 if (rx_queue->freq_up_hint > 1341 lcore_scaleup_hint) 1342 lcore_scaleup_hint = 1343 rx_queue->freq_up_hint; 1344 } 1345 1346 if (lcore_scaleup_hint == FREQ_HIGHEST) { 1347 if (rte_power_freq_max) 1348 rte_power_freq_max(lcore_id); 1349 } else if (lcore_scaleup_hint == FREQ_HIGHER) { 1350 if (rte_power_freq_up) 1351 rte_power_freq_up(lcore_id); 1352 } 1353 } else { 1354 /** 1355 * All Rx queues empty in recent consecutive polls, 1356 * sleep in a conservative manner, meaning sleep as 1357 * less as possible. 1358 */ 1359 for (i = 1, lcore_idle_hint = 1360 qconf->rx_queue_list[0].idle_hint; 1361 i < qconf->n_rx_queue; ++i) { 1362 rx_queue = &(qconf->rx_queue_list[i]); 1363 if (rx_queue->idle_hint < lcore_idle_hint) 1364 lcore_idle_hint = rx_queue->idle_hint; 1365 } 1366 1367 if (lcore_idle_hint < SUSPEND_THRESHOLD) 1368 /** 1369 * execute "pause" instruction to avoid context 1370 * switch which generally take hundred of 1371 * microseconds for short sleep. 1372 */ 1373 rte_delay_us(lcore_idle_hint); 1374 else { 1375 /* suspend until rx interrupt triggers */ 1376 if (intr_en) { 1377 turn_on_off_intr(qconf, 1); 1378 sleep_until_rx_interrupt( 1379 qconf->n_rx_queue, 1380 lcore_id); 1381 turn_on_off_intr(qconf, 0); 1382 /** 1383 * start receiving packets immediately 1384 */ 1385 if (likely(!is_done())) 1386 goto start_rx; 1387 } 1388 } 1389 stats[lcore_id].sleep_time += lcore_idle_hint; 1390 } 1391 } 1392 1393 return 0; 1394 } 1395 1396 static int 1397 check_lcore_params(void) 1398 { 1399 uint16_t queue, i; 1400 uint32_t lcore; 1401 int socketid; 1402 1403 for (i = 0; i < nb_lcore_params; ++i) { 1404 queue = lcore_params[i].queue_id; 1405 if (queue >= MAX_RX_QUEUE_PER_PORT) { 1406 printf("invalid queue number: %" PRIu16 "\n", queue); 1407 return -1; 1408 } 1409 lcore = lcore_params[i].lcore_id; 1410 if (!rte_lcore_is_enabled(lcore)) { 1411 printf("error: lcore %u is not enabled in lcore " 1412 "mask\n", lcore); 1413 return -1; 1414 } 1415 if ((socketid = rte_lcore_to_socket_id(lcore) != 0) && 1416 (numa_on == 0)) { 1417 printf("warning: lcore %u is on socket %d with numa " 1418 "off\n", lcore, socketid); 1419 } 1420 if (app_mode == APP_MODE_TELEMETRY && lcore == rte_lcore_id()) { 1421 printf("cannot enable main core %d in config for telemetry mode\n", 1422 rte_lcore_id()); 1423 return -1; 1424 } 1425 } 1426 return 0; 1427 } 1428 1429 static int 1430 check_port_config(void) 1431 { 1432 unsigned portid; 1433 uint16_t i; 1434 1435 for (i = 0; i < nb_lcore_params; ++i) { 1436 portid = lcore_params[i].port_id; 1437 if ((enabled_port_mask & (1 << portid)) == 0) { 1438 printf("port %u is not enabled in port mask\n", 1439 portid); 1440 return -1; 1441 } 1442 if (!rte_eth_dev_is_valid_port(portid)) { 1443 printf("port %u is not present on the board\n", 1444 portid); 1445 return -1; 1446 } 1447 } 1448 return 0; 1449 } 1450 1451 static uint16_t 1452 get_port_n_rx_queues(const uint16_t port) 1453 { 1454 int queue = -1; 1455 uint16_t i; 1456 1457 for (i = 0; i < nb_lcore_params; ++i) { 1458 if (lcore_params[i].port_id == port && 1459 lcore_params[i].queue_id > queue) 1460 queue = lcore_params[i].queue_id; 1461 } 1462 return (uint16_t)(++queue); 1463 } 1464 1465 static int 1466 init_lcore_rx_queues(void) 1467 { 1468 uint16_t i, nb_rx_queue; 1469 uint32_t lcore; 1470 1471 for (i = 0; i < nb_lcore_params; ++i) { 1472 lcore = lcore_params[i].lcore_id; 1473 nb_rx_queue = lcore_conf[lcore].n_rx_queue; 1474 if (nb_rx_queue >= MAX_RX_QUEUE_PER_LCORE) { 1475 printf("error: too many queues (%u) for lcore: %u\n", 1476 (unsigned int)nb_rx_queue + 1, lcore); 1477 return -1; 1478 } else { 1479 lcore_conf[lcore].rx_queue_list[nb_rx_queue].port_id = 1480 lcore_params[i].port_id; 1481 lcore_conf[lcore].rx_queue_list[nb_rx_queue].queue_id = 1482 lcore_params[i].queue_id; 1483 lcore_conf[lcore].n_rx_queue++; 1484 } 1485 } 1486 return 0; 1487 } 1488 1489 /* display usage */ 1490 static void 1491 print_usage(const char *prgname) 1492 { 1493 printf ("%s [EAL options] -- -p PORTMASK -P" 1494 " [--config (port,queue,lcore)[,(port,queue,lcore]]" 1495 " [--high-perf-cores CORELIST" 1496 " [--perf-config (port,queue,hi_perf,lcore_index)[,(port,queue,hi_perf,lcore_index]]" 1497 " [--max-pkt-len PKTLEN]\n" 1498 " -p PORTMASK: hexadecimal bitmask of ports to configure\n" 1499 " -P: enable promiscuous mode\n" 1500 " -u: set min/max frequency for uncore to minimum value\n" 1501 " -U: set min/max frequency for uncore to maximum value\n" 1502 " -i (frequency index): set min/max frequency for uncore to specified frequency index\n" 1503 " --config (port,queue,lcore): rx queues configuration\n" 1504 " --high-perf-cores CORELIST: list of high performance cores\n" 1505 " --perf-config: similar as config, cores specified as indices" 1506 " for bins containing high or regular performance cores\n" 1507 " --no-numa: optional, disable numa awareness\n" 1508 " --max-pkt-len PKTLEN: maximum packet length in decimal (64-9600)\n" 1509 " --parse-ptype: parse packet type by software\n" 1510 " --legacy: use legacy interrupt-based scaling\n" 1511 " --telemetry: enable telemetry mode, to update" 1512 " empty polls, full polls, and core busyness to telemetry\n" 1513 " --interrupt-only: enable interrupt-only mode\n" 1514 " --pmd-mgmt MODE: enable PMD power management mode. " 1515 "Currently supported modes: baseline, monitor, pause, scale\n" 1516 " --max-empty-polls MAX_EMPTY_POLLS: number of empty polls to" 1517 " wait before entering sleep state\n" 1518 " --pause-duration DURATION: set the duration, in microseconds," 1519 " of the pause callback\n" 1520 " --scale-freq-min FREQ_MIN: set minimum frequency for scaling mode for" 1521 " all application lcores (FREQ_MIN must be in kHz, in increments of 100MHz)\n" 1522 " --scale-freq-max FREQ_MAX: set maximum frequency for scaling mode for" 1523 " all application lcores (FREQ_MAX must be in kHz, in increments of 100MHz)\n", 1524 prgname); 1525 } 1526 1527 static int 1528 parse_int(const char *opt) 1529 { 1530 char *end = NULL; 1531 unsigned long val; 1532 1533 /* parse integer string */ 1534 val = strtoul(opt, &end, 10); 1535 if ((opt[0] == '\0') || (end == NULL) || (*end != '\0')) 1536 return -1; 1537 1538 return val; 1539 } 1540 1541 static int parse_max_pkt_len(const char *pktlen) 1542 { 1543 char *end = NULL; 1544 unsigned long len; 1545 1546 /* parse decimal string */ 1547 len = strtoul(pktlen, &end, 10); 1548 if ((pktlen[0] == '\0') || (end == NULL) || (*end != '\0')) 1549 return -1; 1550 1551 if (len == 0) 1552 return -1; 1553 1554 return len; 1555 } 1556 1557 static int 1558 parse_uncore_options(enum uncore_choice choice, const char *argument) 1559 { 1560 unsigned int die, pkg, max_pkg, max_die; 1561 int ret = 0; 1562 ret = rte_power_set_uncore_env(RTE_UNCORE_PM_ENV_AUTO_DETECT); 1563 if (ret < 0) { 1564 RTE_LOG(INFO, L3FWD_POWER, "Failed to set uncore env\n"); 1565 return ret; 1566 } 1567 1568 max_pkg = rte_power_uncore_get_num_pkgs(); 1569 if (max_pkg == 0) 1570 return -1; 1571 1572 for (pkg = 0; pkg < max_pkg; pkg++) { 1573 max_die = rte_power_uncore_get_num_dies(pkg); 1574 if (max_die == 0) 1575 return -1; 1576 for (die = 0; die < max_die; die++) { 1577 ret = rte_power_uncore_init(pkg, die); 1578 if (ret == -1) { 1579 RTE_LOG(INFO, L3FWD_POWER, "Unable to initialize uncore for pkg %02u die %02u\n" 1580 , pkg, die); 1581 return ret; 1582 } 1583 if (choice == UNCORE_MIN) { 1584 ret = rte_power_uncore_freq_min(pkg, die); 1585 if (ret == -1) { 1586 RTE_LOG(INFO, L3FWD_POWER, 1587 "Unable to set the uncore min/max to minimum uncore frequency value for pkg %02u die %02u\n" 1588 , pkg, die); 1589 return ret; 1590 } 1591 } else if (choice == UNCORE_MAX) { 1592 ret = rte_power_uncore_freq_max(pkg, die); 1593 if (ret == -1) { 1594 RTE_LOG(INFO, L3FWD_POWER, 1595 "Unable to set uncore min/max to maximum uncore frequency value for pkg %02u die %02u\n" 1596 , pkg, die); 1597 return ret; 1598 } 1599 } else if (choice == UNCORE_IDX) { 1600 char *ptr = NULL; 1601 int frequency_index = strtol(argument, &ptr, 10); 1602 if (argument == ptr) { 1603 RTE_LOG(INFO, L3FWD_POWER, "Index given is not a valid number."); 1604 return -1; 1605 } 1606 int freq_array_len = rte_power_uncore_get_num_freqs(pkg, die); 1607 if (frequency_index > freq_array_len - 1) { 1608 RTE_LOG(INFO, L3FWD_POWER, 1609 "Frequency index given out of range, please choose a value from 0 to %d.\n", 1610 freq_array_len); 1611 return -1; 1612 } 1613 ret = rte_power_set_uncore_freq(pkg, die, frequency_index); 1614 if (ret == -1) { 1615 RTE_LOG(INFO, L3FWD_POWER, 1616 "Unable to set min/max uncore index value for pkg %02u die %02u\n", 1617 pkg, die); 1618 return ret; 1619 } 1620 } else { 1621 RTE_LOG(INFO, L3FWD_POWER, "Uncore choice provided invalid\n"); 1622 return -1; 1623 } 1624 } 1625 } 1626 1627 RTE_LOG(INFO, L3FWD_POWER, "Successfully set max/min/index uncore frequency.\n"); 1628 return ret; 1629 } 1630 1631 static int 1632 parse_portmask(const char *portmask) 1633 { 1634 char *end = NULL; 1635 unsigned long pm; 1636 1637 /* parse hexadecimal string */ 1638 pm = strtoul(portmask, &end, 16); 1639 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0')) 1640 return 0; 1641 1642 return pm; 1643 } 1644 1645 static int 1646 parse_config(const char *q_arg) 1647 { 1648 char s[256]; 1649 const char *p, *p0 = q_arg; 1650 char *end; 1651 enum fieldnames { 1652 FLD_PORT = 0, 1653 FLD_QUEUE, 1654 FLD_LCORE, 1655 _NUM_FLD 1656 }; 1657 unsigned long int_fld[_NUM_FLD]; 1658 char *str_fld[_NUM_FLD]; 1659 int i; 1660 unsigned size; 1661 unsigned int max_fld[_NUM_FLD] = { 1662 RTE_MAX_ETHPORTS, 1663 RTE_MAX_QUEUES_PER_PORT, 1664 RTE_MAX_LCORE 1665 }; 1666 1667 nb_lcore_params = 0; 1668 1669 while ((p = strchr(p0,'(')) != NULL) { 1670 ++p; 1671 if((p0 = strchr(p,')')) == NULL) 1672 return -1; 1673 1674 size = p0 - p; 1675 if(size >= sizeof(s)) 1676 return -1; 1677 1678 snprintf(s, sizeof(s), "%.*s", size, p); 1679 if (rte_strsplit(s, sizeof(s), str_fld, _NUM_FLD, ',') != 1680 _NUM_FLD) 1681 return -1; 1682 for (i = 0; i < _NUM_FLD; i++){ 1683 errno = 0; 1684 int_fld[i] = strtoul(str_fld[i], &end, 0); 1685 if (errno != 0 || end == str_fld[i] || int_fld[i] > max_fld[i]) 1686 return -1; 1687 } 1688 if (nb_lcore_params >= MAX_LCORE_PARAMS) { 1689 printf("exceeded max number of lcore params: %hu\n", 1690 nb_lcore_params); 1691 return -1; 1692 } 1693 lcore_params_array[nb_lcore_params].port_id = 1694 (uint16_t)int_fld[FLD_PORT]; 1695 lcore_params_array[nb_lcore_params].queue_id = 1696 (uint16_t)int_fld[FLD_QUEUE]; 1697 lcore_params_array[nb_lcore_params].lcore_id = 1698 (uint32_t)int_fld[FLD_LCORE]; 1699 ++nb_lcore_params; 1700 } 1701 lcore_params = lcore_params_array; 1702 1703 return 0; 1704 } 1705 1706 static int 1707 parse_pmd_mgmt_config(const char *name) 1708 { 1709 #define PMD_MGMT_MONITOR "monitor" 1710 #define PMD_MGMT_PAUSE "pause" 1711 #define PMD_MGMT_SCALE "scale" 1712 #define PMD_MGMT_BASELINE "baseline" 1713 1714 if (strncmp(PMD_MGMT_MONITOR, name, sizeof(PMD_MGMT_MONITOR)) == 0) { 1715 pmgmt_type = RTE_POWER_MGMT_TYPE_MONITOR; 1716 return 0; 1717 } 1718 1719 if (strncmp(PMD_MGMT_PAUSE, name, sizeof(PMD_MGMT_PAUSE)) == 0) { 1720 pmgmt_type = RTE_POWER_MGMT_TYPE_PAUSE; 1721 return 0; 1722 } 1723 1724 if (strncmp(PMD_MGMT_SCALE, name, sizeof(PMD_MGMT_SCALE)) == 0) { 1725 pmgmt_type = RTE_POWER_MGMT_TYPE_SCALE; 1726 return 0; 1727 } 1728 if (strncmp(PMD_MGMT_BASELINE, name, sizeof(PMD_MGMT_BASELINE)) == 0) { 1729 baseline_enabled = true; 1730 return 0; 1731 } 1732 /* unknown PMD power management mode */ 1733 return -1; 1734 } 1735 1736 #define CMD_LINE_OPT_PARSE_PTYPE "parse-ptype" 1737 #define CMD_LINE_OPT_LEGACY "legacy" 1738 #define CMD_LINE_OPT_INTERRUPT_ONLY "interrupt-only" 1739 #define CMD_LINE_OPT_TELEMETRY "telemetry" 1740 #define CMD_LINE_OPT_PMD_MGMT "pmd-mgmt" 1741 #define CMD_LINE_OPT_MAX_PKT_LEN "max-pkt-len" 1742 #define CMD_LINE_OPT_MAX_EMPTY_POLLS "max-empty-polls" 1743 #define CMD_LINE_OPT_PAUSE_DURATION "pause-duration" 1744 #define CMD_LINE_OPT_SCALE_FREQ_MIN "scale-freq-min" 1745 #define CMD_LINE_OPT_SCALE_FREQ_MAX "scale-freq-max" 1746 1747 /* Parse the argument given in the command line of the application */ 1748 static int 1749 parse_args(int argc, char **argv) 1750 { 1751 int opt, ret; 1752 char **argvopt; 1753 int option_index; 1754 char *prgname = argv[0]; 1755 static struct option lgopts[] = { 1756 {"config", 1, 0, 0}, 1757 {"perf-config", 1, 0, 0}, 1758 {"high-perf-cores", 1, 0, 0}, 1759 {"no-numa", 0, 0, 0}, 1760 {CMD_LINE_OPT_MAX_PKT_LEN, 1, 0, 0}, 1761 {CMD_LINE_OPT_PARSE_PTYPE, 0, 0, 0}, 1762 {CMD_LINE_OPT_LEGACY, 0, 0, 0}, 1763 {CMD_LINE_OPT_TELEMETRY, 0, 0, 0}, 1764 {CMD_LINE_OPT_INTERRUPT_ONLY, 0, 0, 0}, 1765 {CMD_LINE_OPT_PMD_MGMT, 1, 0, 0}, 1766 {CMD_LINE_OPT_MAX_EMPTY_POLLS, 1, 0, 0}, 1767 {CMD_LINE_OPT_PAUSE_DURATION, 1, 0, 0}, 1768 {CMD_LINE_OPT_SCALE_FREQ_MIN, 1, 0, 0}, 1769 {CMD_LINE_OPT_SCALE_FREQ_MAX, 1, 0, 0}, 1770 {NULL, 0, 0, 0} 1771 }; 1772 1773 argvopt = argv; 1774 1775 while ((opt = getopt_long(argc, argvopt, "p:PuUi:", 1776 lgopts, &option_index)) != EOF) { 1777 1778 switch (opt) { 1779 /* portmask */ 1780 case 'p': 1781 enabled_port_mask = parse_portmask(optarg); 1782 if (enabled_port_mask == 0) { 1783 printf("invalid portmask\n"); 1784 print_usage(prgname); 1785 return -1; 1786 } 1787 break; 1788 case 'P': 1789 printf("Promiscuous mode selected\n"); 1790 promiscuous_on = 1; 1791 break; 1792 case 'u': 1793 enabled_uncore = parse_uncore_options(UNCORE_MIN, NULL); 1794 if (enabled_uncore < 0) { 1795 print_usage(prgname); 1796 return -1; 1797 } 1798 break; 1799 case 'U': 1800 enabled_uncore = parse_uncore_options(UNCORE_MAX, NULL); 1801 if (enabled_uncore < 0) { 1802 print_usage(prgname); 1803 return -1; 1804 } 1805 break; 1806 case 'i': 1807 enabled_uncore = parse_uncore_options(UNCORE_IDX, optarg); 1808 if (enabled_uncore < 0) { 1809 print_usage(prgname); 1810 return -1; 1811 } 1812 break; 1813 /* long options */ 1814 case 0: 1815 if (!strncmp(lgopts[option_index].name, "config", 6)) { 1816 ret = parse_config(optarg); 1817 if (ret) { 1818 printf("invalid config\n"); 1819 print_usage(prgname); 1820 return -1; 1821 } 1822 } 1823 1824 if (!strncmp(lgopts[option_index].name, 1825 "perf-config", 11)) { 1826 ret = parse_perf_config(optarg); 1827 if (ret) { 1828 printf("invalid perf-config\n"); 1829 print_usage(prgname); 1830 return -1; 1831 } 1832 } 1833 1834 if (!strncmp(lgopts[option_index].name, 1835 "high-perf-cores", 15)) { 1836 ret = parse_perf_core_list(optarg); 1837 if (ret) { 1838 printf("invalid high-perf-cores\n"); 1839 print_usage(prgname); 1840 return -1; 1841 } 1842 } 1843 1844 if (!strncmp(lgopts[option_index].name, 1845 "no-numa", 7)) { 1846 printf("numa is disabled \n"); 1847 numa_on = 0; 1848 } 1849 1850 if (!strncmp(lgopts[option_index].name, 1851 CMD_LINE_OPT_LEGACY, 1852 sizeof(CMD_LINE_OPT_LEGACY))) { 1853 if (app_mode != APP_MODE_DEFAULT) { 1854 printf(" legacy mode is mutually exclusive with other modes\n"); 1855 return -1; 1856 } 1857 app_mode = APP_MODE_LEGACY; 1858 printf("legacy mode is enabled\n"); 1859 } 1860 1861 if (!strncmp(lgopts[option_index].name, 1862 CMD_LINE_OPT_TELEMETRY, 1863 sizeof(CMD_LINE_OPT_TELEMETRY))) { 1864 if (app_mode != APP_MODE_DEFAULT) { 1865 printf(" telemetry mode is mutually exclusive with other modes\n"); 1866 return -1; 1867 } 1868 app_mode = APP_MODE_TELEMETRY; 1869 printf("telemetry mode is enabled\n"); 1870 } 1871 1872 if (!strncmp(lgopts[option_index].name, 1873 CMD_LINE_OPT_PMD_MGMT, 1874 sizeof(CMD_LINE_OPT_PMD_MGMT))) { 1875 if (app_mode != APP_MODE_DEFAULT) { 1876 printf(" power mgmt mode is mutually exclusive with other modes\n"); 1877 return -1; 1878 } 1879 if (parse_pmd_mgmt_config(optarg) < 0) { 1880 printf(" Invalid PMD power management mode: %s\n", 1881 optarg); 1882 return -1; 1883 } 1884 app_mode = APP_MODE_PMD_MGMT; 1885 printf("PMD power mgmt mode is enabled\n"); 1886 } 1887 if (!strncmp(lgopts[option_index].name, 1888 CMD_LINE_OPT_INTERRUPT_ONLY, 1889 sizeof(CMD_LINE_OPT_INTERRUPT_ONLY))) { 1890 if (app_mode != APP_MODE_DEFAULT) { 1891 printf(" interrupt-only mode is mutually exclusive with other modes\n"); 1892 return -1; 1893 } 1894 app_mode = APP_MODE_INTERRUPT; 1895 printf("interrupt-only mode is enabled\n"); 1896 } 1897 1898 if (!strncmp(lgopts[option_index].name, 1899 CMD_LINE_OPT_MAX_PKT_LEN, 1900 sizeof(CMD_LINE_OPT_MAX_PKT_LEN))) { 1901 printf("Custom frame size is configured\n"); 1902 max_pkt_len = parse_max_pkt_len(optarg); 1903 } 1904 1905 if (!strncmp(lgopts[option_index].name, 1906 CMD_LINE_OPT_PARSE_PTYPE, 1907 sizeof(CMD_LINE_OPT_PARSE_PTYPE))) { 1908 printf("soft parse-ptype is enabled\n"); 1909 parse_ptype = 1; 1910 } 1911 1912 if (!strncmp(lgopts[option_index].name, 1913 CMD_LINE_OPT_MAX_EMPTY_POLLS, 1914 sizeof(CMD_LINE_OPT_MAX_EMPTY_POLLS))) { 1915 printf("Maximum empty polls configured\n"); 1916 max_empty_polls = parse_int(optarg); 1917 } 1918 1919 if (!strncmp(lgopts[option_index].name, 1920 CMD_LINE_OPT_PAUSE_DURATION, 1921 sizeof(CMD_LINE_OPT_PAUSE_DURATION))) { 1922 printf("Pause duration configured\n"); 1923 pause_duration = parse_int(optarg); 1924 } 1925 1926 if (!strncmp(lgopts[option_index].name, 1927 CMD_LINE_OPT_SCALE_FREQ_MIN, 1928 sizeof(CMD_LINE_OPT_SCALE_FREQ_MIN))) { 1929 printf("Scaling frequency minimum configured\n"); 1930 scale_freq_min = parse_int(optarg); 1931 } 1932 1933 if (!strncmp(lgopts[option_index].name, 1934 CMD_LINE_OPT_SCALE_FREQ_MAX, 1935 sizeof(CMD_LINE_OPT_SCALE_FREQ_MAX))) { 1936 printf("Scaling frequency maximum configured\n"); 1937 scale_freq_max = parse_int(optarg); 1938 } 1939 1940 break; 1941 1942 default: 1943 print_usage(prgname); 1944 return -1; 1945 } 1946 } 1947 1948 if (optind >= 0) 1949 argv[optind-1] = prgname; 1950 1951 ret = optind-1; 1952 optind = 1; /* reset getopt lib */ 1953 return ret; 1954 } 1955 1956 static void 1957 print_ethaddr(const char *name, const struct rte_ether_addr *eth_addr) 1958 { 1959 char buf[RTE_ETHER_ADDR_FMT_SIZE]; 1960 rte_ether_format_addr(buf, RTE_ETHER_ADDR_FMT_SIZE, eth_addr); 1961 printf("%s%s", name, buf); 1962 } 1963 1964 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 1965 static void 1966 setup_hash(int socketid) 1967 { 1968 struct rte_hash_parameters ipv4_l3fwd_hash_params = { 1969 .name = NULL, 1970 .entries = L3FWD_HASH_ENTRIES, 1971 .key_len = sizeof(struct ipv4_5tuple), 1972 .hash_func = DEFAULT_HASH_FUNC, 1973 .hash_func_init_val = 0, 1974 }; 1975 1976 struct rte_hash_parameters ipv6_l3fwd_hash_params = { 1977 .name = NULL, 1978 .entries = L3FWD_HASH_ENTRIES, 1979 .key_len = sizeof(struct ipv6_5tuple), 1980 .hash_func = DEFAULT_HASH_FUNC, 1981 .hash_func_init_val = 0, 1982 }; 1983 1984 unsigned i; 1985 int ret; 1986 char s[64]; 1987 1988 /* create ipv4 hash */ 1989 snprintf(s, sizeof(s), "ipv4_l3fwd_hash_%d", socketid); 1990 ipv4_l3fwd_hash_params.name = s; 1991 ipv4_l3fwd_hash_params.socket_id = socketid; 1992 ipv4_l3fwd_lookup_struct[socketid] = 1993 rte_hash_create(&ipv4_l3fwd_hash_params); 1994 if (ipv4_l3fwd_lookup_struct[socketid] == NULL) 1995 rte_exit(EXIT_FAILURE, "Unable to create the l3fwd hash on " 1996 "socket %d\n", socketid); 1997 1998 /* create ipv6 hash */ 1999 snprintf(s, sizeof(s), "ipv6_l3fwd_hash_%d", socketid); 2000 ipv6_l3fwd_hash_params.name = s; 2001 ipv6_l3fwd_hash_params.socket_id = socketid; 2002 ipv6_l3fwd_lookup_struct[socketid] = 2003 rte_hash_create(&ipv6_l3fwd_hash_params); 2004 if (ipv6_l3fwd_lookup_struct[socketid] == NULL) 2005 rte_exit(EXIT_FAILURE, "Unable to create the l3fwd hash on " 2006 "socket %d\n", socketid); 2007 2008 2009 /* populate the ipv4 hash */ 2010 for (i = 0; i < RTE_DIM(ipv4_l3fwd_route_array); i++) { 2011 ret = rte_hash_add_key (ipv4_l3fwd_lookup_struct[socketid], 2012 (void *) &ipv4_l3fwd_route_array[i].key); 2013 if (ret < 0) { 2014 rte_exit(EXIT_FAILURE, "Unable to add entry %u to the" 2015 "l3fwd hash on socket %d\n", i, socketid); 2016 } 2017 ipv4_l3fwd_out_if[ret] = ipv4_l3fwd_route_array[i].if_out; 2018 printf("Hash: Adding key\n"); 2019 print_ipv4_key(ipv4_l3fwd_route_array[i].key); 2020 } 2021 2022 /* populate the ipv6 hash */ 2023 for (i = 0; i < RTE_DIM(ipv6_l3fwd_route_array); i++) { 2024 ret = rte_hash_add_key (ipv6_l3fwd_lookup_struct[socketid], 2025 (void *) &ipv6_l3fwd_route_array[i].key); 2026 if (ret < 0) { 2027 rte_exit(EXIT_FAILURE, "Unable to add entry %u to the" 2028 "l3fwd hash on socket %d\n", i, socketid); 2029 } 2030 ipv6_l3fwd_out_if[ret] = ipv6_l3fwd_route_array[i].if_out; 2031 printf("Hash: Adding key\n"); 2032 print_ipv6_key(ipv6_l3fwd_route_array[i].key); 2033 } 2034 } 2035 #endif 2036 2037 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 2038 static void 2039 setup_lpm(int socketid) 2040 { 2041 unsigned i; 2042 int ret; 2043 char s[64]; 2044 2045 /* create the LPM table */ 2046 struct rte_lpm_config lpm_ipv4_config; 2047 2048 lpm_ipv4_config.max_rules = IPV4_L3FWD_LPM_MAX_RULES; 2049 lpm_ipv4_config.number_tbl8s = 256; 2050 lpm_ipv4_config.flags = 0; 2051 2052 snprintf(s, sizeof(s), "IPV4_L3FWD_LPM_%d", socketid); 2053 ipv4_l3fwd_lookup_struct[socketid] = 2054 rte_lpm_create(s, socketid, &lpm_ipv4_config); 2055 if (ipv4_l3fwd_lookup_struct[socketid] == NULL) 2056 rte_exit(EXIT_FAILURE, "Unable to create the l3fwd LPM table" 2057 " on socket %d\n", socketid); 2058 2059 /* populate the LPM table */ 2060 for (i = 0; i < RTE_DIM(ipv4_l3fwd_route_array); i++) { 2061 ret = rte_lpm_add(ipv4_l3fwd_lookup_struct[socketid], 2062 ipv4_l3fwd_route_array[i].ip, 2063 ipv4_l3fwd_route_array[i].depth, 2064 ipv4_l3fwd_route_array[i].if_out); 2065 2066 if (ret < 0) { 2067 rte_exit(EXIT_FAILURE, "Unable to add entry %u to the " 2068 "l3fwd LPM table on socket %d\n", 2069 i, socketid); 2070 } 2071 2072 printf("LPM: Adding route 0x%08x / %d (%d)\n", 2073 (unsigned)ipv4_l3fwd_route_array[i].ip, 2074 ipv4_l3fwd_route_array[i].depth, 2075 ipv4_l3fwd_route_array[i].if_out); 2076 } 2077 } 2078 #endif 2079 2080 static int 2081 init_mem(unsigned nb_mbuf) 2082 { 2083 struct lcore_conf *qconf; 2084 int socketid; 2085 unsigned lcore_id; 2086 char s[64]; 2087 2088 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2089 if (rte_lcore_is_enabled(lcore_id) == 0) 2090 continue; 2091 2092 if (numa_on) 2093 socketid = rte_lcore_to_socket_id(lcore_id); 2094 else 2095 socketid = 0; 2096 2097 if (socketid >= NB_SOCKETS) { 2098 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is " 2099 "out of range %d\n", socketid, 2100 lcore_id, NB_SOCKETS); 2101 } 2102 if (pktmbuf_pool[socketid] == NULL) { 2103 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 2104 pktmbuf_pool[socketid] = 2105 rte_pktmbuf_pool_create(s, nb_mbuf, 2106 MEMPOOL_CACHE_SIZE, 0, 2107 RTE_MBUF_DEFAULT_BUF_SIZE, 2108 socketid); 2109 if (pktmbuf_pool[socketid] == NULL) 2110 rte_exit(EXIT_FAILURE, 2111 "Cannot init mbuf pool on socket %d\n", 2112 socketid); 2113 else 2114 printf("Allocated mbuf pool on socket %d\n", 2115 socketid); 2116 2117 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 2118 setup_lpm(socketid); 2119 #else 2120 setup_hash(socketid); 2121 #endif 2122 } 2123 qconf = &lcore_conf[lcore_id]; 2124 qconf->ipv4_lookup_struct = ipv4_l3fwd_lookup_struct[socketid]; 2125 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 2126 qconf->ipv6_lookup_struct = ipv6_l3fwd_lookup_struct[socketid]; 2127 #endif 2128 } 2129 return 0; 2130 } 2131 2132 /* Check the link status of all ports in up to 9s, and print them finally */ 2133 static void 2134 check_all_ports_link_status(uint32_t port_mask) 2135 { 2136 #define CHECK_INTERVAL 100 /* 100ms */ 2137 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 2138 uint8_t count, all_ports_up, print_flag = 0; 2139 uint16_t portid; 2140 struct rte_eth_link link; 2141 int ret; 2142 char link_status_text[RTE_ETH_LINK_MAX_STR_LEN]; 2143 2144 printf("\nChecking link status"); 2145 fflush(stdout); 2146 for (count = 0; count <= MAX_CHECK_TIME; count++) { 2147 all_ports_up = 1; 2148 RTE_ETH_FOREACH_DEV(portid) { 2149 if ((port_mask & (1 << portid)) == 0) 2150 continue; 2151 memset(&link, 0, sizeof(link)); 2152 ret = rte_eth_link_get_nowait(portid, &link); 2153 if (ret < 0) { 2154 all_ports_up = 0; 2155 if (print_flag == 1) 2156 printf("Port %u link get failed: %s\n", 2157 portid, rte_strerror(-ret)); 2158 continue; 2159 } 2160 /* print link status if flag set */ 2161 if (print_flag == 1) { 2162 rte_eth_link_to_str(link_status_text, 2163 sizeof(link_status_text), &link); 2164 printf("Port %d %s\n", portid, 2165 link_status_text); 2166 continue; 2167 } 2168 /* clear all_ports_up flag if any link down */ 2169 if (link.link_status == RTE_ETH_LINK_DOWN) { 2170 all_ports_up = 0; 2171 break; 2172 } 2173 } 2174 /* after finally printing all link status, get out */ 2175 if (print_flag == 1) 2176 break; 2177 2178 if (all_ports_up == 0) { 2179 printf("."); 2180 fflush(stdout); 2181 rte_delay_ms(CHECK_INTERVAL); 2182 } 2183 2184 /* set the print_flag if all ports up or timeout */ 2185 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 2186 print_flag = 1; 2187 printf("done\n"); 2188 } 2189 } 2190 } 2191 2192 static int check_ptype(uint16_t portid) 2193 { 2194 int i, ret; 2195 int ptype_l3_ipv4 = 0; 2196 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 2197 int ptype_l3_ipv6 = 0; 2198 #endif 2199 uint32_t ptype_mask = RTE_PTYPE_L3_MASK; 2200 2201 ret = rte_eth_dev_get_supported_ptypes(portid, ptype_mask, NULL, 0); 2202 if (ret <= 0) 2203 return 0; 2204 2205 uint32_t ptypes[ret]; 2206 2207 ret = rte_eth_dev_get_supported_ptypes(portid, ptype_mask, ptypes, ret); 2208 for (i = 0; i < ret; ++i) { 2209 if (ptypes[i] & RTE_PTYPE_L3_IPV4) 2210 ptype_l3_ipv4 = 1; 2211 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 2212 if (ptypes[i] & RTE_PTYPE_L3_IPV6) 2213 ptype_l3_ipv6 = 1; 2214 #endif 2215 } 2216 2217 if (ptype_l3_ipv4 == 0) 2218 printf("port %d cannot parse RTE_PTYPE_L3_IPV4\n", portid); 2219 2220 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 2221 if (ptype_l3_ipv6 == 0) 2222 printf("port %d cannot parse RTE_PTYPE_L3_IPV6\n", portid); 2223 #endif 2224 2225 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 2226 if (ptype_l3_ipv4) 2227 #else /* APP_LOOKUP_EXACT_MATCH */ 2228 if (ptype_l3_ipv4 && ptype_l3_ipv6) 2229 #endif 2230 return 1; 2231 2232 return 0; 2233 2234 } 2235 2236 static int 2237 init_power_library(void) 2238 { 2239 enum power_management_env env; 2240 unsigned int lcore_id; 2241 int ret = 0; 2242 2243 RTE_LCORE_FOREACH(lcore_id) { 2244 /* init power management library */ 2245 ret = rte_power_init(lcore_id); 2246 if (ret) { 2247 RTE_LOG(ERR, L3FWD_POWER, 2248 "Library initialization failed on core %u\n", 2249 lcore_id); 2250 return ret; 2251 } 2252 /* we're not supporting the VM channel mode */ 2253 env = rte_power_get_env(); 2254 if (env != PM_ENV_ACPI_CPUFREQ && 2255 env != PM_ENV_PSTATE_CPUFREQ && 2256 env != PM_ENV_AMD_PSTATE_CPUFREQ && 2257 env != PM_ENV_CPPC_CPUFREQ) { 2258 RTE_LOG(ERR, L3FWD_POWER, 2259 "Only ACPI and PSTATE mode are supported\n"); 2260 return -1; 2261 } 2262 } 2263 return ret; 2264 } 2265 2266 static int 2267 deinit_power_library(void) 2268 { 2269 unsigned int lcore_id, max_pkg, max_die, die, pkg; 2270 int ret = 0; 2271 2272 RTE_LCORE_FOREACH(lcore_id) { 2273 /* deinit power management library */ 2274 ret = rte_power_exit(lcore_id); 2275 if (ret) { 2276 RTE_LOG(ERR, L3FWD_POWER, 2277 "Library deinitialization failed on core %u\n", 2278 lcore_id); 2279 return ret; 2280 } 2281 } 2282 2283 /* if uncore option was set */ 2284 if (enabled_uncore == 0) { 2285 max_pkg = rte_power_uncore_get_num_pkgs(); 2286 if (max_pkg == 0) 2287 return -1; 2288 for (pkg = 0; pkg < max_pkg; pkg++) { 2289 max_die = rte_power_uncore_get_num_dies(pkg); 2290 if (max_die == 0) 2291 return -1; 2292 for (die = 0; die < max_die; die++) { 2293 ret = rte_power_uncore_exit(pkg, die); 2294 if (ret < 0) { 2295 RTE_LOG(ERR, L3FWD_POWER, "Failed to exit uncore deinit successfully for pkg %02u die %02u\n" 2296 , pkg, die); 2297 return -1; 2298 } 2299 } 2300 } 2301 } 2302 return ret; 2303 } 2304 2305 static void 2306 get_current_stat_values(uint64_t *values) 2307 { 2308 unsigned int lcore_id = rte_lcore_id(); 2309 struct lcore_conf *qconf; 2310 uint64_t app_eps = 0, app_fps = 0, app_br = 0; 2311 uint64_t count = 0; 2312 2313 RTE_LCORE_FOREACH_WORKER(lcore_id) { 2314 qconf = &lcore_conf[lcore_id]; 2315 if (qconf->n_rx_queue == 0) 2316 continue; 2317 count++; 2318 rte_spinlock_lock(&stats[lcore_id].telemetry_lock); 2319 app_eps += stats[lcore_id].ep_nep[1]; 2320 app_fps += stats[lcore_id].fp_nfp[1]; 2321 app_br += stats[lcore_id].br; 2322 rte_spinlock_unlock(&stats[lcore_id].telemetry_lock); 2323 } 2324 2325 if (count > 0) { 2326 values[0] = app_eps/count; 2327 values[1] = app_fps/count; 2328 values[2] = app_br/count; 2329 } else 2330 memset(values, 0, sizeof(uint64_t) * NUM_TELSTATS); 2331 2332 } 2333 2334 static void 2335 update_telemetry(__rte_unused struct rte_timer *tim, 2336 __rte_unused void *arg) 2337 { 2338 int ret; 2339 uint64_t values[NUM_TELSTATS] = {0}; 2340 2341 get_current_stat_values(values); 2342 ret = rte_metrics_update_values(RTE_METRICS_GLOBAL, telstats_index, 2343 values, RTE_DIM(values)); 2344 if (ret < 0) 2345 RTE_LOG(WARNING, L3FWD_POWER, "failed to update metrics\n"); 2346 } 2347 2348 static int 2349 handle_app_stats(const char *cmd __rte_unused, 2350 const char *params __rte_unused, 2351 struct rte_tel_data *d) 2352 { 2353 uint64_t values[NUM_TELSTATS] = {0}; 2354 uint32_t i; 2355 2356 rte_tel_data_start_dict(d); 2357 get_current_stat_values(values); 2358 for (i = 0; i < NUM_TELSTATS; i++) 2359 rte_tel_data_add_dict_uint(d, telstats_strings[i].name, 2360 values[i]); 2361 return 0; 2362 } 2363 2364 static void 2365 telemetry_setup_timer(void) 2366 { 2367 int lcore_id = rte_lcore_id(); 2368 uint64_t hz = rte_get_timer_hz(); 2369 uint64_t ticks; 2370 2371 ticks = hz / TELEMETRY_INTERVALS_PER_SEC; 2372 rte_timer_reset_sync(&telemetry_timer, 2373 ticks, 2374 PERIODICAL, 2375 lcore_id, 2376 update_telemetry, 2377 NULL); 2378 } 2379 2380 static int 2381 launch_timer(unsigned int lcore_id) 2382 { 2383 int64_t prev_tsc = 0, cur_tsc, diff_tsc, cycles_10ms; 2384 2385 RTE_SET_USED(lcore_id); 2386 2387 2388 if (rte_get_main_lcore() != lcore_id) { 2389 rte_panic("timer on lcore:%d which is not main core:%d\n", 2390 lcore_id, 2391 rte_get_main_lcore()); 2392 } 2393 2394 RTE_LOG(INFO, L3FWD_POWER, "Bring up the Timer\n"); 2395 2396 telemetry_setup_timer(); 2397 2398 cycles_10ms = rte_get_timer_hz() / 100; 2399 2400 while (!is_done()) { 2401 cur_tsc = rte_rdtsc(); 2402 diff_tsc = cur_tsc - prev_tsc; 2403 if (diff_tsc > cycles_10ms) { 2404 rte_timer_manage(); 2405 prev_tsc = cur_tsc; 2406 cycles_10ms = rte_get_timer_hz() / 100; 2407 } 2408 } 2409 2410 RTE_LOG(INFO, L3FWD_POWER, "Timer_subsystem is done\n"); 2411 2412 return 0; 2413 } 2414 2415 static int 2416 autodetect_mode(void) 2417 { 2418 RTE_LOG(NOTICE, L3FWD_POWER, "Operating mode not specified, probing frequency scaling support...\n"); 2419 2420 /* 2421 * Empty poll and telemetry modes have to be specifically requested to 2422 * be enabled, but we can auto-detect between interrupt mode with or 2423 * without frequency scaling. Any of ACPI, pstate and CPPC can be used. 2424 */ 2425 if (rte_power_check_env_supported(PM_ENV_ACPI_CPUFREQ)) 2426 return APP_MODE_LEGACY; 2427 if (rte_power_check_env_supported(PM_ENV_PSTATE_CPUFREQ)) 2428 return APP_MODE_LEGACY; 2429 if (rte_power_check_env_supported(PM_ENV_AMD_PSTATE_CPUFREQ)) 2430 return APP_MODE_LEGACY; 2431 if (rte_power_check_env_supported(PM_ENV_CPPC_CPUFREQ)) 2432 return APP_MODE_LEGACY; 2433 2434 RTE_LOG(NOTICE, L3FWD_POWER, "Frequency scaling not supported, selecting interrupt-only mode\n"); 2435 2436 return APP_MODE_INTERRUPT; 2437 } 2438 2439 static const char * 2440 mode_to_str(enum appmode mode) 2441 { 2442 switch (mode) { 2443 case APP_MODE_LEGACY: 2444 return "legacy"; 2445 case APP_MODE_TELEMETRY: 2446 return "telemetry"; 2447 case APP_MODE_INTERRUPT: 2448 return "interrupt-only"; 2449 case APP_MODE_PMD_MGMT: 2450 return "pmd mgmt"; 2451 default: 2452 return "invalid"; 2453 } 2454 } 2455 2456 static uint32_t 2457 eth_dev_get_overhead_len(uint32_t max_rx_pktlen, uint16_t max_mtu) 2458 { 2459 uint32_t overhead_len; 2460 2461 if (max_mtu != UINT16_MAX && max_rx_pktlen > max_mtu) 2462 overhead_len = max_rx_pktlen - max_mtu; 2463 else 2464 overhead_len = RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN; 2465 2466 return overhead_len; 2467 } 2468 2469 static int 2470 config_port_max_pkt_len(struct rte_eth_conf *conf, 2471 struct rte_eth_dev_info *dev_info) 2472 { 2473 uint32_t overhead_len; 2474 2475 if (max_pkt_len == 0) 2476 return 0; 2477 2478 if (max_pkt_len < RTE_ETHER_MIN_LEN || max_pkt_len > MAX_JUMBO_PKT_LEN) 2479 return -1; 2480 2481 overhead_len = eth_dev_get_overhead_len(dev_info->max_rx_pktlen, 2482 dev_info->max_mtu); 2483 conf->rxmode.mtu = max_pkt_len - overhead_len; 2484 2485 if (conf->rxmode.mtu > RTE_ETHER_MTU) 2486 conf->txmode.offloads |= RTE_ETH_TX_OFFLOAD_MULTI_SEGS; 2487 2488 return 0; 2489 } 2490 2491 /* Power library initialized in the main routine. 8< */ 2492 int 2493 main(int argc, char **argv) 2494 { 2495 struct lcore_conf *qconf; 2496 struct rte_eth_dev_info dev_info; 2497 struct rte_eth_txconf *txconf; 2498 int ret; 2499 uint16_t nb_ports; 2500 uint16_t queueid; 2501 unsigned lcore_id; 2502 uint64_t hz; 2503 uint32_t n_tx_queue, nb_lcores; 2504 uint32_t dev_rxq_num, dev_txq_num; 2505 uint8_t socketid; 2506 uint16_t portid, nb_rx_queue, queue; 2507 const char *ptr_strings[NUM_TELSTATS]; 2508 2509 /* init EAL */ 2510 ret = rte_eal_init(argc, argv); 2511 if (ret < 0) 2512 rte_exit(EXIT_FAILURE, "Invalid EAL parameters\n"); 2513 argc -= ret; 2514 argv += ret; 2515 2516 /* catch SIGINT and restore cpufreq governor to ondemand */ 2517 signal(SIGINT, signal_exit_now); 2518 2519 /* init RTE timer library to be used late */ 2520 rte_timer_subsystem_init(); 2521 2522 /* if we're running pmd-mgmt mode, don't default to baseline mode */ 2523 baseline_enabled = false; 2524 2525 /* parse application arguments (after the EAL ones) */ 2526 ret = parse_args(argc, argv); 2527 if (ret < 0) 2528 rte_exit(EXIT_FAILURE, "Invalid L3FWD parameters\n"); 2529 2530 if (app_mode == APP_MODE_DEFAULT) 2531 app_mode = autodetect_mode(); 2532 2533 RTE_LOG(INFO, L3FWD_POWER, "Selected operation mode: %s\n", 2534 mode_to_str(app_mode)); 2535 2536 /* only legacy mode relies on power library */ 2537 if ((app_mode == APP_MODE_LEGACY) && init_power_library()) 2538 rte_exit(EXIT_FAILURE, "init_power_library failed\n"); 2539 2540 if (update_lcore_params() < 0) 2541 rte_exit(EXIT_FAILURE, "update_lcore_params failed\n"); 2542 2543 if (check_lcore_params() < 0) 2544 rte_exit(EXIT_FAILURE, "check_lcore_params failed\n"); 2545 2546 ret = init_lcore_rx_queues(); 2547 if (ret < 0) 2548 rte_exit(EXIT_FAILURE, "init_lcore_rx_queues failed\n"); 2549 2550 nb_ports = rte_eth_dev_count_avail(); 2551 2552 if (check_port_config() < 0) 2553 rte_exit(EXIT_FAILURE, "check_port_config failed\n"); 2554 2555 nb_lcores = rte_lcore_count(); 2556 2557 /* initialize all ports */ 2558 RTE_ETH_FOREACH_DEV(portid) { 2559 struct rte_eth_conf local_port_conf = port_conf; 2560 /* not all app modes need interrupts */ 2561 bool need_intr = app_mode == APP_MODE_LEGACY || 2562 app_mode == APP_MODE_INTERRUPT; 2563 2564 /* skip ports that are not enabled */ 2565 if ((enabled_port_mask & (1 << portid)) == 0) { 2566 printf("\nSkipping disabled port %d\n", portid); 2567 continue; 2568 } 2569 2570 /* init port */ 2571 printf("Initializing port %d ... ", portid ); 2572 fflush(stdout); 2573 2574 ret = rte_eth_dev_info_get(portid, &dev_info); 2575 if (ret != 0) 2576 rte_exit(EXIT_FAILURE, 2577 "Error during getting device (port %u) info: %s\n", 2578 portid, strerror(-ret)); 2579 2580 dev_rxq_num = dev_info.max_rx_queues; 2581 dev_txq_num = dev_info.max_tx_queues; 2582 2583 nb_rx_queue = get_port_n_rx_queues(portid); 2584 if (nb_rx_queue > dev_rxq_num) 2585 rte_exit(EXIT_FAILURE, 2586 "Cannot configure not existed rxq: " 2587 "port=%d\n", portid); 2588 2589 n_tx_queue = nb_lcores; 2590 if (n_tx_queue > dev_txq_num) 2591 n_tx_queue = dev_txq_num; 2592 printf("Creating queues: nb_rxq=%d nb_txq=%u... ", 2593 nb_rx_queue, (unsigned)n_tx_queue ); 2594 /* If number of Rx queue is 0, no need to enable Rx interrupt */ 2595 if (nb_rx_queue == 0) 2596 need_intr = false; 2597 2598 if (need_intr) 2599 local_port_conf.intr_conf.rxq = 1; 2600 2601 ret = rte_eth_dev_info_get(portid, &dev_info); 2602 if (ret != 0) 2603 rte_exit(EXIT_FAILURE, 2604 "Error during getting device (port %u) info: %s\n", 2605 portid, strerror(-ret)); 2606 2607 ret = config_port_max_pkt_len(&local_port_conf, &dev_info); 2608 if (ret != 0) 2609 rte_exit(EXIT_FAILURE, 2610 "Invalid max packet length: %u (port %u)\n", 2611 max_pkt_len, portid); 2612 2613 if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) 2614 local_port_conf.txmode.offloads |= 2615 RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE; 2616 2617 local_port_conf.rx_adv_conf.rss_conf.rss_hf &= 2618 dev_info.flow_type_rss_offloads; 2619 if (local_port_conf.rx_adv_conf.rss_conf.rss_hf != 2620 port_conf.rx_adv_conf.rss_conf.rss_hf) { 2621 printf("Port %u modified RSS hash function based on hardware support," 2622 "requested:%#"PRIx64" configured:%#"PRIx64"\n", 2623 portid, 2624 port_conf.rx_adv_conf.rss_conf.rss_hf, 2625 local_port_conf.rx_adv_conf.rss_conf.rss_hf); 2626 } 2627 2628 if (local_port_conf.rx_adv_conf.rss_conf.rss_hf == 0) 2629 local_port_conf.rxmode.mq_mode = RTE_ETH_MQ_RX_NONE; 2630 local_port_conf.rxmode.offloads &= dev_info.rx_offload_capa; 2631 port_conf.rxmode.offloads = local_port_conf.rxmode.offloads; 2632 2633 ret = rte_eth_dev_configure(portid, nb_rx_queue, 2634 (uint16_t)n_tx_queue, &local_port_conf); 2635 if (ret < 0) 2636 rte_exit(EXIT_FAILURE, "Cannot configure device: " 2637 "err=%d, port=%d\n", ret, portid); 2638 2639 ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, 2640 &nb_txd); 2641 if (ret < 0) 2642 rte_exit(EXIT_FAILURE, 2643 "Cannot adjust number of descriptors: err=%d, port=%d\n", 2644 ret, portid); 2645 2646 ret = rte_eth_macaddr_get(portid, &ports_eth_addr[portid]); 2647 if (ret < 0) 2648 rte_exit(EXIT_FAILURE, 2649 "Cannot get MAC address: err=%d, port=%d\n", 2650 ret, portid); 2651 2652 print_ethaddr(" Address:", &ports_eth_addr[portid]); 2653 printf(", "); 2654 2655 /* init memory */ 2656 ret = init_mem(NB_MBUF); 2657 if (ret < 0) 2658 rte_exit(EXIT_FAILURE, "init_mem failed\n"); 2659 2660 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2661 if (rte_lcore_is_enabled(lcore_id) == 0) 2662 continue; 2663 2664 /* Initialize TX buffers */ 2665 qconf = &lcore_conf[lcore_id]; 2666 qconf->tx_buffer[portid] = rte_zmalloc_socket("tx_buffer", 2667 RTE_ETH_TX_BUFFER_SIZE(MAX_PKT_BURST), 0, 2668 rte_eth_dev_socket_id(portid)); 2669 if (qconf->tx_buffer[portid] == NULL) 2670 rte_exit(EXIT_FAILURE, "Can't allocate tx buffer for port %u\n", 2671 portid); 2672 2673 rte_eth_tx_buffer_init(qconf->tx_buffer[portid], MAX_PKT_BURST); 2674 } 2675 2676 /* init one TX queue per couple (lcore,port) */ 2677 queueid = 0; 2678 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2679 if (rte_lcore_is_enabled(lcore_id) == 0) 2680 continue; 2681 2682 if (queueid >= dev_txq_num) 2683 continue; 2684 2685 if (numa_on) 2686 socketid = \ 2687 (uint8_t)rte_lcore_to_socket_id(lcore_id); 2688 else 2689 socketid = 0; 2690 2691 printf("txq=%u,%d,%d ", lcore_id, queueid, socketid); 2692 fflush(stdout); 2693 2694 txconf = &dev_info.default_txconf; 2695 txconf->offloads = local_port_conf.txmode.offloads; 2696 ret = rte_eth_tx_queue_setup(portid, queueid, nb_txd, 2697 socketid, txconf); 2698 if (ret < 0) 2699 rte_exit(EXIT_FAILURE, 2700 "rte_eth_tx_queue_setup: err=%d, " 2701 "port=%d\n", ret, portid); 2702 2703 qconf = &lcore_conf[lcore_id]; 2704 qconf->tx_queue_id[portid] = queueid; 2705 queueid++; 2706 2707 qconf->tx_port_id[qconf->n_tx_port] = portid; 2708 qconf->n_tx_port++; 2709 } 2710 printf("\n"); 2711 } 2712 2713 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2714 if (rte_lcore_is_enabled(lcore_id) == 0) 2715 continue; 2716 2717 if (app_mode == APP_MODE_LEGACY) { 2718 /* init timer structures for each enabled lcore */ 2719 rte_timer_init(&power_timers[lcore_id]); 2720 hz = rte_get_timer_hz(); 2721 rte_timer_reset(&power_timers[lcore_id], 2722 hz/TIMER_NUMBER_PER_SECOND, 2723 SINGLE, lcore_id, 2724 power_timer_cb, NULL); 2725 } 2726 qconf = &lcore_conf[lcore_id]; 2727 printf("\nInitializing rx queues on lcore %u ... ", lcore_id ); 2728 fflush(stdout); 2729 2730 /* init RX queues */ 2731 for(queue = 0; queue < qconf->n_rx_queue; ++queue) { 2732 struct rte_eth_rxconf rxq_conf; 2733 2734 portid = qconf->rx_queue_list[queue].port_id; 2735 queueid = qconf->rx_queue_list[queue].queue_id; 2736 2737 if (numa_on) 2738 socketid = \ 2739 (uint8_t)rte_lcore_to_socket_id(lcore_id); 2740 else 2741 socketid = 0; 2742 2743 printf("rxq=%d,%d,%d ", portid, queueid, socketid); 2744 fflush(stdout); 2745 2746 ret = rte_eth_dev_info_get(portid, &dev_info); 2747 if (ret != 0) 2748 rte_exit(EXIT_FAILURE, 2749 "Error during getting device (port %u) info: %s\n", 2750 portid, strerror(-ret)); 2751 2752 rxq_conf = dev_info.default_rxconf; 2753 rxq_conf.offloads = port_conf.rxmode.offloads; 2754 ret = rte_eth_rx_queue_setup(portid, queueid, nb_rxd, 2755 socketid, &rxq_conf, 2756 pktmbuf_pool[socketid]); 2757 if (ret < 0) 2758 rte_exit(EXIT_FAILURE, 2759 "rte_eth_rx_queue_setup: err=%d, " 2760 "port=%d\n", ret, portid); 2761 2762 if (parse_ptype) { 2763 if (add_cb_parse_ptype(portid, queueid) < 0) 2764 rte_exit(EXIT_FAILURE, 2765 "Fail to add ptype cb\n"); 2766 } 2767 2768 if (app_mode == APP_MODE_PMD_MGMT && !baseline_enabled) { 2769 /* Set power_pmd_mgmt configs passed by user */ 2770 rte_power_pmd_mgmt_set_emptypoll_max(max_empty_polls); 2771 ret = rte_power_pmd_mgmt_set_pause_duration(pause_duration); 2772 if (ret < 0) 2773 rte_exit(EXIT_FAILURE, 2774 "Error setting pause_duration: err=%d, lcore=%d\n", 2775 ret, lcore_id); 2776 2777 ret = rte_power_pmd_mgmt_set_scaling_freq_min(lcore_id, 2778 scale_freq_min); 2779 if (ret < 0) 2780 rte_exit(EXIT_FAILURE, 2781 "Error setting scaling freq min: err=%d, lcore=%d\n", 2782 ret, lcore_id); 2783 2784 ret = rte_power_pmd_mgmt_set_scaling_freq_max(lcore_id, 2785 scale_freq_max); 2786 if (ret < 0) 2787 rte_exit(EXIT_FAILURE, 2788 "Error setting scaling freq max: err=%d, lcore %d\n", 2789 ret, lcore_id); 2790 2791 ret = rte_power_ethdev_pmgmt_queue_enable( 2792 lcore_id, portid, queueid, 2793 pmgmt_type); 2794 if (ret < 0) 2795 rte_exit(EXIT_FAILURE, 2796 "rte_power_ethdev_pmgmt_queue_enable: err=%d, port=%d\n", 2797 ret, portid); 2798 } 2799 } 2800 } 2801 /* >8 End of power library initialization. */ 2802 2803 printf("\n"); 2804 2805 /* start ports */ 2806 RTE_ETH_FOREACH_DEV(portid) { 2807 if ((enabled_port_mask & (1 << portid)) == 0) { 2808 continue; 2809 } 2810 /* Start device */ 2811 ret = rte_eth_dev_start(portid); 2812 if (ret < 0) 2813 rte_exit(EXIT_FAILURE, "rte_eth_dev_start: err=%d, " 2814 "port=%d\n", ret, portid); 2815 /* 2816 * If enabled, put device in promiscuous mode. 2817 * This allows IO forwarding mode to forward packets 2818 * to itself through 2 cross-connected ports of the 2819 * target machine. 2820 */ 2821 if (promiscuous_on) { 2822 ret = rte_eth_promiscuous_enable(portid); 2823 if (ret != 0) 2824 rte_exit(EXIT_FAILURE, 2825 "rte_eth_promiscuous_enable: err=%s, port=%u\n", 2826 rte_strerror(-ret), portid); 2827 } 2828 /* initialize spinlock for each port */ 2829 rte_spinlock_init(&(locks[portid])); 2830 2831 if (!parse_ptype) 2832 if (!check_ptype(portid)) 2833 rte_exit(EXIT_FAILURE, 2834 "PMD can not provide needed ptypes\n"); 2835 } 2836 2837 check_all_ports_link_status(enabled_port_mask); 2838 2839 /* launch per-lcore init on every lcore */ 2840 if (app_mode == APP_MODE_LEGACY) { 2841 rte_eal_mp_remote_launch(main_legacy_loop, NULL, CALL_MAIN); 2842 } else if (app_mode == APP_MODE_TELEMETRY) { 2843 unsigned int i; 2844 2845 /* Init metrics library */ 2846 rte_metrics_init(rte_socket_id()); 2847 /** Register stats with metrics library */ 2848 for (i = 0; i < NUM_TELSTATS; i++) 2849 ptr_strings[i] = telstats_strings[i].name; 2850 2851 ret = rte_metrics_reg_names(ptr_strings, NUM_TELSTATS); 2852 if (ret >= 0) 2853 telstats_index = ret; 2854 else 2855 rte_exit(EXIT_FAILURE, "failed to register metrics names"); 2856 2857 RTE_LCORE_FOREACH_WORKER(lcore_id) { 2858 rte_spinlock_init(&stats[lcore_id].telemetry_lock); 2859 } 2860 rte_timer_init(&telemetry_timer); 2861 rte_telemetry_register_cmd("/l3fwd-power/stats", 2862 handle_app_stats, 2863 "Returns global power stats. Parameters: None"); 2864 rte_eal_mp_remote_launch(main_telemetry_loop, NULL, 2865 SKIP_MAIN); 2866 } else if (app_mode == APP_MODE_INTERRUPT) { 2867 rte_eal_mp_remote_launch(main_intr_loop, NULL, CALL_MAIN); 2868 } else if (app_mode == APP_MODE_PMD_MGMT) { 2869 /* reuse telemetry loop for PMD power management mode */ 2870 rte_eal_mp_remote_launch(main_telemetry_loop, NULL, CALL_MAIN); 2871 } 2872 2873 if (app_mode == APP_MODE_TELEMETRY) 2874 launch_timer(rte_lcore_id()); 2875 2876 RTE_LCORE_FOREACH_WORKER(lcore_id) { 2877 if (rte_eal_wait_lcore(lcore_id) < 0) 2878 return -1; 2879 } 2880 2881 if (app_mode == APP_MODE_PMD_MGMT) { 2882 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2883 if (rte_lcore_is_enabled(lcore_id) == 0) 2884 continue; 2885 qconf = &lcore_conf[lcore_id]; 2886 for (queue = 0; queue < qconf->n_rx_queue; ++queue) { 2887 portid = qconf->rx_queue_list[queue].port_id; 2888 queueid = qconf->rx_queue_list[queue].queue_id; 2889 2890 rte_power_ethdev_pmgmt_queue_disable(lcore_id, 2891 portid, queueid); 2892 } 2893 } 2894 } 2895 2896 RTE_ETH_FOREACH_DEV(portid) 2897 { 2898 if ((enabled_port_mask & (1 << portid)) == 0) 2899 continue; 2900 2901 ret = rte_eth_dev_stop(portid); 2902 if (ret != 0) 2903 RTE_LOG(ERR, L3FWD_POWER, "rte_eth_dev_stop: err=%d, port=%u\n", 2904 ret, portid); 2905 2906 rte_eth_dev_close(portid); 2907 } 2908 2909 if ((app_mode == APP_MODE_LEGACY) && deinit_power_library()) 2910 rte_exit(EXIT_FAILURE, "deinit_power_library failed\n"); 2911 2912 if (rte_eal_cleanup() < 0) 2913 RTE_LOG(ERR, L3FWD_POWER, "EAL cleanup failed\n"); 2914 2915 return 0; 2916 } 2917