1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2018 Intel Corporation 3 */ 4 5 #include <stdio.h> 6 #include <stdlib.h> 7 #include <stdint.h> 8 #include <inttypes.h> 9 #include <sys/types.h> 10 #include <string.h> 11 #include <sys/queue.h> 12 #include <stdarg.h> 13 #include <errno.h> 14 #include <getopt.h> 15 #include <unistd.h> 16 #include <signal.h> 17 #include <math.h> 18 19 #include <rte_common.h> 20 #include <rte_byteorder.h> 21 #include <rte_log.h> 22 #include <rte_malloc.h> 23 #include <rte_memory.h> 24 #include <rte_memcpy.h> 25 #include <rte_eal.h> 26 #include <rte_launch.h> 27 #include <rte_cycles.h> 28 #include <rte_prefetch.h> 29 #include <rte_lcore.h> 30 #include <rte_per_lcore.h> 31 #include <rte_branch_prediction.h> 32 #include <rte_interrupts.h> 33 #include <rte_random.h> 34 #include <rte_debug.h> 35 #include <rte_ether.h> 36 #include <rte_ethdev.h> 37 #include <rte_mempool.h> 38 #include <rte_mbuf.h> 39 #include <rte_ip.h> 40 #include <rte_tcp.h> 41 #include <rte_udp.h> 42 #include <rte_string_fns.h> 43 #include <rte_timer.h> 44 #include <rte_power_cpufreq.h> 45 #include <rte_spinlock.h> 46 #include <rte_metrics.h> 47 #include <rte_telemetry.h> 48 #include <rte_power_pmd_mgmt.h> 49 #include <rte_power_uncore.h> 50 #include <rte_power_qos.h> 51 52 #include "perf_core.h" 53 #include "main.h" 54 55 RTE_LOG_REGISTER(l3fwd_power_logtype, l3fwd.power, INFO); 56 #define RTE_LOGTYPE_L3FWD_POWER l3fwd_power_logtype 57 58 #define MAX_PKT_BURST 32 59 60 #define MIN_ZERO_POLL_COUNT 10 61 62 /* 100 ms interval */ 63 #define TIMER_NUMBER_PER_SECOND 10 64 /* (10ms) */ 65 #define INTERVALS_PER_SECOND 100 66 /* 100000 us */ 67 #define SCALING_PERIOD (1000000/TIMER_NUMBER_PER_SECOND) 68 #define SCALING_DOWN_TIME_RATIO_THRESHOLD 0.25 69 70 #define APP_LOOKUP_EXACT_MATCH 0 71 #define APP_LOOKUP_LPM 1 72 #define DO_RFC_1812_CHECKS 73 74 #ifndef APP_LOOKUP_METHOD 75 #define APP_LOOKUP_METHOD APP_LOOKUP_LPM 76 #endif 77 78 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 79 #include <rte_hash.h> 80 #elif (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 81 #include <rte_lpm.h> 82 #else 83 #error "APP_LOOKUP_METHOD set to incorrect value" 84 #endif 85 86 #ifndef IPv6_BYTES 87 #define IPv6_BYTES_FMT "%02x%02x:%02x%02x:%02x%02x:%02x%02x:"\ 88 "%02x%02x:%02x%02x:%02x%02x:%02x%02x" 89 #define IPv6_BYTES(addr) \ 90 addr[0], addr[1], addr[2], addr[3], \ 91 addr[4], addr[5], addr[6], addr[7], \ 92 addr[8], addr[9], addr[10], addr[11],\ 93 addr[12], addr[13],addr[14], addr[15] 94 #endif 95 96 #define MAX_JUMBO_PKT_LEN 9600 97 98 #define IPV6_ADDR_LEN 16 99 100 #define MEMPOOL_CACHE_SIZE 256 101 102 /* 103 * This expression is used to calculate the number of mbufs needed depending on 104 * user input, taking into account memory for rx and tx hardware rings, cache 105 * per lcore and mtable per port per lcore. RTE_MAX is used to ensure that 106 * NB_MBUF never goes below a minimum value of 8192. 107 */ 108 109 #define NB_MBUF RTE_MAX ( \ 110 (nb_ports*nb_rx_queue*nb_rxd + \ 111 nb_ports*nb_lcores*MAX_PKT_BURST + \ 112 nb_ports*n_tx_queue*nb_txd + \ 113 nb_lcores*MEMPOOL_CACHE_SIZE), \ 114 (unsigned)8192) 115 116 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 117 118 #define NB_SOCKETS 8 119 120 /* Configure how many packets ahead to prefetch, when reading packets */ 121 #define PREFETCH_OFFSET 3 122 123 /* 124 * Configurable number of RX/TX ring descriptors 125 */ 126 #define RX_DESC_DEFAULT 1024 127 #define TX_DESC_DEFAULT 1024 128 129 #define NUM_TELSTATS RTE_DIM(telstats_strings) 130 131 static uint16_t nb_rxd = RX_DESC_DEFAULT; 132 static uint16_t nb_txd = TX_DESC_DEFAULT; 133 134 /* ethernet addresses of ports */ 135 static struct rte_ether_addr ports_eth_addr[RTE_MAX_ETHPORTS]; 136 137 /* ethernet addresses of ports */ 138 static rte_spinlock_t locks[RTE_MAX_ETHPORTS]; 139 140 /* mask of enabled ports */ 141 static uint32_t enabled_port_mask = 0; 142 /* Ports set in promiscuous mode off by default. */ 143 static int promiscuous_on = 0; 144 /* NUMA is enabled by default. */ 145 static int numa_on = 1; 146 volatile bool quit_signal; 147 /* timer to update telemetry every 500ms */ 148 static struct rte_timer telemetry_timer; 149 150 /* stats index returned by metrics lib */ 151 int telstats_index; 152 153 /* flag to check if uncore option enabled */ 154 int enabled_uncore = -1; 155 156 struct telstats_name { 157 char name[RTE_ETH_XSTATS_NAME_SIZE]; 158 }; 159 160 /* telemetry stats to be reported */ 161 const struct telstats_name telstats_strings[] = { 162 {"empty_poll"}, 163 {"full_poll"}, 164 {"busy_percent"} 165 }; 166 167 /* core busyness in percentage */ 168 enum busy_rate { 169 ZERO = 0, 170 PARTIAL = 50, 171 FULL = 100 172 }; 173 174 enum uncore_choice { 175 UNCORE_MIN = 0, 176 UNCORE_MAX = 1, 177 UNCORE_IDX = 2 178 }; 179 180 /* reference poll count to measure core busyness */ 181 #define DEFAULT_COUNT 10000 182 /* 183 * reference CYCLES to be used to 184 * measure core busyness based on poll count 185 */ 186 #define MIN_CYCLES 1500000ULL 187 #define MAX_CYCLES 22000000ULL 188 189 /* (500ms) */ 190 #define TELEMETRY_INTERVALS_PER_SEC 2 191 192 static int parse_ptype; /**< Parse packet type using rx callback, and */ 193 /**< disabled by default */ 194 195 enum appmode { 196 APP_MODE_DEFAULT = 0, 197 APP_MODE_LEGACY, 198 APP_MODE_TELEMETRY, 199 APP_MODE_INTERRUPT, 200 APP_MODE_PMD_MGMT 201 }; 202 203 enum appmode app_mode; 204 205 static enum rte_power_pmd_mgmt_type pmgmt_type; 206 bool baseline_enabled; 207 208 enum freq_scale_hint_t 209 { 210 FREQ_LOWER = -1, 211 FREQ_CURRENT = 0, 212 FREQ_HIGHER = 1, 213 FREQ_HIGHEST = 2 214 }; 215 216 struct __rte_cache_aligned lcore_rx_queue { 217 uint16_t port_id; 218 uint16_t queue_id; 219 enum freq_scale_hint_t freq_up_hint; 220 uint32_t zero_rx_packet_count; 221 uint32_t idle_hint; 222 }; 223 224 #define MAX_RX_QUEUE_PER_LCORE 16 225 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS 226 #define MAX_RX_QUEUE_PER_PORT 128 227 228 #define MAX_RX_QUEUE_INTERRUPT_PER_PORT 16 229 230 231 struct lcore_params lcore_params_array[MAX_LCORE_PARAMS]; 232 static struct lcore_params lcore_params_array_default[] = { 233 {0, 0, 2}, 234 {0, 1, 2}, 235 {0, 2, 2}, 236 {1, 0, 2}, 237 {1, 1, 2}, 238 {1, 2, 2}, 239 {2, 0, 2}, 240 {3, 0, 3}, 241 {3, 1, 3}, 242 }; 243 244 struct lcore_params *lcore_params = lcore_params_array_default; 245 uint16_t nb_lcore_params = RTE_DIM(lcore_params_array_default); 246 247 static struct rte_eth_conf port_conf = { 248 .rxmode = { 249 .mq_mode = RTE_ETH_MQ_RX_RSS, 250 .offloads = RTE_ETH_RX_OFFLOAD_CHECKSUM, 251 }, 252 .rx_adv_conf = { 253 .rss_conf = { 254 .rss_key = NULL, 255 .rss_hf = RTE_ETH_RSS_UDP, 256 }, 257 }, 258 .txmode = { 259 .mq_mode = RTE_ETH_MQ_TX_NONE, 260 } 261 }; 262 263 static uint32_t max_pkt_len; 264 static uint32_t max_empty_polls = 512; 265 static uint32_t pause_duration = 1; 266 static uint32_t scale_freq_min; 267 static uint32_t scale_freq_max; 268 269 static int cpu_resume_latency = -1; 270 static int resume_latency_bk[RTE_MAX_LCORE]; 271 272 static struct rte_mempool * pktmbuf_pool[NB_SOCKETS]; 273 274 275 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 276 277 #ifdef RTE_ARCH_X86 278 #include <rte_hash_crc.h> 279 #define DEFAULT_HASH_FUNC rte_hash_crc 280 #else 281 #include <rte_jhash.h> 282 #define DEFAULT_HASH_FUNC rte_jhash 283 #endif 284 285 struct __rte_packed_begin ipv4_5tuple { 286 uint32_t ip_dst; 287 uint32_t ip_src; 288 uint16_t port_dst; 289 uint16_t port_src; 290 uint8_t proto; 291 } __rte_packed_end; 292 293 struct __rte_packed_begin ipv6_5tuple { 294 uint8_t ip_dst[IPV6_ADDR_LEN]; 295 uint8_t ip_src[IPV6_ADDR_LEN]; 296 uint16_t port_dst; 297 uint16_t port_src; 298 uint8_t proto; 299 } __rte_packed_end; 300 301 struct ipv4_l3fwd_route { 302 struct ipv4_5tuple key; 303 uint8_t if_out; 304 }; 305 306 struct ipv6_l3fwd_route { 307 struct ipv6_5tuple key; 308 uint8_t if_out; 309 }; 310 311 static struct ipv4_l3fwd_route ipv4_l3fwd_route_array[] = { 312 {{RTE_IPV4(100,10,0,1), RTE_IPV4(200,10,0,1), 101, 11, IPPROTO_TCP}, 0}, 313 {{RTE_IPV4(100,20,0,2), RTE_IPV4(200,20,0,2), 102, 12, IPPROTO_TCP}, 1}, 314 {{RTE_IPV4(100,30,0,3), RTE_IPV4(200,30,0,3), 103, 13, IPPROTO_TCP}, 2}, 315 {{RTE_IPV4(100,40,0,4), RTE_IPV4(200,40,0,4), 104, 14, IPPROTO_TCP}, 3}, 316 }; 317 318 static struct ipv6_l3fwd_route ipv6_l3fwd_route_array[] = { 319 { 320 { 321 {0xfe, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 322 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05}, 323 {0xfe, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 324 0x02, 0x1e, 0x67, 0xff, 0xfe, 0x0d, 0xb6, 0x0a}, 325 1, 10, IPPROTO_UDP 326 }, 4 327 }, 328 }; 329 330 typedef struct rte_hash lookup_struct_t; 331 static lookup_struct_t *ipv4_l3fwd_lookup_struct[NB_SOCKETS]; 332 static lookup_struct_t *ipv6_l3fwd_lookup_struct[NB_SOCKETS]; 333 334 #define L3FWD_HASH_ENTRIES 1024 335 336 static alignas(RTE_CACHE_LINE_SIZE) uint16_t ipv4_l3fwd_out_if[L3FWD_HASH_ENTRIES]; 337 static alignas(RTE_CACHE_LINE_SIZE) uint16_t ipv6_l3fwd_out_if[L3FWD_HASH_ENTRIES]; 338 #endif 339 340 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 341 struct ipv4_l3fwd_route { 342 uint32_t ip; 343 uint8_t depth; 344 uint8_t if_out; 345 }; 346 347 static struct ipv4_l3fwd_route ipv4_l3fwd_route_array[] = { 348 {RTE_IPV4(1,1,1,0), 24, 0}, 349 {RTE_IPV4(2,1,1,0), 24, 1}, 350 {RTE_IPV4(3,1,1,0), 24, 2}, 351 {RTE_IPV4(4,1,1,0), 24, 3}, 352 {RTE_IPV4(5,1,1,0), 24, 4}, 353 {RTE_IPV4(6,1,1,0), 24, 5}, 354 {RTE_IPV4(7,1,1,0), 24, 6}, 355 {RTE_IPV4(8,1,1,0), 24, 7}, 356 }; 357 358 #define IPV4_L3FWD_LPM_MAX_RULES 1024 359 360 typedef struct rte_lpm lookup_struct_t; 361 static lookup_struct_t *ipv4_l3fwd_lookup_struct[NB_SOCKETS]; 362 #endif 363 364 struct __rte_cache_aligned lcore_conf { 365 uint16_t n_rx_queue; 366 struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; 367 uint16_t n_tx_port; 368 uint16_t tx_port_id[RTE_MAX_ETHPORTS]; 369 uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; 370 struct rte_eth_dev_tx_buffer *tx_buffer[RTE_MAX_ETHPORTS]; 371 lookup_struct_t * ipv4_lookup_struct; 372 lookup_struct_t * ipv6_lookup_struct; 373 }; 374 375 struct __rte_cache_aligned lcore_stats { 376 /* total sleep time in ms since last frequency scaling down */ 377 uint32_t sleep_time; 378 /* number of long sleep recently */ 379 uint32_t nb_long_sleep; 380 /* freq. scaling up trend */ 381 uint32_t trend; 382 /* total packet processed recently */ 383 uint64_t nb_rx_processed; 384 /* total iterations looped recently */ 385 uint64_t nb_iteration_looped; 386 /* 387 * Represents empty and non empty polls 388 * of rte_eth_rx_burst(); 389 * ep_nep[0] holds non empty polls 390 * i.e. 0 < nb_rx <= MAX_BURST 391 * ep_nep[1] holds empty polls. 392 * i.e. nb_rx == 0 393 */ 394 uint64_t ep_nep[2]; 395 /* 396 * Represents full and empty+partial 397 * polls of rte_eth_rx_burst(); 398 * ep_nep[0] holds empty+partial polls. 399 * i.e. 0 <= nb_rx < MAX_BURST 400 * ep_nep[1] holds full polls 401 * i.e. nb_rx == MAX_BURST 402 */ 403 uint64_t fp_nfp[2]; 404 enum busy_rate br; 405 rte_spinlock_t telemetry_lock; 406 }; 407 408 static alignas(RTE_CACHE_LINE_SIZE) struct lcore_conf lcore_conf[RTE_MAX_LCORE]; 409 static alignas(RTE_CACHE_LINE_SIZE) struct lcore_stats stats[RTE_MAX_LCORE]; 410 static struct rte_timer power_timers[RTE_MAX_LCORE]; 411 412 static inline uint32_t power_idle_heuristic(uint32_t zero_rx_packet_count); 413 static inline enum freq_scale_hint_t power_freq_scaleup_heuristic( \ 414 unsigned int lcore_id, uint16_t port_id, uint16_t queue_id); 415 416 static int is_done(void) 417 { 418 return quit_signal; 419 } 420 421 /* exit signal handler */ 422 static void 423 signal_exit_now(int sigtype) 424 { 425 426 if (sigtype == SIGINT) 427 quit_signal = true; 428 429 } 430 431 /* Frequency scale down timer callback */ 432 static void 433 power_timer_cb(__rte_unused struct rte_timer *tim, 434 __rte_unused void *arg) 435 { 436 uint64_t hz; 437 float sleep_time_ratio; 438 unsigned lcore_id = rte_lcore_id(); 439 440 /* accumulate total execution time in us when callback is invoked */ 441 sleep_time_ratio = (float)(stats[lcore_id].sleep_time) / 442 (float)SCALING_PERIOD; 443 /** 444 * check whether need to scale down frequency a step if it sleep a lot. 445 */ 446 if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD) { 447 rte_power_freq_down(lcore_id); 448 } 449 else if ( (unsigned)(stats[lcore_id].nb_rx_processed / 450 stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST) { 451 /** 452 * scale down a step if average packet per iteration less 453 * than expectation. 454 */ 455 rte_power_freq_down(lcore_id); 456 } 457 458 /** 459 * initialize another timer according to current frequency to ensure 460 * timer interval is relatively fixed. 461 */ 462 hz = rte_get_timer_hz(); 463 rte_timer_reset(&power_timers[lcore_id], hz/TIMER_NUMBER_PER_SECOND, 464 SINGLE, lcore_id, power_timer_cb, NULL); 465 466 stats[lcore_id].nb_rx_processed = 0; 467 stats[lcore_id].nb_iteration_looped = 0; 468 469 stats[lcore_id].sleep_time = 0; 470 } 471 472 /* Enqueue a single packet, and send burst if queue is filled */ 473 static inline int 474 send_single_packet(struct rte_mbuf *m, uint16_t port) 475 { 476 uint32_t lcore_id; 477 struct lcore_conf *qconf; 478 479 lcore_id = rte_lcore_id(); 480 qconf = &lcore_conf[lcore_id]; 481 482 rte_eth_tx_buffer(port, qconf->tx_queue_id[port], 483 qconf->tx_buffer[port], m); 484 485 return 0; 486 } 487 488 #ifdef DO_RFC_1812_CHECKS 489 static inline int 490 is_valid_ipv4_pkt(struct rte_ipv4_hdr *pkt, uint32_t link_len) 491 { 492 /* From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2 */ 493 /* 494 * 1. The packet length reported by the Link Layer must be large 495 * enough to hold the minimum length legal IP datagram (20 bytes). 496 */ 497 if (link_len < sizeof(struct rte_ipv4_hdr)) 498 return -1; 499 500 /* 2. The IP checksum must be correct. */ 501 /* if this is not checked in H/W, check it. */ 502 if ((port_conf.rxmode.offloads & RTE_ETH_RX_OFFLOAD_IPV4_CKSUM) == 0) { 503 uint16_t actual_cksum, expected_cksum; 504 actual_cksum = pkt->hdr_checksum; 505 pkt->hdr_checksum = 0; 506 expected_cksum = rte_ipv4_cksum(pkt); 507 if (actual_cksum != expected_cksum) 508 return -2; 509 } 510 511 /* 512 * 3. The IP version number must be 4. If the version number is not 4 513 * then the packet may be another version of IP, such as IPng or 514 * ST-II. 515 */ 516 if (((pkt->version_ihl) >> 4) != 4) 517 return -3; 518 /* 519 * 4. The IP header length field must be large enough to hold the 520 * minimum length legal IP datagram (20 bytes = 5 words). 521 */ 522 if ((pkt->version_ihl & 0xf) < 5) 523 return -4; 524 525 /* 526 * 5. The IP total length field must be large enough to hold the IP 527 * datagram header, whose length is specified in the IP header length 528 * field. 529 */ 530 if (rte_cpu_to_be_16(pkt->total_length) < sizeof(struct rte_ipv4_hdr)) 531 return -5; 532 533 return 0; 534 } 535 #endif 536 537 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 538 static void 539 print_ipv4_key(struct ipv4_5tuple key) 540 { 541 printf("IP dst = %08x, IP src = %08x, port dst = %d, port src = %d, " 542 "proto = %d\n", (unsigned)key.ip_dst, (unsigned)key.ip_src, 543 key.port_dst, key.port_src, key.proto); 544 } 545 static void 546 print_ipv6_key(struct ipv6_5tuple key) 547 { 548 printf( "IP dst = " IPv6_BYTES_FMT ", IP src = " IPv6_BYTES_FMT ", " 549 "port dst = %d, port src = %d, proto = %d\n", 550 IPv6_BYTES(key.ip_dst), IPv6_BYTES(key.ip_src), 551 key.port_dst, key.port_src, key.proto); 552 } 553 554 static inline uint16_t 555 get_ipv4_dst_port(struct rte_ipv4_hdr *ipv4_hdr, uint16_t portid, 556 lookup_struct_t * ipv4_l3fwd_lookup_struct) 557 { 558 struct ipv4_5tuple key; 559 struct rte_tcp_hdr *tcp; 560 struct rte_udp_hdr *udp; 561 int ret = 0; 562 563 key.ip_dst = rte_be_to_cpu_32(ipv4_hdr->dst_addr); 564 key.ip_src = rte_be_to_cpu_32(ipv4_hdr->src_addr); 565 key.proto = ipv4_hdr->next_proto_id; 566 567 switch (ipv4_hdr->next_proto_id) { 568 case IPPROTO_TCP: 569 tcp = (struct rte_tcp_hdr *)((unsigned char *)ipv4_hdr + 570 sizeof(struct rte_ipv4_hdr)); 571 key.port_dst = rte_be_to_cpu_16(tcp->dst_port); 572 key.port_src = rte_be_to_cpu_16(tcp->src_port); 573 break; 574 575 case IPPROTO_UDP: 576 udp = (struct rte_udp_hdr *)((unsigned char *)ipv4_hdr + 577 sizeof(struct rte_ipv4_hdr)); 578 key.port_dst = rte_be_to_cpu_16(udp->dst_port); 579 key.port_src = rte_be_to_cpu_16(udp->src_port); 580 break; 581 582 default: 583 key.port_dst = 0; 584 key.port_src = 0; 585 break; 586 } 587 588 /* Find destination port */ 589 ret = rte_hash_lookup(ipv4_l3fwd_lookup_struct, (const void *)&key); 590 return ((ret < 0) ? portid : ipv4_l3fwd_out_if[ret]); 591 } 592 593 static inline uint16_t 594 get_ipv6_dst_port(struct rte_ipv6_hdr *ipv6_hdr, uint16_t portid, 595 lookup_struct_t *ipv6_l3fwd_lookup_struct) 596 { 597 struct ipv6_5tuple key; 598 struct rte_tcp_hdr *tcp; 599 struct rte_udp_hdr *udp; 600 int ret = 0; 601 602 memcpy(key.ip_dst, ipv6_hdr->dst_addr, IPV6_ADDR_LEN); 603 memcpy(key.ip_src, ipv6_hdr->src_addr, IPV6_ADDR_LEN); 604 605 key.proto = ipv6_hdr->proto; 606 607 switch (ipv6_hdr->proto) { 608 case IPPROTO_TCP: 609 tcp = (struct rte_tcp_hdr *)((unsigned char *) ipv6_hdr + 610 sizeof(struct rte_ipv6_hdr)); 611 key.port_dst = rte_be_to_cpu_16(tcp->dst_port); 612 key.port_src = rte_be_to_cpu_16(tcp->src_port); 613 break; 614 615 case IPPROTO_UDP: 616 udp = (struct rte_udp_hdr *)((unsigned char *) ipv6_hdr + 617 sizeof(struct rte_ipv6_hdr)); 618 key.port_dst = rte_be_to_cpu_16(udp->dst_port); 619 key.port_src = rte_be_to_cpu_16(udp->src_port); 620 break; 621 622 default: 623 key.port_dst = 0; 624 key.port_src = 0; 625 break; 626 } 627 628 /* Find destination port */ 629 ret = rte_hash_lookup(ipv6_l3fwd_lookup_struct, (const void *)&key); 630 return ((ret < 0) ? portid : ipv6_l3fwd_out_if[ret]); 631 } 632 #endif 633 634 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 635 static inline uint16_t 636 get_ipv4_dst_port(struct rte_ipv4_hdr *ipv4_hdr, uint16_t portid, 637 lookup_struct_t *ipv4_l3fwd_lookup_struct) 638 { 639 uint32_t next_hop; 640 641 return ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct, 642 rte_be_to_cpu_32(ipv4_hdr->dst_addr), &next_hop) == 0)? 643 next_hop : portid); 644 } 645 #endif 646 647 static inline void 648 parse_ptype_one(struct rte_mbuf *m) 649 { 650 struct rte_ether_hdr *eth_hdr; 651 uint32_t packet_type = RTE_PTYPE_UNKNOWN; 652 uint16_t ether_type; 653 654 eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 655 ether_type = eth_hdr->ether_type; 656 if (ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4)) 657 packet_type |= RTE_PTYPE_L3_IPV4_EXT_UNKNOWN; 658 else if (ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV6)) 659 packet_type |= RTE_PTYPE_L3_IPV6_EXT_UNKNOWN; 660 661 m->packet_type = packet_type; 662 } 663 664 static uint16_t 665 cb_parse_ptype(uint16_t port __rte_unused, uint16_t queue __rte_unused, 666 struct rte_mbuf *pkts[], uint16_t nb_pkts, 667 uint16_t max_pkts __rte_unused, 668 void *user_param __rte_unused) 669 { 670 unsigned int i; 671 672 for (i = 0; i < nb_pkts; ++i) 673 parse_ptype_one(pkts[i]); 674 675 return nb_pkts; 676 } 677 678 static int 679 add_cb_parse_ptype(uint16_t portid, uint16_t queueid) 680 { 681 printf("Port %d: softly parse packet type info\n", portid); 682 if (rte_eth_add_rx_callback(portid, queueid, cb_parse_ptype, NULL)) 683 return 0; 684 685 printf("Failed to add rx callback: port=%d\n", portid); 686 return -1; 687 } 688 689 static inline void 690 l3fwd_simple_forward(struct rte_mbuf *m, uint16_t portid, 691 struct lcore_conf *qconf) 692 { 693 struct rte_ether_hdr *eth_hdr; 694 struct rte_ipv4_hdr *ipv4_hdr; 695 void *d_addr_bytes; 696 uint16_t dst_port; 697 698 eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 699 700 if (RTE_ETH_IS_IPV4_HDR(m->packet_type)) { 701 /* Handle IPv4 headers.*/ 702 ipv4_hdr = 703 rte_pktmbuf_mtod_offset(m, struct rte_ipv4_hdr *, 704 sizeof(struct rte_ether_hdr)); 705 706 #ifdef DO_RFC_1812_CHECKS 707 /* Check to make sure the packet is valid (RFC1812) */ 708 if (is_valid_ipv4_pkt(ipv4_hdr, m->pkt_len) < 0) { 709 rte_pktmbuf_free(m); 710 return; 711 } 712 #endif 713 714 dst_port = get_ipv4_dst_port(ipv4_hdr, portid, 715 qconf->ipv4_lookup_struct); 716 if (dst_port >= RTE_MAX_ETHPORTS || 717 (enabled_port_mask & 1 << dst_port) == 0) 718 dst_port = portid; 719 720 /* 02:00:00:00:00:xx */ 721 d_addr_bytes = ð_hdr->dst_addr.addr_bytes[0]; 722 *((uint64_t *)d_addr_bytes) = 723 0x000000000002 + ((uint64_t)dst_port << 40); 724 725 #ifdef DO_RFC_1812_CHECKS 726 /* Update time to live and header checksum */ 727 --(ipv4_hdr->time_to_live); 728 ++(ipv4_hdr->hdr_checksum); 729 #endif 730 731 /* src addr */ 732 rte_ether_addr_copy(&ports_eth_addr[dst_port], 733 ð_hdr->src_addr); 734 735 send_single_packet(m, dst_port); 736 } else if (RTE_ETH_IS_IPV6_HDR(m->packet_type)) { 737 /* Handle IPv6 headers.*/ 738 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 739 struct rte_ipv6_hdr *ipv6_hdr; 740 741 ipv6_hdr = 742 rte_pktmbuf_mtod_offset(m, struct rte_ipv6_hdr *, 743 sizeof(struct rte_ether_hdr)); 744 745 dst_port = get_ipv6_dst_port(ipv6_hdr, portid, 746 qconf->ipv6_lookup_struct); 747 748 if (dst_port >= RTE_MAX_ETHPORTS || 749 (enabled_port_mask & 1 << dst_port) == 0) 750 dst_port = portid; 751 752 /* 02:00:00:00:00:xx */ 753 d_addr_bytes = ð_hdr->dst_addr.addr_bytes[0]; 754 *((uint64_t *)d_addr_bytes) = 755 0x000000000002 + ((uint64_t)dst_port << 40); 756 757 /* src addr */ 758 rte_ether_addr_copy(&ports_eth_addr[dst_port], 759 ð_hdr->src_addr); 760 761 send_single_packet(m, dst_port); 762 #else 763 /* We don't currently handle IPv6 packets in LPM mode. */ 764 rte_pktmbuf_free(m); 765 #endif 766 } else 767 rte_pktmbuf_free(m); 768 769 } 770 771 #define MINIMUM_SLEEP_TIME 1 772 #define SUSPEND_THRESHOLD 300 773 774 static inline uint32_t 775 power_idle_heuristic(uint32_t zero_rx_packet_count) 776 { 777 /* If zero count is less than 100, sleep 1us */ 778 if (zero_rx_packet_count < SUSPEND_THRESHOLD) 779 return MINIMUM_SLEEP_TIME; 780 /* If zero count is less than 1000, sleep 100 us which is the 781 minimum latency switching from C3/C6 to C0 782 */ 783 else 784 return SUSPEND_THRESHOLD; 785 } 786 787 static inline enum freq_scale_hint_t 788 power_freq_scaleup_heuristic(unsigned lcore_id, 789 uint16_t port_id, 790 uint16_t queue_id) 791 { 792 uint32_t rxq_count = rte_eth_rx_queue_count(port_id, queue_id); 793 /** 794 * HW Rx queue size is 128 by default, Rx burst read at maximum 32 entries 795 * per iteration 796 */ 797 #define FREQ_GEAR1_RX_PACKET_THRESHOLD MAX_PKT_BURST 798 #define FREQ_GEAR2_RX_PACKET_THRESHOLD (MAX_PKT_BURST*2) 799 #define FREQ_GEAR3_RX_PACKET_THRESHOLD (MAX_PKT_BURST*3) 800 #define FREQ_UP_TREND1_ACC 1 801 #define FREQ_UP_TREND2_ACC 100 802 #define FREQ_UP_THRESHOLD 10000 803 804 if (likely(rxq_count > FREQ_GEAR3_RX_PACKET_THRESHOLD)) { 805 stats[lcore_id].trend = 0; 806 return FREQ_HIGHEST; 807 } else if (likely(rxq_count > FREQ_GEAR2_RX_PACKET_THRESHOLD)) 808 stats[lcore_id].trend += FREQ_UP_TREND2_ACC; 809 else if (likely(rxq_count > FREQ_GEAR1_RX_PACKET_THRESHOLD)) 810 stats[lcore_id].trend += FREQ_UP_TREND1_ACC; 811 812 if (likely(stats[lcore_id].trend > FREQ_UP_THRESHOLD)) { 813 stats[lcore_id].trend = 0; 814 return FREQ_HIGHER; 815 } 816 817 return FREQ_CURRENT; 818 } 819 820 /** 821 * force polling thread sleep until one-shot rx interrupt triggers 822 * @param port_id 823 * Port id. 824 * @param queue_id 825 * Rx queue id. 826 * @return 827 * 0 on success 828 */ 829 static int 830 sleep_until_rx_interrupt(int num, int lcore) 831 { 832 /* 833 * we want to track when we are woken up by traffic so that we can go 834 * back to sleep again without log spamming. Avoid cache line sharing 835 * to prevent threads stepping on each others' toes. 836 */ 837 static alignas(RTE_CACHE_LINE_SIZE) struct { 838 bool wakeup; 839 } status[RTE_MAX_LCORE]; 840 struct rte_epoll_event event[num]; 841 int n, i; 842 uint16_t port_id; 843 uint16_t queue_id; 844 void *data; 845 846 if (status[lcore].wakeup) { 847 RTE_LOG(INFO, L3FWD_POWER, 848 "lcore %u sleeps until interrupt triggers\n", 849 rte_lcore_id()); 850 } 851 852 n = rte_epoll_wait(RTE_EPOLL_PER_THREAD, event, num, 10); 853 for (i = 0; i < n; i++) { 854 data = event[i].epdata.data; 855 port_id = ((uintptr_t)data) >> (sizeof(uint16_t) * CHAR_BIT); 856 queue_id = ((uintptr_t)data) & 857 RTE_LEN2MASK((sizeof(uint16_t) * CHAR_BIT), uint16_t); 858 RTE_LOG(INFO, L3FWD_POWER, 859 "lcore %u is waked up from rx interrupt on" 860 " port %d queue %d\n", 861 rte_lcore_id(), port_id, queue_id); 862 } 863 status[lcore].wakeup = n != 0; 864 865 return 0; 866 } 867 868 static void turn_on_off_intr(struct lcore_conf *qconf, bool on) 869 { 870 int i; 871 struct lcore_rx_queue *rx_queue; 872 uint16_t queue_id; 873 uint16_t port_id; 874 875 for (i = 0; i < qconf->n_rx_queue; ++i) { 876 rx_queue = &(qconf->rx_queue_list[i]); 877 port_id = rx_queue->port_id; 878 queue_id = rx_queue->queue_id; 879 880 rte_spinlock_lock(&(locks[port_id])); 881 if (on) 882 rte_eth_dev_rx_intr_enable(port_id, queue_id); 883 else 884 rte_eth_dev_rx_intr_disable(port_id, queue_id); 885 rte_spinlock_unlock(&(locks[port_id])); 886 } 887 } 888 889 static int event_register(struct lcore_conf *qconf) 890 { 891 struct lcore_rx_queue *rx_queue; 892 uint16_t queueid; 893 uint16_t portid; 894 uint32_t data; 895 int ret; 896 int i; 897 898 for (i = 0; i < qconf->n_rx_queue; ++i) { 899 rx_queue = &(qconf->rx_queue_list[i]); 900 portid = rx_queue->port_id; 901 queueid = rx_queue->queue_id; 902 data = portid << (sizeof(uint16_t) * CHAR_BIT) | queueid; 903 904 ret = rte_eth_dev_rx_intr_ctl_q(portid, queueid, 905 RTE_EPOLL_PER_THREAD, 906 RTE_INTR_EVENT_ADD, 907 (void *)((uintptr_t)data)); 908 if (ret) 909 return ret; 910 } 911 912 return 0; 913 } 914 915 /* Main processing loop. 8< */ 916 static int main_intr_loop(__rte_unused void *dummy) 917 { 918 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 919 unsigned int lcore_id; 920 uint64_t prev_tsc, diff_tsc, cur_tsc; 921 int i, j, nb_rx; 922 uint16_t portid, queueid; 923 struct lcore_conf *qconf; 924 struct lcore_rx_queue *rx_queue; 925 uint32_t lcore_rx_idle_count = 0; 926 uint32_t lcore_idle_hint = 0; 927 int intr_en = 0; 928 929 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 930 US_PER_S * BURST_TX_DRAIN_US; 931 932 prev_tsc = 0; 933 934 lcore_id = rte_lcore_id(); 935 qconf = &lcore_conf[lcore_id]; 936 937 if (qconf->n_rx_queue == 0) { 938 RTE_LOG(INFO, L3FWD_POWER, "lcore %u has nothing to do\n", 939 lcore_id); 940 return 0; 941 } 942 943 RTE_LOG(INFO, L3FWD_POWER, "entering main interrupt loop on lcore %u\n", 944 lcore_id); 945 946 for (i = 0; i < qconf->n_rx_queue; i++) { 947 portid = qconf->rx_queue_list[i].port_id; 948 queueid = qconf->rx_queue_list[i].queue_id; 949 RTE_LOG(INFO, L3FWD_POWER, 950 " -- lcoreid=%u portid=%u rxqueueid=%" PRIu16 "\n", 951 lcore_id, portid, queueid); 952 } 953 954 /* add into event wait list */ 955 if (event_register(qconf) == 0) 956 intr_en = 1; 957 else 958 RTE_LOG(INFO, L3FWD_POWER, "RX interrupt won't enable.\n"); 959 960 while (!is_done()) { 961 stats[lcore_id].nb_iteration_looped++; 962 963 cur_tsc = rte_rdtsc(); 964 965 /* 966 * TX burst queue drain 967 */ 968 diff_tsc = cur_tsc - prev_tsc; 969 if (unlikely(diff_tsc > drain_tsc)) { 970 for (i = 0; i < qconf->n_tx_port; ++i) { 971 portid = qconf->tx_port_id[i]; 972 rte_eth_tx_buffer_flush(portid, 973 qconf->tx_queue_id[portid], 974 qconf->tx_buffer[portid]); 975 } 976 prev_tsc = cur_tsc; 977 } 978 979 start_rx: 980 /* 981 * Read packet from RX queues 982 */ 983 lcore_rx_idle_count = 0; 984 for (i = 0; i < qconf->n_rx_queue; ++i) { 985 rx_queue = &(qconf->rx_queue_list[i]); 986 rx_queue->idle_hint = 0; 987 portid = rx_queue->port_id; 988 queueid = rx_queue->queue_id; 989 990 nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst, 991 MAX_PKT_BURST); 992 993 stats[lcore_id].nb_rx_processed += nb_rx; 994 if (unlikely(nb_rx == 0)) { 995 /** 996 * no packet received from rx queue, try to 997 * sleep for a while forcing CPU enter deeper 998 * C states. 999 */ 1000 rx_queue->zero_rx_packet_count++; 1001 1002 if (rx_queue->zero_rx_packet_count <= 1003 MIN_ZERO_POLL_COUNT) 1004 continue; 1005 1006 rx_queue->idle_hint = power_idle_heuristic( 1007 rx_queue->zero_rx_packet_count); 1008 lcore_rx_idle_count++; 1009 } else { 1010 rx_queue->zero_rx_packet_count = 0; 1011 } 1012 1013 /* Prefetch first packets */ 1014 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1015 rte_prefetch0(rte_pktmbuf_mtod( 1016 pkts_burst[j], void *)); 1017 } 1018 1019 /* Prefetch and forward already prefetched packets */ 1020 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1021 rte_prefetch0(rte_pktmbuf_mtod( 1022 pkts_burst[j + PREFETCH_OFFSET], 1023 void *)); 1024 l3fwd_simple_forward( 1025 pkts_burst[j], portid, qconf); 1026 } 1027 1028 /* Forward remaining prefetched packets */ 1029 for (; j < nb_rx; j++) { 1030 l3fwd_simple_forward( 1031 pkts_burst[j], portid, qconf); 1032 } 1033 } 1034 1035 if (unlikely(lcore_rx_idle_count == qconf->n_rx_queue)) { 1036 /** 1037 * All Rx queues empty in recent consecutive polls, 1038 * sleep in a conservative manner, meaning sleep as 1039 * less as possible. 1040 */ 1041 for (i = 1, 1042 lcore_idle_hint = qconf->rx_queue_list[0].idle_hint; 1043 i < qconf->n_rx_queue; ++i) { 1044 rx_queue = &(qconf->rx_queue_list[i]); 1045 if (rx_queue->idle_hint < lcore_idle_hint) 1046 lcore_idle_hint = rx_queue->idle_hint; 1047 } 1048 1049 if (lcore_idle_hint < SUSPEND_THRESHOLD) 1050 /** 1051 * execute "pause" instruction to avoid context 1052 * switch which generally take hundred of 1053 * microseconds for short sleep. 1054 */ 1055 rte_delay_us(lcore_idle_hint); 1056 else { 1057 /* suspend until rx interrupt triggers */ 1058 if (intr_en) { 1059 turn_on_off_intr(qconf, 1); 1060 sleep_until_rx_interrupt( 1061 qconf->n_rx_queue, 1062 lcore_id); 1063 turn_on_off_intr(qconf, 0); 1064 /** 1065 * start receiving packets immediately 1066 */ 1067 if (likely(!is_done())) 1068 goto start_rx; 1069 } 1070 } 1071 stats[lcore_id].sleep_time += lcore_idle_hint; 1072 } 1073 } 1074 1075 return 0; 1076 } 1077 /* >8 End of main processing loop. */ 1078 1079 /* main processing loop */ 1080 static int 1081 main_telemetry_loop(__rte_unused void *dummy) 1082 { 1083 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1084 unsigned int lcore_id; 1085 uint64_t prev_tsc, diff_tsc, cur_tsc, prev_tel_tsc; 1086 int i, j, nb_rx; 1087 uint16_t portid, queueid; 1088 struct lcore_conf *qconf; 1089 struct lcore_rx_queue *rx_queue; 1090 uint64_t ep_nep[2] = {0}, fp_nfp[2] = {0}; 1091 uint64_t poll_count; 1092 enum busy_rate br; 1093 1094 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 1095 US_PER_S * BURST_TX_DRAIN_US; 1096 1097 poll_count = 0; 1098 prev_tsc = 0; 1099 prev_tel_tsc = 0; 1100 1101 lcore_id = rte_lcore_id(); 1102 qconf = &lcore_conf[lcore_id]; 1103 1104 if (qconf->n_rx_queue == 0) { 1105 RTE_LOG(INFO, L3FWD_POWER, "lcore %u has nothing to do\n", 1106 lcore_id); 1107 return 0; 1108 } 1109 1110 RTE_LOG(INFO, L3FWD_POWER, "entering main telemetry loop on lcore %u\n", 1111 lcore_id); 1112 1113 for (i = 0; i < qconf->n_rx_queue; i++) { 1114 portid = qconf->rx_queue_list[i].port_id; 1115 queueid = qconf->rx_queue_list[i].queue_id; 1116 RTE_LOG(INFO, L3FWD_POWER, " -- lcoreid=%u portid=%u " 1117 "rxqueueid=%" PRIu16 "\n", lcore_id, portid, queueid); 1118 } 1119 1120 while (!is_done()) { 1121 1122 cur_tsc = rte_rdtsc(); 1123 /* 1124 * TX burst queue drain 1125 */ 1126 diff_tsc = cur_tsc - prev_tsc; 1127 if (unlikely(diff_tsc > drain_tsc)) { 1128 for (i = 0; i < qconf->n_tx_port; ++i) { 1129 portid = qconf->tx_port_id[i]; 1130 rte_eth_tx_buffer_flush(portid, 1131 qconf->tx_queue_id[portid], 1132 qconf->tx_buffer[portid]); 1133 } 1134 prev_tsc = cur_tsc; 1135 } 1136 1137 /* 1138 * Read packet from RX queues 1139 */ 1140 for (i = 0; i < qconf->n_rx_queue; ++i) { 1141 rx_queue = &(qconf->rx_queue_list[i]); 1142 portid = rx_queue->port_id; 1143 queueid = rx_queue->queue_id; 1144 1145 nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst, 1146 MAX_PKT_BURST); 1147 ep_nep[nb_rx == 0]++; 1148 fp_nfp[nb_rx == MAX_PKT_BURST]++; 1149 poll_count++; 1150 if (unlikely(nb_rx == 0)) 1151 continue; 1152 1153 /* Prefetch first packets */ 1154 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1155 rte_prefetch0(rte_pktmbuf_mtod( 1156 pkts_burst[j], void *)); 1157 } 1158 1159 /* Prefetch and forward already prefetched packets */ 1160 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1161 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1162 j + PREFETCH_OFFSET], void *)); 1163 l3fwd_simple_forward(pkts_burst[j], portid, 1164 qconf); 1165 } 1166 1167 /* Forward remaining prefetched packets */ 1168 for (; j < nb_rx; j++) { 1169 l3fwd_simple_forward(pkts_burst[j], portid, 1170 qconf); 1171 } 1172 } 1173 if (unlikely(poll_count >= DEFAULT_COUNT)) { 1174 diff_tsc = cur_tsc - prev_tel_tsc; 1175 if (diff_tsc >= MAX_CYCLES) { 1176 br = FULL; 1177 } else if (diff_tsc > MIN_CYCLES && 1178 diff_tsc < MAX_CYCLES) { 1179 br = (diff_tsc * 100) / MAX_CYCLES; 1180 } else { 1181 br = ZERO; 1182 } 1183 poll_count = 0; 1184 prev_tel_tsc = cur_tsc; 1185 /* update stats for telemetry */ 1186 rte_spinlock_lock(&stats[lcore_id].telemetry_lock); 1187 stats[lcore_id].ep_nep[0] = ep_nep[0]; 1188 stats[lcore_id].ep_nep[1] = ep_nep[1]; 1189 stats[lcore_id].fp_nfp[0] = fp_nfp[0]; 1190 stats[lcore_id].fp_nfp[1] = fp_nfp[1]; 1191 stats[lcore_id].br = br; 1192 rte_spinlock_unlock(&stats[lcore_id].telemetry_lock); 1193 } 1194 } 1195 1196 return 0; 1197 } 1198 1199 /* main processing loop */ 1200 static int 1201 main_legacy_loop(__rte_unused void *dummy) 1202 { 1203 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1204 unsigned lcore_id; 1205 uint64_t prev_tsc, diff_tsc, cur_tsc, tim_res_tsc, hz; 1206 uint64_t prev_tsc_power = 0, cur_tsc_power, diff_tsc_power; 1207 int i, j, nb_rx; 1208 uint16_t portid, queueid; 1209 struct lcore_conf *qconf; 1210 struct lcore_rx_queue *rx_queue; 1211 enum freq_scale_hint_t lcore_scaleup_hint; 1212 uint32_t lcore_rx_idle_count = 0; 1213 uint32_t lcore_idle_hint = 0; 1214 int intr_en = 0; 1215 1216 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1217 1218 prev_tsc = 0; 1219 hz = rte_get_timer_hz(); 1220 tim_res_tsc = hz/TIMER_NUMBER_PER_SECOND; 1221 1222 lcore_id = rte_lcore_id(); 1223 qconf = &lcore_conf[lcore_id]; 1224 1225 if (qconf->n_rx_queue == 0) { 1226 RTE_LOG(INFO, L3FWD_POWER, "lcore %u has nothing to do\n", lcore_id); 1227 return 0; 1228 } 1229 1230 RTE_LOG(INFO, L3FWD_POWER, "entering main loop on lcore %u\n", lcore_id); 1231 1232 for (i = 0; i < qconf->n_rx_queue; i++) { 1233 portid = qconf->rx_queue_list[i].port_id; 1234 queueid = qconf->rx_queue_list[i].queue_id; 1235 RTE_LOG(INFO, L3FWD_POWER, " -- lcoreid=%u portid=%u " 1236 "rxqueueid=%" PRIu16 "\n", lcore_id, portid, queueid); 1237 } 1238 1239 /* add into event wait list */ 1240 if (event_register(qconf) == 0) 1241 intr_en = 1; 1242 else 1243 RTE_LOG(INFO, L3FWD_POWER, "RX interrupt won't enable.\n"); 1244 1245 while (!is_done()) { 1246 stats[lcore_id].nb_iteration_looped++; 1247 1248 cur_tsc = rte_rdtsc(); 1249 cur_tsc_power = cur_tsc; 1250 1251 /* 1252 * TX burst queue drain 1253 */ 1254 diff_tsc = cur_tsc - prev_tsc; 1255 if (unlikely(diff_tsc > drain_tsc)) { 1256 for (i = 0; i < qconf->n_tx_port; ++i) { 1257 portid = qconf->tx_port_id[i]; 1258 rte_eth_tx_buffer_flush(portid, 1259 qconf->tx_queue_id[portid], 1260 qconf->tx_buffer[portid]); 1261 } 1262 prev_tsc = cur_tsc; 1263 } 1264 1265 diff_tsc_power = cur_tsc_power - prev_tsc_power; 1266 if (diff_tsc_power > tim_res_tsc) { 1267 rte_timer_manage(); 1268 prev_tsc_power = cur_tsc_power; 1269 } 1270 1271 start_rx: 1272 /* 1273 * Read packet from RX queues 1274 */ 1275 lcore_scaleup_hint = FREQ_CURRENT; 1276 lcore_rx_idle_count = 0; 1277 for (i = 0; i < qconf->n_rx_queue; ++i) { 1278 rx_queue = &(qconf->rx_queue_list[i]); 1279 rx_queue->idle_hint = 0; 1280 portid = rx_queue->port_id; 1281 queueid = rx_queue->queue_id; 1282 1283 nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst, 1284 MAX_PKT_BURST); 1285 1286 stats[lcore_id].nb_rx_processed += nb_rx; 1287 if (unlikely(nb_rx == 0)) { 1288 /** 1289 * no packet received from rx queue, try to 1290 * sleep for a while forcing CPU enter deeper 1291 * C states. 1292 */ 1293 rx_queue->zero_rx_packet_count++; 1294 1295 if (rx_queue->zero_rx_packet_count <= 1296 MIN_ZERO_POLL_COUNT) 1297 continue; 1298 1299 rx_queue->idle_hint = power_idle_heuristic(\ 1300 rx_queue->zero_rx_packet_count); 1301 lcore_rx_idle_count++; 1302 } else { 1303 rx_queue->zero_rx_packet_count = 0; 1304 1305 /** 1306 * do not scale up frequency immediately as 1307 * user to kernel space communication is costly 1308 * which might impact packet I/O for received 1309 * packets. 1310 */ 1311 rx_queue->freq_up_hint = 1312 power_freq_scaleup_heuristic(lcore_id, 1313 portid, queueid); 1314 } 1315 1316 /* Prefetch first packets */ 1317 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1318 rte_prefetch0(rte_pktmbuf_mtod( 1319 pkts_burst[j], void *)); 1320 } 1321 1322 /* Prefetch and forward already prefetched packets */ 1323 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1324 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1325 j + PREFETCH_OFFSET], void *)); 1326 l3fwd_simple_forward(pkts_burst[j], portid, 1327 qconf); 1328 } 1329 1330 /* Forward remaining prefetched packets */ 1331 for (; j < nb_rx; j++) { 1332 l3fwd_simple_forward(pkts_burst[j], portid, 1333 qconf); 1334 } 1335 } 1336 1337 if (likely(lcore_rx_idle_count != qconf->n_rx_queue)) { 1338 for (i = 1, lcore_scaleup_hint = 1339 qconf->rx_queue_list[0].freq_up_hint; 1340 i < qconf->n_rx_queue; ++i) { 1341 rx_queue = &(qconf->rx_queue_list[i]); 1342 if (rx_queue->freq_up_hint > 1343 lcore_scaleup_hint) 1344 lcore_scaleup_hint = 1345 rx_queue->freq_up_hint; 1346 } 1347 1348 if (lcore_scaleup_hint == FREQ_HIGHEST) { 1349 rte_power_freq_max(lcore_id); 1350 } else if (lcore_scaleup_hint == FREQ_HIGHER) { 1351 rte_power_freq_up(lcore_id); 1352 } 1353 } else { 1354 /** 1355 * All Rx queues empty in recent consecutive polls, 1356 * sleep in a conservative manner, meaning sleep as 1357 * less as possible. 1358 */ 1359 for (i = 1, lcore_idle_hint = 1360 qconf->rx_queue_list[0].idle_hint; 1361 i < qconf->n_rx_queue; ++i) { 1362 rx_queue = &(qconf->rx_queue_list[i]); 1363 if (rx_queue->idle_hint < lcore_idle_hint) 1364 lcore_idle_hint = rx_queue->idle_hint; 1365 } 1366 1367 if (lcore_idle_hint < SUSPEND_THRESHOLD) 1368 /** 1369 * execute "pause" instruction to avoid context 1370 * switch which generally take hundred of 1371 * microseconds for short sleep. 1372 */ 1373 rte_delay_us(lcore_idle_hint); 1374 else { 1375 /* suspend until rx interrupt triggers */ 1376 if (intr_en) { 1377 turn_on_off_intr(qconf, 1); 1378 sleep_until_rx_interrupt( 1379 qconf->n_rx_queue, 1380 lcore_id); 1381 turn_on_off_intr(qconf, 0); 1382 /** 1383 * start receiving packets immediately 1384 */ 1385 if (likely(!is_done())) 1386 goto start_rx; 1387 } 1388 } 1389 stats[lcore_id].sleep_time += lcore_idle_hint; 1390 } 1391 } 1392 1393 return 0; 1394 } 1395 1396 static int 1397 check_lcore_params(void) 1398 { 1399 uint16_t queue, i; 1400 uint32_t lcore; 1401 int socketid; 1402 1403 for (i = 0; i < nb_lcore_params; ++i) { 1404 queue = lcore_params[i].queue_id; 1405 if (queue >= MAX_RX_QUEUE_PER_PORT) { 1406 printf("invalid queue number: %" PRIu16 "\n", queue); 1407 return -1; 1408 } 1409 lcore = lcore_params[i].lcore_id; 1410 if (!rte_lcore_is_enabled(lcore)) { 1411 printf("error: lcore %u is not enabled in lcore " 1412 "mask\n", lcore); 1413 return -1; 1414 } 1415 if ((socketid = rte_lcore_to_socket_id(lcore) != 0) && 1416 (numa_on == 0)) { 1417 printf("warning: lcore %u is on socket %d with numa " 1418 "off\n", lcore, socketid); 1419 } 1420 if (app_mode == APP_MODE_TELEMETRY && lcore == rte_lcore_id()) { 1421 printf("cannot enable main core %d in config for telemetry mode\n", 1422 rte_lcore_id()); 1423 return -1; 1424 } 1425 } 1426 return 0; 1427 } 1428 1429 static int 1430 check_port_config(void) 1431 { 1432 unsigned portid; 1433 uint16_t i; 1434 1435 for (i = 0; i < nb_lcore_params; ++i) { 1436 portid = lcore_params[i].port_id; 1437 if ((enabled_port_mask & (1 << portid)) == 0) { 1438 printf("port %u is not enabled in port mask\n", 1439 portid); 1440 return -1; 1441 } 1442 if (!rte_eth_dev_is_valid_port(portid)) { 1443 printf("port %u is not present on the board\n", 1444 portid); 1445 return -1; 1446 } 1447 } 1448 return 0; 1449 } 1450 1451 static uint16_t 1452 get_port_n_rx_queues(const uint16_t port) 1453 { 1454 int queue = -1; 1455 uint16_t i; 1456 1457 for (i = 0; i < nb_lcore_params; ++i) { 1458 if (lcore_params[i].port_id == port && 1459 lcore_params[i].queue_id > queue) 1460 queue = lcore_params[i].queue_id; 1461 } 1462 return (uint16_t)(++queue); 1463 } 1464 1465 static int 1466 init_lcore_rx_queues(void) 1467 { 1468 uint16_t i, nb_rx_queue; 1469 uint32_t lcore; 1470 1471 for (i = 0; i < nb_lcore_params; ++i) { 1472 lcore = lcore_params[i].lcore_id; 1473 nb_rx_queue = lcore_conf[lcore].n_rx_queue; 1474 if (nb_rx_queue >= MAX_RX_QUEUE_PER_LCORE) { 1475 printf("error: too many queues (%u) for lcore: %u\n", 1476 (unsigned int)nb_rx_queue + 1, lcore); 1477 return -1; 1478 } else { 1479 lcore_conf[lcore].rx_queue_list[nb_rx_queue].port_id = 1480 lcore_params[i].port_id; 1481 lcore_conf[lcore].rx_queue_list[nb_rx_queue].queue_id = 1482 lcore_params[i].queue_id; 1483 lcore_conf[lcore].n_rx_queue++; 1484 } 1485 } 1486 return 0; 1487 } 1488 1489 /* display usage */ 1490 static void 1491 print_usage(const char *prgname) 1492 { 1493 printf ("%s [EAL options] -- -p PORTMASK -P" 1494 " [--config (port,queue,lcore)[,(port,queue,lcore]]" 1495 " [--high-perf-cores CORELIST" 1496 " [--perf-config (port,queue,hi_perf,lcore_index)[,(port,queue,hi_perf,lcore_index]]" 1497 " [--max-pkt-len PKTLEN]\n" 1498 " -p PORTMASK: hexadecimal bitmask of ports to configure\n" 1499 " -P: enable promiscuous mode\n" 1500 " -u: set min/max frequency for uncore to minimum value\n" 1501 " -U: set min/max frequency for uncore to maximum value\n" 1502 " -i (frequency index): set min/max frequency for uncore to specified frequency index\n" 1503 " --config (port,queue,lcore): rx queues configuration\n" 1504 " --cpu-resume-latency LATENCY: set CPU resume latency to control C-state selection," 1505 " 0 : just allow to enter C0-state\n" 1506 " --high-perf-cores CORELIST: list of high performance cores\n" 1507 " --perf-config: similar as config, cores specified as indices" 1508 " for bins containing high or regular performance cores\n" 1509 " --no-numa: optional, disable numa awareness\n" 1510 " --max-pkt-len PKTLEN: maximum packet length in decimal (64-9600)\n" 1511 " --parse-ptype: parse packet type by software\n" 1512 " --legacy: use legacy interrupt-based scaling\n" 1513 " --telemetry: enable telemetry mode, to update" 1514 " empty polls, full polls, and core busyness to telemetry\n" 1515 " --interrupt-only: enable interrupt-only mode\n" 1516 " --pmd-mgmt MODE: enable PMD power management mode. " 1517 "Currently supported modes: baseline, monitor, pause, scale\n" 1518 " --max-empty-polls MAX_EMPTY_POLLS: number of empty polls to" 1519 " wait before entering sleep state\n" 1520 " --pause-duration DURATION: set the duration, in microseconds," 1521 " of the pause callback\n" 1522 " --scale-freq-min FREQ_MIN: set minimum frequency for scaling mode for" 1523 " all application lcores (FREQ_MIN must be in kHz, in increments of 100MHz)\n" 1524 " --scale-freq-max FREQ_MAX: set maximum frequency for scaling mode for" 1525 " all application lcores (FREQ_MAX must be in kHz, in increments of 100MHz)\n", 1526 prgname); 1527 } 1528 1529 /* 1530 * Caller must give the right upper limit so as to ensure receiver variable 1531 * doesn't overflow. 1532 */ 1533 static int 1534 parse_uint(const char *opt, uint32_t max, uint32_t *res) 1535 { 1536 char *end = NULL; 1537 unsigned long val; 1538 1539 /* parse integer string */ 1540 val = strtoul(opt, &end, 10); 1541 if ((opt[0] == '\0') || (end == NULL) || (*end != '\0')) 1542 return -1; 1543 1544 if (val > max) { 1545 RTE_LOG(ERR, L3FWD_POWER, "%s parameter shouldn't exceed %u.\n", 1546 opt, max); 1547 return -1; 1548 } 1549 1550 *res = val; 1551 1552 return 0; 1553 } 1554 1555 static int 1556 parse_uncore_options(enum uncore_choice choice, const char *argument) 1557 { 1558 unsigned int die, pkg, max_pkg, max_die; 1559 int ret = 0; 1560 ret = rte_power_set_uncore_env(RTE_UNCORE_PM_ENV_AUTO_DETECT); 1561 if (ret < 0) { 1562 RTE_LOG(INFO, L3FWD_POWER, "Failed to set uncore env\n"); 1563 return ret; 1564 } 1565 1566 max_pkg = rte_power_uncore_get_num_pkgs(); 1567 if (max_pkg == 0) 1568 return -1; 1569 1570 for (pkg = 0; pkg < max_pkg; pkg++) { 1571 max_die = rte_power_uncore_get_num_dies(pkg); 1572 if (max_die == 0) 1573 return -1; 1574 for (die = 0; die < max_die; die++) { 1575 ret = rte_power_uncore_init(pkg, die); 1576 if (ret == -1) { 1577 RTE_LOG(INFO, L3FWD_POWER, "Unable to initialize uncore for pkg %02u die %02u\n" 1578 , pkg, die); 1579 return ret; 1580 } 1581 if (choice == UNCORE_MIN) { 1582 ret = rte_power_uncore_freq_min(pkg, die); 1583 if (ret == -1) { 1584 RTE_LOG(INFO, L3FWD_POWER, 1585 "Unable to set the uncore min/max to minimum uncore frequency value for pkg %02u die %02u\n" 1586 , pkg, die); 1587 return ret; 1588 } 1589 } else if (choice == UNCORE_MAX) { 1590 ret = rte_power_uncore_freq_max(pkg, die); 1591 if (ret == -1) { 1592 RTE_LOG(INFO, L3FWD_POWER, 1593 "Unable to set uncore min/max to maximum uncore frequency value for pkg %02u die %02u\n" 1594 , pkg, die); 1595 return ret; 1596 } 1597 } else if (choice == UNCORE_IDX) { 1598 char *ptr = NULL; 1599 int frequency_index = strtol(argument, &ptr, 10); 1600 if (argument == ptr) { 1601 RTE_LOG(INFO, L3FWD_POWER, "Index given is not a valid number."); 1602 return -1; 1603 } 1604 int freq_array_len = rte_power_uncore_get_num_freqs(pkg, die); 1605 if (frequency_index > freq_array_len - 1) { 1606 RTE_LOG(INFO, L3FWD_POWER, 1607 "Frequency index given out of range, please choose a value from 0 to %d.\n", 1608 freq_array_len); 1609 return -1; 1610 } 1611 ret = rte_power_set_uncore_freq(pkg, die, frequency_index); 1612 if (ret == -1) { 1613 RTE_LOG(INFO, L3FWD_POWER, 1614 "Unable to set min/max uncore index value for pkg %02u die %02u\n", 1615 pkg, die); 1616 return ret; 1617 } 1618 } else { 1619 RTE_LOG(INFO, L3FWD_POWER, "Uncore choice provided invalid\n"); 1620 return -1; 1621 } 1622 } 1623 } 1624 1625 RTE_LOG(INFO, L3FWD_POWER, "Successfully set max/min/index uncore frequency.\n"); 1626 return ret; 1627 } 1628 1629 static int 1630 parse_portmask(const char *portmask) 1631 { 1632 char *end = NULL; 1633 unsigned long pm; 1634 1635 /* parse hexadecimal string */ 1636 pm = strtoul(portmask, &end, 16); 1637 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0')) 1638 return 0; 1639 1640 return pm; 1641 } 1642 1643 static int 1644 parse_config(const char *q_arg) 1645 { 1646 char s[256]; 1647 const char *p, *p0 = q_arg; 1648 char *end; 1649 enum fieldnames { 1650 FLD_PORT = 0, 1651 FLD_QUEUE, 1652 FLD_LCORE, 1653 _NUM_FLD 1654 }; 1655 unsigned long int_fld[_NUM_FLD]; 1656 char *str_fld[_NUM_FLD]; 1657 int i; 1658 unsigned size; 1659 unsigned int max_fld[_NUM_FLD] = { 1660 RTE_MAX_ETHPORTS, 1661 RTE_MAX_QUEUES_PER_PORT, 1662 RTE_MAX_LCORE 1663 }; 1664 1665 nb_lcore_params = 0; 1666 1667 while ((p = strchr(p0,'(')) != NULL) { 1668 ++p; 1669 if((p0 = strchr(p,')')) == NULL) 1670 return -1; 1671 1672 size = p0 - p; 1673 if(size >= sizeof(s)) 1674 return -1; 1675 1676 snprintf(s, sizeof(s), "%.*s", size, p); 1677 if (rte_strsplit(s, sizeof(s), str_fld, _NUM_FLD, ',') != 1678 _NUM_FLD) 1679 return -1; 1680 for (i = 0; i < _NUM_FLD; i++){ 1681 errno = 0; 1682 int_fld[i] = strtoul(str_fld[i], &end, 0); 1683 if (errno != 0 || end == str_fld[i] || int_fld[i] > max_fld[i]) 1684 return -1; 1685 } 1686 if (nb_lcore_params >= MAX_LCORE_PARAMS) { 1687 printf("exceeded max number of lcore params: %hu\n", 1688 nb_lcore_params); 1689 return -1; 1690 } 1691 lcore_params_array[nb_lcore_params].port_id = 1692 (uint16_t)int_fld[FLD_PORT]; 1693 lcore_params_array[nb_lcore_params].queue_id = 1694 (uint16_t)int_fld[FLD_QUEUE]; 1695 lcore_params_array[nb_lcore_params].lcore_id = 1696 (uint32_t)int_fld[FLD_LCORE]; 1697 ++nb_lcore_params; 1698 } 1699 lcore_params = lcore_params_array; 1700 1701 return 0; 1702 } 1703 1704 static int 1705 parse_pmd_mgmt_config(const char *name) 1706 { 1707 #define PMD_MGMT_MONITOR "monitor" 1708 #define PMD_MGMT_PAUSE "pause" 1709 #define PMD_MGMT_SCALE "scale" 1710 #define PMD_MGMT_BASELINE "baseline" 1711 1712 if (strncmp(PMD_MGMT_MONITOR, name, sizeof(PMD_MGMT_MONITOR)) == 0) { 1713 pmgmt_type = RTE_POWER_MGMT_TYPE_MONITOR; 1714 return 0; 1715 } 1716 1717 if (strncmp(PMD_MGMT_PAUSE, name, sizeof(PMD_MGMT_PAUSE)) == 0) { 1718 pmgmt_type = RTE_POWER_MGMT_TYPE_PAUSE; 1719 return 0; 1720 } 1721 1722 if (strncmp(PMD_MGMT_SCALE, name, sizeof(PMD_MGMT_SCALE)) == 0) { 1723 pmgmt_type = RTE_POWER_MGMT_TYPE_SCALE; 1724 return 0; 1725 } 1726 if (strncmp(PMD_MGMT_BASELINE, name, sizeof(PMD_MGMT_BASELINE)) == 0) { 1727 baseline_enabled = true; 1728 return 0; 1729 } 1730 /* unknown PMD power management mode */ 1731 return -1; 1732 } 1733 1734 #define CMD_LINE_OPT_PARSE_PTYPE "parse-ptype" 1735 #define CMD_LINE_OPT_LEGACY "legacy" 1736 #define CMD_LINE_OPT_INTERRUPT_ONLY "interrupt-only" 1737 #define CMD_LINE_OPT_TELEMETRY "telemetry" 1738 #define CMD_LINE_OPT_PMD_MGMT "pmd-mgmt" 1739 #define CMD_LINE_OPT_MAX_PKT_LEN "max-pkt-len" 1740 #define CMD_LINE_OPT_MAX_EMPTY_POLLS "max-empty-polls" 1741 #define CMD_LINE_OPT_PAUSE_DURATION "pause-duration" 1742 #define CMD_LINE_OPT_SCALE_FREQ_MIN "scale-freq-min" 1743 #define CMD_LINE_OPT_SCALE_FREQ_MAX "scale-freq-max" 1744 #define CMD_LINE_OPT_CPU_RESUME_LATENCY "cpu-resume-latency" 1745 1746 /* Parse the argument given in the command line of the application */ 1747 static int 1748 parse_args(int argc, char **argv) 1749 { 1750 int opt, ret; 1751 char **argvopt; 1752 int option_index; 1753 char *prgname = argv[0]; 1754 static struct option lgopts[] = { 1755 {"config", 1, 0, 0}, 1756 {"perf-config", 1, 0, 0}, 1757 {"high-perf-cores", 1, 0, 0}, 1758 {"no-numa", 0, 0, 0}, 1759 {CMD_LINE_OPT_CPU_RESUME_LATENCY, 1, 0, 0}, 1760 {CMD_LINE_OPT_MAX_PKT_LEN, 1, 0, 0}, 1761 {CMD_LINE_OPT_PARSE_PTYPE, 0, 0, 0}, 1762 {CMD_LINE_OPT_LEGACY, 0, 0, 0}, 1763 {CMD_LINE_OPT_TELEMETRY, 0, 0, 0}, 1764 {CMD_LINE_OPT_INTERRUPT_ONLY, 0, 0, 0}, 1765 {CMD_LINE_OPT_PMD_MGMT, 1, 0, 0}, 1766 {CMD_LINE_OPT_MAX_EMPTY_POLLS, 1, 0, 0}, 1767 {CMD_LINE_OPT_PAUSE_DURATION, 1, 0, 0}, 1768 {CMD_LINE_OPT_SCALE_FREQ_MIN, 1, 0, 0}, 1769 {CMD_LINE_OPT_SCALE_FREQ_MAX, 1, 0, 0}, 1770 {NULL, 0, 0, 0} 1771 }; 1772 1773 argvopt = argv; 1774 1775 while ((opt = getopt_long(argc, argvopt, "p:PuUi:", 1776 lgopts, &option_index)) != EOF) { 1777 1778 switch (opt) { 1779 /* portmask */ 1780 case 'p': 1781 enabled_port_mask = parse_portmask(optarg); 1782 if (enabled_port_mask == 0) { 1783 printf("invalid portmask\n"); 1784 print_usage(prgname); 1785 return -1; 1786 } 1787 break; 1788 case 'P': 1789 printf("Promiscuous mode selected\n"); 1790 promiscuous_on = 1; 1791 break; 1792 case 'u': 1793 enabled_uncore = parse_uncore_options(UNCORE_MIN, NULL); 1794 if (enabled_uncore < 0) { 1795 print_usage(prgname); 1796 return -1; 1797 } 1798 break; 1799 case 'U': 1800 enabled_uncore = parse_uncore_options(UNCORE_MAX, NULL); 1801 if (enabled_uncore < 0) { 1802 print_usage(prgname); 1803 return -1; 1804 } 1805 break; 1806 case 'i': 1807 enabled_uncore = parse_uncore_options(UNCORE_IDX, optarg); 1808 if (enabled_uncore < 0) { 1809 print_usage(prgname); 1810 return -1; 1811 } 1812 break; 1813 /* long options */ 1814 case 0: 1815 if (!strncmp(lgopts[option_index].name, "config", 6)) { 1816 ret = parse_config(optarg); 1817 if (ret) { 1818 printf("invalid config\n"); 1819 print_usage(prgname); 1820 return -1; 1821 } 1822 } 1823 1824 if (!strncmp(lgopts[option_index].name, 1825 "perf-config", 11)) { 1826 ret = parse_perf_config(optarg); 1827 if (ret) { 1828 printf("invalid perf-config\n"); 1829 print_usage(prgname); 1830 return -1; 1831 } 1832 } 1833 1834 if (!strncmp(lgopts[option_index].name, 1835 "high-perf-cores", 15)) { 1836 ret = parse_perf_core_list(optarg); 1837 if (ret) { 1838 printf("invalid high-perf-cores\n"); 1839 print_usage(prgname); 1840 return -1; 1841 } 1842 } 1843 1844 if (!strncmp(lgopts[option_index].name, 1845 "no-numa", 7)) { 1846 printf("numa is disabled \n"); 1847 numa_on = 0; 1848 } 1849 1850 if (!strncmp(lgopts[option_index].name, 1851 CMD_LINE_OPT_LEGACY, 1852 sizeof(CMD_LINE_OPT_LEGACY))) { 1853 if (app_mode != APP_MODE_DEFAULT) { 1854 printf(" legacy mode is mutually exclusive with other modes\n"); 1855 return -1; 1856 } 1857 app_mode = APP_MODE_LEGACY; 1858 printf("legacy mode is enabled\n"); 1859 } 1860 1861 if (!strncmp(lgopts[option_index].name, 1862 CMD_LINE_OPT_TELEMETRY, 1863 sizeof(CMD_LINE_OPT_TELEMETRY))) { 1864 if (app_mode != APP_MODE_DEFAULT) { 1865 printf(" telemetry mode is mutually exclusive with other modes\n"); 1866 return -1; 1867 } 1868 app_mode = APP_MODE_TELEMETRY; 1869 printf("telemetry mode is enabled\n"); 1870 } 1871 1872 if (!strncmp(lgopts[option_index].name, 1873 CMD_LINE_OPT_PMD_MGMT, 1874 sizeof(CMD_LINE_OPT_PMD_MGMT))) { 1875 if (app_mode != APP_MODE_DEFAULT) { 1876 printf(" power mgmt mode is mutually exclusive with other modes\n"); 1877 return -1; 1878 } 1879 if (parse_pmd_mgmt_config(optarg) < 0) { 1880 printf(" Invalid PMD power management mode: %s\n", 1881 optarg); 1882 return -1; 1883 } 1884 app_mode = APP_MODE_PMD_MGMT; 1885 printf("PMD power mgmt mode is enabled\n"); 1886 } 1887 if (!strncmp(lgopts[option_index].name, 1888 CMD_LINE_OPT_INTERRUPT_ONLY, 1889 sizeof(CMD_LINE_OPT_INTERRUPT_ONLY))) { 1890 if (app_mode != APP_MODE_DEFAULT) { 1891 printf(" interrupt-only mode is mutually exclusive with other modes\n"); 1892 return -1; 1893 } 1894 app_mode = APP_MODE_INTERRUPT; 1895 printf("interrupt-only mode is enabled\n"); 1896 } 1897 1898 if (!strncmp(lgopts[option_index].name, 1899 CMD_LINE_OPT_MAX_PKT_LEN, 1900 sizeof(CMD_LINE_OPT_MAX_PKT_LEN))) { 1901 if (parse_uint(optarg, UINT32_MAX, &max_pkt_len) != 0) 1902 return -1; 1903 printf("Custom frame size is configured\n"); 1904 } 1905 1906 if (!strncmp(lgopts[option_index].name, 1907 CMD_LINE_OPT_PARSE_PTYPE, 1908 sizeof(CMD_LINE_OPT_PARSE_PTYPE))) { 1909 printf("soft parse-ptype is enabled\n"); 1910 parse_ptype = 1; 1911 } 1912 1913 if (!strncmp(lgopts[option_index].name, 1914 CMD_LINE_OPT_MAX_EMPTY_POLLS, 1915 sizeof(CMD_LINE_OPT_MAX_EMPTY_POLLS))) { 1916 if (parse_uint(optarg, UINT32_MAX, &max_empty_polls) != 0) 1917 return -1; 1918 printf("Maximum empty polls configured\n"); 1919 } 1920 1921 if (!strncmp(lgopts[option_index].name, 1922 CMD_LINE_OPT_PAUSE_DURATION, 1923 sizeof(CMD_LINE_OPT_PAUSE_DURATION))) { 1924 if (parse_uint(optarg, UINT32_MAX, &pause_duration) != 0) 1925 return -1; 1926 printf("Pause duration configured\n"); 1927 } 1928 1929 if (!strncmp(lgopts[option_index].name, 1930 CMD_LINE_OPT_SCALE_FREQ_MIN, 1931 sizeof(CMD_LINE_OPT_SCALE_FREQ_MIN))) { 1932 if (parse_uint(optarg, UINT32_MAX, &scale_freq_min) != 0) 1933 return -1; 1934 printf("Scaling frequency minimum configured\n"); 1935 } 1936 1937 if (!strncmp(lgopts[option_index].name, 1938 CMD_LINE_OPT_SCALE_FREQ_MAX, 1939 sizeof(CMD_LINE_OPT_SCALE_FREQ_MAX))) { 1940 if (parse_uint(optarg, UINT32_MAX, &scale_freq_max) != 0) 1941 return -1; 1942 printf("Scaling frequency maximum configured\n"); 1943 } 1944 1945 if (!strncmp(lgopts[option_index].name, 1946 CMD_LINE_OPT_CPU_RESUME_LATENCY, 1947 sizeof(CMD_LINE_OPT_CPU_RESUME_LATENCY))) { 1948 if (parse_uint(optarg, INT_MAX, 1949 (uint32_t *)&cpu_resume_latency) != 0) 1950 return -1; 1951 printf("PM QoS configured\n"); 1952 } 1953 1954 break; 1955 1956 default: 1957 print_usage(prgname); 1958 return -1; 1959 } 1960 } 1961 1962 if (optind >= 0) 1963 argv[optind-1] = prgname; 1964 1965 ret = optind-1; 1966 optind = 1; /* reset getopt lib */ 1967 return ret; 1968 } 1969 1970 static void 1971 print_ethaddr(const char *name, const struct rte_ether_addr *eth_addr) 1972 { 1973 char buf[RTE_ETHER_ADDR_FMT_SIZE]; 1974 rte_ether_format_addr(buf, RTE_ETHER_ADDR_FMT_SIZE, eth_addr); 1975 printf("%s%s", name, buf); 1976 } 1977 1978 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 1979 static void 1980 setup_hash(int socketid) 1981 { 1982 struct rte_hash_parameters ipv4_l3fwd_hash_params = { 1983 .name = NULL, 1984 .entries = L3FWD_HASH_ENTRIES, 1985 .key_len = sizeof(struct ipv4_5tuple), 1986 .hash_func = DEFAULT_HASH_FUNC, 1987 .hash_func_init_val = 0, 1988 }; 1989 1990 struct rte_hash_parameters ipv6_l3fwd_hash_params = { 1991 .name = NULL, 1992 .entries = L3FWD_HASH_ENTRIES, 1993 .key_len = sizeof(struct ipv6_5tuple), 1994 .hash_func = DEFAULT_HASH_FUNC, 1995 .hash_func_init_val = 0, 1996 }; 1997 1998 unsigned i; 1999 int ret; 2000 char s[64]; 2001 2002 /* create ipv4 hash */ 2003 snprintf(s, sizeof(s), "ipv4_l3fwd_hash_%d", socketid); 2004 ipv4_l3fwd_hash_params.name = s; 2005 ipv4_l3fwd_hash_params.socket_id = socketid; 2006 ipv4_l3fwd_lookup_struct[socketid] = 2007 rte_hash_create(&ipv4_l3fwd_hash_params); 2008 if (ipv4_l3fwd_lookup_struct[socketid] == NULL) 2009 rte_exit(EXIT_FAILURE, "Unable to create the l3fwd hash on " 2010 "socket %d\n", socketid); 2011 2012 /* create ipv6 hash */ 2013 snprintf(s, sizeof(s), "ipv6_l3fwd_hash_%d", socketid); 2014 ipv6_l3fwd_hash_params.name = s; 2015 ipv6_l3fwd_hash_params.socket_id = socketid; 2016 ipv6_l3fwd_lookup_struct[socketid] = 2017 rte_hash_create(&ipv6_l3fwd_hash_params); 2018 if (ipv6_l3fwd_lookup_struct[socketid] == NULL) 2019 rte_exit(EXIT_FAILURE, "Unable to create the l3fwd hash on " 2020 "socket %d\n", socketid); 2021 2022 2023 /* populate the ipv4 hash */ 2024 for (i = 0; i < RTE_DIM(ipv4_l3fwd_route_array); i++) { 2025 ret = rte_hash_add_key (ipv4_l3fwd_lookup_struct[socketid], 2026 (void *) &ipv4_l3fwd_route_array[i].key); 2027 if (ret < 0) { 2028 rte_exit(EXIT_FAILURE, "Unable to add entry %u to the" 2029 "l3fwd hash on socket %d\n", i, socketid); 2030 } 2031 ipv4_l3fwd_out_if[ret] = ipv4_l3fwd_route_array[i].if_out; 2032 printf("Hash: Adding key\n"); 2033 print_ipv4_key(ipv4_l3fwd_route_array[i].key); 2034 } 2035 2036 /* populate the ipv6 hash */ 2037 for (i = 0; i < RTE_DIM(ipv6_l3fwd_route_array); i++) { 2038 ret = rte_hash_add_key (ipv6_l3fwd_lookup_struct[socketid], 2039 (void *) &ipv6_l3fwd_route_array[i].key); 2040 if (ret < 0) { 2041 rte_exit(EXIT_FAILURE, "Unable to add entry %u to the" 2042 "l3fwd hash on socket %d\n", i, socketid); 2043 } 2044 ipv6_l3fwd_out_if[ret] = ipv6_l3fwd_route_array[i].if_out; 2045 printf("Hash: Adding key\n"); 2046 print_ipv6_key(ipv6_l3fwd_route_array[i].key); 2047 } 2048 } 2049 #endif 2050 2051 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 2052 static void 2053 setup_lpm(int socketid) 2054 { 2055 unsigned i; 2056 int ret; 2057 char s[64]; 2058 2059 /* create the LPM table */ 2060 struct rte_lpm_config lpm_ipv4_config; 2061 2062 lpm_ipv4_config.max_rules = IPV4_L3FWD_LPM_MAX_RULES; 2063 lpm_ipv4_config.number_tbl8s = 256; 2064 lpm_ipv4_config.flags = 0; 2065 2066 snprintf(s, sizeof(s), "IPV4_L3FWD_LPM_%d", socketid); 2067 ipv4_l3fwd_lookup_struct[socketid] = 2068 rte_lpm_create(s, socketid, &lpm_ipv4_config); 2069 if (ipv4_l3fwd_lookup_struct[socketid] == NULL) 2070 rte_exit(EXIT_FAILURE, "Unable to create the l3fwd LPM table" 2071 " on socket %d\n", socketid); 2072 2073 /* populate the LPM table */ 2074 for (i = 0; i < RTE_DIM(ipv4_l3fwd_route_array); i++) { 2075 ret = rte_lpm_add(ipv4_l3fwd_lookup_struct[socketid], 2076 ipv4_l3fwd_route_array[i].ip, 2077 ipv4_l3fwd_route_array[i].depth, 2078 ipv4_l3fwd_route_array[i].if_out); 2079 2080 if (ret < 0) { 2081 rte_exit(EXIT_FAILURE, "Unable to add entry %u to the " 2082 "l3fwd LPM table on socket %d\n", 2083 i, socketid); 2084 } 2085 2086 printf("LPM: Adding route 0x%08x / %d (%d)\n", 2087 (unsigned)ipv4_l3fwd_route_array[i].ip, 2088 ipv4_l3fwd_route_array[i].depth, 2089 ipv4_l3fwd_route_array[i].if_out); 2090 } 2091 } 2092 #endif 2093 2094 static int 2095 init_mem(unsigned nb_mbuf) 2096 { 2097 struct lcore_conf *qconf; 2098 int socketid; 2099 unsigned lcore_id; 2100 char s[64]; 2101 2102 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2103 if (rte_lcore_is_enabled(lcore_id) == 0) 2104 continue; 2105 2106 if (numa_on) 2107 socketid = rte_lcore_to_socket_id(lcore_id); 2108 else 2109 socketid = 0; 2110 2111 if (socketid >= NB_SOCKETS) { 2112 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is " 2113 "out of range %d\n", socketid, 2114 lcore_id, NB_SOCKETS); 2115 } 2116 if (pktmbuf_pool[socketid] == NULL) { 2117 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 2118 pktmbuf_pool[socketid] = 2119 rte_pktmbuf_pool_create(s, nb_mbuf, 2120 MEMPOOL_CACHE_SIZE, 0, 2121 RTE_MBUF_DEFAULT_BUF_SIZE, 2122 socketid); 2123 if (pktmbuf_pool[socketid] == NULL) 2124 rte_exit(EXIT_FAILURE, 2125 "Cannot init mbuf pool on socket %d\n", 2126 socketid); 2127 else 2128 printf("Allocated mbuf pool on socket %d\n", 2129 socketid); 2130 2131 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 2132 setup_lpm(socketid); 2133 #else 2134 setup_hash(socketid); 2135 #endif 2136 } 2137 qconf = &lcore_conf[lcore_id]; 2138 qconf->ipv4_lookup_struct = ipv4_l3fwd_lookup_struct[socketid]; 2139 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 2140 qconf->ipv6_lookup_struct = ipv6_l3fwd_lookup_struct[socketid]; 2141 #endif 2142 } 2143 return 0; 2144 } 2145 2146 /* Check the link status of all ports in up to 9s, and print them finally */ 2147 static void 2148 check_all_ports_link_status(uint32_t port_mask) 2149 { 2150 #define CHECK_INTERVAL 100 /* 100ms */ 2151 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 2152 uint8_t count, all_ports_up, print_flag = 0; 2153 uint16_t portid; 2154 struct rte_eth_link link; 2155 int ret; 2156 char link_status_text[RTE_ETH_LINK_MAX_STR_LEN]; 2157 2158 printf("\nChecking link status"); 2159 fflush(stdout); 2160 for (count = 0; count <= MAX_CHECK_TIME; count++) { 2161 all_ports_up = 1; 2162 RTE_ETH_FOREACH_DEV(portid) { 2163 if ((port_mask & (1 << portid)) == 0) 2164 continue; 2165 memset(&link, 0, sizeof(link)); 2166 ret = rte_eth_link_get_nowait(portid, &link); 2167 if (ret < 0) { 2168 all_ports_up = 0; 2169 if (print_flag == 1) 2170 printf("Port %u link get failed: %s\n", 2171 portid, rte_strerror(-ret)); 2172 continue; 2173 } 2174 /* print link status if flag set */ 2175 if (print_flag == 1) { 2176 rte_eth_link_to_str(link_status_text, 2177 sizeof(link_status_text), &link); 2178 printf("Port %d %s\n", portid, 2179 link_status_text); 2180 continue; 2181 } 2182 /* clear all_ports_up flag if any link down */ 2183 if (link.link_status == RTE_ETH_LINK_DOWN) { 2184 all_ports_up = 0; 2185 break; 2186 } 2187 } 2188 /* after finally printing all link status, get out */ 2189 if (print_flag == 1) 2190 break; 2191 2192 if (all_ports_up == 0) { 2193 printf("."); 2194 fflush(stdout); 2195 rte_delay_ms(CHECK_INTERVAL); 2196 } 2197 2198 /* set the print_flag if all ports up or timeout */ 2199 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 2200 print_flag = 1; 2201 printf("done\n"); 2202 } 2203 } 2204 } 2205 2206 static int check_ptype(uint16_t portid) 2207 { 2208 int i, ret; 2209 int ptype_l3_ipv4 = 0; 2210 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 2211 int ptype_l3_ipv6 = 0; 2212 #endif 2213 uint32_t ptype_mask = RTE_PTYPE_L3_MASK; 2214 2215 ret = rte_eth_dev_get_supported_ptypes(portid, ptype_mask, NULL, 0); 2216 if (ret <= 0) 2217 return 0; 2218 2219 uint32_t ptypes[ret]; 2220 2221 ret = rte_eth_dev_get_supported_ptypes(portid, ptype_mask, ptypes, ret); 2222 for (i = 0; i < ret; ++i) { 2223 if (ptypes[i] & RTE_PTYPE_L3_IPV4) 2224 ptype_l3_ipv4 = 1; 2225 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 2226 if (ptypes[i] & RTE_PTYPE_L3_IPV6) 2227 ptype_l3_ipv6 = 1; 2228 #endif 2229 } 2230 2231 if (ptype_l3_ipv4 == 0) 2232 printf("port %d cannot parse RTE_PTYPE_L3_IPV4\n", portid); 2233 2234 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 2235 if (ptype_l3_ipv6 == 0) 2236 printf("port %d cannot parse RTE_PTYPE_L3_IPV6\n", portid); 2237 #endif 2238 2239 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 2240 if (ptype_l3_ipv4) 2241 #else /* APP_LOOKUP_EXACT_MATCH */ 2242 if (ptype_l3_ipv4 && ptype_l3_ipv6) 2243 #endif 2244 return 1; 2245 2246 return 0; 2247 2248 } 2249 2250 static int 2251 init_power_library(void) 2252 { 2253 enum power_management_env env; 2254 unsigned int lcore_id; 2255 int ret = 0; 2256 2257 RTE_LCORE_FOREACH(lcore_id) { 2258 /* init power management library */ 2259 ret = rte_power_init(lcore_id); 2260 if (ret) { 2261 RTE_LOG(ERR, L3FWD_POWER, 2262 "Library initialization failed on core %u\n", 2263 lcore_id); 2264 return ret; 2265 } 2266 /* we're not supporting the VM channel mode */ 2267 env = rte_power_get_env(); 2268 if (env != PM_ENV_ACPI_CPUFREQ && 2269 env != PM_ENV_PSTATE_CPUFREQ && 2270 env != PM_ENV_AMD_PSTATE_CPUFREQ && 2271 env != PM_ENV_CPPC_CPUFREQ) { 2272 RTE_LOG(ERR, L3FWD_POWER, 2273 "Only ACPI and PSTATE mode are supported\n"); 2274 return -1; 2275 } 2276 } 2277 2278 if (cpu_resume_latency != -1) { 2279 RTE_LCORE_FOREACH(lcore_id) { 2280 /* Back old CPU resume latency. */ 2281 ret = rte_power_qos_get_cpu_resume_latency(lcore_id); 2282 if (ret < 0) { 2283 RTE_LOG(ERR, L3FWD_POWER, 2284 "Failed to get cpu resume latency on lcore-%u, ret=%d.\n", 2285 lcore_id, ret); 2286 } 2287 resume_latency_bk[lcore_id] = ret; 2288 2289 /* 2290 * Set the cpu resume latency of the worker lcore based 2291 * on user's request. If set strict latency (0), just 2292 * allow the CPU to enter the shallowest idle state to 2293 * improve performance. 2294 */ 2295 ret = rte_power_qos_set_cpu_resume_latency(lcore_id, 2296 cpu_resume_latency); 2297 if (ret != 0) { 2298 RTE_LOG(ERR, L3FWD_POWER, 2299 "Failed to set cpu resume latency on lcore-%u, ret=%d.\n", 2300 lcore_id, ret); 2301 return ret; 2302 } 2303 } 2304 } 2305 2306 return ret; 2307 } 2308 2309 static int 2310 deinit_power_library(void) 2311 { 2312 unsigned int lcore_id, max_pkg, max_die, die, pkg; 2313 int ret = 0; 2314 2315 RTE_LCORE_FOREACH(lcore_id) { 2316 /* deinit power management library */ 2317 ret = rte_power_exit(lcore_id); 2318 if (ret) { 2319 RTE_LOG(ERR, L3FWD_POWER, 2320 "Library deinitialization failed on core %u\n", 2321 lcore_id); 2322 return ret; 2323 } 2324 } 2325 2326 /* if uncore option was set */ 2327 if (enabled_uncore == 0) { 2328 max_pkg = rte_power_uncore_get_num_pkgs(); 2329 if (max_pkg == 0) 2330 return -1; 2331 for (pkg = 0; pkg < max_pkg; pkg++) { 2332 max_die = rte_power_uncore_get_num_dies(pkg); 2333 if (max_die == 0) 2334 return -1; 2335 for (die = 0; die < max_die; die++) { 2336 ret = rte_power_uncore_exit(pkg, die); 2337 if (ret < 0) { 2338 RTE_LOG(ERR, L3FWD_POWER, "Failed to exit uncore deinit successfully for pkg %02u die %02u\n" 2339 , pkg, die); 2340 return -1; 2341 } 2342 } 2343 } 2344 } 2345 2346 if (cpu_resume_latency != -1) { 2347 RTE_LCORE_FOREACH(lcore_id) { 2348 /* Restore the original value. */ 2349 rte_power_qos_set_cpu_resume_latency(lcore_id, 2350 resume_latency_bk[lcore_id]); 2351 } 2352 } 2353 2354 return ret; 2355 } 2356 2357 static void 2358 get_current_stat_values(uint64_t *values) 2359 { 2360 unsigned int lcore_id = rte_lcore_id(); 2361 struct lcore_conf *qconf; 2362 uint64_t app_eps = 0, app_fps = 0, app_br = 0; 2363 uint64_t count = 0; 2364 2365 RTE_LCORE_FOREACH_WORKER(lcore_id) { 2366 qconf = &lcore_conf[lcore_id]; 2367 if (qconf->n_rx_queue == 0) 2368 continue; 2369 count++; 2370 rte_spinlock_lock(&stats[lcore_id].telemetry_lock); 2371 app_eps += stats[lcore_id].ep_nep[1]; 2372 app_fps += stats[lcore_id].fp_nfp[1]; 2373 app_br += stats[lcore_id].br; 2374 rte_spinlock_unlock(&stats[lcore_id].telemetry_lock); 2375 } 2376 2377 if (count > 0) { 2378 values[0] = app_eps/count; 2379 values[1] = app_fps/count; 2380 values[2] = app_br/count; 2381 } else 2382 memset(values, 0, sizeof(uint64_t) * NUM_TELSTATS); 2383 2384 } 2385 2386 static void 2387 update_telemetry(__rte_unused struct rte_timer *tim, 2388 __rte_unused void *arg) 2389 { 2390 int ret; 2391 uint64_t values[NUM_TELSTATS] = {0}; 2392 2393 get_current_stat_values(values); 2394 ret = rte_metrics_update_values(RTE_METRICS_GLOBAL, telstats_index, 2395 values, RTE_DIM(values)); 2396 if (ret < 0) 2397 RTE_LOG(WARNING, L3FWD_POWER, "failed to update metrics\n"); 2398 } 2399 2400 static int 2401 handle_app_stats(const char *cmd __rte_unused, 2402 const char *params __rte_unused, 2403 struct rte_tel_data *d) 2404 { 2405 uint64_t values[NUM_TELSTATS] = {0}; 2406 uint32_t i; 2407 2408 rte_tel_data_start_dict(d); 2409 get_current_stat_values(values); 2410 for (i = 0; i < NUM_TELSTATS; i++) 2411 rte_tel_data_add_dict_uint(d, telstats_strings[i].name, 2412 values[i]); 2413 return 0; 2414 } 2415 2416 static void 2417 telemetry_setup_timer(void) 2418 { 2419 int lcore_id = rte_lcore_id(); 2420 uint64_t hz = rte_get_timer_hz(); 2421 uint64_t ticks; 2422 2423 ticks = hz / TELEMETRY_INTERVALS_PER_SEC; 2424 rte_timer_reset_sync(&telemetry_timer, 2425 ticks, 2426 PERIODICAL, 2427 lcore_id, 2428 update_telemetry, 2429 NULL); 2430 } 2431 2432 static int 2433 launch_timer(unsigned int lcore_id) 2434 { 2435 int64_t prev_tsc = 0, cur_tsc, diff_tsc, cycles_10ms; 2436 2437 RTE_SET_USED(lcore_id); 2438 2439 2440 if (rte_get_main_lcore() != lcore_id) { 2441 rte_panic("timer on lcore:%d which is not main core:%d\n", 2442 lcore_id, 2443 rte_get_main_lcore()); 2444 } 2445 2446 RTE_LOG(INFO, L3FWD_POWER, "Bring up the Timer\n"); 2447 2448 telemetry_setup_timer(); 2449 2450 cycles_10ms = rte_get_timer_hz() / 100; 2451 2452 while (!is_done()) { 2453 cur_tsc = rte_rdtsc(); 2454 diff_tsc = cur_tsc - prev_tsc; 2455 if (diff_tsc > cycles_10ms) { 2456 rte_timer_manage(); 2457 prev_tsc = cur_tsc; 2458 cycles_10ms = rte_get_timer_hz() / 100; 2459 } 2460 } 2461 2462 RTE_LOG(INFO, L3FWD_POWER, "Timer_subsystem is done\n"); 2463 2464 return 0; 2465 } 2466 2467 static int 2468 autodetect_mode(void) 2469 { 2470 RTE_LOG(NOTICE, L3FWD_POWER, "Operating mode not specified, probing frequency scaling support...\n"); 2471 2472 /* 2473 * Empty poll and telemetry modes have to be specifically requested to 2474 * be enabled, but we can auto-detect between interrupt mode with or 2475 * without frequency scaling. Any of ACPI, pstate and CPPC can be used. 2476 */ 2477 if (rte_power_check_env_supported(PM_ENV_ACPI_CPUFREQ)) 2478 return APP_MODE_LEGACY; 2479 if (rte_power_check_env_supported(PM_ENV_PSTATE_CPUFREQ)) 2480 return APP_MODE_LEGACY; 2481 if (rte_power_check_env_supported(PM_ENV_AMD_PSTATE_CPUFREQ)) 2482 return APP_MODE_LEGACY; 2483 if (rte_power_check_env_supported(PM_ENV_CPPC_CPUFREQ)) 2484 return APP_MODE_LEGACY; 2485 2486 RTE_LOG(NOTICE, L3FWD_POWER, "Frequency scaling not supported, selecting interrupt-only mode\n"); 2487 2488 return APP_MODE_INTERRUPT; 2489 } 2490 2491 static const char * 2492 mode_to_str(enum appmode mode) 2493 { 2494 switch (mode) { 2495 case APP_MODE_LEGACY: 2496 return "legacy"; 2497 case APP_MODE_TELEMETRY: 2498 return "telemetry"; 2499 case APP_MODE_INTERRUPT: 2500 return "interrupt-only"; 2501 case APP_MODE_PMD_MGMT: 2502 return "pmd mgmt"; 2503 default: 2504 return "invalid"; 2505 } 2506 } 2507 2508 static uint32_t 2509 eth_dev_get_overhead_len(uint32_t max_rx_pktlen, uint16_t max_mtu) 2510 { 2511 uint32_t overhead_len; 2512 2513 if (max_mtu != UINT16_MAX && max_rx_pktlen > max_mtu) 2514 overhead_len = max_rx_pktlen - max_mtu; 2515 else 2516 overhead_len = RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN; 2517 2518 return overhead_len; 2519 } 2520 2521 static int 2522 config_port_max_pkt_len(struct rte_eth_conf *conf, 2523 struct rte_eth_dev_info *dev_info) 2524 { 2525 uint32_t overhead_len; 2526 2527 if (max_pkt_len == 0) 2528 return 0; 2529 2530 if (max_pkt_len < RTE_ETHER_MIN_LEN || max_pkt_len > MAX_JUMBO_PKT_LEN) 2531 return -1; 2532 2533 overhead_len = eth_dev_get_overhead_len(dev_info->max_rx_pktlen, 2534 dev_info->max_mtu); 2535 conf->rxmode.mtu = max_pkt_len - overhead_len; 2536 2537 if (conf->rxmode.mtu > RTE_ETHER_MTU) 2538 conf->txmode.offloads |= RTE_ETH_TX_OFFLOAD_MULTI_SEGS; 2539 2540 return 0; 2541 } 2542 2543 /* Power library initialized in the main routine. 8< */ 2544 int 2545 main(int argc, char **argv) 2546 { 2547 struct lcore_conf *qconf; 2548 struct rte_eth_dev_info dev_info; 2549 struct rte_eth_txconf *txconf; 2550 int ret; 2551 uint16_t nb_ports; 2552 uint16_t queueid; 2553 unsigned lcore_id; 2554 uint64_t hz; 2555 uint32_t n_tx_queue, nb_lcores; 2556 uint32_t dev_rxq_num, dev_txq_num; 2557 uint8_t socketid; 2558 uint16_t portid, nb_rx_queue, queue; 2559 const char *ptr_strings[NUM_TELSTATS]; 2560 2561 /* init EAL */ 2562 ret = rte_eal_init(argc, argv); 2563 if (ret < 0) 2564 rte_exit(EXIT_FAILURE, "Invalid EAL parameters\n"); 2565 argc -= ret; 2566 argv += ret; 2567 2568 /* catch SIGINT and restore cpufreq governor to ondemand */ 2569 signal(SIGINT, signal_exit_now); 2570 2571 /* init RTE timer library to be used late */ 2572 rte_timer_subsystem_init(); 2573 2574 /* if we're running pmd-mgmt mode, don't default to baseline mode */ 2575 baseline_enabled = false; 2576 2577 /* parse application arguments (after the EAL ones) */ 2578 ret = parse_args(argc, argv); 2579 if (ret < 0) 2580 rte_exit(EXIT_FAILURE, "Invalid L3FWD parameters\n"); 2581 2582 if (app_mode == APP_MODE_DEFAULT) 2583 app_mode = autodetect_mode(); 2584 2585 RTE_LOG(INFO, L3FWD_POWER, "Selected operation mode: %s\n", 2586 mode_to_str(app_mode)); 2587 2588 /* only legacy mode relies on power library */ 2589 if ((app_mode == APP_MODE_LEGACY) && init_power_library()) 2590 rte_exit(EXIT_FAILURE, "init_power_library failed\n"); 2591 2592 if (update_lcore_params() < 0) 2593 rte_exit(EXIT_FAILURE, "update_lcore_params failed\n"); 2594 2595 if (check_lcore_params() < 0) 2596 rte_exit(EXIT_FAILURE, "check_lcore_params failed\n"); 2597 2598 ret = init_lcore_rx_queues(); 2599 if (ret < 0) 2600 rte_exit(EXIT_FAILURE, "init_lcore_rx_queues failed\n"); 2601 2602 nb_ports = rte_eth_dev_count_avail(); 2603 2604 if (check_port_config() < 0) 2605 rte_exit(EXIT_FAILURE, "check_port_config failed\n"); 2606 2607 nb_lcores = rte_lcore_count(); 2608 2609 /* initialize all ports */ 2610 RTE_ETH_FOREACH_DEV(portid) { 2611 struct rte_eth_conf local_port_conf = port_conf; 2612 /* not all app modes need interrupts */ 2613 bool need_intr = app_mode == APP_MODE_LEGACY || 2614 app_mode == APP_MODE_INTERRUPT; 2615 2616 /* skip ports that are not enabled */ 2617 if ((enabled_port_mask & (1 << portid)) == 0) { 2618 printf("\nSkipping disabled port %d\n", portid); 2619 continue; 2620 } 2621 2622 /* init port */ 2623 printf("Initializing port %d ... ", portid ); 2624 fflush(stdout); 2625 2626 ret = rte_eth_dev_info_get(portid, &dev_info); 2627 if (ret != 0) 2628 rte_exit(EXIT_FAILURE, 2629 "Error during getting device (port %u) info: %s\n", 2630 portid, strerror(-ret)); 2631 2632 dev_rxq_num = dev_info.max_rx_queues; 2633 dev_txq_num = dev_info.max_tx_queues; 2634 2635 nb_rx_queue = get_port_n_rx_queues(portid); 2636 if (nb_rx_queue > dev_rxq_num) 2637 rte_exit(EXIT_FAILURE, 2638 "Cannot configure not existed rxq: " 2639 "port=%d\n", portid); 2640 2641 n_tx_queue = nb_lcores; 2642 if (n_tx_queue > dev_txq_num) 2643 n_tx_queue = dev_txq_num; 2644 printf("Creating queues: nb_rxq=%d nb_txq=%u... ", 2645 nb_rx_queue, (unsigned)n_tx_queue ); 2646 /* If number of Rx queue is 0, no need to enable Rx interrupt */ 2647 if (nb_rx_queue == 0) 2648 need_intr = false; 2649 2650 if (need_intr) 2651 local_port_conf.intr_conf.rxq = 1; 2652 2653 ret = rte_eth_dev_info_get(portid, &dev_info); 2654 if (ret != 0) 2655 rte_exit(EXIT_FAILURE, 2656 "Error during getting device (port %u) info: %s\n", 2657 portid, strerror(-ret)); 2658 2659 ret = config_port_max_pkt_len(&local_port_conf, &dev_info); 2660 if (ret != 0) 2661 rte_exit(EXIT_FAILURE, 2662 "Invalid max packet length: %u (port %u)\n", 2663 max_pkt_len, portid); 2664 2665 if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) 2666 local_port_conf.txmode.offloads |= 2667 RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE; 2668 2669 local_port_conf.rx_adv_conf.rss_conf.rss_hf &= 2670 dev_info.flow_type_rss_offloads; 2671 if (local_port_conf.rx_adv_conf.rss_conf.rss_hf != 2672 port_conf.rx_adv_conf.rss_conf.rss_hf) { 2673 printf("Port %u modified RSS hash function based on hardware support," 2674 "requested:%#"PRIx64" configured:%#"PRIx64"\n", 2675 portid, 2676 port_conf.rx_adv_conf.rss_conf.rss_hf, 2677 local_port_conf.rx_adv_conf.rss_conf.rss_hf); 2678 } 2679 2680 if (local_port_conf.rx_adv_conf.rss_conf.rss_hf == 0) 2681 local_port_conf.rxmode.mq_mode = RTE_ETH_MQ_RX_NONE; 2682 local_port_conf.rxmode.offloads &= dev_info.rx_offload_capa; 2683 port_conf.rxmode.offloads = local_port_conf.rxmode.offloads; 2684 2685 ret = rte_eth_dev_configure(portid, nb_rx_queue, 2686 (uint16_t)n_tx_queue, &local_port_conf); 2687 if (ret < 0) 2688 rte_exit(EXIT_FAILURE, "Cannot configure device: " 2689 "err=%d, port=%d\n", ret, portid); 2690 2691 ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, 2692 &nb_txd); 2693 if (ret < 0) 2694 rte_exit(EXIT_FAILURE, 2695 "Cannot adjust number of descriptors: err=%d, port=%d\n", 2696 ret, portid); 2697 2698 ret = rte_eth_macaddr_get(portid, &ports_eth_addr[portid]); 2699 if (ret < 0) 2700 rte_exit(EXIT_FAILURE, 2701 "Cannot get MAC address: err=%d, port=%d\n", 2702 ret, portid); 2703 2704 print_ethaddr(" Address:", &ports_eth_addr[portid]); 2705 printf(", "); 2706 2707 /* init memory */ 2708 ret = init_mem(NB_MBUF); 2709 if (ret < 0) 2710 rte_exit(EXIT_FAILURE, "init_mem failed\n"); 2711 2712 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2713 if (rte_lcore_is_enabled(lcore_id) == 0) 2714 continue; 2715 2716 /* Initialize TX buffers */ 2717 qconf = &lcore_conf[lcore_id]; 2718 qconf->tx_buffer[portid] = rte_zmalloc_socket("tx_buffer", 2719 RTE_ETH_TX_BUFFER_SIZE(MAX_PKT_BURST), 0, 2720 rte_eth_dev_socket_id(portid)); 2721 if (qconf->tx_buffer[portid] == NULL) 2722 rte_exit(EXIT_FAILURE, "Can't allocate tx buffer for port %u\n", 2723 portid); 2724 2725 rte_eth_tx_buffer_init(qconf->tx_buffer[portid], MAX_PKT_BURST); 2726 } 2727 2728 /* init one TX queue per couple (lcore,port) */ 2729 queueid = 0; 2730 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2731 if (rte_lcore_is_enabled(lcore_id) == 0) 2732 continue; 2733 2734 if (queueid >= dev_txq_num) 2735 continue; 2736 2737 if (numa_on) 2738 socketid = \ 2739 (uint8_t)rte_lcore_to_socket_id(lcore_id); 2740 else 2741 socketid = 0; 2742 2743 printf("txq=%u,%d,%d ", lcore_id, queueid, socketid); 2744 fflush(stdout); 2745 2746 txconf = &dev_info.default_txconf; 2747 txconf->offloads = local_port_conf.txmode.offloads; 2748 ret = rte_eth_tx_queue_setup(portid, queueid, nb_txd, 2749 socketid, txconf); 2750 if (ret < 0) 2751 rte_exit(EXIT_FAILURE, 2752 "rte_eth_tx_queue_setup: err=%d, " 2753 "port=%d\n", ret, portid); 2754 2755 qconf = &lcore_conf[lcore_id]; 2756 qconf->tx_queue_id[portid] = queueid; 2757 queueid++; 2758 2759 qconf->tx_port_id[qconf->n_tx_port] = portid; 2760 qconf->n_tx_port++; 2761 } 2762 printf("\n"); 2763 } 2764 2765 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2766 if (rte_lcore_is_enabled(lcore_id) == 0) 2767 continue; 2768 2769 if (app_mode == APP_MODE_LEGACY) { 2770 /* init timer structures for each enabled lcore */ 2771 rte_timer_init(&power_timers[lcore_id]); 2772 hz = rte_get_timer_hz(); 2773 rte_timer_reset(&power_timers[lcore_id], 2774 hz/TIMER_NUMBER_PER_SECOND, 2775 SINGLE, lcore_id, 2776 power_timer_cb, NULL); 2777 } 2778 qconf = &lcore_conf[lcore_id]; 2779 printf("\nInitializing rx queues on lcore %u ... ", lcore_id ); 2780 fflush(stdout); 2781 2782 /* init RX queues */ 2783 for(queue = 0; queue < qconf->n_rx_queue; ++queue) { 2784 struct rte_eth_rxconf rxq_conf; 2785 2786 portid = qconf->rx_queue_list[queue].port_id; 2787 queueid = qconf->rx_queue_list[queue].queue_id; 2788 2789 if (numa_on) 2790 socketid = \ 2791 (uint8_t)rte_lcore_to_socket_id(lcore_id); 2792 else 2793 socketid = 0; 2794 2795 printf("rxq=%d,%d,%d ", portid, queueid, socketid); 2796 fflush(stdout); 2797 2798 ret = rte_eth_dev_info_get(portid, &dev_info); 2799 if (ret != 0) 2800 rte_exit(EXIT_FAILURE, 2801 "Error during getting device (port %u) info: %s\n", 2802 portid, strerror(-ret)); 2803 2804 rxq_conf = dev_info.default_rxconf; 2805 rxq_conf.offloads = port_conf.rxmode.offloads; 2806 ret = rte_eth_rx_queue_setup(portid, queueid, nb_rxd, 2807 socketid, &rxq_conf, 2808 pktmbuf_pool[socketid]); 2809 if (ret < 0) 2810 rte_exit(EXIT_FAILURE, 2811 "rte_eth_rx_queue_setup: err=%d, " 2812 "port=%d\n", ret, portid); 2813 2814 if (parse_ptype) { 2815 if (add_cb_parse_ptype(portid, queueid) < 0) 2816 rte_exit(EXIT_FAILURE, 2817 "Fail to add ptype cb\n"); 2818 } 2819 2820 if (app_mode == APP_MODE_PMD_MGMT && !baseline_enabled) { 2821 /* Set power_pmd_mgmt configs passed by user */ 2822 rte_power_pmd_mgmt_set_emptypoll_max(max_empty_polls); 2823 ret = rte_power_pmd_mgmt_set_pause_duration(pause_duration); 2824 if (ret < 0) 2825 rte_exit(EXIT_FAILURE, 2826 "Error setting pause_duration: err=%d, lcore=%d\n", 2827 ret, lcore_id); 2828 2829 ret = rte_power_pmd_mgmt_set_scaling_freq_min(lcore_id, 2830 scale_freq_min); 2831 if (ret < 0) 2832 rte_exit(EXIT_FAILURE, 2833 "Error setting scaling freq min: err=%d, lcore=%d\n", 2834 ret, lcore_id); 2835 2836 ret = rte_power_pmd_mgmt_set_scaling_freq_max(lcore_id, 2837 scale_freq_max); 2838 if (ret < 0) 2839 rte_exit(EXIT_FAILURE, 2840 "Error setting scaling freq max: err=%d, lcore %d\n", 2841 ret, lcore_id); 2842 2843 ret = rte_power_ethdev_pmgmt_queue_enable( 2844 lcore_id, portid, queueid, 2845 pmgmt_type); 2846 if (ret < 0) 2847 rte_exit(EXIT_FAILURE, 2848 "rte_power_ethdev_pmgmt_queue_enable: err=%d, port=%d\n", 2849 ret, portid); 2850 } 2851 } 2852 } 2853 /* >8 End of power library initialization. */ 2854 2855 printf("\n"); 2856 2857 /* start ports */ 2858 RTE_ETH_FOREACH_DEV(portid) { 2859 if ((enabled_port_mask & (1 << portid)) == 0) { 2860 continue; 2861 } 2862 /* Start device */ 2863 ret = rte_eth_dev_start(portid); 2864 if (ret < 0) 2865 rte_exit(EXIT_FAILURE, "rte_eth_dev_start: err=%d, " 2866 "port=%d\n", ret, portid); 2867 /* 2868 * If enabled, put device in promiscuous mode. 2869 * This allows IO forwarding mode to forward packets 2870 * to itself through 2 cross-connected ports of the 2871 * target machine. 2872 */ 2873 if (promiscuous_on) { 2874 ret = rte_eth_promiscuous_enable(portid); 2875 if (ret != 0) 2876 rte_exit(EXIT_FAILURE, 2877 "rte_eth_promiscuous_enable: err=%s, port=%u\n", 2878 rte_strerror(-ret), portid); 2879 } 2880 /* initialize spinlock for each port */ 2881 rte_spinlock_init(&(locks[portid])); 2882 2883 if (!parse_ptype) 2884 if (!check_ptype(portid)) 2885 rte_exit(EXIT_FAILURE, 2886 "PMD can not provide needed ptypes\n"); 2887 } 2888 2889 check_all_ports_link_status(enabled_port_mask); 2890 2891 /* launch per-lcore init on every lcore */ 2892 if (app_mode == APP_MODE_LEGACY) { 2893 rte_eal_mp_remote_launch(main_legacy_loop, NULL, CALL_MAIN); 2894 } else if (app_mode == APP_MODE_TELEMETRY) { 2895 unsigned int i; 2896 2897 /* Init metrics library */ 2898 rte_metrics_init(rte_socket_id()); 2899 /** Register stats with metrics library */ 2900 for (i = 0; i < NUM_TELSTATS; i++) 2901 ptr_strings[i] = telstats_strings[i].name; 2902 2903 ret = rte_metrics_reg_names(ptr_strings, NUM_TELSTATS); 2904 if (ret >= 0) 2905 telstats_index = ret; 2906 else 2907 rte_exit(EXIT_FAILURE, "failed to register metrics names"); 2908 2909 RTE_LCORE_FOREACH_WORKER(lcore_id) { 2910 rte_spinlock_init(&stats[lcore_id].telemetry_lock); 2911 } 2912 rte_timer_init(&telemetry_timer); 2913 rte_telemetry_register_cmd("/l3fwd-power/stats", 2914 handle_app_stats, 2915 "Returns global power stats. Parameters: None"); 2916 rte_eal_mp_remote_launch(main_telemetry_loop, NULL, 2917 SKIP_MAIN); 2918 } else if (app_mode == APP_MODE_INTERRUPT) { 2919 rte_eal_mp_remote_launch(main_intr_loop, NULL, CALL_MAIN); 2920 } else if (app_mode == APP_MODE_PMD_MGMT) { 2921 /* reuse telemetry loop for PMD power management mode */ 2922 rte_eal_mp_remote_launch(main_telemetry_loop, NULL, CALL_MAIN); 2923 } 2924 2925 if (app_mode == APP_MODE_TELEMETRY) 2926 launch_timer(rte_lcore_id()); 2927 2928 RTE_LCORE_FOREACH_WORKER(lcore_id) { 2929 if (rte_eal_wait_lcore(lcore_id) < 0) 2930 return -1; 2931 } 2932 2933 if (app_mode == APP_MODE_PMD_MGMT) { 2934 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2935 if (rte_lcore_is_enabled(lcore_id) == 0) 2936 continue; 2937 qconf = &lcore_conf[lcore_id]; 2938 for (queue = 0; queue < qconf->n_rx_queue; ++queue) { 2939 portid = qconf->rx_queue_list[queue].port_id; 2940 queueid = qconf->rx_queue_list[queue].queue_id; 2941 2942 rte_power_ethdev_pmgmt_queue_disable(lcore_id, 2943 portid, queueid); 2944 } 2945 } 2946 } 2947 2948 RTE_ETH_FOREACH_DEV(portid) 2949 { 2950 if ((enabled_port_mask & (1 << portid)) == 0) 2951 continue; 2952 2953 ret = rte_eth_dev_stop(portid); 2954 if (ret != 0) 2955 RTE_LOG(ERR, L3FWD_POWER, "rte_eth_dev_stop: err=%d, port=%u\n", 2956 ret, portid); 2957 2958 rte_eth_dev_close(portid); 2959 } 2960 2961 if ((app_mode == APP_MODE_LEGACY) && deinit_power_library()) 2962 rte_exit(EXIT_FAILURE, "deinit_power_library failed\n"); 2963 2964 if (rte_eal_cleanup() < 0) 2965 RTE_LOG(ERR, L3FWD_POWER, "EAL cleanup failed\n"); 2966 2967 return 0; 2968 } 2969