1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2018 Intel Corporation 3 */ 4 5 #include <stdio.h> 6 #include <stdlib.h> 7 #include <stdint.h> 8 #include <inttypes.h> 9 #include <sys/types.h> 10 #include <string.h> 11 #include <sys/queue.h> 12 #include <stdarg.h> 13 #include <errno.h> 14 #include <getopt.h> 15 #include <unistd.h> 16 #include <signal.h> 17 #include <math.h> 18 19 #include <rte_common.h> 20 #include <rte_byteorder.h> 21 #include <rte_log.h> 22 #include <rte_malloc.h> 23 #include <rte_memory.h> 24 #include <rte_memcpy.h> 25 #include <rte_eal.h> 26 #include <rte_launch.h> 27 #include <rte_cycles.h> 28 #include <rte_prefetch.h> 29 #include <rte_lcore.h> 30 #include <rte_per_lcore.h> 31 #include <rte_branch_prediction.h> 32 #include <rte_interrupts.h> 33 #include <rte_random.h> 34 #include <rte_debug.h> 35 #include <rte_ether.h> 36 #include <rte_ethdev.h> 37 #include <rte_mempool.h> 38 #include <rte_mbuf.h> 39 #include <rte_ip.h> 40 #include <rte_tcp.h> 41 #include <rte_udp.h> 42 #include <rte_string_fns.h> 43 #include <rte_timer.h> 44 #include <rte_power.h> 45 #include <rte_spinlock.h> 46 #include <rte_power_empty_poll.h> 47 #include <rte_metrics.h> 48 #include <rte_telemetry.h> 49 #include <rte_power_pmd_mgmt.h> 50 51 #include "perf_core.h" 52 #include "main.h" 53 54 #define RTE_LOGTYPE_L3FWD_POWER RTE_LOGTYPE_USER1 55 56 #define MAX_PKT_BURST 32 57 58 #define MIN_ZERO_POLL_COUNT 10 59 60 /* 100 ms interval */ 61 #define TIMER_NUMBER_PER_SECOND 10 62 /* (10ms) */ 63 #define INTERVALS_PER_SECOND 100 64 /* 100000 us */ 65 #define SCALING_PERIOD (1000000/TIMER_NUMBER_PER_SECOND) 66 #define SCALING_DOWN_TIME_RATIO_THRESHOLD 0.25 67 68 #define APP_LOOKUP_EXACT_MATCH 0 69 #define APP_LOOKUP_LPM 1 70 #define DO_RFC_1812_CHECKS 71 72 #ifndef APP_LOOKUP_METHOD 73 #define APP_LOOKUP_METHOD APP_LOOKUP_LPM 74 #endif 75 76 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 77 #include <rte_hash.h> 78 #elif (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 79 #include <rte_lpm.h> 80 #else 81 #error "APP_LOOKUP_METHOD set to incorrect value" 82 #endif 83 84 #ifndef IPv6_BYTES 85 #define IPv6_BYTES_FMT "%02x%02x:%02x%02x:%02x%02x:%02x%02x:"\ 86 "%02x%02x:%02x%02x:%02x%02x:%02x%02x" 87 #define IPv6_BYTES(addr) \ 88 addr[0], addr[1], addr[2], addr[3], \ 89 addr[4], addr[5], addr[6], addr[7], \ 90 addr[8], addr[9], addr[10], addr[11],\ 91 addr[12], addr[13],addr[14], addr[15] 92 #endif 93 94 #define MAX_JUMBO_PKT_LEN 9600 95 96 #define IPV6_ADDR_LEN 16 97 98 #define MEMPOOL_CACHE_SIZE 256 99 100 /* 101 * This expression is used to calculate the number of mbufs needed depending on 102 * user input, taking into account memory for rx and tx hardware rings, cache 103 * per lcore and mtable per port per lcore. RTE_MAX is used to ensure that 104 * NB_MBUF never goes below a minimum value of 8192. 105 */ 106 107 #define NB_MBUF RTE_MAX ( \ 108 (nb_ports*nb_rx_queue*nb_rxd + \ 109 nb_ports*nb_lcores*MAX_PKT_BURST + \ 110 nb_ports*n_tx_queue*nb_txd + \ 111 nb_lcores*MEMPOOL_CACHE_SIZE), \ 112 (unsigned)8192) 113 114 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 115 116 #define NB_SOCKETS 8 117 118 /* Configure how many packets ahead to prefetch, when reading packets */ 119 #define PREFETCH_OFFSET 3 120 121 /* 122 * Configurable number of RX/TX ring descriptors 123 */ 124 #define RTE_TEST_RX_DESC_DEFAULT 1024 125 #define RTE_TEST_TX_DESC_DEFAULT 1024 126 127 /* 128 * These two thresholds were decided on by running the training algorithm on 129 * a 2.5GHz Xeon. These defaults can be overridden by supplying non-zero values 130 * for the med_threshold and high_threshold parameters on the command line. 131 */ 132 #define EMPTY_POLL_MED_THRESHOLD 350000UL 133 #define EMPTY_POLL_HGH_THRESHOLD 580000UL 134 135 #define NUM_TELSTATS RTE_DIM(telstats_strings) 136 137 static uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT; 138 static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT; 139 140 /* ethernet addresses of ports */ 141 static struct rte_ether_addr ports_eth_addr[RTE_MAX_ETHPORTS]; 142 143 /* ethernet addresses of ports */ 144 static rte_spinlock_t locks[RTE_MAX_ETHPORTS]; 145 146 /* mask of enabled ports */ 147 static uint32_t enabled_port_mask = 0; 148 /* Ports set in promiscuous mode off by default. */ 149 static int promiscuous_on = 0; 150 /* NUMA is enabled by default. */ 151 static int numa_on = 1; 152 static bool empty_poll_stop; 153 static bool empty_poll_train; 154 volatile bool quit_signal; 155 static struct ep_params *ep_params; 156 static struct ep_policy policy; 157 static long ep_med_edpi, ep_hgh_edpi; 158 /* timer to update telemetry every 500ms */ 159 static struct rte_timer telemetry_timer; 160 161 /* stats index returned by metrics lib */ 162 int telstats_index; 163 164 struct telstats_name { 165 char name[RTE_ETH_XSTATS_NAME_SIZE]; 166 }; 167 168 /* telemetry stats to be reported */ 169 const struct telstats_name telstats_strings[] = { 170 {"empty_poll"}, 171 {"full_poll"}, 172 {"busy_percent"} 173 }; 174 175 /* core busyness in percentage */ 176 enum busy_rate { 177 ZERO = 0, 178 PARTIAL = 50, 179 FULL = 100 180 }; 181 182 /* reference poll count to measure core busyness */ 183 #define DEFAULT_COUNT 10000 184 /* 185 * reference CYCLES to be used to 186 * measure core busyness based on poll count 187 */ 188 #define MIN_CYCLES 1500000ULL 189 #define MAX_CYCLES 22000000ULL 190 191 /* (500ms) */ 192 #define TELEMETRY_INTERVALS_PER_SEC 2 193 194 static int parse_ptype; /**< Parse packet type using rx callback, and */ 195 /**< disabled by default */ 196 197 enum appmode { 198 APP_MODE_DEFAULT = 0, 199 APP_MODE_LEGACY, 200 APP_MODE_EMPTY_POLL, 201 APP_MODE_TELEMETRY, 202 APP_MODE_INTERRUPT, 203 APP_MODE_PMD_MGMT 204 }; 205 206 enum appmode app_mode; 207 208 static enum rte_power_pmd_mgmt_type pmgmt_type; 209 bool baseline_enabled; 210 211 enum freq_scale_hint_t 212 { 213 FREQ_LOWER = -1, 214 FREQ_CURRENT = 0, 215 FREQ_HIGHER = 1, 216 FREQ_HIGHEST = 2 217 }; 218 219 struct lcore_rx_queue { 220 uint16_t port_id; 221 uint8_t queue_id; 222 enum freq_scale_hint_t freq_up_hint; 223 uint32_t zero_rx_packet_count; 224 uint32_t idle_hint; 225 } __rte_cache_aligned; 226 227 #define MAX_RX_QUEUE_PER_LCORE 16 228 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS 229 #define MAX_RX_QUEUE_PER_PORT 128 230 231 #define MAX_RX_QUEUE_INTERRUPT_PER_PORT 16 232 233 234 struct lcore_params lcore_params_array[MAX_LCORE_PARAMS]; 235 static struct lcore_params lcore_params_array_default[] = { 236 {0, 0, 2}, 237 {0, 1, 2}, 238 {0, 2, 2}, 239 {1, 0, 2}, 240 {1, 1, 2}, 241 {1, 2, 2}, 242 {2, 0, 2}, 243 {3, 0, 3}, 244 {3, 1, 3}, 245 }; 246 247 struct lcore_params *lcore_params = lcore_params_array_default; 248 uint16_t nb_lcore_params = RTE_DIM(lcore_params_array_default); 249 250 static struct rte_eth_conf port_conf = { 251 .rxmode = { 252 .mq_mode = RTE_ETH_MQ_RX_RSS, 253 .split_hdr_size = 0, 254 .offloads = RTE_ETH_RX_OFFLOAD_CHECKSUM, 255 }, 256 .rx_adv_conf = { 257 .rss_conf = { 258 .rss_key = NULL, 259 .rss_hf = RTE_ETH_RSS_UDP, 260 }, 261 }, 262 .txmode = { 263 .mq_mode = RTE_ETH_MQ_TX_NONE, 264 } 265 }; 266 267 static uint32_t max_pkt_len; 268 static uint32_t max_empty_polls = 512; 269 static uint32_t pause_duration = 1; 270 static uint32_t scale_freq_min; 271 static uint32_t scale_freq_max; 272 273 static struct rte_mempool * pktmbuf_pool[NB_SOCKETS]; 274 275 276 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 277 278 #ifdef RTE_ARCH_X86 279 #include <rte_hash_crc.h> 280 #define DEFAULT_HASH_FUNC rte_hash_crc 281 #else 282 #include <rte_jhash.h> 283 #define DEFAULT_HASH_FUNC rte_jhash 284 #endif 285 286 struct ipv4_5tuple { 287 uint32_t ip_dst; 288 uint32_t ip_src; 289 uint16_t port_dst; 290 uint16_t port_src; 291 uint8_t proto; 292 } __rte_packed; 293 294 struct ipv6_5tuple { 295 uint8_t ip_dst[IPV6_ADDR_LEN]; 296 uint8_t ip_src[IPV6_ADDR_LEN]; 297 uint16_t port_dst; 298 uint16_t port_src; 299 uint8_t proto; 300 } __rte_packed; 301 302 struct ipv4_l3fwd_route { 303 struct ipv4_5tuple key; 304 uint8_t if_out; 305 }; 306 307 struct ipv6_l3fwd_route { 308 struct ipv6_5tuple key; 309 uint8_t if_out; 310 }; 311 312 static struct ipv4_l3fwd_route ipv4_l3fwd_route_array[] = { 313 {{RTE_IPV4(100,10,0,1), RTE_IPV4(200,10,0,1), 101, 11, IPPROTO_TCP}, 0}, 314 {{RTE_IPV4(100,20,0,2), RTE_IPV4(200,20,0,2), 102, 12, IPPROTO_TCP}, 1}, 315 {{RTE_IPV4(100,30,0,3), RTE_IPV4(200,30,0,3), 103, 13, IPPROTO_TCP}, 2}, 316 {{RTE_IPV4(100,40,0,4), RTE_IPV4(200,40,0,4), 104, 14, IPPROTO_TCP}, 3}, 317 }; 318 319 static struct ipv6_l3fwd_route ipv6_l3fwd_route_array[] = { 320 { 321 { 322 {0xfe, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 323 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05}, 324 {0xfe, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 325 0x02, 0x1e, 0x67, 0xff, 0xfe, 0x0d, 0xb6, 0x0a}, 326 1, 10, IPPROTO_UDP 327 }, 4 328 }, 329 }; 330 331 typedef struct rte_hash lookup_struct_t; 332 static lookup_struct_t *ipv4_l3fwd_lookup_struct[NB_SOCKETS]; 333 static lookup_struct_t *ipv6_l3fwd_lookup_struct[NB_SOCKETS]; 334 335 #define L3FWD_HASH_ENTRIES 1024 336 337 static uint16_t ipv4_l3fwd_out_if[L3FWD_HASH_ENTRIES] __rte_cache_aligned; 338 static uint16_t ipv6_l3fwd_out_if[L3FWD_HASH_ENTRIES] __rte_cache_aligned; 339 #endif 340 341 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 342 struct ipv4_l3fwd_route { 343 uint32_t ip; 344 uint8_t depth; 345 uint8_t if_out; 346 }; 347 348 static struct ipv4_l3fwd_route ipv4_l3fwd_route_array[] = { 349 {RTE_IPV4(1,1,1,0), 24, 0}, 350 {RTE_IPV4(2,1,1,0), 24, 1}, 351 {RTE_IPV4(3,1,1,0), 24, 2}, 352 {RTE_IPV4(4,1,1,0), 24, 3}, 353 {RTE_IPV4(5,1,1,0), 24, 4}, 354 {RTE_IPV4(6,1,1,0), 24, 5}, 355 {RTE_IPV4(7,1,1,0), 24, 6}, 356 {RTE_IPV4(8,1,1,0), 24, 7}, 357 }; 358 359 #define IPV4_L3FWD_LPM_MAX_RULES 1024 360 361 typedef struct rte_lpm lookup_struct_t; 362 static lookup_struct_t *ipv4_l3fwd_lookup_struct[NB_SOCKETS]; 363 #endif 364 365 struct lcore_conf { 366 uint16_t n_rx_queue; 367 struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; 368 uint16_t n_tx_port; 369 uint16_t tx_port_id[RTE_MAX_ETHPORTS]; 370 uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; 371 struct rte_eth_dev_tx_buffer *tx_buffer[RTE_MAX_ETHPORTS]; 372 lookup_struct_t * ipv4_lookup_struct; 373 lookup_struct_t * ipv6_lookup_struct; 374 } __rte_cache_aligned; 375 376 struct lcore_stats { 377 /* total sleep time in ms since last frequency scaling down */ 378 uint32_t sleep_time; 379 /* number of long sleep recently */ 380 uint32_t nb_long_sleep; 381 /* freq. scaling up trend */ 382 uint32_t trend; 383 /* total packet processed recently */ 384 uint64_t nb_rx_processed; 385 /* total iterations looped recently */ 386 uint64_t nb_iteration_looped; 387 /* 388 * Represents empty and non empty polls 389 * of rte_eth_rx_burst(); 390 * ep_nep[0] holds non empty polls 391 * i.e. 0 < nb_rx <= MAX_BURST 392 * ep_nep[1] holds empty polls. 393 * i.e. nb_rx == 0 394 */ 395 uint64_t ep_nep[2]; 396 /* 397 * Represents full and empty+partial 398 * polls of rte_eth_rx_burst(); 399 * ep_nep[0] holds empty+partial polls. 400 * i.e. 0 <= nb_rx < MAX_BURST 401 * ep_nep[1] holds full polls 402 * i.e. nb_rx == MAX_BURST 403 */ 404 uint64_t fp_nfp[2]; 405 enum busy_rate br; 406 rte_spinlock_t telemetry_lock; 407 } __rte_cache_aligned; 408 409 static struct lcore_conf lcore_conf[RTE_MAX_LCORE] __rte_cache_aligned; 410 static struct lcore_stats stats[RTE_MAX_LCORE] __rte_cache_aligned; 411 static struct rte_timer power_timers[RTE_MAX_LCORE]; 412 413 static inline uint32_t power_idle_heuristic(uint32_t zero_rx_packet_count); 414 static inline enum freq_scale_hint_t power_freq_scaleup_heuristic( \ 415 unsigned int lcore_id, uint16_t port_id, uint16_t queue_id); 416 417 418 /* 419 * These defaults are using the max frequency index (1), a medium index (9) 420 * and a typical low frequency index (14). These can be adjusted to use 421 * different indexes using the relevant command line parameters. 422 */ 423 static uint8_t freq_tlb[] = {14, 9, 1}; 424 425 static int is_done(void) 426 { 427 return quit_signal; 428 } 429 430 /* exit signal handler */ 431 static void 432 signal_exit_now(int sigtype) 433 { 434 435 if (sigtype == SIGINT) 436 quit_signal = true; 437 438 } 439 440 /* Frequency scale down timer callback */ 441 static void 442 power_timer_cb(__rte_unused struct rte_timer *tim, 443 __rte_unused void *arg) 444 { 445 uint64_t hz; 446 float sleep_time_ratio; 447 unsigned lcore_id = rte_lcore_id(); 448 449 /* accumulate total execution time in us when callback is invoked */ 450 sleep_time_ratio = (float)(stats[lcore_id].sleep_time) / 451 (float)SCALING_PERIOD; 452 /** 453 * check whether need to scale down frequency a step if it sleep a lot. 454 */ 455 if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD) { 456 if (rte_power_freq_down) 457 rte_power_freq_down(lcore_id); 458 } 459 else if ( (unsigned)(stats[lcore_id].nb_rx_processed / 460 stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST) { 461 /** 462 * scale down a step if average packet per iteration less 463 * than expectation. 464 */ 465 if (rte_power_freq_down) 466 rte_power_freq_down(lcore_id); 467 } 468 469 /** 470 * initialize another timer according to current frequency to ensure 471 * timer interval is relatively fixed. 472 */ 473 hz = rte_get_timer_hz(); 474 rte_timer_reset(&power_timers[lcore_id], hz/TIMER_NUMBER_PER_SECOND, 475 SINGLE, lcore_id, power_timer_cb, NULL); 476 477 stats[lcore_id].nb_rx_processed = 0; 478 stats[lcore_id].nb_iteration_looped = 0; 479 480 stats[lcore_id].sleep_time = 0; 481 } 482 483 /* Enqueue a single packet, and send burst if queue is filled */ 484 static inline int 485 send_single_packet(struct rte_mbuf *m, uint16_t port) 486 { 487 uint32_t lcore_id; 488 struct lcore_conf *qconf; 489 490 lcore_id = rte_lcore_id(); 491 qconf = &lcore_conf[lcore_id]; 492 493 rte_eth_tx_buffer(port, qconf->tx_queue_id[port], 494 qconf->tx_buffer[port], m); 495 496 return 0; 497 } 498 499 #ifdef DO_RFC_1812_CHECKS 500 static inline int 501 is_valid_ipv4_pkt(struct rte_ipv4_hdr *pkt, uint32_t link_len) 502 { 503 /* From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2 */ 504 /* 505 * 1. The packet length reported by the Link Layer must be large 506 * enough to hold the minimum length legal IP datagram (20 bytes). 507 */ 508 if (link_len < sizeof(struct rte_ipv4_hdr)) 509 return -1; 510 511 /* 2. The IP checksum must be correct. */ 512 /* if this is not checked in H/W, check it. */ 513 if ((port_conf.rxmode.offloads & RTE_ETH_RX_OFFLOAD_IPV4_CKSUM) == 0) { 514 uint16_t actual_cksum, expected_cksum; 515 actual_cksum = pkt->hdr_checksum; 516 pkt->hdr_checksum = 0; 517 expected_cksum = rte_ipv4_cksum(pkt); 518 if (actual_cksum != expected_cksum) 519 return -2; 520 } 521 522 /* 523 * 3. The IP version number must be 4. If the version number is not 4 524 * then the packet may be another version of IP, such as IPng or 525 * ST-II. 526 */ 527 if (((pkt->version_ihl) >> 4) != 4) 528 return -3; 529 /* 530 * 4. The IP header length field must be large enough to hold the 531 * minimum length legal IP datagram (20 bytes = 5 words). 532 */ 533 if ((pkt->version_ihl & 0xf) < 5) 534 return -4; 535 536 /* 537 * 5. The IP total length field must be large enough to hold the IP 538 * datagram header, whose length is specified in the IP header length 539 * field. 540 */ 541 if (rte_cpu_to_be_16(pkt->total_length) < sizeof(struct rte_ipv4_hdr)) 542 return -5; 543 544 return 0; 545 } 546 #endif 547 548 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 549 static void 550 print_ipv4_key(struct ipv4_5tuple key) 551 { 552 printf("IP dst = %08x, IP src = %08x, port dst = %d, port src = %d, " 553 "proto = %d\n", (unsigned)key.ip_dst, (unsigned)key.ip_src, 554 key.port_dst, key.port_src, key.proto); 555 } 556 static void 557 print_ipv6_key(struct ipv6_5tuple key) 558 { 559 printf( "IP dst = " IPv6_BYTES_FMT ", IP src = " IPv6_BYTES_FMT ", " 560 "port dst = %d, port src = %d, proto = %d\n", 561 IPv6_BYTES(key.ip_dst), IPv6_BYTES(key.ip_src), 562 key.port_dst, key.port_src, key.proto); 563 } 564 565 static inline uint16_t 566 get_ipv4_dst_port(struct rte_ipv4_hdr *ipv4_hdr, uint16_t portid, 567 lookup_struct_t * ipv4_l3fwd_lookup_struct) 568 { 569 struct ipv4_5tuple key; 570 struct rte_tcp_hdr *tcp; 571 struct rte_udp_hdr *udp; 572 int ret = 0; 573 574 key.ip_dst = rte_be_to_cpu_32(ipv4_hdr->dst_addr); 575 key.ip_src = rte_be_to_cpu_32(ipv4_hdr->src_addr); 576 key.proto = ipv4_hdr->next_proto_id; 577 578 switch (ipv4_hdr->next_proto_id) { 579 case IPPROTO_TCP: 580 tcp = (struct rte_tcp_hdr *)((unsigned char *)ipv4_hdr + 581 sizeof(struct rte_ipv4_hdr)); 582 key.port_dst = rte_be_to_cpu_16(tcp->dst_port); 583 key.port_src = rte_be_to_cpu_16(tcp->src_port); 584 break; 585 586 case IPPROTO_UDP: 587 udp = (struct rte_udp_hdr *)((unsigned char *)ipv4_hdr + 588 sizeof(struct rte_ipv4_hdr)); 589 key.port_dst = rte_be_to_cpu_16(udp->dst_port); 590 key.port_src = rte_be_to_cpu_16(udp->src_port); 591 break; 592 593 default: 594 key.port_dst = 0; 595 key.port_src = 0; 596 break; 597 } 598 599 /* Find destination port */ 600 ret = rte_hash_lookup(ipv4_l3fwd_lookup_struct, (const void *)&key); 601 return ((ret < 0) ? portid : ipv4_l3fwd_out_if[ret]); 602 } 603 604 static inline uint16_t 605 get_ipv6_dst_port(struct rte_ipv6_hdr *ipv6_hdr, uint16_t portid, 606 lookup_struct_t *ipv6_l3fwd_lookup_struct) 607 { 608 struct ipv6_5tuple key; 609 struct rte_tcp_hdr *tcp; 610 struct rte_udp_hdr *udp; 611 int ret = 0; 612 613 memcpy(key.ip_dst, ipv6_hdr->dst_addr, IPV6_ADDR_LEN); 614 memcpy(key.ip_src, ipv6_hdr->src_addr, IPV6_ADDR_LEN); 615 616 key.proto = ipv6_hdr->proto; 617 618 switch (ipv6_hdr->proto) { 619 case IPPROTO_TCP: 620 tcp = (struct rte_tcp_hdr *)((unsigned char *) ipv6_hdr + 621 sizeof(struct rte_ipv6_hdr)); 622 key.port_dst = rte_be_to_cpu_16(tcp->dst_port); 623 key.port_src = rte_be_to_cpu_16(tcp->src_port); 624 break; 625 626 case IPPROTO_UDP: 627 udp = (struct rte_udp_hdr *)((unsigned char *) ipv6_hdr + 628 sizeof(struct rte_ipv6_hdr)); 629 key.port_dst = rte_be_to_cpu_16(udp->dst_port); 630 key.port_src = rte_be_to_cpu_16(udp->src_port); 631 break; 632 633 default: 634 key.port_dst = 0; 635 key.port_src = 0; 636 break; 637 } 638 639 /* Find destination port */ 640 ret = rte_hash_lookup(ipv6_l3fwd_lookup_struct, (const void *)&key); 641 return ((ret < 0) ? portid : ipv6_l3fwd_out_if[ret]); 642 } 643 #endif 644 645 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 646 static inline uint16_t 647 get_ipv4_dst_port(struct rte_ipv4_hdr *ipv4_hdr, uint16_t portid, 648 lookup_struct_t *ipv4_l3fwd_lookup_struct) 649 { 650 uint32_t next_hop; 651 652 return ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct, 653 rte_be_to_cpu_32(ipv4_hdr->dst_addr), &next_hop) == 0)? 654 next_hop : portid); 655 } 656 #endif 657 658 static inline void 659 parse_ptype_one(struct rte_mbuf *m) 660 { 661 struct rte_ether_hdr *eth_hdr; 662 uint32_t packet_type = RTE_PTYPE_UNKNOWN; 663 uint16_t ether_type; 664 665 eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 666 ether_type = eth_hdr->ether_type; 667 if (ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4)) 668 packet_type |= RTE_PTYPE_L3_IPV4_EXT_UNKNOWN; 669 else if (ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV6)) 670 packet_type |= RTE_PTYPE_L3_IPV6_EXT_UNKNOWN; 671 672 m->packet_type = packet_type; 673 } 674 675 static uint16_t 676 cb_parse_ptype(uint16_t port __rte_unused, uint16_t queue __rte_unused, 677 struct rte_mbuf *pkts[], uint16_t nb_pkts, 678 uint16_t max_pkts __rte_unused, 679 void *user_param __rte_unused) 680 { 681 unsigned int i; 682 683 for (i = 0; i < nb_pkts; ++i) 684 parse_ptype_one(pkts[i]); 685 686 return nb_pkts; 687 } 688 689 static int 690 add_cb_parse_ptype(uint16_t portid, uint16_t queueid) 691 { 692 printf("Port %d: softly parse packet type info\n", portid); 693 if (rte_eth_add_rx_callback(portid, queueid, cb_parse_ptype, NULL)) 694 return 0; 695 696 printf("Failed to add rx callback: port=%d\n", portid); 697 return -1; 698 } 699 700 static inline void 701 l3fwd_simple_forward(struct rte_mbuf *m, uint16_t portid, 702 struct lcore_conf *qconf) 703 { 704 struct rte_ether_hdr *eth_hdr; 705 struct rte_ipv4_hdr *ipv4_hdr; 706 void *d_addr_bytes; 707 uint16_t dst_port; 708 709 eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 710 711 if (RTE_ETH_IS_IPV4_HDR(m->packet_type)) { 712 /* Handle IPv4 headers.*/ 713 ipv4_hdr = 714 rte_pktmbuf_mtod_offset(m, struct rte_ipv4_hdr *, 715 sizeof(struct rte_ether_hdr)); 716 717 #ifdef DO_RFC_1812_CHECKS 718 /* Check to make sure the packet is valid (RFC1812) */ 719 if (is_valid_ipv4_pkt(ipv4_hdr, m->pkt_len) < 0) { 720 rte_pktmbuf_free(m); 721 return; 722 } 723 #endif 724 725 dst_port = get_ipv4_dst_port(ipv4_hdr, portid, 726 qconf->ipv4_lookup_struct); 727 if (dst_port >= RTE_MAX_ETHPORTS || 728 (enabled_port_mask & 1 << dst_port) == 0) 729 dst_port = portid; 730 731 /* 02:00:00:00:00:xx */ 732 d_addr_bytes = ð_hdr->dst_addr.addr_bytes[0]; 733 *((uint64_t *)d_addr_bytes) = 734 0x000000000002 + ((uint64_t)dst_port << 40); 735 736 #ifdef DO_RFC_1812_CHECKS 737 /* Update time to live and header checksum */ 738 --(ipv4_hdr->time_to_live); 739 ++(ipv4_hdr->hdr_checksum); 740 #endif 741 742 /* src addr */ 743 rte_ether_addr_copy(&ports_eth_addr[dst_port], 744 ð_hdr->src_addr); 745 746 send_single_packet(m, dst_port); 747 } else if (RTE_ETH_IS_IPV6_HDR(m->packet_type)) { 748 /* Handle IPv6 headers.*/ 749 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 750 struct rte_ipv6_hdr *ipv6_hdr; 751 752 ipv6_hdr = 753 rte_pktmbuf_mtod_offset(m, struct rte_ipv6_hdr *, 754 sizeof(struct rte_ether_hdr)); 755 756 dst_port = get_ipv6_dst_port(ipv6_hdr, portid, 757 qconf->ipv6_lookup_struct); 758 759 if (dst_port >= RTE_MAX_ETHPORTS || 760 (enabled_port_mask & 1 << dst_port) == 0) 761 dst_port = portid; 762 763 /* 02:00:00:00:00:xx */ 764 d_addr_bytes = ð_hdr->dst_addr.addr_bytes[0]; 765 *((uint64_t *)d_addr_bytes) = 766 0x000000000002 + ((uint64_t)dst_port << 40); 767 768 /* src addr */ 769 rte_ether_addr_copy(&ports_eth_addr[dst_port], 770 ð_hdr->src_addr); 771 772 send_single_packet(m, dst_port); 773 #else 774 /* We don't currently handle IPv6 packets in LPM mode. */ 775 rte_pktmbuf_free(m); 776 #endif 777 } else 778 rte_pktmbuf_free(m); 779 780 } 781 782 #define MINIMUM_SLEEP_TIME 1 783 #define SUSPEND_THRESHOLD 300 784 785 static inline uint32_t 786 power_idle_heuristic(uint32_t zero_rx_packet_count) 787 { 788 /* If zero count is less than 100, sleep 1us */ 789 if (zero_rx_packet_count < SUSPEND_THRESHOLD) 790 return MINIMUM_SLEEP_TIME; 791 /* If zero count is less than 1000, sleep 100 us which is the 792 minimum latency switching from C3/C6 to C0 793 */ 794 else 795 return SUSPEND_THRESHOLD; 796 } 797 798 static inline enum freq_scale_hint_t 799 power_freq_scaleup_heuristic(unsigned lcore_id, 800 uint16_t port_id, 801 uint16_t queue_id) 802 { 803 uint32_t rxq_count = rte_eth_rx_queue_count(port_id, queue_id); 804 /** 805 * HW Rx queue size is 128 by default, Rx burst read at maximum 32 entries 806 * per iteration 807 */ 808 #define FREQ_GEAR1_RX_PACKET_THRESHOLD MAX_PKT_BURST 809 #define FREQ_GEAR2_RX_PACKET_THRESHOLD (MAX_PKT_BURST*2) 810 #define FREQ_GEAR3_RX_PACKET_THRESHOLD (MAX_PKT_BURST*3) 811 #define FREQ_UP_TREND1_ACC 1 812 #define FREQ_UP_TREND2_ACC 100 813 #define FREQ_UP_THRESHOLD 10000 814 815 if (likely(rxq_count > FREQ_GEAR3_RX_PACKET_THRESHOLD)) { 816 stats[lcore_id].trend = 0; 817 return FREQ_HIGHEST; 818 } else if (likely(rxq_count > FREQ_GEAR2_RX_PACKET_THRESHOLD)) 819 stats[lcore_id].trend += FREQ_UP_TREND2_ACC; 820 else if (likely(rxq_count > FREQ_GEAR1_RX_PACKET_THRESHOLD)) 821 stats[lcore_id].trend += FREQ_UP_TREND1_ACC; 822 823 if (likely(stats[lcore_id].trend > FREQ_UP_THRESHOLD)) { 824 stats[lcore_id].trend = 0; 825 return FREQ_HIGHER; 826 } 827 828 return FREQ_CURRENT; 829 } 830 831 /** 832 * force polling thread sleep until one-shot rx interrupt triggers 833 * @param port_id 834 * Port id. 835 * @param queue_id 836 * Rx queue id. 837 * @return 838 * 0 on success 839 */ 840 static int 841 sleep_until_rx_interrupt(int num, int lcore) 842 { 843 /* 844 * we want to track when we are woken up by traffic so that we can go 845 * back to sleep again without log spamming. Avoid cache line sharing 846 * to prevent threads stepping on each others' toes. 847 */ 848 static struct { 849 bool wakeup; 850 } __rte_cache_aligned status[RTE_MAX_LCORE]; 851 struct rte_epoll_event event[num]; 852 int n, i; 853 uint16_t port_id; 854 uint8_t queue_id; 855 void *data; 856 857 if (status[lcore].wakeup) { 858 RTE_LOG(INFO, L3FWD_POWER, 859 "lcore %u sleeps until interrupt triggers\n", 860 rte_lcore_id()); 861 } 862 863 n = rte_epoll_wait(RTE_EPOLL_PER_THREAD, event, num, 10); 864 for (i = 0; i < n; i++) { 865 data = event[i].epdata.data; 866 port_id = ((uintptr_t)data) >> CHAR_BIT; 867 queue_id = ((uintptr_t)data) & 868 RTE_LEN2MASK(CHAR_BIT, uint8_t); 869 RTE_LOG(INFO, L3FWD_POWER, 870 "lcore %u is waked up from rx interrupt on" 871 " port %d queue %d\n", 872 rte_lcore_id(), port_id, queue_id); 873 } 874 status[lcore].wakeup = n != 0; 875 876 return 0; 877 } 878 879 static void turn_on_off_intr(struct lcore_conf *qconf, bool on) 880 { 881 int i; 882 struct lcore_rx_queue *rx_queue; 883 uint8_t queue_id; 884 uint16_t port_id; 885 886 for (i = 0; i < qconf->n_rx_queue; ++i) { 887 rx_queue = &(qconf->rx_queue_list[i]); 888 port_id = rx_queue->port_id; 889 queue_id = rx_queue->queue_id; 890 891 rte_spinlock_lock(&(locks[port_id])); 892 if (on) 893 rte_eth_dev_rx_intr_enable(port_id, queue_id); 894 else 895 rte_eth_dev_rx_intr_disable(port_id, queue_id); 896 rte_spinlock_unlock(&(locks[port_id])); 897 } 898 } 899 900 static int event_register(struct lcore_conf *qconf) 901 { 902 struct lcore_rx_queue *rx_queue; 903 uint8_t queueid; 904 uint16_t portid; 905 uint32_t data; 906 int ret; 907 int i; 908 909 for (i = 0; i < qconf->n_rx_queue; ++i) { 910 rx_queue = &(qconf->rx_queue_list[i]); 911 portid = rx_queue->port_id; 912 queueid = rx_queue->queue_id; 913 data = portid << CHAR_BIT | queueid; 914 915 ret = rte_eth_dev_rx_intr_ctl_q(portid, queueid, 916 RTE_EPOLL_PER_THREAD, 917 RTE_INTR_EVENT_ADD, 918 (void *)((uintptr_t)data)); 919 if (ret) 920 return ret; 921 } 922 923 return 0; 924 } 925 926 /* Main processing loop. 8< */ 927 static int main_intr_loop(__rte_unused void *dummy) 928 { 929 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 930 unsigned int lcore_id; 931 uint64_t prev_tsc, diff_tsc, cur_tsc; 932 int i, j, nb_rx; 933 uint8_t queueid; 934 uint16_t portid; 935 struct lcore_conf *qconf; 936 struct lcore_rx_queue *rx_queue; 937 uint32_t lcore_rx_idle_count = 0; 938 uint32_t lcore_idle_hint = 0; 939 int intr_en = 0; 940 941 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 942 US_PER_S * BURST_TX_DRAIN_US; 943 944 prev_tsc = 0; 945 946 lcore_id = rte_lcore_id(); 947 qconf = &lcore_conf[lcore_id]; 948 949 if (qconf->n_rx_queue == 0) { 950 RTE_LOG(INFO, L3FWD_POWER, "lcore %u has nothing to do\n", 951 lcore_id); 952 return 0; 953 } 954 955 RTE_LOG(INFO, L3FWD_POWER, "entering main interrupt loop on lcore %u\n", 956 lcore_id); 957 958 for (i = 0; i < qconf->n_rx_queue; i++) { 959 portid = qconf->rx_queue_list[i].port_id; 960 queueid = qconf->rx_queue_list[i].queue_id; 961 RTE_LOG(INFO, L3FWD_POWER, 962 " -- lcoreid=%u portid=%u rxqueueid=%hhu\n", 963 lcore_id, portid, queueid); 964 } 965 966 /* add into event wait list */ 967 if (event_register(qconf) == 0) 968 intr_en = 1; 969 else 970 RTE_LOG(INFO, L3FWD_POWER, "RX interrupt won't enable.\n"); 971 972 while (!is_done()) { 973 stats[lcore_id].nb_iteration_looped++; 974 975 cur_tsc = rte_rdtsc(); 976 977 /* 978 * TX burst queue drain 979 */ 980 diff_tsc = cur_tsc - prev_tsc; 981 if (unlikely(diff_tsc > drain_tsc)) { 982 for (i = 0; i < qconf->n_tx_port; ++i) { 983 portid = qconf->tx_port_id[i]; 984 rte_eth_tx_buffer_flush(portid, 985 qconf->tx_queue_id[portid], 986 qconf->tx_buffer[portid]); 987 } 988 prev_tsc = cur_tsc; 989 } 990 991 start_rx: 992 /* 993 * Read packet from RX queues 994 */ 995 lcore_rx_idle_count = 0; 996 for (i = 0; i < qconf->n_rx_queue; ++i) { 997 rx_queue = &(qconf->rx_queue_list[i]); 998 rx_queue->idle_hint = 0; 999 portid = rx_queue->port_id; 1000 queueid = rx_queue->queue_id; 1001 1002 nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst, 1003 MAX_PKT_BURST); 1004 1005 stats[lcore_id].nb_rx_processed += nb_rx; 1006 if (unlikely(nb_rx == 0)) { 1007 /** 1008 * no packet received from rx queue, try to 1009 * sleep for a while forcing CPU enter deeper 1010 * C states. 1011 */ 1012 rx_queue->zero_rx_packet_count++; 1013 1014 if (rx_queue->zero_rx_packet_count <= 1015 MIN_ZERO_POLL_COUNT) 1016 continue; 1017 1018 rx_queue->idle_hint = power_idle_heuristic( 1019 rx_queue->zero_rx_packet_count); 1020 lcore_rx_idle_count++; 1021 } else { 1022 rx_queue->zero_rx_packet_count = 0; 1023 } 1024 1025 /* Prefetch first packets */ 1026 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1027 rte_prefetch0(rte_pktmbuf_mtod( 1028 pkts_burst[j], void *)); 1029 } 1030 1031 /* Prefetch and forward already prefetched packets */ 1032 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1033 rte_prefetch0(rte_pktmbuf_mtod( 1034 pkts_burst[j + PREFETCH_OFFSET], 1035 void *)); 1036 l3fwd_simple_forward( 1037 pkts_burst[j], portid, qconf); 1038 } 1039 1040 /* Forward remaining prefetched packets */ 1041 for (; j < nb_rx; j++) { 1042 l3fwd_simple_forward( 1043 pkts_burst[j], portid, qconf); 1044 } 1045 } 1046 1047 if (unlikely(lcore_rx_idle_count == qconf->n_rx_queue)) { 1048 /** 1049 * All Rx queues empty in recent consecutive polls, 1050 * sleep in a conservative manner, meaning sleep as 1051 * less as possible. 1052 */ 1053 for (i = 1, 1054 lcore_idle_hint = qconf->rx_queue_list[0].idle_hint; 1055 i < qconf->n_rx_queue; ++i) { 1056 rx_queue = &(qconf->rx_queue_list[i]); 1057 if (rx_queue->idle_hint < lcore_idle_hint) 1058 lcore_idle_hint = rx_queue->idle_hint; 1059 } 1060 1061 if (lcore_idle_hint < SUSPEND_THRESHOLD) 1062 /** 1063 * execute "pause" instruction to avoid context 1064 * switch which generally take hundred of 1065 * microseconds for short sleep. 1066 */ 1067 rte_delay_us(lcore_idle_hint); 1068 else { 1069 /* suspend until rx interrupt triggers */ 1070 if (intr_en) { 1071 turn_on_off_intr(qconf, 1); 1072 sleep_until_rx_interrupt( 1073 qconf->n_rx_queue, 1074 lcore_id); 1075 turn_on_off_intr(qconf, 0); 1076 /** 1077 * start receiving packets immediately 1078 */ 1079 if (likely(!is_done())) 1080 goto start_rx; 1081 } 1082 } 1083 stats[lcore_id].sleep_time += lcore_idle_hint; 1084 } 1085 } 1086 1087 return 0; 1088 } 1089 /* >8 End of main processing loop. */ 1090 1091 /* main processing loop */ 1092 static int 1093 main_telemetry_loop(__rte_unused void *dummy) 1094 { 1095 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1096 unsigned int lcore_id; 1097 uint64_t prev_tsc, diff_tsc, cur_tsc, prev_tel_tsc; 1098 int i, j, nb_rx; 1099 uint8_t queueid; 1100 uint16_t portid; 1101 struct lcore_conf *qconf; 1102 struct lcore_rx_queue *rx_queue; 1103 uint64_t ep_nep[2] = {0}, fp_nfp[2] = {0}; 1104 uint64_t poll_count; 1105 enum busy_rate br; 1106 1107 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 1108 US_PER_S * BURST_TX_DRAIN_US; 1109 1110 poll_count = 0; 1111 prev_tsc = 0; 1112 prev_tel_tsc = 0; 1113 1114 lcore_id = rte_lcore_id(); 1115 qconf = &lcore_conf[lcore_id]; 1116 1117 if (qconf->n_rx_queue == 0) { 1118 RTE_LOG(INFO, L3FWD_POWER, "lcore %u has nothing to do\n", 1119 lcore_id); 1120 return 0; 1121 } 1122 1123 RTE_LOG(INFO, L3FWD_POWER, "entering main telemetry loop on lcore %u\n", 1124 lcore_id); 1125 1126 for (i = 0; i < qconf->n_rx_queue; i++) { 1127 portid = qconf->rx_queue_list[i].port_id; 1128 queueid = qconf->rx_queue_list[i].queue_id; 1129 RTE_LOG(INFO, L3FWD_POWER, " -- lcoreid=%u portid=%u " 1130 "rxqueueid=%hhu\n", lcore_id, portid, queueid); 1131 } 1132 1133 while (!is_done()) { 1134 1135 cur_tsc = rte_rdtsc(); 1136 /* 1137 * TX burst queue drain 1138 */ 1139 diff_tsc = cur_tsc - prev_tsc; 1140 if (unlikely(diff_tsc > drain_tsc)) { 1141 for (i = 0; i < qconf->n_tx_port; ++i) { 1142 portid = qconf->tx_port_id[i]; 1143 rte_eth_tx_buffer_flush(portid, 1144 qconf->tx_queue_id[portid], 1145 qconf->tx_buffer[portid]); 1146 } 1147 prev_tsc = cur_tsc; 1148 } 1149 1150 /* 1151 * Read packet from RX queues 1152 */ 1153 for (i = 0; i < qconf->n_rx_queue; ++i) { 1154 rx_queue = &(qconf->rx_queue_list[i]); 1155 portid = rx_queue->port_id; 1156 queueid = rx_queue->queue_id; 1157 1158 nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst, 1159 MAX_PKT_BURST); 1160 ep_nep[nb_rx == 0]++; 1161 fp_nfp[nb_rx == MAX_PKT_BURST]++; 1162 poll_count++; 1163 if (unlikely(nb_rx == 0)) 1164 continue; 1165 1166 /* Prefetch first packets */ 1167 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1168 rte_prefetch0(rte_pktmbuf_mtod( 1169 pkts_burst[j], void *)); 1170 } 1171 1172 /* Prefetch and forward already prefetched packets */ 1173 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1174 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1175 j + PREFETCH_OFFSET], void *)); 1176 l3fwd_simple_forward(pkts_burst[j], portid, 1177 qconf); 1178 } 1179 1180 /* Forward remaining prefetched packets */ 1181 for (; j < nb_rx; j++) { 1182 l3fwd_simple_forward(pkts_burst[j], portid, 1183 qconf); 1184 } 1185 } 1186 if (unlikely(poll_count >= DEFAULT_COUNT)) { 1187 diff_tsc = cur_tsc - prev_tel_tsc; 1188 if (diff_tsc >= MAX_CYCLES) { 1189 br = FULL; 1190 } else if (diff_tsc > MIN_CYCLES && 1191 diff_tsc < MAX_CYCLES) { 1192 br = (diff_tsc * 100) / MAX_CYCLES; 1193 } else { 1194 br = ZERO; 1195 } 1196 poll_count = 0; 1197 prev_tel_tsc = cur_tsc; 1198 /* update stats for telemetry */ 1199 rte_spinlock_lock(&stats[lcore_id].telemetry_lock); 1200 stats[lcore_id].ep_nep[0] = ep_nep[0]; 1201 stats[lcore_id].ep_nep[1] = ep_nep[1]; 1202 stats[lcore_id].fp_nfp[0] = fp_nfp[0]; 1203 stats[lcore_id].fp_nfp[1] = fp_nfp[1]; 1204 stats[lcore_id].br = br; 1205 rte_spinlock_unlock(&stats[lcore_id].telemetry_lock); 1206 } 1207 } 1208 1209 return 0; 1210 } 1211 /* main processing loop */ 1212 static int 1213 main_empty_poll_loop(__rte_unused void *dummy) 1214 { 1215 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1216 unsigned int lcore_id; 1217 uint64_t prev_tsc, diff_tsc, cur_tsc; 1218 int i, j, nb_rx; 1219 uint8_t queueid; 1220 uint16_t portid; 1221 struct lcore_conf *qconf; 1222 struct lcore_rx_queue *rx_queue; 1223 1224 const uint64_t drain_tsc = 1225 (rte_get_tsc_hz() + US_PER_S - 1) / 1226 US_PER_S * BURST_TX_DRAIN_US; 1227 1228 prev_tsc = 0; 1229 1230 lcore_id = rte_lcore_id(); 1231 qconf = &lcore_conf[lcore_id]; 1232 1233 if (qconf->n_rx_queue == 0) { 1234 RTE_LOG(INFO, L3FWD_POWER, "lcore %u has nothing to do\n", 1235 lcore_id); 1236 return 0; 1237 } 1238 1239 for (i = 0; i < qconf->n_rx_queue; i++) { 1240 portid = qconf->rx_queue_list[i].port_id; 1241 queueid = qconf->rx_queue_list[i].queue_id; 1242 RTE_LOG(INFO, L3FWD_POWER, " -- lcoreid=%u portid=%u " 1243 "rxqueueid=%hhu\n", lcore_id, portid, queueid); 1244 } 1245 1246 while (!is_done()) { 1247 stats[lcore_id].nb_iteration_looped++; 1248 1249 cur_tsc = rte_rdtsc(); 1250 /* 1251 * TX burst queue drain 1252 */ 1253 diff_tsc = cur_tsc - prev_tsc; 1254 if (unlikely(diff_tsc > drain_tsc)) { 1255 for (i = 0; i < qconf->n_tx_port; ++i) { 1256 portid = qconf->tx_port_id[i]; 1257 rte_eth_tx_buffer_flush(portid, 1258 qconf->tx_queue_id[portid], 1259 qconf->tx_buffer[portid]); 1260 } 1261 prev_tsc = cur_tsc; 1262 } 1263 1264 /* 1265 * Read packet from RX queues 1266 */ 1267 for (i = 0; i < qconf->n_rx_queue; ++i) { 1268 rx_queue = &(qconf->rx_queue_list[i]); 1269 rx_queue->idle_hint = 0; 1270 portid = rx_queue->port_id; 1271 queueid = rx_queue->queue_id; 1272 1273 nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst, 1274 MAX_PKT_BURST); 1275 1276 stats[lcore_id].nb_rx_processed += nb_rx; 1277 1278 if (nb_rx == 0) { 1279 1280 rte_power_empty_poll_stat_update(lcore_id); 1281 1282 continue; 1283 } else { 1284 rte_power_poll_stat_update(lcore_id, nb_rx); 1285 } 1286 1287 1288 /* Prefetch first packets */ 1289 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1290 rte_prefetch0(rte_pktmbuf_mtod( 1291 pkts_burst[j], void *)); 1292 } 1293 1294 /* Prefetch and forward already prefetched packets */ 1295 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1296 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1297 j + PREFETCH_OFFSET], 1298 void *)); 1299 l3fwd_simple_forward(pkts_burst[j], portid, 1300 qconf); 1301 } 1302 1303 /* Forward remaining prefetched packets */ 1304 for (; j < nb_rx; j++) { 1305 l3fwd_simple_forward(pkts_burst[j], portid, 1306 qconf); 1307 } 1308 1309 } 1310 1311 } 1312 1313 return 0; 1314 } 1315 /* main processing loop */ 1316 static int 1317 main_legacy_loop(__rte_unused void *dummy) 1318 { 1319 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1320 unsigned lcore_id; 1321 uint64_t prev_tsc, diff_tsc, cur_tsc, tim_res_tsc, hz; 1322 uint64_t prev_tsc_power = 0, cur_tsc_power, diff_tsc_power; 1323 int i, j, nb_rx; 1324 uint8_t queueid; 1325 uint16_t portid; 1326 struct lcore_conf *qconf; 1327 struct lcore_rx_queue *rx_queue; 1328 enum freq_scale_hint_t lcore_scaleup_hint; 1329 uint32_t lcore_rx_idle_count = 0; 1330 uint32_t lcore_idle_hint = 0; 1331 int intr_en = 0; 1332 1333 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1334 1335 prev_tsc = 0; 1336 hz = rte_get_timer_hz(); 1337 tim_res_tsc = hz/TIMER_NUMBER_PER_SECOND; 1338 1339 lcore_id = rte_lcore_id(); 1340 qconf = &lcore_conf[lcore_id]; 1341 1342 if (qconf->n_rx_queue == 0) { 1343 RTE_LOG(INFO, L3FWD_POWER, "lcore %u has nothing to do\n", lcore_id); 1344 return 0; 1345 } 1346 1347 RTE_LOG(INFO, L3FWD_POWER, "entering main loop on lcore %u\n", lcore_id); 1348 1349 for (i = 0; i < qconf->n_rx_queue; i++) { 1350 portid = qconf->rx_queue_list[i].port_id; 1351 queueid = qconf->rx_queue_list[i].queue_id; 1352 RTE_LOG(INFO, L3FWD_POWER, " -- lcoreid=%u portid=%u " 1353 "rxqueueid=%hhu\n", lcore_id, portid, queueid); 1354 } 1355 1356 /* add into event wait list */ 1357 if (event_register(qconf) == 0) 1358 intr_en = 1; 1359 else 1360 RTE_LOG(INFO, L3FWD_POWER, "RX interrupt won't enable.\n"); 1361 1362 while (!is_done()) { 1363 stats[lcore_id].nb_iteration_looped++; 1364 1365 cur_tsc = rte_rdtsc(); 1366 cur_tsc_power = cur_tsc; 1367 1368 /* 1369 * TX burst queue drain 1370 */ 1371 diff_tsc = cur_tsc - prev_tsc; 1372 if (unlikely(diff_tsc > drain_tsc)) { 1373 for (i = 0; i < qconf->n_tx_port; ++i) { 1374 portid = qconf->tx_port_id[i]; 1375 rte_eth_tx_buffer_flush(portid, 1376 qconf->tx_queue_id[portid], 1377 qconf->tx_buffer[portid]); 1378 } 1379 prev_tsc = cur_tsc; 1380 } 1381 1382 diff_tsc_power = cur_tsc_power - prev_tsc_power; 1383 if (diff_tsc_power > tim_res_tsc) { 1384 rte_timer_manage(); 1385 prev_tsc_power = cur_tsc_power; 1386 } 1387 1388 start_rx: 1389 /* 1390 * Read packet from RX queues 1391 */ 1392 lcore_scaleup_hint = FREQ_CURRENT; 1393 lcore_rx_idle_count = 0; 1394 for (i = 0; i < qconf->n_rx_queue; ++i) { 1395 rx_queue = &(qconf->rx_queue_list[i]); 1396 rx_queue->idle_hint = 0; 1397 portid = rx_queue->port_id; 1398 queueid = rx_queue->queue_id; 1399 1400 nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst, 1401 MAX_PKT_BURST); 1402 1403 stats[lcore_id].nb_rx_processed += nb_rx; 1404 if (unlikely(nb_rx == 0)) { 1405 /** 1406 * no packet received from rx queue, try to 1407 * sleep for a while forcing CPU enter deeper 1408 * C states. 1409 */ 1410 rx_queue->zero_rx_packet_count++; 1411 1412 if (rx_queue->zero_rx_packet_count <= 1413 MIN_ZERO_POLL_COUNT) 1414 continue; 1415 1416 rx_queue->idle_hint = power_idle_heuristic(\ 1417 rx_queue->zero_rx_packet_count); 1418 lcore_rx_idle_count++; 1419 } else { 1420 rx_queue->zero_rx_packet_count = 0; 1421 1422 /** 1423 * do not scale up frequency immediately as 1424 * user to kernel space communication is costly 1425 * which might impact packet I/O for received 1426 * packets. 1427 */ 1428 rx_queue->freq_up_hint = 1429 power_freq_scaleup_heuristic(lcore_id, 1430 portid, queueid); 1431 } 1432 1433 /* Prefetch first packets */ 1434 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1435 rte_prefetch0(rte_pktmbuf_mtod( 1436 pkts_burst[j], void *)); 1437 } 1438 1439 /* Prefetch and forward already prefetched packets */ 1440 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1441 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1442 j + PREFETCH_OFFSET], void *)); 1443 l3fwd_simple_forward(pkts_burst[j], portid, 1444 qconf); 1445 } 1446 1447 /* Forward remaining prefetched packets */ 1448 for (; j < nb_rx; j++) { 1449 l3fwd_simple_forward(pkts_burst[j], portid, 1450 qconf); 1451 } 1452 } 1453 1454 if (likely(lcore_rx_idle_count != qconf->n_rx_queue)) { 1455 for (i = 1, lcore_scaleup_hint = 1456 qconf->rx_queue_list[0].freq_up_hint; 1457 i < qconf->n_rx_queue; ++i) { 1458 rx_queue = &(qconf->rx_queue_list[i]); 1459 if (rx_queue->freq_up_hint > 1460 lcore_scaleup_hint) 1461 lcore_scaleup_hint = 1462 rx_queue->freq_up_hint; 1463 } 1464 1465 if (lcore_scaleup_hint == FREQ_HIGHEST) { 1466 if (rte_power_freq_max) 1467 rte_power_freq_max(lcore_id); 1468 } else if (lcore_scaleup_hint == FREQ_HIGHER) { 1469 if (rte_power_freq_up) 1470 rte_power_freq_up(lcore_id); 1471 } 1472 } else { 1473 /** 1474 * All Rx queues empty in recent consecutive polls, 1475 * sleep in a conservative manner, meaning sleep as 1476 * less as possible. 1477 */ 1478 for (i = 1, lcore_idle_hint = 1479 qconf->rx_queue_list[0].idle_hint; 1480 i < qconf->n_rx_queue; ++i) { 1481 rx_queue = &(qconf->rx_queue_list[i]); 1482 if (rx_queue->idle_hint < lcore_idle_hint) 1483 lcore_idle_hint = rx_queue->idle_hint; 1484 } 1485 1486 if (lcore_idle_hint < SUSPEND_THRESHOLD) 1487 /** 1488 * execute "pause" instruction to avoid context 1489 * switch which generally take hundred of 1490 * microseconds for short sleep. 1491 */ 1492 rte_delay_us(lcore_idle_hint); 1493 else { 1494 /* suspend until rx interrupt triggers */ 1495 if (intr_en) { 1496 turn_on_off_intr(qconf, 1); 1497 sleep_until_rx_interrupt( 1498 qconf->n_rx_queue, 1499 lcore_id); 1500 turn_on_off_intr(qconf, 0); 1501 /** 1502 * start receiving packets immediately 1503 */ 1504 if (likely(!is_done())) 1505 goto start_rx; 1506 } 1507 } 1508 stats[lcore_id].sleep_time += lcore_idle_hint; 1509 } 1510 } 1511 1512 return 0; 1513 } 1514 1515 static int 1516 check_lcore_params(void) 1517 { 1518 uint8_t queue, lcore; 1519 uint16_t i; 1520 int socketid; 1521 1522 for (i = 0; i < nb_lcore_params; ++i) { 1523 queue = lcore_params[i].queue_id; 1524 if (queue >= MAX_RX_QUEUE_PER_PORT) { 1525 printf("invalid queue number: %hhu\n", queue); 1526 return -1; 1527 } 1528 lcore = lcore_params[i].lcore_id; 1529 if (!rte_lcore_is_enabled(lcore)) { 1530 printf("error: lcore %hhu is not enabled in lcore " 1531 "mask\n", lcore); 1532 return -1; 1533 } 1534 if ((socketid = rte_lcore_to_socket_id(lcore) != 0) && 1535 (numa_on == 0)) { 1536 printf("warning: lcore %hhu is on socket %d with numa " 1537 "off\n", lcore, socketid); 1538 } 1539 if (app_mode == APP_MODE_TELEMETRY && lcore == rte_lcore_id()) { 1540 printf("cannot enable main core %d in config for telemetry mode\n", 1541 rte_lcore_id()); 1542 return -1; 1543 } 1544 } 1545 return 0; 1546 } 1547 1548 static int 1549 check_port_config(void) 1550 { 1551 unsigned portid; 1552 uint16_t i; 1553 1554 for (i = 0; i < nb_lcore_params; ++i) { 1555 portid = lcore_params[i].port_id; 1556 if ((enabled_port_mask & (1 << portid)) == 0) { 1557 printf("port %u is not enabled in port mask\n", 1558 portid); 1559 return -1; 1560 } 1561 if (!rte_eth_dev_is_valid_port(portid)) { 1562 printf("port %u is not present on the board\n", 1563 portid); 1564 return -1; 1565 } 1566 } 1567 return 0; 1568 } 1569 1570 static uint8_t 1571 get_port_n_rx_queues(const uint16_t port) 1572 { 1573 int queue = -1; 1574 uint16_t i; 1575 1576 for (i = 0; i < nb_lcore_params; ++i) { 1577 if (lcore_params[i].port_id == port && 1578 lcore_params[i].queue_id > queue) 1579 queue = lcore_params[i].queue_id; 1580 } 1581 return (uint8_t)(++queue); 1582 } 1583 1584 static int 1585 init_lcore_rx_queues(void) 1586 { 1587 uint16_t i, nb_rx_queue; 1588 uint8_t lcore; 1589 1590 for (i = 0; i < nb_lcore_params; ++i) { 1591 lcore = lcore_params[i].lcore_id; 1592 nb_rx_queue = lcore_conf[lcore].n_rx_queue; 1593 if (nb_rx_queue >= MAX_RX_QUEUE_PER_LCORE) { 1594 printf("error: too many queues (%u) for lcore: %u\n", 1595 (unsigned)nb_rx_queue + 1, (unsigned)lcore); 1596 return -1; 1597 } else { 1598 lcore_conf[lcore].rx_queue_list[nb_rx_queue].port_id = 1599 lcore_params[i].port_id; 1600 lcore_conf[lcore].rx_queue_list[nb_rx_queue].queue_id = 1601 lcore_params[i].queue_id; 1602 lcore_conf[lcore].n_rx_queue++; 1603 } 1604 } 1605 return 0; 1606 } 1607 1608 /* display usage */ 1609 static void 1610 print_usage(const char *prgname) 1611 { 1612 printf ("%s [EAL options] -- -p PORTMASK -P" 1613 " [--config (port,queue,lcore)[,(port,queue,lcore]]" 1614 " [--high-perf-cores CORELIST" 1615 " [--perf-config (port,queue,hi_perf,lcore_index)[,(port,queue,hi_perf,lcore_index]]" 1616 " [--max-pkt-len PKTLEN]\n" 1617 " -p PORTMASK: hexadecimal bitmask of ports to configure\n" 1618 " -P: enable promiscuous mode\n" 1619 " --config (port,queue,lcore): rx queues configuration\n" 1620 " --high-perf-cores CORELIST: list of high performance cores\n" 1621 " --perf-config: similar as config, cores specified as indices" 1622 " for bins containing high or regular performance cores\n" 1623 " --no-numa: optional, disable numa awareness\n" 1624 " --max-pkt-len PKTLEN: maximum packet length in decimal (64-9600)\n" 1625 " --parse-ptype: parse packet type by software\n" 1626 " --legacy: use legacy interrupt-based scaling\n" 1627 " --empty-poll: enable empty poll detection" 1628 " follow (training_flag, high_threshold, med_threshold)\n" 1629 " --telemetry: enable telemetry mode, to update" 1630 " empty polls, full polls, and core busyness to telemetry\n" 1631 " --interrupt-only: enable interrupt-only mode\n" 1632 " --pmd-mgmt MODE: enable PMD power management mode. " 1633 "Currently supported modes: baseline, monitor, pause, scale\n" 1634 " --max-empty-polls MAX_EMPTY_POLLS: number of empty polls to" 1635 " wait before entering sleep state\n" 1636 " --pause-duration DURATION: set the duration, in microseconds," 1637 " of the pause callback\n" 1638 " --scale-freq-min FREQ_MIN: set minimum frequency for scaling mode for" 1639 " all application lcores (FREQ_MIN must be in kHz, in increments of 100MHz)\n" 1640 " --scale-freq-max FREQ_MAX: set maximum frequency for scaling mode for" 1641 " all application lcores (FREQ_MAX must be in kHz, in increments of 100MHz)\n", 1642 prgname); 1643 } 1644 1645 static int 1646 parse_int(const char *opt) 1647 { 1648 char *end = NULL; 1649 unsigned long val; 1650 1651 /* parse integer string */ 1652 val = strtoul(opt, &end, 10); 1653 if ((opt[0] == '\0') || (end == NULL) || (*end != '\0')) 1654 return -1; 1655 1656 return val; 1657 } 1658 1659 static int parse_max_pkt_len(const char *pktlen) 1660 { 1661 char *end = NULL; 1662 unsigned long len; 1663 1664 /* parse decimal string */ 1665 len = strtoul(pktlen, &end, 10); 1666 if ((pktlen[0] == '\0') || (end == NULL) || (*end != '\0')) 1667 return -1; 1668 1669 if (len == 0) 1670 return -1; 1671 1672 return len; 1673 } 1674 1675 static int 1676 parse_portmask(const char *portmask) 1677 { 1678 char *end = NULL; 1679 unsigned long pm; 1680 1681 /* parse hexadecimal string */ 1682 pm = strtoul(portmask, &end, 16); 1683 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0')) 1684 return 0; 1685 1686 return pm; 1687 } 1688 1689 static int 1690 parse_config(const char *q_arg) 1691 { 1692 char s[256]; 1693 const char *p, *p0 = q_arg; 1694 char *end; 1695 enum fieldnames { 1696 FLD_PORT = 0, 1697 FLD_QUEUE, 1698 FLD_LCORE, 1699 _NUM_FLD 1700 }; 1701 unsigned long int_fld[_NUM_FLD]; 1702 char *str_fld[_NUM_FLD]; 1703 int i; 1704 unsigned size; 1705 1706 nb_lcore_params = 0; 1707 1708 while ((p = strchr(p0,'(')) != NULL) { 1709 ++p; 1710 if((p0 = strchr(p,')')) == NULL) 1711 return -1; 1712 1713 size = p0 - p; 1714 if(size >= sizeof(s)) 1715 return -1; 1716 1717 snprintf(s, sizeof(s), "%.*s", size, p); 1718 if (rte_strsplit(s, sizeof(s), str_fld, _NUM_FLD, ',') != 1719 _NUM_FLD) 1720 return -1; 1721 for (i = 0; i < _NUM_FLD; i++){ 1722 errno = 0; 1723 int_fld[i] = strtoul(str_fld[i], &end, 0); 1724 if (errno != 0 || end == str_fld[i] || int_fld[i] > 1725 255) 1726 return -1; 1727 } 1728 if (nb_lcore_params >= MAX_LCORE_PARAMS) { 1729 printf("exceeded max number of lcore params: %hu\n", 1730 nb_lcore_params); 1731 return -1; 1732 } 1733 lcore_params_array[nb_lcore_params].port_id = 1734 (uint8_t)int_fld[FLD_PORT]; 1735 lcore_params_array[nb_lcore_params].queue_id = 1736 (uint8_t)int_fld[FLD_QUEUE]; 1737 lcore_params_array[nb_lcore_params].lcore_id = 1738 (uint8_t)int_fld[FLD_LCORE]; 1739 ++nb_lcore_params; 1740 } 1741 lcore_params = lcore_params_array; 1742 1743 return 0; 1744 } 1745 1746 static int 1747 parse_pmd_mgmt_config(const char *name) 1748 { 1749 #define PMD_MGMT_MONITOR "monitor" 1750 #define PMD_MGMT_PAUSE "pause" 1751 #define PMD_MGMT_SCALE "scale" 1752 #define PMD_MGMT_BASELINE "baseline" 1753 1754 if (strncmp(PMD_MGMT_MONITOR, name, sizeof(PMD_MGMT_MONITOR)) == 0) { 1755 pmgmt_type = RTE_POWER_MGMT_TYPE_MONITOR; 1756 return 0; 1757 } 1758 1759 if (strncmp(PMD_MGMT_PAUSE, name, sizeof(PMD_MGMT_PAUSE)) == 0) { 1760 pmgmt_type = RTE_POWER_MGMT_TYPE_PAUSE; 1761 return 0; 1762 } 1763 1764 if (strncmp(PMD_MGMT_SCALE, name, sizeof(PMD_MGMT_SCALE)) == 0) { 1765 pmgmt_type = RTE_POWER_MGMT_TYPE_SCALE; 1766 return 0; 1767 } 1768 if (strncmp(PMD_MGMT_BASELINE, name, sizeof(PMD_MGMT_BASELINE)) == 0) { 1769 baseline_enabled = true; 1770 return 0; 1771 } 1772 /* unknown PMD power management mode */ 1773 return -1; 1774 } 1775 1776 static int 1777 parse_ep_config(const char *q_arg) 1778 { 1779 char s[256]; 1780 const char *p = q_arg; 1781 char *end; 1782 int num_arg; 1783 1784 char *str_fld[3]; 1785 1786 int training_flag; 1787 int med_edpi; 1788 int hgh_edpi; 1789 1790 ep_med_edpi = EMPTY_POLL_MED_THRESHOLD; 1791 ep_hgh_edpi = EMPTY_POLL_HGH_THRESHOLD; 1792 1793 strlcpy(s, p, sizeof(s)); 1794 1795 num_arg = rte_strsplit(s, sizeof(s), str_fld, 3, ','); 1796 1797 empty_poll_train = false; 1798 1799 if (num_arg == 0) 1800 return 0; 1801 1802 if (num_arg == 3) { 1803 1804 training_flag = strtoul(str_fld[0], &end, 0); 1805 med_edpi = strtoul(str_fld[1], &end, 0); 1806 hgh_edpi = strtoul(str_fld[2], &end, 0); 1807 1808 if (training_flag == 1) 1809 empty_poll_train = true; 1810 1811 if (med_edpi > 0) 1812 ep_med_edpi = med_edpi; 1813 1814 if (hgh_edpi > 0) 1815 ep_hgh_edpi = hgh_edpi; 1816 1817 } else { 1818 1819 return -1; 1820 } 1821 1822 return 0; 1823 1824 } 1825 #define CMD_LINE_OPT_PARSE_PTYPE "parse-ptype" 1826 #define CMD_LINE_OPT_LEGACY "legacy" 1827 #define CMD_LINE_OPT_EMPTY_POLL "empty-poll" 1828 #define CMD_LINE_OPT_INTERRUPT_ONLY "interrupt-only" 1829 #define CMD_LINE_OPT_TELEMETRY "telemetry" 1830 #define CMD_LINE_OPT_PMD_MGMT "pmd-mgmt" 1831 #define CMD_LINE_OPT_MAX_PKT_LEN "max-pkt-len" 1832 #define CMD_LINE_OPT_MAX_EMPTY_POLLS "max-empty-polls" 1833 #define CMD_LINE_OPT_PAUSE_DURATION "pause-duration" 1834 #define CMD_LINE_OPT_SCALE_FREQ_MIN "scale-freq-min" 1835 #define CMD_LINE_OPT_SCALE_FREQ_MAX "scale-freq-max" 1836 1837 /* Parse the argument given in the command line of the application */ 1838 static int 1839 parse_args(int argc, char **argv) 1840 { 1841 int opt, ret; 1842 char **argvopt; 1843 int option_index; 1844 uint32_t limit; 1845 char *prgname = argv[0]; 1846 static struct option lgopts[] = { 1847 {"config", 1, 0, 0}, 1848 {"perf-config", 1, 0, 0}, 1849 {"high-perf-cores", 1, 0, 0}, 1850 {"no-numa", 0, 0, 0}, 1851 {CMD_LINE_OPT_MAX_PKT_LEN, 1, 0, 0}, 1852 {CMD_LINE_OPT_EMPTY_POLL, 1, 0, 0}, 1853 {CMD_LINE_OPT_PARSE_PTYPE, 0, 0, 0}, 1854 {CMD_LINE_OPT_LEGACY, 0, 0, 0}, 1855 {CMD_LINE_OPT_TELEMETRY, 0, 0, 0}, 1856 {CMD_LINE_OPT_INTERRUPT_ONLY, 0, 0, 0}, 1857 {CMD_LINE_OPT_PMD_MGMT, 1, 0, 0}, 1858 {CMD_LINE_OPT_MAX_EMPTY_POLLS, 1, 0, 0}, 1859 {CMD_LINE_OPT_PAUSE_DURATION, 1, 0, 0}, 1860 {CMD_LINE_OPT_SCALE_FREQ_MIN, 1, 0, 0}, 1861 {CMD_LINE_OPT_SCALE_FREQ_MAX, 1, 0, 0}, 1862 {NULL, 0, 0, 0} 1863 }; 1864 1865 argvopt = argv; 1866 1867 while ((opt = getopt_long(argc, argvopt, "p:l:m:h:P", 1868 lgopts, &option_index)) != EOF) { 1869 1870 switch (opt) { 1871 /* portmask */ 1872 case 'p': 1873 enabled_port_mask = parse_portmask(optarg); 1874 if (enabled_port_mask == 0) { 1875 printf("invalid portmask\n"); 1876 print_usage(prgname); 1877 return -1; 1878 } 1879 break; 1880 case 'P': 1881 printf("Promiscuous mode selected\n"); 1882 promiscuous_on = 1; 1883 break; 1884 case 'l': 1885 limit = parse_max_pkt_len(optarg); 1886 freq_tlb[LOW] = limit; 1887 break; 1888 case 'm': 1889 limit = parse_max_pkt_len(optarg); 1890 freq_tlb[MED] = limit; 1891 break; 1892 case 'h': 1893 limit = parse_max_pkt_len(optarg); 1894 freq_tlb[HGH] = limit; 1895 break; 1896 /* long options */ 1897 case 0: 1898 if (!strncmp(lgopts[option_index].name, "config", 6)) { 1899 ret = parse_config(optarg); 1900 if (ret) { 1901 printf("invalid config\n"); 1902 print_usage(prgname); 1903 return -1; 1904 } 1905 } 1906 1907 if (!strncmp(lgopts[option_index].name, 1908 "perf-config", 11)) { 1909 ret = parse_perf_config(optarg); 1910 if (ret) { 1911 printf("invalid perf-config\n"); 1912 print_usage(prgname); 1913 return -1; 1914 } 1915 } 1916 1917 if (!strncmp(lgopts[option_index].name, 1918 "high-perf-cores", 15)) { 1919 ret = parse_perf_core_list(optarg); 1920 if (ret) { 1921 printf("invalid high-perf-cores\n"); 1922 print_usage(prgname); 1923 return -1; 1924 } 1925 } 1926 1927 if (!strncmp(lgopts[option_index].name, 1928 "no-numa", 7)) { 1929 printf("numa is disabled \n"); 1930 numa_on = 0; 1931 } 1932 1933 if (!strncmp(lgopts[option_index].name, 1934 CMD_LINE_OPT_LEGACY, 1935 sizeof(CMD_LINE_OPT_LEGACY))) { 1936 if (app_mode != APP_MODE_DEFAULT) { 1937 printf(" legacy mode is mutually exclusive with other modes\n"); 1938 return -1; 1939 } 1940 app_mode = APP_MODE_LEGACY; 1941 printf("legacy mode is enabled\n"); 1942 } 1943 1944 if (!strncmp(lgopts[option_index].name, 1945 CMD_LINE_OPT_EMPTY_POLL, 10)) { 1946 if (app_mode != APP_MODE_DEFAULT) { 1947 printf(" empty-poll mode is mutually exclusive with other modes\n"); 1948 return -1; 1949 } 1950 app_mode = APP_MODE_EMPTY_POLL; 1951 ret = parse_ep_config(optarg); 1952 1953 if (ret) { 1954 printf("invalid empty poll config\n"); 1955 print_usage(prgname); 1956 return -1; 1957 } 1958 printf("empty-poll is enabled\n"); 1959 } 1960 1961 if (!strncmp(lgopts[option_index].name, 1962 CMD_LINE_OPT_TELEMETRY, 1963 sizeof(CMD_LINE_OPT_TELEMETRY))) { 1964 if (app_mode != APP_MODE_DEFAULT) { 1965 printf(" telemetry mode is mutually exclusive with other modes\n"); 1966 return -1; 1967 } 1968 app_mode = APP_MODE_TELEMETRY; 1969 printf("telemetry mode is enabled\n"); 1970 } 1971 1972 if (!strncmp(lgopts[option_index].name, 1973 CMD_LINE_OPT_PMD_MGMT, 1974 sizeof(CMD_LINE_OPT_PMD_MGMT))) { 1975 if (app_mode != APP_MODE_DEFAULT) { 1976 printf(" power mgmt mode is mutually exclusive with other modes\n"); 1977 return -1; 1978 } 1979 if (parse_pmd_mgmt_config(optarg) < 0) { 1980 printf(" Invalid PMD power management mode: %s\n", 1981 optarg); 1982 return -1; 1983 } 1984 app_mode = APP_MODE_PMD_MGMT; 1985 printf("PMD power mgmt mode is enabled\n"); 1986 } 1987 if (!strncmp(lgopts[option_index].name, 1988 CMD_LINE_OPT_INTERRUPT_ONLY, 1989 sizeof(CMD_LINE_OPT_INTERRUPT_ONLY))) { 1990 if (app_mode != APP_MODE_DEFAULT) { 1991 printf(" interrupt-only mode is mutually exclusive with other modes\n"); 1992 return -1; 1993 } 1994 app_mode = APP_MODE_INTERRUPT; 1995 printf("interrupt-only mode is enabled\n"); 1996 } 1997 1998 if (!strncmp(lgopts[option_index].name, 1999 CMD_LINE_OPT_MAX_PKT_LEN, 2000 sizeof(CMD_LINE_OPT_MAX_PKT_LEN))) { 2001 printf("Custom frame size is configured\n"); 2002 max_pkt_len = parse_max_pkt_len(optarg); 2003 } 2004 2005 if (!strncmp(lgopts[option_index].name, 2006 CMD_LINE_OPT_PARSE_PTYPE, 2007 sizeof(CMD_LINE_OPT_PARSE_PTYPE))) { 2008 printf("soft parse-ptype is enabled\n"); 2009 parse_ptype = 1; 2010 } 2011 2012 if (!strncmp(lgopts[option_index].name, 2013 CMD_LINE_OPT_MAX_EMPTY_POLLS, 2014 sizeof(CMD_LINE_OPT_MAX_EMPTY_POLLS))) { 2015 printf("Maximum empty polls configured\n"); 2016 max_empty_polls = parse_int(optarg); 2017 } 2018 2019 if (!strncmp(lgopts[option_index].name, 2020 CMD_LINE_OPT_PAUSE_DURATION, 2021 sizeof(CMD_LINE_OPT_PAUSE_DURATION))) { 2022 printf("Pause duration configured\n"); 2023 pause_duration = parse_int(optarg); 2024 } 2025 2026 if (!strncmp(lgopts[option_index].name, 2027 CMD_LINE_OPT_SCALE_FREQ_MIN, 2028 sizeof(CMD_LINE_OPT_SCALE_FREQ_MIN))) { 2029 printf("Scaling frequency minimum configured\n"); 2030 scale_freq_min = parse_int(optarg); 2031 } 2032 2033 if (!strncmp(lgopts[option_index].name, 2034 CMD_LINE_OPT_SCALE_FREQ_MAX, 2035 sizeof(CMD_LINE_OPT_SCALE_FREQ_MAX))) { 2036 printf("Scaling frequency maximum configured\n"); 2037 scale_freq_max = parse_int(optarg); 2038 } 2039 2040 break; 2041 2042 default: 2043 print_usage(prgname); 2044 return -1; 2045 } 2046 } 2047 2048 if (optind >= 0) 2049 argv[optind-1] = prgname; 2050 2051 ret = optind-1; 2052 optind = 1; /* reset getopt lib */ 2053 return ret; 2054 } 2055 2056 static void 2057 print_ethaddr(const char *name, const struct rte_ether_addr *eth_addr) 2058 { 2059 char buf[RTE_ETHER_ADDR_FMT_SIZE]; 2060 rte_ether_format_addr(buf, RTE_ETHER_ADDR_FMT_SIZE, eth_addr); 2061 printf("%s%s", name, buf); 2062 } 2063 2064 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 2065 static void 2066 setup_hash(int socketid) 2067 { 2068 struct rte_hash_parameters ipv4_l3fwd_hash_params = { 2069 .name = NULL, 2070 .entries = L3FWD_HASH_ENTRIES, 2071 .key_len = sizeof(struct ipv4_5tuple), 2072 .hash_func = DEFAULT_HASH_FUNC, 2073 .hash_func_init_val = 0, 2074 }; 2075 2076 struct rte_hash_parameters ipv6_l3fwd_hash_params = { 2077 .name = NULL, 2078 .entries = L3FWD_HASH_ENTRIES, 2079 .key_len = sizeof(struct ipv6_5tuple), 2080 .hash_func = DEFAULT_HASH_FUNC, 2081 .hash_func_init_val = 0, 2082 }; 2083 2084 unsigned i; 2085 int ret; 2086 char s[64]; 2087 2088 /* create ipv4 hash */ 2089 snprintf(s, sizeof(s), "ipv4_l3fwd_hash_%d", socketid); 2090 ipv4_l3fwd_hash_params.name = s; 2091 ipv4_l3fwd_hash_params.socket_id = socketid; 2092 ipv4_l3fwd_lookup_struct[socketid] = 2093 rte_hash_create(&ipv4_l3fwd_hash_params); 2094 if (ipv4_l3fwd_lookup_struct[socketid] == NULL) 2095 rte_exit(EXIT_FAILURE, "Unable to create the l3fwd hash on " 2096 "socket %d\n", socketid); 2097 2098 /* create ipv6 hash */ 2099 snprintf(s, sizeof(s), "ipv6_l3fwd_hash_%d", socketid); 2100 ipv6_l3fwd_hash_params.name = s; 2101 ipv6_l3fwd_hash_params.socket_id = socketid; 2102 ipv6_l3fwd_lookup_struct[socketid] = 2103 rte_hash_create(&ipv6_l3fwd_hash_params); 2104 if (ipv6_l3fwd_lookup_struct[socketid] == NULL) 2105 rte_exit(EXIT_FAILURE, "Unable to create the l3fwd hash on " 2106 "socket %d\n", socketid); 2107 2108 2109 /* populate the ipv4 hash */ 2110 for (i = 0; i < RTE_DIM(ipv4_l3fwd_route_array); i++) { 2111 ret = rte_hash_add_key (ipv4_l3fwd_lookup_struct[socketid], 2112 (void *) &ipv4_l3fwd_route_array[i].key); 2113 if (ret < 0) { 2114 rte_exit(EXIT_FAILURE, "Unable to add entry %u to the" 2115 "l3fwd hash on socket %d\n", i, socketid); 2116 } 2117 ipv4_l3fwd_out_if[ret] = ipv4_l3fwd_route_array[i].if_out; 2118 printf("Hash: Adding key\n"); 2119 print_ipv4_key(ipv4_l3fwd_route_array[i].key); 2120 } 2121 2122 /* populate the ipv6 hash */ 2123 for (i = 0; i < RTE_DIM(ipv6_l3fwd_route_array); i++) { 2124 ret = rte_hash_add_key (ipv6_l3fwd_lookup_struct[socketid], 2125 (void *) &ipv6_l3fwd_route_array[i].key); 2126 if (ret < 0) { 2127 rte_exit(EXIT_FAILURE, "Unable to add entry %u to the" 2128 "l3fwd hash on socket %d\n", i, socketid); 2129 } 2130 ipv6_l3fwd_out_if[ret] = ipv6_l3fwd_route_array[i].if_out; 2131 printf("Hash: Adding key\n"); 2132 print_ipv6_key(ipv6_l3fwd_route_array[i].key); 2133 } 2134 } 2135 #endif 2136 2137 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 2138 static void 2139 setup_lpm(int socketid) 2140 { 2141 unsigned i; 2142 int ret; 2143 char s[64]; 2144 2145 /* create the LPM table */ 2146 struct rte_lpm_config lpm_ipv4_config; 2147 2148 lpm_ipv4_config.max_rules = IPV4_L3FWD_LPM_MAX_RULES; 2149 lpm_ipv4_config.number_tbl8s = 256; 2150 lpm_ipv4_config.flags = 0; 2151 2152 snprintf(s, sizeof(s), "IPV4_L3FWD_LPM_%d", socketid); 2153 ipv4_l3fwd_lookup_struct[socketid] = 2154 rte_lpm_create(s, socketid, &lpm_ipv4_config); 2155 if (ipv4_l3fwd_lookup_struct[socketid] == NULL) 2156 rte_exit(EXIT_FAILURE, "Unable to create the l3fwd LPM table" 2157 " on socket %d\n", socketid); 2158 2159 /* populate the LPM table */ 2160 for (i = 0; i < RTE_DIM(ipv4_l3fwd_route_array); i++) { 2161 ret = rte_lpm_add(ipv4_l3fwd_lookup_struct[socketid], 2162 ipv4_l3fwd_route_array[i].ip, 2163 ipv4_l3fwd_route_array[i].depth, 2164 ipv4_l3fwd_route_array[i].if_out); 2165 2166 if (ret < 0) { 2167 rte_exit(EXIT_FAILURE, "Unable to add entry %u to the " 2168 "l3fwd LPM table on socket %d\n", 2169 i, socketid); 2170 } 2171 2172 printf("LPM: Adding route 0x%08x / %d (%d)\n", 2173 (unsigned)ipv4_l3fwd_route_array[i].ip, 2174 ipv4_l3fwd_route_array[i].depth, 2175 ipv4_l3fwd_route_array[i].if_out); 2176 } 2177 } 2178 #endif 2179 2180 static int 2181 init_mem(unsigned nb_mbuf) 2182 { 2183 struct lcore_conf *qconf; 2184 int socketid; 2185 unsigned lcore_id; 2186 char s[64]; 2187 2188 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2189 if (rte_lcore_is_enabled(lcore_id) == 0) 2190 continue; 2191 2192 if (numa_on) 2193 socketid = rte_lcore_to_socket_id(lcore_id); 2194 else 2195 socketid = 0; 2196 2197 if (socketid >= NB_SOCKETS) { 2198 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is " 2199 "out of range %d\n", socketid, 2200 lcore_id, NB_SOCKETS); 2201 } 2202 if (pktmbuf_pool[socketid] == NULL) { 2203 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 2204 pktmbuf_pool[socketid] = 2205 rte_pktmbuf_pool_create(s, nb_mbuf, 2206 MEMPOOL_CACHE_SIZE, 0, 2207 RTE_MBUF_DEFAULT_BUF_SIZE, 2208 socketid); 2209 if (pktmbuf_pool[socketid] == NULL) 2210 rte_exit(EXIT_FAILURE, 2211 "Cannot init mbuf pool on socket %d\n", 2212 socketid); 2213 else 2214 printf("Allocated mbuf pool on socket %d\n", 2215 socketid); 2216 2217 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 2218 setup_lpm(socketid); 2219 #else 2220 setup_hash(socketid); 2221 #endif 2222 } 2223 qconf = &lcore_conf[lcore_id]; 2224 qconf->ipv4_lookup_struct = ipv4_l3fwd_lookup_struct[socketid]; 2225 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 2226 qconf->ipv6_lookup_struct = ipv6_l3fwd_lookup_struct[socketid]; 2227 #endif 2228 } 2229 return 0; 2230 } 2231 2232 /* Check the link status of all ports in up to 9s, and print them finally */ 2233 static void 2234 check_all_ports_link_status(uint32_t port_mask) 2235 { 2236 #define CHECK_INTERVAL 100 /* 100ms */ 2237 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 2238 uint8_t count, all_ports_up, print_flag = 0; 2239 uint16_t portid; 2240 struct rte_eth_link link; 2241 int ret; 2242 char link_status_text[RTE_ETH_LINK_MAX_STR_LEN]; 2243 2244 printf("\nChecking link status"); 2245 fflush(stdout); 2246 for (count = 0; count <= MAX_CHECK_TIME; count++) { 2247 all_ports_up = 1; 2248 RTE_ETH_FOREACH_DEV(portid) { 2249 if ((port_mask & (1 << portid)) == 0) 2250 continue; 2251 memset(&link, 0, sizeof(link)); 2252 ret = rte_eth_link_get_nowait(portid, &link); 2253 if (ret < 0) { 2254 all_ports_up = 0; 2255 if (print_flag == 1) 2256 printf("Port %u link get failed: %s\n", 2257 portid, rte_strerror(-ret)); 2258 continue; 2259 } 2260 /* print link status if flag set */ 2261 if (print_flag == 1) { 2262 rte_eth_link_to_str(link_status_text, 2263 sizeof(link_status_text), &link); 2264 printf("Port %d %s\n", portid, 2265 link_status_text); 2266 continue; 2267 } 2268 /* clear all_ports_up flag if any link down */ 2269 if (link.link_status == RTE_ETH_LINK_DOWN) { 2270 all_ports_up = 0; 2271 break; 2272 } 2273 } 2274 /* after finally printing all link status, get out */ 2275 if (print_flag == 1) 2276 break; 2277 2278 if (all_ports_up == 0) { 2279 printf("."); 2280 fflush(stdout); 2281 rte_delay_ms(CHECK_INTERVAL); 2282 } 2283 2284 /* set the print_flag if all ports up or timeout */ 2285 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 2286 print_flag = 1; 2287 printf("done\n"); 2288 } 2289 } 2290 } 2291 2292 static int check_ptype(uint16_t portid) 2293 { 2294 int i, ret; 2295 int ptype_l3_ipv4 = 0; 2296 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 2297 int ptype_l3_ipv6 = 0; 2298 #endif 2299 uint32_t ptype_mask = RTE_PTYPE_L3_MASK; 2300 2301 ret = rte_eth_dev_get_supported_ptypes(portid, ptype_mask, NULL, 0); 2302 if (ret <= 0) 2303 return 0; 2304 2305 uint32_t ptypes[ret]; 2306 2307 ret = rte_eth_dev_get_supported_ptypes(portid, ptype_mask, ptypes, ret); 2308 for (i = 0; i < ret; ++i) { 2309 if (ptypes[i] & RTE_PTYPE_L3_IPV4) 2310 ptype_l3_ipv4 = 1; 2311 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 2312 if (ptypes[i] & RTE_PTYPE_L3_IPV6) 2313 ptype_l3_ipv6 = 1; 2314 #endif 2315 } 2316 2317 if (ptype_l3_ipv4 == 0) 2318 printf("port %d cannot parse RTE_PTYPE_L3_IPV4\n", portid); 2319 2320 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 2321 if (ptype_l3_ipv6 == 0) 2322 printf("port %d cannot parse RTE_PTYPE_L3_IPV6\n", portid); 2323 #endif 2324 2325 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 2326 if (ptype_l3_ipv4) 2327 #else /* APP_LOOKUP_EXACT_MATCH */ 2328 if (ptype_l3_ipv4 && ptype_l3_ipv6) 2329 #endif 2330 return 1; 2331 2332 return 0; 2333 2334 } 2335 2336 static int 2337 init_power_library(void) 2338 { 2339 enum power_management_env env; 2340 unsigned int lcore_id; 2341 int ret = 0; 2342 2343 RTE_LCORE_FOREACH(lcore_id) { 2344 /* init power management library */ 2345 ret = rte_power_init(lcore_id); 2346 if (ret) { 2347 RTE_LOG(ERR, POWER, 2348 "Library initialization failed on core %u\n", 2349 lcore_id); 2350 return ret; 2351 } 2352 /* we're not supporting the VM channel mode */ 2353 env = rte_power_get_env(); 2354 if (env != PM_ENV_ACPI_CPUFREQ && 2355 env != PM_ENV_PSTATE_CPUFREQ) { 2356 RTE_LOG(ERR, POWER, 2357 "Only ACPI and PSTATE mode are supported\n"); 2358 return -1; 2359 } 2360 } 2361 return ret; 2362 } 2363 2364 static int 2365 deinit_power_library(void) 2366 { 2367 unsigned int lcore_id; 2368 int ret = 0; 2369 2370 RTE_LCORE_FOREACH(lcore_id) { 2371 /* deinit power management library */ 2372 ret = rte_power_exit(lcore_id); 2373 if (ret) { 2374 RTE_LOG(ERR, POWER, 2375 "Library deinitialization failed on core %u\n", 2376 lcore_id); 2377 return ret; 2378 } 2379 } 2380 return ret; 2381 } 2382 2383 static void 2384 get_current_stat_values(uint64_t *values) 2385 { 2386 unsigned int lcore_id = rte_lcore_id(); 2387 struct lcore_conf *qconf; 2388 uint64_t app_eps = 0, app_fps = 0, app_br = 0; 2389 uint64_t count = 0; 2390 2391 RTE_LCORE_FOREACH_WORKER(lcore_id) { 2392 qconf = &lcore_conf[lcore_id]; 2393 if (qconf->n_rx_queue == 0) 2394 continue; 2395 count++; 2396 rte_spinlock_lock(&stats[lcore_id].telemetry_lock); 2397 app_eps += stats[lcore_id].ep_nep[1]; 2398 app_fps += stats[lcore_id].fp_nfp[1]; 2399 app_br += stats[lcore_id].br; 2400 rte_spinlock_unlock(&stats[lcore_id].telemetry_lock); 2401 } 2402 2403 if (count > 0) { 2404 values[0] = app_eps/count; 2405 values[1] = app_fps/count; 2406 values[2] = app_br/count; 2407 } else 2408 memset(values, 0, sizeof(uint64_t) * NUM_TELSTATS); 2409 2410 } 2411 2412 static void 2413 update_telemetry(__rte_unused struct rte_timer *tim, 2414 __rte_unused void *arg) 2415 { 2416 int ret; 2417 uint64_t values[NUM_TELSTATS] = {0}; 2418 2419 get_current_stat_values(values); 2420 ret = rte_metrics_update_values(RTE_METRICS_GLOBAL, telstats_index, 2421 values, RTE_DIM(values)); 2422 if (ret < 0) 2423 RTE_LOG(WARNING, POWER, "failed to update metrics\n"); 2424 } 2425 2426 static int 2427 handle_app_stats(const char *cmd __rte_unused, 2428 const char *params __rte_unused, 2429 struct rte_tel_data *d) 2430 { 2431 uint64_t values[NUM_TELSTATS] = {0}; 2432 uint32_t i; 2433 2434 rte_tel_data_start_dict(d); 2435 get_current_stat_values(values); 2436 for (i = 0; i < NUM_TELSTATS; i++) 2437 rte_tel_data_add_dict_u64(d, telstats_strings[i].name, 2438 values[i]); 2439 return 0; 2440 } 2441 2442 static void 2443 telemetry_setup_timer(void) 2444 { 2445 int lcore_id = rte_lcore_id(); 2446 uint64_t hz = rte_get_timer_hz(); 2447 uint64_t ticks; 2448 2449 ticks = hz / TELEMETRY_INTERVALS_PER_SEC; 2450 rte_timer_reset_sync(&telemetry_timer, 2451 ticks, 2452 PERIODICAL, 2453 lcore_id, 2454 update_telemetry, 2455 NULL); 2456 } 2457 static void 2458 empty_poll_setup_timer(void) 2459 { 2460 int lcore_id = rte_lcore_id(); 2461 uint64_t hz = rte_get_timer_hz(); 2462 2463 struct ep_params *ep_ptr = ep_params; 2464 2465 ep_ptr->interval_ticks = hz / INTERVALS_PER_SECOND; 2466 2467 rte_timer_reset_sync(&ep_ptr->timer0, 2468 ep_ptr->interval_ticks, 2469 PERIODICAL, 2470 lcore_id, 2471 rte_empty_poll_detection, 2472 (void *)ep_ptr); 2473 2474 } 2475 static int 2476 launch_timer(unsigned int lcore_id) 2477 { 2478 int64_t prev_tsc = 0, cur_tsc, diff_tsc, cycles_10ms; 2479 2480 RTE_SET_USED(lcore_id); 2481 2482 2483 if (rte_get_main_lcore() != lcore_id) { 2484 rte_panic("timer on lcore:%d which is not main core:%d\n", 2485 lcore_id, 2486 rte_get_main_lcore()); 2487 } 2488 2489 RTE_LOG(INFO, POWER, "Bring up the Timer\n"); 2490 2491 if (app_mode == APP_MODE_EMPTY_POLL) 2492 empty_poll_setup_timer(); 2493 else 2494 telemetry_setup_timer(); 2495 2496 cycles_10ms = rte_get_timer_hz() / 100; 2497 2498 while (!is_done()) { 2499 cur_tsc = rte_rdtsc(); 2500 diff_tsc = cur_tsc - prev_tsc; 2501 if (diff_tsc > cycles_10ms) { 2502 rte_timer_manage(); 2503 prev_tsc = cur_tsc; 2504 cycles_10ms = rte_get_timer_hz() / 100; 2505 } 2506 } 2507 2508 RTE_LOG(INFO, POWER, "Timer_subsystem is done\n"); 2509 2510 return 0; 2511 } 2512 2513 static int 2514 autodetect_mode(void) 2515 { 2516 RTE_LOG(NOTICE, L3FWD_POWER, "Operating mode not specified, probing frequency scaling support...\n"); 2517 2518 /* 2519 * Empty poll and telemetry modes have to be specifically requested to 2520 * be enabled, but we can auto-detect between interrupt mode with or 2521 * without frequency scaling. Both ACPI and pstate can be used. 2522 */ 2523 if (rte_power_check_env_supported(PM_ENV_ACPI_CPUFREQ)) 2524 return APP_MODE_LEGACY; 2525 if (rte_power_check_env_supported(PM_ENV_PSTATE_CPUFREQ)) 2526 return APP_MODE_LEGACY; 2527 2528 RTE_LOG(NOTICE, L3FWD_POWER, "Frequency scaling not supported, selecting interrupt-only mode\n"); 2529 2530 return APP_MODE_INTERRUPT; 2531 } 2532 2533 static const char * 2534 mode_to_str(enum appmode mode) 2535 { 2536 switch (mode) { 2537 case APP_MODE_LEGACY: 2538 return "legacy"; 2539 case APP_MODE_EMPTY_POLL: 2540 return "empty poll"; 2541 case APP_MODE_TELEMETRY: 2542 return "telemetry"; 2543 case APP_MODE_INTERRUPT: 2544 return "interrupt-only"; 2545 case APP_MODE_PMD_MGMT: 2546 return "pmd mgmt"; 2547 default: 2548 return "invalid"; 2549 } 2550 } 2551 2552 static uint32_t 2553 eth_dev_get_overhead_len(uint32_t max_rx_pktlen, uint16_t max_mtu) 2554 { 2555 uint32_t overhead_len; 2556 2557 if (max_mtu != UINT16_MAX && max_rx_pktlen > max_mtu) 2558 overhead_len = max_rx_pktlen - max_mtu; 2559 else 2560 overhead_len = RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN; 2561 2562 return overhead_len; 2563 } 2564 2565 static int 2566 config_port_max_pkt_len(struct rte_eth_conf *conf, 2567 struct rte_eth_dev_info *dev_info) 2568 { 2569 uint32_t overhead_len; 2570 2571 if (max_pkt_len == 0) 2572 return 0; 2573 2574 if (max_pkt_len < RTE_ETHER_MIN_LEN || max_pkt_len > MAX_JUMBO_PKT_LEN) 2575 return -1; 2576 2577 overhead_len = eth_dev_get_overhead_len(dev_info->max_rx_pktlen, 2578 dev_info->max_mtu); 2579 conf->rxmode.mtu = max_pkt_len - overhead_len; 2580 2581 if (conf->rxmode.mtu > RTE_ETHER_MTU) 2582 conf->txmode.offloads |= RTE_ETH_TX_OFFLOAD_MULTI_SEGS; 2583 2584 return 0; 2585 } 2586 2587 /* Power library initialized in the main routine. 8< */ 2588 int 2589 main(int argc, char **argv) 2590 { 2591 struct lcore_conf *qconf; 2592 struct rte_eth_dev_info dev_info; 2593 struct rte_eth_txconf *txconf; 2594 int ret; 2595 uint16_t nb_ports; 2596 uint16_t queueid; 2597 unsigned lcore_id; 2598 uint64_t hz; 2599 uint32_t n_tx_queue, nb_lcores; 2600 uint32_t dev_rxq_num, dev_txq_num; 2601 uint8_t nb_rx_queue, queue, socketid; 2602 uint16_t portid; 2603 const char *ptr_strings[NUM_TELSTATS]; 2604 2605 /* init EAL */ 2606 ret = rte_eal_init(argc, argv); 2607 if (ret < 0) 2608 rte_exit(EXIT_FAILURE, "Invalid EAL parameters\n"); 2609 argc -= ret; 2610 argv += ret; 2611 2612 /* catch SIGINT and restore cpufreq governor to ondemand */ 2613 signal(SIGINT, signal_exit_now); 2614 2615 /* init RTE timer library to be used late */ 2616 rte_timer_subsystem_init(); 2617 2618 /* if we're running pmd-mgmt mode, don't default to baseline mode */ 2619 baseline_enabled = false; 2620 2621 /* parse application arguments (after the EAL ones) */ 2622 ret = parse_args(argc, argv); 2623 if (ret < 0) 2624 rte_exit(EXIT_FAILURE, "Invalid L3FWD parameters\n"); 2625 2626 if (app_mode == APP_MODE_DEFAULT) 2627 app_mode = autodetect_mode(); 2628 2629 RTE_LOG(INFO, L3FWD_POWER, "Selected operation mode: %s\n", 2630 mode_to_str(app_mode)); 2631 2632 /* only legacy and empty poll mode rely on power library */ 2633 if ((app_mode == APP_MODE_LEGACY || app_mode == APP_MODE_EMPTY_POLL) && 2634 init_power_library()) 2635 rte_exit(EXIT_FAILURE, "init_power_library failed\n"); 2636 2637 if (update_lcore_params() < 0) 2638 rte_exit(EXIT_FAILURE, "update_lcore_params failed\n"); 2639 2640 if (check_lcore_params() < 0) 2641 rte_exit(EXIT_FAILURE, "check_lcore_params failed\n"); 2642 2643 ret = init_lcore_rx_queues(); 2644 if (ret < 0) 2645 rte_exit(EXIT_FAILURE, "init_lcore_rx_queues failed\n"); 2646 2647 nb_ports = rte_eth_dev_count_avail(); 2648 2649 if (check_port_config() < 0) 2650 rte_exit(EXIT_FAILURE, "check_port_config failed\n"); 2651 2652 nb_lcores = rte_lcore_count(); 2653 2654 /* initialize all ports */ 2655 RTE_ETH_FOREACH_DEV(portid) { 2656 struct rte_eth_conf local_port_conf = port_conf; 2657 /* not all app modes need interrupts */ 2658 bool need_intr = app_mode == APP_MODE_LEGACY || 2659 app_mode == APP_MODE_INTERRUPT; 2660 2661 /* skip ports that are not enabled */ 2662 if ((enabled_port_mask & (1 << portid)) == 0) { 2663 printf("\nSkipping disabled port %d\n", portid); 2664 continue; 2665 } 2666 2667 /* init port */ 2668 printf("Initializing port %d ... ", portid ); 2669 fflush(stdout); 2670 2671 ret = rte_eth_dev_info_get(portid, &dev_info); 2672 if (ret != 0) 2673 rte_exit(EXIT_FAILURE, 2674 "Error during getting device (port %u) info: %s\n", 2675 portid, strerror(-ret)); 2676 2677 dev_rxq_num = dev_info.max_rx_queues; 2678 dev_txq_num = dev_info.max_tx_queues; 2679 2680 nb_rx_queue = get_port_n_rx_queues(portid); 2681 if (nb_rx_queue > dev_rxq_num) 2682 rte_exit(EXIT_FAILURE, 2683 "Cannot configure not existed rxq: " 2684 "port=%d\n", portid); 2685 2686 n_tx_queue = nb_lcores; 2687 if (n_tx_queue > dev_txq_num) 2688 n_tx_queue = dev_txq_num; 2689 printf("Creating queues: nb_rxq=%d nb_txq=%u... ", 2690 nb_rx_queue, (unsigned)n_tx_queue ); 2691 /* If number of Rx queue is 0, no need to enable Rx interrupt */ 2692 if (nb_rx_queue == 0) 2693 need_intr = false; 2694 2695 if (need_intr) 2696 local_port_conf.intr_conf.rxq = 1; 2697 2698 ret = rte_eth_dev_info_get(portid, &dev_info); 2699 if (ret != 0) 2700 rte_exit(EXIT_FAILURE, 2701 "Error during getting device (port %u) info: %s\n", 2702 portid, strerror(-ret)); 2703 2704 ret = config_port_max_pkt_len(&local_port_conf, &dev_info); 2705 if (ret != 0) 2706 rte_exit(EXIT_FAILURE, 2707 "Invalid max packet length: %u (port %u)\n", 2708 max_pkt_len, portid); 2709 2710 if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) 2711 local_port_conf.txmode.offloads |= 2712 RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE; 2713 2714 local_port_conf.rx_adv_conf.rss_conf.rss_hf &= 2715 dev_info.flow_type_rss_offloads; 2716 if (local_port_conf.rx_adv_conf.rss_conf.rss_hf != 2717 port_conf.rx_adv_conf.rss_conf.rss_hf) { 2718 printf("Port %u modified RSS hash function based on hardware support," 2719 "requested:%#"PRIx64" configured:%#"PRIx64"\n", 2720 portid, 2721 port_conf.rx_adv_conf.rss_conf.rss_hf, 2722 local_port_conf.rx_adv_conf.rss_conf.rss_hf); 2723 } 2724 2725 if (local_port_conf.rx_adv_conf.rss_conf.rss_hf == 0) 2726 local_port_conf.rxmode.mq_mode = RTE_ETH_MQ_RX_NONE; 2727 local_port_conf.rxmode.offloads &= dev_info.rx_offload_capa; 2728 port_conf.rxmode.offloads = local_port_conf.rxmode.offloads; 2729 2730 ret = rte_eth_dev_configure(portid, nb_rx_queue, 2731 (uint16_t)n_tx_queue, &local_port_conf); 2732 if (ret < 0) 2733 rte_exit(EXIT_FAILURE, "Cannot configure device: " 2734 "err=%d, port=%d\n", ret, portid); 2735 2736 ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, 2737 &nb_txd); 2738 if (ret < 0) 2739 rte_exit(EXIT_FAILURE, 2740 "Cannot adjust number of descriptors: err=%d, port=%d\n", 2741 ret, portid); 2742 2743 ret = rte_eth_macaddr_get(portid, &ports_eth_addr[portid]); 2744 if (ret < 0) 2745 rte_exit(EXIT_FAILURE, 2746 "Cannot get MAC address: err=%d, port=%d\n", 2747 ret, portid); 2748 2749 print_ethaddr(" Address:", &ports_eth_addr[portid]); 2750 printf(", "); 2751 2752 /* init memory */ 2753 ret = init_mem(NB_MBUF); 2754 if (ret < 0) 2755 rte_exit(EXIT_FAILURE, "init_mem failed\n"); 2756 2757 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2758 if (rte_lcore_is_enabled(lcore_id) == 0) 2759 continue; 2760 2761 /* Initialize TX buffers */ 2762 qconf = &lcore_conf[lcore_id]; 2763 qconf->tx_buffer[portid] = rte_zmalloc_socket("tx_buffer", 2764 RTE_ETH_TX_BUFFER_SIZE(MAX_PKT_BURST), 0, 2765 rte_eth_dev_socket_id(portid)); 2766 if (qconf->tx_buffer[portid] == NULL) 2767 rte_exit(EXIT_FAILURE, "Can't allocate tx buffer for port %u\n", 2768 portid); 2769 2770 rte_eth_tx_buffer_init(qconf->tx_buffer[portid], MAX_PKT_BURST); 2771 } 2772 2773 /* init one TX queue per couple (lcore,port) */ 2774 queueid = 0; 2775 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2776 if (rte_lcore_is_enabled(lcore_id) == 0) 2777 continue; 2778 2779 if (queueid >= dev_txq_num) 2780 continue; 2781 2782 if (numa_on) 2783 socketid = \ 2784 (uint8_t)rte_lcore_to_socket_id(lcore_id); 2785 else 2786 socketid = 0; 2787 2788 printf("txq=%u,%d,%d ", lcore_id, queueid, socketid); 2789 fflush(stdout); 2790 2791 txconf = &dev_info.default_txconf; 2792 txconf->offloads = local_port_conf.txmode.offloads; 2793 ret = rte_eth_tx_queue_setup(portid, queueid, nb_txd, 2794 socketid, txconf); 2795 if (ret < 0) 2796 rte_exit(EXIT_FAILURE, 2797 "rte_eth_tx_queue_setup: err=%d, " 2798 "port=%d\n", ret, portid); 2799 2800 qconf = &lcore_conf[lcore_id]; 2801 qconf->tx_queue_id[portid] = queueid; 2802 queueid++; 2803 2804 qconf->tx_port_id[qconf->n_tx_port] = portid; 2805 qconf->n_tx_port++; 2806 } 2807 printf("\n"); 2808 } 2809 2810 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2811 if (rte_lcore_is_enabled(lcore_id) == 0) 2812 continue; 2813 2814 if (app_mode == APP_MODE_LEGACY) { 2815 /* init timer structures for each enabled lcore */ 2816 rte_timer_init(&power_timers[lcore_id]); 2817 hz = rte_get_timer_hz(); 2818 rte_timer_reset(&power_timers[lcore_id], 2819 hz/TIMER_NUMBER_PER_SECOND, 2820 SINGLE, lcore_id, 2821 power_timer_cb, NULL); 2822 } 2823 qconf = &lcore_conf[lcore_id]; 2824 printf("\nInitializing rx queues on lcore %u ... ", lcore_id ); 2825 fflush(stdout); 2826 2827 /* init RX queues */ 2828 for(queue = 0; queue < qconf->n_rx_queue; ++queue) { 2829 struct rte_eth_rxconf rxq_conf; 2830 2831 portid = qconf->rx_queue_list[queue].port_id; 2832 queueid = qconf->rx_queue_list[queue].queue_id; 2833 2834 if (numa_on) 2835 socketid = \ 2836 (uint8_t)rte_lcore_to_socket_id(lcore_id); 2837 else 2838 socketid = 0; 2839 2840 printf("rxq=%d,%d,%d ", portid, queueid, socketid); 2841 fflush(stdout); 2842 2843 ret = rte_eth_dev_info_get(portid, &dev_info); 2844 if (ret != 0) 2845 rte_exit(EXIT_FAILURE, 2846 "Error during getting device (port %u) info: %s\n", 2847 portid, strerror(-ret)); 2848 2849 rxq_conf = dev_info.default_rxconf; 2850 rxq_conf.offloads = port_conf.rxmode.offloads; 2851 ret = rte_eth_rx_queue_setup(portid, queueid, nb_rxd, 2852 socketid, &rxq_conf, 2853 pktmbuf_pool[socketid]); 2854 if (ret < 0) 2855 rte_exit(EXIT_FAILURE, 2856 "rte_eth_rx_queue_setup: err=%d, " 2857 "port=%d\n", ret, portid); 2858 2859 if (parse_ptype) { 2860 if (add_cb_parse_ptype(portid, queueid) < 0) 2861 rte_exit(EXIT_FAILURE, 2862 "Fail to add ptype cb\n"); 2863 } 2864 2865 if (app_mode == APP_MODE_PMD_MGMT && !baseline_enabled) { 2866 /* Set power_pmd_mgmt configs passed by user */ 2867 rte_power_pmd_mgmt_set_emptypoll_max(max_empty_polls); 2868 ret = rte_power_pmd_mgmt_set_pause_duration(pause_duration); 2869 if (ret < 0) 2870 rte_exit(EXIT_FAILURE, 2871 "Error setting pause_duration: err=%d, lcore=%d\n", 2872 ret, lcore_id); 2873 2874 ret = rte_power_pmd_mgmt_set_scaling_freq_min(lcore_id, 2875 scale_freq_min); 2876 if (ret < 0) 2877 rte_exit(EXIT_FAILURE, 2878 "Error setting scaling freq min: err=%d, lcore=%d\n", 2879 ret, lcore_id); 2880 2881 ret = rte_power_pmd_mgmt_set_scaling_freq_max(lcore_id, 2882 scale_freq_max); 2883 if (ret < 0) 2884 rte_exit(EXIT_FAILURE, 2885 "Error setting scaling freq max: err=%d, lcore %d\n", 2886 ret, lcore_id); 2887 2888 ret = rte_power_ethdev_pmgmt_queue_enable( 2889 lcore_id, portid, queueid, 2890 pmgmt_type); 2891 if (ret < 0) 2892 rte_exit(EXIT_FAILURE, 2893 "rte_power_ethdev_pmgmt_queue_enable: err=%d, port=%d\n", 2894 ret, portid); 2895 } 2896 } 2897 } 2898 /* >8 End of power library initialization. */ 2899 2900 printf("\n"); 2901 2902 /* start ports */ 2903 RTE_ETH_FOREACH_DEV(portid) { 2904 if ((enabled_port_mask & (1 << portid)) == 0) { 2905 continue; 2906 } 2907 /* Start device */ 2908 ret = rte_eth_dev_start(portid); 2909 if (ret < 0) 2910 rte_exit(EXIT_FAILURE, "rte_eth_dev_start: err=%d, " 2911 "port=%d\n", ret, portid); 2912 /* 2913 * If enabled, put device in promiscuous mode. 2914 * This allows IO forwarding mode to forward packets 2915 * to itself through 2 cross-connected ports of the 2916 * target machine. 2917 */ 2918 if (promiscuous_on) { 2919 ret = rte_eth_promiscuous_enable(portid); 2920 if (ret != 0) 2921 rte_exit(EXIT_FAILURE, 2922 "rte_eth_promiscuous_enable: err=%s, port=%u\n", 2923 rte_strerror(-ret), portid); 2924 } 2925 /* initialize spinlock for each port */ 2926 rte_spinlock_init(&(locks[portid])); 2927 2928 if (!parse_ptype) 2929 if (!check_ptype(portid)) 2930 rte_exit(EXIT_FAILURE, 2931 "PMD can not provide needed ptypes\n"); 2932 } 2933 2934 check_all_ports_link_status(enabled_port_mask); 2935 2936 if (app_mode == APP_MODE_EMPTY_POLL) { 2937 2938 if (empty_poll_train) { 2939 policy.state = TRAINING; 2940 } else { 2941 policy.state = MED_NORMAL; 2942 policy.med_base_edpi = ep_med_edpi; 2943 policy.hgh_base_edpi = ep_hgh_edpi; 2944 } 2945 2946 ret = rte_power_empty_poll_stat_init(&ep_params, 2947 freq_tlb, 2948 &policy); 2949 if (ret < 0) 2950 rte_exit(EXIT_FAILURE, "empty poll init failed"); 2951 } 2952 2953 2954 /* launch per-lcore init on every lcore */ 2955 if (app_mode == APP_MODE_LEGACY) { 2956 rte_eal_mp_remote_launch(main_legacy_loop, NULL, CALL_MAIN); 2957 } else if (app_mode == APP_MODE_EMPTY_POLL) { 2958 empty_poll_stop = false; 2959 rte_eal_mp_remote_launch(main_empty_poll_loop, NULL, 2960 SKIP_MAIN); 2961 } else if (app_mode == APP_MODE_TELEMETRY) { 2962 unsigned int i; 2963 2964 /* Init metrics library */ 2965 rte_metrics_init(rte_socket_id()); 2966 /** Register stats with metrics library */ 2967 for (i = 0; i < NUM_TELSTATS; i++) 2968 ptr_strings[i] = telstats_strings[i].name; 2969 2970 ret = rte_metrics_reg_names(ptr_strings, NUM_TELSTATS); 2971 if (ret >= 0) 2972 telstats_index = ret; 2973 else 2974 rte_exit(EXIT_FAILURE, "failed to register metrics names"); 2975 2976 RTE_LCORE_FOREACH_WORKER(lcore_id) { 2977 rte_spinlock_init(&stats[lcore_id].telemetry_lock); 2978 } 2979 rte_timer_init(&telemetry_timer); 2980 rte_telemetry_register_cmd("/l3fwd-power/stats", 2981 handle_app_stats, 2982 "Returns global power stats. Parameters: None"); 2983 rte_eal_mp_remote_launch(main_telemetry_loop, NULL, 2984 SKIP_MAIN); 2985 } else if (app_mode == APP_MODE_INTERRUPT) { 2986 rte_eal_mp_remote_launch(main_intr_loop, NULL, CALL_MAIN); 2987 } else if (app_mode == APP_MODE_PMD_MGMT) { 2988 /* reuse telemetry loop for PMD power management mode */ 2989 rte_eal_mp_remote_launch(main_telemetry_loop, NULL, CALL_MAIN); 2990 } 2991 2992 if (app_mode == APP_MODE_EMPTY_POLL || app_mode == APP_MODE_TELEMETRY) 2993 launch_timer(rte_lcore_id()); 2994 2995 RTE_LCORE_FOREACH_WORKER(lcore_id) { 2996 if (rte_eal_wait_lcore(lcore_id) < 0) 2997 return -1; 2998 } 2999 3000 if (app_mode == APP_MODE_PMD_MGMT) { 3001 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 3002 if (rte_lcore_is_enabled(lcore_id) == 0) 3003 continue; 3004 qconf = &lcore_conf[lcore_id]; 3005 for (queue = 0; queue < qconf->n_rx_queue; ++queue) { 3006 portid = qconf->rx_queue_list[queue].port_id; 3007 queueid = qconf->rx_queue_list[queue].queue_id; 3008 3009 rte_power_ethdev_pmgmt_queue_disable(lcore_id, 3010 portid, queueid); 3011 } 3012 } 3013 } 3014 3015 RTE_ETH_FOREACH_DEV(portid) 3016 { 3017 if ((enabled_port_mask & (1 << portid)) == 0) 3018 continue; 3019 3020 ret = rte_eth_dev_stop(portid); 3021 if (ret != 0) 3022 RTE_LOG(ERR, L3FWD_POWER, "rte_eth_dev_stop: err=%d, port=%u\n", 3023 ret, portid); 3024 3025 rte_eth_dev_close(portid); 3026 } 3027 3028 if (app_mode == APP_MODE_EMPTY_POLL) 3029 rte_power_empty_poll_stat_free(); 3030 3031 if ((app_mode == APP_MODE_LEGACY || app_mode == APP_MODE_EMPTY_POLL) && 3032 deinit_power_library()) 3033 rte_exit(EXIT_FAILURE, "deinit_power_library failed\n"); 3034 3035 if (rte_eal_cleanup() < 0) 3036 RTE_LOG(ERR, L3FWD_POWER, "EAL cleanup failed\n"); 3037 3038 return 0; 3039 } 3040