1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2018 Intel Corporation 3 */ 4 5 #include <stdio.h> 6 #include <stdlib.h> 7 #include <stdint.h> 8 #include <inttypes.h> 9 #include <sys/types.h> 10 #include <string.h> 11 #include <sys/queue.h> 12 #include <stdarg.h> 13 #include <errno.h> 14 #include <getopt.h> 15 #include <unistd.h> 16 #include <signal.h> 17 #include <math.h> 18 19 #include <rte_common.h> 20 #include <rte_byteorder.h> 21 #include <rte_log.h> 22 #include <rte_malloc.h> 23 #include <rte_memory.h> 24 #include <rte_memcpy.h> 25 #include <rte_eal.h> 26 #include <rte_launch.h> 27 #include <rte_cycles.h> 28 #include <rte_prefetch.h> 29 #include <rte_lcore.h> 30 #include <rte_per_lcore.h> 31 #include <rte_branch_prediction.h> 32 #include <rte_interrupts.h> 33 #include <rte_random.h> 34 #include <rte_debug.h> 35 #include <rte_ether.h> 36 #include <rte_ethdev.h> 37 #include <rte_mempool.h> 38 #include <rte_mbuf.h> 39 #include <rte_ip.h> 40 #include <rte_tcp.h> 41 #include <rte_udp.h> 42 #include <rte_string_fns.h> 43 #include <rte_timer.h> 44 #include <rte_power.h> 45 #include <rte_spinlock.h> 46 #include <rte_power_empty_poll.h> 47 #include <rte_metrics.h> 48 #include <rte_telemetry.h> 49 #include <rte_power_pmd_mgmt.h> 50 51 #include "perf_core.h" 52 #include "main.h" 53 54 #define RTE_LOGTYPE_L3FWD_POWER RTE_LOGTYPE_USER1 55 56 #define MAX_PKT_BURST 32 57 58 #define MIN_ZERO_POLL_COUNT 10 59 60 /* 100 ms interval */ 61 #define TIMER_NUMBER_PER_SECOND 10 62 /* (10ms) */ 63 #define INTERVALS_PER_SECOND 100 64 /* 100000 us */ 65 #define SCALING_PERIOD (1000000/TIMER_NUMBER_PER_SECOND) 66 #define SCALING_DOWN_TIME_RATIO_THRESHOLD 0.25 67 68 #define APP_LOOKUP_EXACT_MATCH 0 69 #define APP_LOOKUP_LPM 1 70 #define DO_RFC_1812_CHECKS 71 72 #ifndef APP_LOOKUP_METHOD 73 #define APP_LOOKUP_METHOD APP_LOOKUP_LPM 74 #endif 75 76 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 77 #include <rte_hash.h> 78 #elif (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 79 #include <rte_lpm.h> 80 #else 81 #error "APP_LOOKUP_METHOD set to incorrect value" 82 #endif 83 84 #ifndef IPv6_BYTES 85 #define IPv6_BYTES_FMT "%02x%02x:%02x%02x:%02x%02x:%02x%02x:"\ 86 "%02x%02x:%02x%02x:%02x%02x:%02x%02x" 87 #define IPv6_BYTES(addr) \ 88 addr[0], addr[1], addr[2], addr[3], \ 89 addr[4], addr[5], addr[6], addr[7], \ 90 addr[8], addr[9], addr[10], addr[11],\ 91 addr[12], addr[13],addr[14], addr[15] 92 #endif 93 94 #define MAX_JUMBO_PKT_LEN 9600 95 96 #define IPV6_ADDR_LEN 16 97 98 #define MEMPOOL_CACHE_SIZE 256 99 100 /* 101 * This expression is used to calculate the number of mbufs needed depending on 102 * user input, taking into account memory for rx and tx hardware rings, cache 103 * per lcore and mtable per port per lcore. RTE_MAX is used to ensure that 104 * NB_MBUF never goes below a minimum value of 8192. 105 */ 106 107 #define NB_MBUF RTE_MAX ( \ 108 (nb_ports*nb_rx_queue*nb_rxd + \ 109 nb_ports*nb_lcores*MAX_PKT_BURST + \ 110 nb_ports*n_tx_queue*nb_txd + \ 111 nb_lcores*MEMPOOL_CACHE_SIZE), \ 112 (unsigned)8192) 113 114 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 115 116 #define NB_SOCKETS 8 117 118 /* Configure how many packets ahead to prefetch, when reading packets */ 119 #define PREFETCH_OFFSET 3 120 121 /* 122 * Configurable number of RX/TX ring descriptors 123 */ 124 #define RTE_TEST_RX_DESC_DEFAULT 1024 125 #define RTE_TEST_TX_DESC_DEFAULT 1024 126 127 /* 128 * These two thresholds were decided on by running the training algorithm on 129 * a 2.5GHz Xeon. These defaults can be overridden by supplying non-zero values 130 * for the med_threshold and high_threshold parameters on the command line. 131 */ 132 #define EMPTY_POLL_MED_THRESHOLD 350000UL 133 #define EMPTY_POLL_HGH_THRESHOLD 580000UL 134 135 #define NUM_TELSTATS RTE_DIM(telstats_strings) 136 137 static uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT; 138 static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT; 139 140 /* ethernet addresses of ports */ 141 static struct rte_ether_addr ports_eth_addr[RTE_MAX_ETHPORTS]; 142 143 /* ethernet addresses of ports */ 144 static rte_spinlock_t locks[RTE_MAX_ETHPORTS]; 145 146 /* mask of enabled ports */ 147 static uint32_t enabled_port_mask = 0; 148 /* Ports set in promiscuous mode off by default. */ 149 static int promiscuous_on = 0; 150 /* NUMA is enabled by default. */ 151 static int numa_on = 1; 152 static bool empty_poll_stop; 153 static bool empty_poll_train; 154 volatile bool quit_signal; 155 static struct ep_params *ep_params; 156 static struct ep_policy policy; 157 static long ep_med_edpi, ep_hgh_edpi; 158 /* timer to update telemetry every 500ms */ 159 static struct rte_timer telemetry_timer; 160 161 /* stats index returned by metrics lib */ 162 int telstats_index; 163 164 struct telstats_name { 165 char name[RTE_ETH_XSTATS_NAME_SIZE]; 166 }; 167 168 /* telemetry stats to be reported */ 169 const struct telstats_name telstats_strings[] = { 170 {"empty_poll"}, 171 {"full_poll"}, 172 {"busy_percent"} 173 }; 174 175 /* core busyness in percentage */ 176 enum busy_rate { 177 ZERO = 0, 178 PARTIAL = 50, 179 FULL = 100 180 }; 181 182 /* reference poll count to measure core busyness */ 183 #define DEFAULT_COUNT 10000 184 /* 185 * reference CYCLES to be used to 186 * measure core busyness based on poll count 187 */ 188 #define MIN_CYCLES 1500000ULL 189 #define MAX_CYCLES 22000000ULL 190 191 /* (500ms) */ 192 #define TELEMETRY_INTERVALS_PER_SEC 2 193 194 static int parse_ptype; /**< Parse packet type using rx callback, and */ 195 /**< disabled by default */ 196 197 enum appmode { 198 APP_MODE_DEFAULT = 0, 199 APP_MODE_LEGACY, 200 APP_MODE_EMPTY_POLL, 201 APP_MODE_TELEMETRY, 202 APP_MODE_INTERRUPT, 203 APP_MODE_PMD_MGMT 204 }; 205 206 enum appmode app_mode; 207 208 static enum rte_power_pmd_mgmt_type pmgmt_type; 209 bool baseline_enabled; 210 211 enum freq_scale_hint_t 212 { 213 FREQ_LOWER = -1, 214 FREQ_CURRENT = 0, 215 FREQ_HIGHER = 1, 216 FREQ_HIGHEST = 2 217 }; 218 219 struct lcore_rx_queue { 220 uint16_t port_id; 221 uint8_t queue_id; 222 enum freq_scale_hint_t freq_up_hint; 223 uint32_t zero_rx_packet_count; 224 uint32_t idle_hint; 225 } __rte_cache_aligned; 226 227 #define MAX_RX_QUEUE_PER_LCORE 16 228 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS 229 #define MAX_RX_QUEUE_PER_PORT 128 230 231 #define MAX_RX_QUEUE_INTERRUPT_PER_PORT 16 232 233 234 struct lcore_params lcore_params_array[MAX_LCORE_PARAMS]; 235 static struct lcore_params lcore_params_array_default[] = { 236 {0, 0, 2}, 237 {0, 1, 2}, 238 {0, 2, 2}, 239 {1, 0, 2}, 240 {1, 1, 2}, 241 {1, 2, 2}, 242 {2, 0, 2}, 243 {3, 0, 3}, 244 {3, 1, 3}, 245 }; 246 247 struct lcore_params *lcore_params = lcore_params_array_default; 248 uint16_t nb_lcore_params = RTE_DIM(lcore_params_array_default); 249 250 static struct rte_eth_conf port_conf = { 251 .rxmode = { 252 .mq_mode = RTE_ETH_MQ_RX_RSS, 253 .split_hdr_size = 0, 254 .offloads = RTE_ETH_RX_OFFLOAD_CHECKSUM, 255 }, 256 .rx_adv_conf = { 257 .rss_conf = { 258 .rss_key = NULL, 259 .rss_hf = RTE_ETH_RSS_UDP, 260 }, 261 }, 262 .txmode = { 263 .mq_mode = RTE_ETH_MQ_TX_NONE, 264 } 265 }; 266 267 static uint32_t max_pkt_len; 268 269 static struct rte_mempool * pktmbuf_pool[NB_SOCKETS]; 270 271 272 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 273 274 #ifdef RTE_ARCH_X86 275 #include <rte_hash_crc.h> 276 #define DEFAULT_HASH_FUNC rte_hash_crc 277 #else 278 #include <rte_jhash.h> 279 #define DEFAULT_HASH_FUNC rte_jhash 280 #endif 281 282 struct ipv4_5tuple { 283 uint32_t ip_dst; 284 uint32_t ip_src; 285 uint16_t port_dst; 286 uint16_t port_src; 287 uint8_t proto; 288 } __rte_packed; 289 290 struct ipv6_5tuple { 291 uint8_t ip_dst[IPV6_ADDR_LEN]; 292 uint8_t ip_src[IPV6_ADDR_LEN]; 293 uint16_t port_dst; 294 uint16_t port_src; 295 uint8_t proto; 296 } __rte_packed; 297 298 struct ipv4_l3fwd_route { 299 struct ipv4_5tuple key; 300 uint8_t if_out; 301 }; 302 303 struct ipv6_l3fwd_route { 304 struct ipv6_5tuple key; 305 uint8_t if_out; 306 }; 307 308 static struct ipv4_l3fwd_route ipv4_l3fwd_route_array[] = { 309 {{RTE_IPV4(100,10,0,1), RTE_IPV4(200,10,0,1), 101, 11, IPPROTO_TCP}, 0}, 310 {{RTE_IPV4(100,20,0,2), RTE_IPV4(200,20,0,2), 102, 12, IPPROTO_TCP}, 1}, 311 {{RTE_IPV4(100,30,0,3), RTE_IPV4(200,30,0,3), 103, 13, IPPROTO_TCP}, 2}, 312 {{RTE_IPV4(100,40,0,4), RTE_IPV4(200,40,0,4), 104, 14, IPPROTO_TCP}, 3}, 313 }; 314 315 static struct ipv6_l3fwd_route ipv6_l3fwd_route_array[] = { 316 { 317 { 318 {0xfe, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 319 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05}, 320 {0xfe, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 321 0x02, 0x1e, 0x67, 0xff, 0xfe, 0x0d, 0xb6, 0x0a}, 322 1, 10, IPPROTO_UDP 323 }, 4 324 }, 325 }; 326 327 typedef struct rte_hash lookup_struct_t; 328 static lookup_struct_t *ipv4_l3fwd_lookup_struct[NB_SOCKETS]; 329 static lookup_struct_t *ipv6_l3fwd_lookup_struct[NB_SOCKETS]; 330 331 #define L3FWD_HASH_ENTRIES 1024 332 333 static uint16_t ipv4_l3fwd_out_if[L3FWD_HASH_ENTRIES] __rte_cache_aligned; 334 static uint16_t ipv6_l3fwd_out_if[L3FWD_HASH_ENTRIES] __rte_cache_aligned; 335 #endif 336 337 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 338 struct ipv4_l3fwd_route { 339 uint32_t ip; 340 uint8_t depth; 341 uint8_t if_out; 342 }; 343 344 static struct ipv4_l3fwd_route ipv4_l3fwd_route_array[] = { 345 {RTE_IPV4(1,1,1,0), 24, 0}, 346 {RTE_IPV4(2,1,1,0), 24, 1}, 347 {RTE_IPV4(3,1,1,0), 24, 2}, 348 {RTE_IPV4(4,1,1,0), 24, 3}, 349 {RTE_IPV4(5,1,1,0), 24, 4}, 350 {RTE_IPV4(6,1,1,0), 24, 5}, 351 {RTE_IPV4(7,1,1,0), 24, 6}, 352 {RTE_IPV4(8,1,1,0), 24, 7}, 353 }; 354 355 #define IPV4_L3FWD_LPM_MAX_RULES 1024 356 357 typedef struct rte_lpm lookup_struct_t; 358 static lookup_struct_t *ipv4_l3fwd_lookup_struct[NB_SOCKETS]; 359 #endif 360 361 struct lcore_conf { 362 uint16_t n_rx_queue; 363 struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; 364 uint16_t n_tx_port; 365 uint16_t tx_port_id[RTE_MAX_ETHPORTS]; 366 uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; 367 struct rte_eth_dev_tx_buffer *tx_buffer[RTE_MAX_ETHPORTS]; 368 lookup_struct_t * ipv4_lookup_struct; 369 lookup_struct_t * ipv6_lookup_struct; 370 } __rte_cache_aligned; 371 372 struct lcore_stats { 373 /* total sleep time in ms since last frequency scaling down */ 374 uint32_t sleep_time; 375 /* number of long sleep recently */ 376 uint32_t nb_long_sleep; 377 /* freq. scaling up trend */ 378 uint32_t trend; 379 /* total packet processed recently */ 380 uint64_t nb_rx_processed; 381 /* total iterations looped recently */ 382 uint64_t nb_iteration_looped; 383 /* 384 * Represents empty and non empty polls 385 * of rte_eth_rx_burst(); 386 * ep_nep[0] holds non empty polls 387 * i.e. 0 < nb_rx <= MAX_BURST 388 * ep_nep[1] holds empty polls. 389 * i.e. nb_rx == 0 390 */ 391 uint64_t ep_nep[2]; 392 /* 393 * Represents full and empty+partial 394 * polls of rte_eth_rx_burst(); 395 * ep_nep[0] holds empty+partial polls. 396 * i.e. 0 <= nb_rx < MAX_BURST 397 * ep_nep[1] holds full polls 398 * i.e. nb_rx == MAX_BURST 399 */ 400 uint64_t fp_nfp[2]; 401 enum busy_rate br; 402 rte_spinlock_t telemetry_lock; 403 } __rte_cache_aligned; 404 405 static struct lcore_conf lcore_conf[RTE_MAX_LCORE] __rte_cache_aligned; 406 static struct lcore_stats stats[RTE_MAX_LCORE] __rte_cache_aligned; 407 static struct rte_timer power_timers[RTE_MAX_LCORE]; 408 409 static inline uint32_t power_idle_heuristic(uint32_t zero_rx_packet_count); 410 static inline enum freq_scale_hint_t power_freq_scaleup_heuristic( \ 411 unsigned int lcore_id, uint16_t port_id, uint16_t queue_id); 412 413 414 /* 415 * These defaults are using the max frequency index (1), a medium index (9) 416 * and a typical low frequency index (14). These can be adjusted to use 417 * different indexes using the relevant command line parameters. 418 */ 419 static uint8_t freq_tlb[] = {14, 9, 1}; 420 421 static int is_done(void) 422 { 423 return quit_signal; 424 } 425 426 /* exit signal handler */ 427 static void 428 signal_exit_now(int sigtype) 429 { 430 431 if (sigtype == SIGINT) 432 quit_signal = true; 433 434 } 435 436 /* Freqency scale down timer callback */ 437 static void 438 power_timer_cb(__rte_unused struct rte_timer *tim, 439 __rte_unused void *arg) 440 { 441 uint64_t hz; 442 float sleep_time_ratio; 443 unsigned lcore_id = rte_lcore_id(); 444 445 /* accumulate total execution time in us when callback is invoked */ 446 sleep_time_ratio = (float)(stats[lcore_id].sleep_time) / 447 (float)SCALING_PERIOD; 448 /** 449 * check whether need to scale down frequency a step if it sleep a lot. 450 */ 451 if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD) { 452 if (rte_power_freq_down) 453 rte_power_freq_down(lcore_id); 454 } 455 else if ( (unsigned)(stats[lcore_id].nb_rx_processed / 456 stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST) { 457 /** 458 * scale down a step if average packet per iteration less 459 * than expectation. 460 */ 461 if (rte_power_freq_down) 462 rte_power_freq_down(lcore_id); 463 } 464 465 /** 466 * initialize another timer according to current frequency to ensure 467 * timer interval is relatively fixed. 468 */ 469 hz = rte_get_timer_hz(); 470 rte_timer_reset(&power_timers[lcore_id], hz/TIMER_NUMBER_PER_SECOND, 471 SINGLE, lcore_id, power_timer_cb, NULL); 472 473 stats[lcore_id].nb_rx_processed = 0; 474 stats[lcore_id].nb_iteration_looped = 0; 475 476 stats[lcore_id].sleep_time = 0; 477 } 478 479 /* Enqueue a single packet, and send burst if queue is filled */ 480 static inline int 481 send_single_packet(struct rte_mbuf *m, uint16_t port) 482 { 483 uint32_t lcore_id; 484 struct lcore_conf *qconf; 485 486 lcore_id = rte_lcore_id(); 487 qconf = &lcore_conf[lcore_id]; 488 489 rte_eth_tx_buffer(port, qconf->tx_queue_id[port], 490 qconf->tx_buffer[port], m); 491 492 return 0; 493 } 494 495 #ifdef DO_RFC_1812_CHECKS 496 static inline int 497 is_valid_ipv4_pkt(struct rte_ipv4_hdr *pkt, uint32_t link_len) 498 { 499 /* From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2 */ 500 /* 501 * 1. The packet length reported by the Link Layer must be large 502 * enough to hold the minimum length legal IP datagram (20 bytes). 503 */ 504 if (link_len < sizeof(struct rte_ipv4_hdr)) 505 return -1; 506 507 /* 2. The IP checksum must be correct. */ 508 /* if this is not checked in H/W, check it. */ 509 if ((port_conf.rxmode.offloads & RTE_ETH_RX_OFFLOAD_IPV4_CKSUM) == 0) { 510 uint16_t actual_cksum, expected_cksum; 511 actual_cksum = pkt->hdr_checksum; 512 pkt->hdr_checksum = 0; 513 expected_cksum = rte_ipv4_cksum(pkt); 514 if (actual_cksum != expected_cksum) 515 return -2; 516 } 517 518 /* 519 * 3. The IP version number must be 4. If the version number is not 4 520 * then the packet may be another version of IP, such as IPng or 521 * ST-II. 522 */ 523 if (((pkt->version_ihl) >> 4) != 4) 524 return -3; 525 /* 526 * 4. The IP header length field must be large enough to hold the 527 * minimum length legal IP datagram (20 bytes = 5 words). 528 */ 529 if ((pkt->version_ihl & 0xf) < 5) 530 return -4; 531 532 /* 533 * 5. The IP total length field must be large enough to hold the IP 534 * datagram header, whose length is specified in the IP header length 535 * field. 536 */ 537 if (rte_cpu_to_be_16(pkt->total_length) < sizeof(struct rte_ipv4_hdr)) 538 return -5; 539 540 return 0; 541 } 542 #endif 543 544 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 545 static void 546 print_ipv4_key(struct ipv4_5tuple key) 547 { 548 printf("IP dst = %08x, IP src = %08x, port dst = %d, port src = %d, " 549 "proto = %d\n", (unsigned)key.ip_dst, (unsigned)key.ip_src, 550 key.port_dst, key.port_src, key.proto); 551 } 552 static void 553 print_ipv6_key(struct ipv6_5tuple key) 554 { 555 printf( "IP dst = " IPv6_BYTES_FMT ", IP src = " IPv6_BYTES_FMT ", " 556 "port dst = %d, port src = %d, proto = %d\n", 557 IPv6_BYTES(key.ip_dst), IPv6_BYTES(key.ip_src), 558 key.port_dst, key.port_src, key.proto); 559 } 560 561 static inline uint16_t 562 get_ipv4_dst_port(struct rte_ipv4_hdr *ipv4_hdr, uint16_t portid, 563 lookup_struct_t * ipv4_l3fwd_lookup_struct) 564 { 565 struct ipv4_5tuple key; 566 struct rte_tcp_hdr *tcp; 567 struct rte_udp_hdr *udp; 568 int ret = 0; 569 570 key.ip_dst = rte_be_to_cpu_32(ipv4_hdr->dst_addr); 571 key.ip_src = rte_be_to_cpu_32(ipv4_hdr->src_addr); 572 key.proto = ipv4_hdr->next_proto_id; 573 574 switch (ipv4_hdr->next_proto_id) { 575 case IPPROTO_TCP: 576 tcp = (struct rte_tcp_hdr *)((unsigned char *)ipv4_hdr + 577 sizeof(struct rte_ipv4_hdr)); 578 key.port_dst = rte_be_to_cpu_16(tcp->dst_port); 579 key.port_src = rte_be_to_cpu_16(tcp->src_port); 580 break; 581 582 case IPPROTO_UDP: 583 udp = (struct rte_udp_hdr *)((unsigned char *)ipv4_hdr + 584 sizeof(struct rte_ipv4_hdr)); 585 key.port_dst = rte_be_to_cpu_16(udp->dst_port); 586 key.port_src = rte_be_to_cpu_16(udp->src_port); 587 break; 588 589 default: 590 key.port_dst = 0; 591 key.port_src = 0; 592 break; 593 } 594 595 /* Find destination port */ 596 ret = rte_hash_lookup(ipv4_l3fwd_lookup_struct, (const void *)&key); 597 return ((ret < 0) ? portid : ipv4_l3fwd_out_if[ret]); 598 } 599 600 static inline uint16_t 601 get_ipv6_dst_port(struct rte_ipv6_hdr *ipv6_hdr, uint16_t portid, 602 lookup_struct_t *ipv6_l3fwd_lookup_struct) 603 { 604 struct ipv6_5tuple key; 605 struct rte_tcp_hdr *tcp; 606 struct rte_udp_hdr *udp; 607 int ret = 0; 608 609 memcpy(key.ip_dst, ipv6_hdr->dst_addr, IPV6_ADDR_LEN); 610 memcpy(key.ip_src, ipv6_hdr->src_addr, IPV6_ADDR_LEN); 611 612 key.proto = ipv6_hdr->proto; 613 614 switch (ipv6_hdr->proto) { 615 case IPPROTO_TCP: 616 tcp = (struct rte_tcp_hdr *)((unsigned char *) ipv6_hdr + 617 sizeof(struct rte_ipv6_hdr)); 618 key.port_dst = rte_be_to_cpu_16(tcp->dst_port); 619 key.port_src = rte_be_to_cpu_16(tcp->src_port); 620 break; 621 622 case IPPROTO_UDP: 623 udp = (struct rte_udp_hdr *)((unsigned char *) ipv6_hdr + 624 sizeof(struct rte_ipv6_hdr)); 625 key.port_dst = rte_be_to_cpu_16(udp->dst_port); 626 key.port_src = rte_be_to_cpu_16(udp->src_port); 627 break; 628 629 default: 630 key.port_dst = 0; 631 key.port_src = 0; 632 break; 633 } 634 635 /* Find destination port */ 636 ret = rte_hash_lookup(ipv6_l3fwd_lookup_struct, (const void *)&key); 637 return ((ret < 0) ? portid : ipv6_l3fwd_out_if[ret]); 638 } 639 #endif 640 641 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 642 static inline uint16_t 643 get_ipv4_dst_port(struct rte_ipv4_hdr *ipv4_hdr, uint16_t portid, 644 lookup_struct_t *ipv4_l3fwd_lookup_struct) 645 { 646 uint32_t next_hop; 647 648 return ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct, 649 rte_be_to_cpu_32(ipv4_hdr->dst_addr), &next_hop) == 0)? 650 next_hop : portid); 651 } 652 #endif 653 654 static inline void 655 parse_ptype_one(struct rte_mbuf *m) 656 { 657 struct rte_ether_hdr *eth_hdr; 658 uint32_t packet_type = RTE_PTYPE_UNKNOWN; 659 uint16_t ether_type; 660 661 eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 662 ether_type = eth_hdr->ether_type; 663 if (ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4)) 664 packet_type |= RTE_PTYPE_L3_IPV4_EXT_UNKNOWN; 665 else if (ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV6)) 666 packet_type |= RTE_PTYPE_L3_IPV6_EXT_UNKNOWN; 667 668 m->packet_type = packet_type; 669 } 670 671 static uint16_t 672 cb_parse_ptype(uint16_t port __rte_unused, uint16_t queue __rte_unused, 673 struct rte_mbuf *pkts[], uint16_t nb_pkts, 674 uint16_t max_pkts __rte_unused, 675 void *user_param __rte_unused) 676 { 677 unsigned int i; 678 679 for (i = 0; i < nb_pkts; ++i) 680 parse_ptype_one(pkts[i]); 681 682 return nb_pkts; 683 } 684 685 static int 686 add_cb_parse_ptype(uint16_t portid, uint16_t queueid) 687 { 688 printf("Port %d: softly parse packet type info\n", portid); 689 if (rte_eth_add_rx_callback(portid, queueid, cb_parse_ptype, NULL)) 690 return 0; 691 692 printf("Failed to add rx callback: port=%d\n", portid); 693 return -1; 694 } 695 696 static inline void 697 l3fwd_simple_forward(struct rte_mbuf *m, uint16_t portid, 698 struct lcore_conf *qconf) 699 { 700 struct rte_ether_hdr *eth_hdr; 701 struct rte_ipv4_hdr *ipv4_hdr; 702 void *d_addr_bytes; 703 uint16_t dst_port; 704 705 eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 706 707 if (RTE_ETH_IS_IPV4_HDR(m->packet_type)) { 708 /* Handle IPv4 headers.*/ 709 ipv4_hdr = 710 rte_pktmbuf_mtod_offset(m, struct rte_ipv4_hdr *, 711 sizeof(struct rte_ether_hdr)); 712 713 #ifdef DO_RFC_1812_CHECKS 714 /* Check to make sure the packet is valid (RFC1812) */ 715 if (is_valid_ipv4_pkt(ipv4_hdr, m->pkt_len) < 0) { 716 rte_pktmbuf_free(m); 717 return; 718 } 719 #endif 720 721 dst_port = get_ipv4_dst_port(ipv4_hdr, portid, 722 qconf->ipv4_lookup_struct); 723 if (dst_port >= RTE_MAX_ETHPORTS || 724 (enabled_port_mask & 1 << dst_port) == 0) 725 dst_port = portid; 726 727 /* 02:00:00:00:00:xx */ 728 d_addr_bytes = ð_hdr->dst_addr.addr_bytes[0]; 729 *((uint64_t *)d_addr_bytes) = 730 0x000000000002 + ((uint64_t)dst_port << 40); 731 732 #ifdef DO_RFC_1812_CHECKS 733 /* Update time to live and header checksum */ 734 --(ipv4_hdr->time_to_live); 735 ++(ipv4_hdr->hdr_checksum); 736 #endif 737 738 /* src addr */ 739 rte_ether_addr_copy(&ports_eth_addr[dst_port], 740 ð_hdr->src_addr); 741 742 send_single_packet(m, dst_port); 743 } else if (RTE_ETH_IS_IPV6_HDR(m->packet_type)) { 744 /* Handle IPv6 headers.*/ 745 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 746 struct rte_ipv6_hdr *ipv6_hdr; 747 748 ipv6_hdr = 749 rte_pktmbuf_mtod_offset(m, struct rte_ipv6_hdr *, 750 sizeof(struct rte_ether_hdr)); 751 752 dst_port = get_ipv6_dst_port(ipv6_hdr, portid, 753 qconf->ipv6_lookup_struct); 754 755 if (dst_port >= RTE_MAX_ETHPORTS || 756 (enabled_port_mask & 1 << dst_port) == 0) 757 dst_port = portid; 758 759 /* 02:00:00:00:00:xx */ 760 d_addr_bytes = ð_hdr->dst_addr.addr_bytes[0]; 761 *((uint64_t *)d_addr_bytes) = 762 0x000000000002 + ((uint64_t)dst_port << 40); 763 764 /* src addr */ 765 rte_ether_addr_copy(&ports_eth_addr[dst_port], 766 ð_hdr->src_addr); 767 768 send_single_packet(m, dst_port); 769 #else 770 /* We don't currently handle IPv6 packets in LPM mode. */ 771 rte_pktmbuf_free(m); 772 #endif 773 } else 774 rte_pktmbuf_free(m); 775 776 } 777 778 #define MINIMUM_SLEEP_TIME 1 779 #define SUSPEND_THRESHOLD 300 780 781 static inline uint32_t 782 power_idle_heuristic(uint32_t zero_rx_packet_count) 783 { 784 /* If zero count is less than 100, sleep 1us */ 785 if (zero_rx_packet_count < SUSPEND_THRESHOLD) 786 return MINIMUM_SLEEP_TIME; 787 /* If zero count is less than 1000, sleep 100 us which is the 788 minimum latency switching from C3/C6 to C0 789 */ 790 else 791 return SUSPEND_THRESHOLD; 792 } 793 794 static inline enum freq_scale_hint_t 795 power_freq_scaleup_heuristic(unsigned lcore_id, 796 uint16_t port_id, 797 uint16_t queue_id) 798 { 799 uint32_t rxq_count = rte_eth_rx_queue_count(port_id, queue_id); 800 /** 801 * HW Rx queue size is 128 by default, Rx burst read at maximum 32 entries 802 * per iteration 803 */ 804 #define FREQ_GEAR1_RX_PACKET_THRESHOLD MAX_PKT_BURST 805 #define FREQ_GEAR2_RX_PACKET_THRESHOLD (MAX_PKT_BURST*2) 806 #define FREQ_GEAR3_RX_PACKET_THRESHOLD (MAX_PKT_BURST*3) 807 #define FREQ_UP_TREND1_ACC 1 808 #define FREQ_UP_TREND2_ACC 100 809 #define FREQ_UP_THRESHOLD 10000 810 811 if (likely(rxq_count > FREQ_GEAR3_RX_PACKET_THRESHOLD)) { 812 stats[lcore_id].trend = 0; 813 return FREQ_HIGHEST; 814 } else if (likely(rxq_count > FREQ_GEAR2_RX_PACKET_THRESHOLD)) 815 stats[lcore_id].trend += FREQ_UP_TREND2_ACC; 816 else if (likely(rxq_count > FREQ_GEAR1_RX_PACKET_THRESHOLD)) 817 stats[lcore_id].trend += FREQ_UP_TREND1_ACC; 818 819 if (likely(stats[lcore_id].trend > FREQ_UP_THRESHOLD)) { 820 stats[lcore_id].trend = 0; 821 return FREQ_HIGHER; 822 } 823 824 return FREQ_CURRENT; 825 } 826 827 /** 828 * force polling thread sleep until one-shot rx interrupt triggers 829 * @param port_id 830 * Port id. 831 * @param queue_id 832 * Rx queue id. 833 * @return 834 * 0 on success 835 */ 836 static int 837 sleep_until_rx_interrupt(int num, int lcore) 838 { 839 /* 840 * we want to track when we are woken up by traffic so that we can go 841 * back to sleep again without log spamming. Avoid cache line sharing 842 * to prevent threads stepping on each others' toes. 843 */ 844 static struct { 845 bool wakeup; 846 } __rte_cache_aligned status[RTE_MAX_LCORE]; 847 struct rte_epoll_event event[num]; 848 int n, i; 849 uint16_t port_id; 850 uint8_t queue_id; 851 void *data; 852 853 if (status[lcore].wakeup) { 854 RTE_LOG(INFO, L3FWD_POWER, 855 "lcore %u sleeps until interrupt triggers\n", 856 rte_lcore_id()); 857 } 858 859 n = rte_epoll_wait(RTE_EPOLL_PER_THREAD, event, num, 10); 860 for (i = 0; i < n; i++) { 861 data = event[i].epdata.data; 862 port_id = ((uintptr_t)data) >> CHAR_BIT; 863 queue_id = ((uintptr_t)data) & 864 RTE_LEN2MASK(CHAR_BIT, uint8_t); 865 RTE_LOG(INFO, L3FWD_POWER, 866 "lcore %u is waked up from rx interrupt on" 867 " port %d queue %d\n", 868 rte_lcore_id(), port_id, queue_id); 869 } 870 status[lcore].wakeup = n != 0; 871 872 return 0; 873 } 874 875 static void turn_on_off_intr(struct lcore_conf *qconf, bool on) 876 { 877 int i; 878 struct lcore_rx_queue *rx_queue; 879 uint8_t queue_id; 880 uint16_t port_id; 881 882 for (i = 0; i < qconf->n_rx_queue; ++i) { 883 rx_queue = &(qconf->rx_queue_list[i]); 884 port_id = rx_queue->port_id; 885 queue_id = rx_queue->queue_id; 886 887 rte_spinlock_lock(&(locks[port_id])); 888 if (on) 889 rte_eth_dev_rx_intr_enable(port_id, queue_id); 890 else 891 rte_eth_dev_rx_intr_disable(port_id, queue_id); 892 rte_spinlock_unlock(&(locks[port_id])); 893 } 894 } 895 896 static int event_register(struct lcore_conf *qconf) 897 { 898 struct lcore_rx_queue *rx_queue; 899 uint8_t queueid; 900 uint16_t portid; 901 uint32_t data; 902 int ret; 903 int i; 904 905 for (i = 0; i < qconf->n_rx_queue; ++i) { 906 rx_queue = &(qconf->rx_queue_list[i]); 907 portid = rx_queue->port_id; 908 queueid = rx_queue->queue_id; 909 data = portid << CHAR_BIT | queueid; 910 911 ret = rte_eth_dev_rx_intr_ctl_q(portid, queueid, 912 RTE_EPOLL_PER_THREAD, 913 RTE_INTR_EVENT_ADD, 914 (void *)((uintptr_t)data)); 915 if (ret) 916 return ret; 917 } 918 919 return 0; 920 } 921 922 /* Main processing loop. 8< */ 923 static int main_intr_loop(__rte_unused void *dummy) 924 { 925 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 926 unsigned int lcore_id; 927 uint64_t prev_tsc, diff_tsc, cur_tsc; 928 int i, j, nb_rx; 929 uint8_t queueid; 930 uint16_t portid; 931 struct lcore_conf *qconf; 932 struct lcore_rx_queue *rx_queue; 933 uint32_t lcore_rx_idle_count = 0; 934 uint32_t lcore_idle_hint = 0; 935 int intr_en = 0; 936 937 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 938 US_PER_S * BURST_TX_DRAIN_US; 939 940 prev_tsc = 0; 941 942 lcore_id = rte_lcore_id(); 943 qconf = &lcore_conf[lcore_id]; 944 945 if (qconf->n_rx_queue == 0) { 946 RTE_LOG(INFO, L3FWD_POWER, "lcore %u has nothing to do\n", 947 lcore_id); 948 return 0; 949 } 950 951 RTE_LOG(INFO, L3FWD_POWER, "entering main interrupt loop on lcore %u\n", 952 lcore_id); 953 954 for (i = 0; i < qconf->n_rx_queue; i++) { 955 portid = qconf->rx_queue_list[i].port_id; 956 queueid = qconf->rx_queue_list[i].queue_id; 957 RTE_LOG(INFO, L3FWD_POWER, 958 " -- lcoreid=%u portid=%u rxqueueid=%hhu\n", 959 lcore_id, portid, queueid); 960 } 961 962 /* add into event wait list */ 963 if (event_register(qconf) == 0) 964 intr_en = 1; 965 else 966 RTE_LOG(INFO, L3FWD_POWER, "RX interrupt won't enable.\n"); 967 968 while (!is_done()) { 969 stats[lcore_id].nb_iteration_looped++; 970 971 cur_tsc = rte_rdtsc(); 972 973 /* 974 * TX burst queue drain 975 */ 976 diff_tsc = cur_tsc - prev_tsc; 977 if (unlikely(diff_tsc > drain_tsc)) { 978 for (i = 0; i < qconf->n_tx_port; ++i) { 979 portid = qconf->tx_port_id[i]; 980 rte_eth_tx_buffer_flush(portid, 981 qconf->tx_queue_id[portid], 982 qconf->tx_buffer[portid]); 983 } 984 prev_tsc = cur_tsc; 985 } 986 987 start_rx: 988 /* 989 * Read packet from RX queues 990 */ 991 lcore_rx_idle_count = 0; 992 for (i = 0; i < qconf->n_rx_queue; ++i) { 993 rx_queue = &(qconf->rx_queue_list[i]); 994 rx_queue->idle_hint = 0; 995 portid = rx_queue->port_id; 996 queueid = rx_queue->queue_id; 997 998 nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst, 999 MAX_PKT_BURST); 1000 1001 stats[lcore_id].nb_rx_processed += nb_rx; 1002 if (unlikely(nb_rx == 0)) { 1003 /** 1004 * no packet received from rx queue, try to 1005 * sleep for a while forcing CPU enter deeper 1006 * C states. 1007 */ 1008 rx_queue->zero_rx_packet_count++; 1009 1010 if (rx_queue->zero_rx_packet_count <= 1011 MIN_ZERO_POLL_COUNT) 1012 continue; 1013 1014 rx_queue->idle_hint = power_idle_heuristic( 1015 rx_queue->zero_rx_packet_count); 1016 lcore_rx_idle_count++; 1017 } else { 1018 rx_queue->zero_rx_packet_count = 0; 1019 } 1020 1021 /* Prefetch first packets */ 1022 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1023 rte_prefetch0(rte_pktmbuf_mtod( 1024 pkts_burst[j], void *)); 1025 } 1026 1027 /* Prefetch and forward already prefetched packets */ 1028 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1029 rte_prefetch0(rte_pktmbuf_mtod( 1030 pkts_burst[j + PREFETCH_OFFSET], 1031 void *)); 1032 l3fwd_simple_forward( 1033 pkts_burst[j], portid, qconf); 1034 } 1035 1036 /* Forward remaining prefetched packets */ 1037 for (; j < nb_rx; j++) { 1038 l3fwd_simple_forward( 1039 pkts_burst[j], portid, qconf); 1040 } 1041 } 1042 1043 if (unlikely(lcore_rx_idle_count == qconf->n_rx_queue)) { 1044 /** 1045 * All Rx queues empty in recent consecutive polls, 1046 * sleep in a conservative manner, meaning sleep as 1047 * less as possible. 1048 */ 1049 for (i = 1, 1050 lcore_idle_hint = qconf->rx_queue_list[0].idle_hint; 1051 i < qconf->n_rx_queue; ++i) { 1052 rx_queue = &(qconf->rx_queue_list[i]); 1053 if (rx_queue->idle_hint < lcore_idle_hint) 1054 lcore_idle_hint = rx_queue->idle_hint; 1055 } 1056 1057 if (lcore_idle_hint < SUSPEND_THRESHOLD) 1058 /** 1059 * execute "pause" instruction to avoid context 1060 * switch which generally take hundred of 1061 * microseconds for short sleep. 1062 */ 1063 rte_delay_us(lcore_idle_hint); 1064 else { 1065 /* suspend until rx interrupt triggers */ 1066 if (intr_en) { 1067 turn_on_off_intr(qconf, 1); 1068 sleep_until_rx_interrupt( 1069 qconf->n_rx_queue, 1070 lcore_id); 1071 turn_on_off_intr(qconf, 0); 1072 /** 1073 * start receiving packets immediately 1074 */ 1075 if (likely(!is_done())) 1076 goto start_rx; 1077 } 1078 } 1079 stats[lcore_id].sleep_time += lcore_idle_hint; 1080 } 1081 } 1082 1083 return 0; 1084 } 1085 /* >8 End of main processing loop. */ 1086 1087 /* main processing loop */ 1088 static int 1089 main_telemetry_loop(__rte_unused void *dummy) 1090 { 1091 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1092 unsigned int lcore_id; 1093 uint64_t prev_tsc, diff_tsc, cur_tsc, prev_tel_tsc; 1094 int i, j, nb_rx; 1095 uint8_t queueid; 1096 uint16_t portid; 1097 struct lcore_conf *qconf; 1098 struct lcore_rx_queue *rx_queue; 1099 uint64_t ep_nep[2] = {0}, fp_nfp[2] = {0}; 1100 uint64_t poll_count; 1101 enum busy_rate br; 1102 1103 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 1104 US_PER_S * BURST_TX_DRAIN_US; 1105 1106 poll_count = 0; 1107 prev_tsc = 0; 1108 prev_tel_tsc = 0; 1109 1110 lcore_id = rte_lcore_id(); 1111 qconf = &lcore_conf[lcore_id]; 1112 1113 if (qconf->n_rx_queue == 0) { 1114 RTE_LOG(INFO, L3FWD_POWER, "lcore %u has nothing to do\n", 1115 lcore_id); 1116 return 0; 1117 } 1118 1119 RTE_LOG(INFO, L3FWD_POWER, "entering main telemetry loop on lcore %u\n", 1120 lcore_id); 1121 1122 for (i = 0; i < qconf->n_rx_queue; i++) { 1123 portid = qconf->rx_queue_list[i].port_id; 1124 queueid = qconf->rx_queue_list[i].queue_id; 1125 RTE_LOG(INFO, L3FWD_POWER, " -- lcoreid=%u portid=%u " 1126 "rxqueueid=%hhu\n", lcore_id, portid, queueid); 1127 } 1128 1129 while (!is_done()) { 1130 1131 cur_tsc = rte_rdtsc(); 1132 /* 1133 * TX burst queue drain 1134 */ 1135 diff_tsc = cur_tsc - prev_tsc; 1136 if (unlikely(diff_tsc > drain_tsc)) { 1137 for (i = 0; i < qconf->n_tx_port; ++i) { 1138 portid = qconf->tx_port_id[i]; 1139 rte_eth_tx_buffer_flush(portid, 1140 qconf->tx_queue_id[portid], 1141 qconf->tx_buffer[portid]); 1142 } 1143 prev_tsc = cur_tsc; 1144 } 1145 1146 /* 1147 * Read packet from RX queues 1148 */ 1149 for (i = 0; i < qconf->n_rx_queue; ++i) { 1150 rx_queue = &(qconf->rx_queue_list[i]); 1151 portid = rx_queue->port_id; 1152 queueid = rx_queue->queue_id; 1153 1154 nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst, 1155 MAX_PKT_BURST); 1156 ep_nep[nb_rx == 0]++; 1157 fp_nfp[nb_rx == MAX_PKT_BURST]++; 1158 poll_count++; 1159 if (unlikely(nb_rx == 0)) 1160 continue; 1161 1162 /* Prefetch first packets */ 1163 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1164 rte_prefetch0(rte_pktmbuf_mtod( 1165 pkts_burst[j], void *)); 1166 } 1167 1168 /* Prefetch and forward already prefetched packets */ 1169 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1170 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1171 j + PREFETCH_OFFSET], void *)); 1172 l3fwd_simple_forward(pkts_burst[j], portid, 1173 qconf); 1174 } 1175 1176 /* Forward remaining prefetched packets */ 1177 for (; j < nb_rx; j++) { 1178 l3fwd_simple_forward(pkts_burst[j], portid, 1179 qconf); 1180 } 1181 } 1182 if (unlikely(poll_count >= DEFAULT_COUNT)) { 1183 diff_tsc = cur_tsc - prev_tel_tsc; 1184 if (diff_tsc >= MAX_CYCLES) { 1185 br = FULL; 1186 } else if (diff_tsc > MIN_CYCLES && 1187 diff_tsc < MAX_CYCLES) { 1188 br = (diff_tsc * 100) / MAX_CYCLES; 1189 } else { 1190 br = ZERO; 1191 } 1192 poll_count = 0; 1193 prev_tel_tsc = cur_tsc; 1194 /* update stats for telemetry */ 1195 rte_spinlock_lock(&stats[lcore_id].telemetry_lock); 1196 stats[lcore_id].ep_nep[0] = ep_nep[0]; 1197 stats[lcore_id].ep_nep[1] = ep_nep[1]; 1198 stats[lcore_id].fp_nfp[0] = fp_nfp[0]; 1199 stats[lcore_id].fp_nfp[1] = fp_nfp[1]; 1200 stats[lcore_id].br = br; 1201 rte_spinlock_unlock(&stats[lcore_id].telemetry_lock); 1202 } 1203 } 1204 1205 return 0; 1206 } 1207 /* main processing loop */ 1208 static int 1209 main_empty_poll_loop(__rte_unused void *dummy) 1210 { 1211 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1212 unsigned int lcore_id; 1213 uint64_t prev_tsc, diff_tsc, cur_tsc; 1214 int i, j, nb_rx; 1215 uint8_t queueid; 1216 uint16_t portid; 1217 struct lcore_conf *qconf; 1218 struct lcore_rx_queue *rx_queue; 1219 1220 const uint64_t drain_tsc = 1221 (rte_get_tsc_hz() + US_PER_S - 1) / 1222 US_PER_S * BURST_TX_DRAIN_US; 1223 1224 prev_tsc = 0; 1225 1226 lcore_id = rte_lcore_id(); 1227 qconf = &lcore_conf[lcore_id]; 1228 1229 if (qconf->n_rx_queue == 0) { 1230 RTE_LOG(INFO, L3FWD_POWER, "lcore %u has nothing to do\n", 1231 lcore_id); 1232 return 0; 1233 } 1234 1235 for (i = 0; i < qconf->n_rx_queue; i++) { 1236 portid = qconf->rx_queue_list[i].port_id; 1237 queueid = qconf->rx_queue_list[i].queue_id; 1238 RTE_LOG(INFO, L3FWD_POWER, " -- lcoreid=%u portid=%u " 1239 "rxqueueid=%hhu\n", lcore_id, portid, queueid); 1240 } 1241 1242 while (!is_done()) { 1243 stats[lcore_id].nb_iteration_looped++; 1244 1245 cur_tsc = rte_rdtsc(); 1246 /* 1247 * TX burst queue drain 1248 */ 1249 diff_tsc = cur_tsc - prev_tsc; 1250 if (unlikely(diff_tsc > drain_tsc)) { 1251 for (i = 0; i < qconf->n_tx_port; ++i) { 1252 portid = qconf->tx_port_id[i]; 1253 rte_eth_tx_buffer_flush(portid, 1254 qconf->tx_queue_id[portid], 1255 qconf->tx_buffer[portid]); 1256 } 1257 prev_tsc = cur_tsc; 1258 } 1259 1260 /* 1261 * Read packet from RX queues 1262 */ 1263 for (i = 0; i < qconf->n_rx_queue; ++i) { 1264 rx_queue = &(qconf->rx_queue_list[i]); 1265 rx_queue->idle_hint = 0; 1266 portid = rx_queue->port_id; 1267 queueid = rx_queue->queue_id; 1268 1269 nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst, 1270 MAX_PKT_BURST); 1271 1272 stats[lcore_id].nb_rx_processed += nb_rx; 1273 1274 if (nb_rx == 0) { 1275 1276 rte_power_empty_poll_stat_update(lcore_id); 1277 1278 continue; 1279 } else { 1280 rte_power_poll_stat_update(lcore_id, nb_rx); 1281 } 1282 1283 1284 /* Prefetch first packets */ 1285 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1286 rte_prefetch0(rte_pktmbuf_mtod( 1287 pkts_burst[j], void *)); 1288 } 1289 1290 /* Prefetch and forward already prefetched packets */ 1291 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1292 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1293 j + PREFETCH_OFFSET], 1294 void *)); 1295 l3fwd_simple_forward(pkts_burst[j], portid, 1296 qconf); 1297 } 1298 1299 /* Forward remaining prefetched packets */ 1300 for (; j < nb_rx; j++) { 1301 l3fwd_simple_forward(pkts_burst[j], portid, 1302 qconf); 1303 } 1304 1305 } 1306 1307 } 1308 1309 return 0; 1310 } 1311 /* main processing loop */ 1312 static int 1313 main_legacy_loop(__rte_unused void *dummy) 1314 { 1315 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1316 unsigned lcore_id; 1317 uint64_t prev_tsc, diff_tsc, cur_tsc, tim_res_tsc, hz; 1318 uint64_t prev_tsc_power = 0, cur_tsc_power, diff_tsc_power; 1319 int i, j, nb_rx; 1320 uint8_t queueid; 1321 uint16_t portid; 1322 struct lcore_conf *qconf; 1323 struct lcore_rx_queue *rx_queue; 1324 enum freq_scale_hint_t lcore_scaleup_hint; 1325 uint32_t lcore_rx_idle_count = 0; 1326 uint32_t lcore_idle_hint = 0; 1327 int intr_en = 0; 1328 1329 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1330 1331 prev_tsc = 0; 1332 hz = rte_get_timer_hz(); 1333 tim_res_tsc = hz/TIMER_NUMBER_PER_SECOND; 1334 1335 lcore_id = rte_lcore_id(); 1336 qconf = &lcore_conf[lcore_id]; 1337 1338 if (qconf->n_rx_queue == 0) { 1339 RTE_LOG(INFO, L3FWD_POWER, "lcore %u has nothing to do\n", lcore_id); 1340 return 0; 1341 } 1342 1343 RTE_LOG(INFO, L3FWD_POWER, "entering main loop on lcore %u\n", lcore_id); 1344 1345 for (i = 0; i < qconf->n_rx_queue; i++) { 1346 portid = qconf->rx_queue_list[i].port_id; 1347 queueid = qconf->rx_queue_list[i].queue_id; 1348 RTE_LOG(INFO, L3FWD_POWER, " -- lcoreid=%u portid=%u " 1349 "rxqueueid=%hhu\n", lcore_id, portid, queueid); 1350 } 1351 1352 /* add into event wait list */ 1353 if (event_register(qconf) == 0) 1354 intr_en = 1; 1355 else 1356 RTE_LOG(INFO, L3FWD_POWER, "RX interrupt won't enable.\n"); 1357 1358 while (!is_done()) { 1359 stats[lcore_id].nb_iteration_looped++; 1360 1361 cur_tsc = rte_rdtsc(); 1362 cur_tsc_power = cur_tsc; 1363 1364 /* 1365 * TX burst queue drain 1366 */ 1367 diff_tsc = cur_tsc - prev_tsc; 1368 if (unlikely(diff_tsc > drain_tsc)) { 1369 for (i = 0; i < qconf->n_tx_port; ++i) { 1370 portid = qconf->tx_port_id[i]; 1371 rte_eth_tx_buffer_flush(portid, 1372 qconf->tx_queue_id[portid], 1373 qconf->tx_buffer[portid]); 1374 } 1375 prev_tsc = cur_tsc; 1376 } 1377 1378 diff_tsc_power = cur_tsc_power - prev_tsc_power; 1379 if (diff_tsc_power > tim_res_tsc) { 1380 rte_timer_manage(); 1381 prev_tsc_power = cur_tsc_power; 1382 } 1383 1384 start_rx: 1385 /* 1386 * Read packet from RX queues 1387 */ 1388 lcore_scaleup_hint = FREQ_CURRENT; 1389 lcore_rx_idle_count = 0; 1390 for (i = 0; i < qconf->n_rx_queue; ++i) { 1391 rx_queue = &(qconf->rx_queue_list[i]); 1392 rx_queue->idle_hint = 0; 1393 portid = rx_queue->port_id; 1394 queueid = rx_queue->queue_id; 1395 1396 nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst, 1397 MAX_PKT_BURST); 1398 1399 stats[lcore_id].nb_rx_processed += nb_rx; 1400 if (unlikely(nb_rx == 0)) { 1401 /** 1402 * no packet received from rx queue, try to 1403 * sleep for a while forcing CPU enter deeper 1404 * C states. 1405 */ 1406 rx_queue->zero_rx_packet_count++; 1407 1408 if (rx_queue->zero_rx_packet_count <= 1409 MIN_ZERO_POLL_COUNT) 1410 continue; 1411 1412 rx_queue->idle_hint = power_idle_heuristic(\ 1413 rx_queue->zero_rx_packet_count); 1414 lcore_rx_idle_count++; 1415 } else { 1416 rx_queue->zero_rx_packet_count = 0; 1417 1418 /** 1419 * do not scale up frequency immediately as 1420 * user to kernel space communication is costly 1421 * which might impact packet I/O for received 1422 * packets. 1423 */ 1424 rx_queue->freq_up_hint = 1425 power_freq_scaleup_heuristic(lcore_id, 1426 portid, queueid); 1427 } 1428 1429 /* Prefetch first packets */ 1430 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1431 rte_prefetch0(rte_pktmbuf_mtod( 1432 pkts_burst[j], void *)); 1433 } 1434 1435 /* Prefetch and forward already prefetched packets */ 1436 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1437 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1438 j + PREFETCH_OFFSET], void *)); 1439 l3fwd_simple_forward(pkts_burst[j], portid, 1440 qconf); 1441 } 1442 1443 /* Forward remaining prefetched packets */ 1444 for (; j < nb_rx; j++) { 1445 l3fwd_simple_forward(pkts_burst[j], portid, 1446 qconf); 1447 } 1448 } 1449 1450 if (likely(lcore_rx_idle_count != qconf->n_rx_queue)) { 1451 for (i = 1, lcore_scaleup_hint = 1452 qconf->rx_queue_list[0].freq_up_hint; 1453 i < qconf->n_rx_queue; ++i) { 1454 rx_queue = &(qconf->rx_queue_list[i]); 1455 if (rx_queue->freq_up_hint > 1456 lcore_scaleup_hint) 1457 lcore_scaleup_hint = 1458 rx_queue->freq_up_hint; 1459 } 1460 1461 if (lcore_scaleup_hint == FREQ_HIGHEST) { 1462 if (rte_power_freq_max) 1463 rte_power_freq_max(lcore_id); 1464 } else if (lcore_scaleup_hint == FREQ_HIGHER) { 1465 if (rte_power_freq_up) 1466 rte_power_freq_up(lcore_id); 1467 } 1468 } else { 1469 /** 1470 * All Rx queues empty in recent consecutive polls, 1471 * sleep in a conservative manner, meaning sleep as 1472 * less as possible. 1473 */ 1474 for (i = 1, lcore_idle_hint = 1475 qconf->rx_queue_list[0].idle_hint; 1476 i < qconf->n_rx_queue; ++i) { 1477 rx_queue = &(qconf->rx_queue_list[i]); 1478 if (rx_queue->idle_hint < lcore_idle_hint) 1479 lcore_idle_hint = rx_queue->idle_hint; 1480 } 1481 1482 if (lcore_idle_hint < SUSPEND_THRESHOLD) 1483 /** 1484 * execute "pause" instruction to avoid context 1485 * switch which generally take hundred of 1486 * microseconds for short sleep. 1487 */ 1488 rte_delay_us(lcore_idle_hint); 1489 else { 1490 /* suspend until rx interrupt triggers */ 1491 if (intr_en) { 1492 turn_on_off_intr(qconf, 1); 1493 sleep_until_rx_interrupt( 1494 qconf->n_rx_queue, 1495 lcore_id); 1496 turn_on_off_intr(qconf, 0); 1497 /** 1498 * start receiving packets immediately 1499 */ 1500 if (likely(!is_done())) 1501 goto start_rx; 1502 } 1503 } 1504 stats[lcore_id].sleep_time += lcore_idle_hint; 1505 } 1506 } 1507 1508 return 0; 1509 } 1510 1511 static int 1512 check_lcore_params(void) 1513 { 1514 uint8_t queue, lcore; 1515 uint16_t i; 1516 int socketid; 1517 1518 for (i = 0; i < nb_lcore_params; ++i) { 1519 queue = lcore_params[i].queue_id; 1520 if (queue >= MAX_RX_QUEUE_PER_PORT) { 1521 printf("invalid queue number: %hhu\n", queue); 1522 return -1; 1523 } 1524 lcore = lcore_params[i].lcore_id; 1525 if (!rte_lcore_is_enabled(lcore)) { 1526 printf("error: lcore %hhu is not enabled in lcore " 1527 "mask\n", lcore); 1528 return -1; 1529 } 1530 if ((socketid = rte_lcore_to_socket_id(lcore) != 0) && 1531 (numa_on == 0)) { 1532 printf("warning: lcore %hhu is on socket %d with numa " 1533 "off\n", lcore, socketid); 1534 } 1535 if (app_mode == APP_MODE_TELEMETRY && lcore == rte_lcore_id()) { 1536 printf("cannot enable main core %d in config for telemetry mode\n", 1537 rte_lcore_id()); 1538 return -1; 1539 } 1540 } 1541 return 0; 1542 } 1543 1544 static int 1545 check_port_config(void) 1546 { 1547 unsigned portid; 1548 uint16_t i; 1549 1550 for (i = 0; i < nb_lcore_params; ++i) { 1551 portid = lcore_params[i].port_id; 1552 if ((enabled_port_mask & (1 << portid)) == 0) { 1553 printf("port %u is not enabled in port mask\n", 1554 portid); 1555 return -1; 1556 } 1557 if (!rte_eth_dev_is_valid_port(portid)) { 1558 printf("port %u is not present on the board\n", 1559 portid); 1560 return -1; 1561 } 1562 } 1563 return 0; 1564 } 1565 1566 static uint8_t 1567 get_port_n_rx_queues(const uint16_t port) 1568 { 1569 int queue = -1; 1570 uint16_t i; 1571 1572 for (i = 0; i < nb_lcore_params; ++i) { 1573 if (lcore_params[i].port_id == port && 1574 lcore_params[i].queue_id > queue) 1575 queue = lcore_params[i].queue_id; 1576 } 1577 return (uint8_t)(++queue); 1578 } 1579 1580 static int 1581 init_lcore_rx_queues(void) 1582 { 1583 uint16_t i, nb_rx_queue; 1584 uint8_t lcore; 1585 1586 for (i = 0; i < nb_lcore_params; ++i) { 1587 lcore = lcore_params[i].lcore_id; 1588 nb_rx_queue = lcore_conf[lcore].n_rx_queue; 1589 if (nb_rx_queue >= MAX_RX_QUEUE_PER_LCORE) { 1590 printf("error: too many queues (%u) for lcore: %u\n", 1591 (unsigned)nb_rx_queue + 1, (unsigned)lcore); 1592 return -1; 1593 } else { 1594 lcore_conf[lcore].rx_queue_list[nb_rx_queue].port_id = 1595 lcore_params[i].port_id; 1596 lcore_conf[lcore].rx_queue_list[nb_rx_queue].queue_id = 1597 lcore_params[i].queue_id; 1598 lcore_conf[lcore].n_rx_queue++; 1599 } 1600 } 1601 return 0; 1602 } 1603 1604 /* display usage */ 1605 static void 1606 print_usage(const char *prgname) 1607 { 1608 printf ("%s [EAL options] -- -p PORTMASK -P" 1609 " [--config (port,queue,lcore)[,(port,queue,lcore]]" 1610 " [--high-perf-cores CORELIST" 1611 " [--perf-config (port,queue,hi_perf,lcore_index)[,(port,queue,hi_perf,lcore_index]]" 1612 " [--max-pkt-len PKTLEN]\n" 1613 " -p PORTMASK: hexadecimal bitmask of ports to configure\n" 1614 " -P: enable promiscuous mode\n" 1615 " --config (port,queue,lcore): rx queues configuration\n" 1616 " --high-perf-cores CORELIST: list of high performance cores\n" 1617 " --perf-config: similar as config, cores specified as indices" 1618 " for bins containing high or regular performance cores\n" 1619 " --no-numa: optional, disable numa awareness\n" 1620 " --max-pkt-len PKTLEN: maximum packet length in decimal (64-9600)\n" 1621 " --parse-ptype: parse packet type by software\n" 1622 " --legacy: use legacy interrupt-based scaling\n" 1623 " --empty-poll: enable empty poll detection" 1624 " follow (training_flag, high_threshold, med_threshold)\n" 1625 " --telemetry: enable telemetry mode, to update" 1626 " empty polls, full polls, and core busyness to telemetry\n" 1627 " --interrupt-only: enable interrupt-only mode\n" 1628 " --pmd-mgmt MODE: enable PMD power management mode. " 1629 "Currently supported modes: baseline, monitor, pause, scale\n", 1630 prgname); 1631 } 1632 1633 static int parse_max_pkt_len(const char *pktlen) 1634 { 1635 char *end = NULL; 1636 unsigned long len; 1637 1638 /* parse decimal string */ 1639 len = strtoul(pktlen, &end, 10); 1640 if ((pktlen[0] == '\0') || (end == NULL) || (*end != '\0')) 1641 return -1; 1642 1643 if (len == 0) 1644 return -1; 1645 1646 return len; 1647 } 1648 1649 static int 1650 parse_portmask(const char *portmask) 1651 { 1652 char *end = NULL; 1653 unsigned long pm; 1654 1655 /* parse hexadecimal string */ 1656 pm = strtoul(portmask, &end, 16); 1657 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0')) 1658 return 0; 1659 1660 return pm; 1661 } 1662 1663 static int 1664 parse_config(const char *q_arg) 1665 { 1666 char s[256]; 1667 const char *p, *p0 = q_arg; 1668 char *end; 1669 enum fieldnames { 1670 FLD_PORT = 0, 1671 FLD_QUEUE, 1672 FLD_LCORE, 1673 _NUM_FLD 1674 }; 1675 unsigned long int_fld[_NUM_FLD]; 1676 char *str_fld[_NUM_FLD]; 1677 int i; 1678 unsigned size; 1679 1680 nb_lcore_params = 0; 1681 1682 while ((p = strchr(p0,'(')) != NULL) { 1683 ++p; 1684 if((p0 = strchr(p,')')) == NULL) 1685 return -1; 1686 1687 size = p0 - p; 1688 if(size >= sizeof(s)) 1689 return -1; 1690 1691 snprintf(s, sizeof(s), "%.*s", size, p); 1692 if (rte_strsplit(s, sizeof(s), str_fld, _NUM_FLD, ',') != 1693 _NUM_FLD) 1694 return -1; 1695 for (i = 0; i < _NUM_FLD; i++){ 1696 errno = 0; 1697 int_fld[i] = strtoul(str_fld[i], &end, 0); 1698 if (errno != 0 || end == str_fld[i] || int_fld[i] > 1699 255) 1700 return -1; 1701 } 1702 if (nb_lcore_params >= MAX_LCORE_PARAMS) { 1703 printf("exceeded max number of lcore params: %hu\n", 1704 nb_lcore_params); 1705 return -1; 1706 } 1707 lcore_params_array[nb_lcore_params].port_id = 1708 (uint8_t)int_fld[FLD_PORT]; 1709 lcore_params_array[nb_lcore_params].queue_id = 1710 (uint8_t)int_fld[FLD_QUEUE]; 1711 lcore_params_array[nb_lcore_params].lcore_id = 1712 (uint8_t)int_fld[FLD_LCORE]; 1713 ++nb_lcore_params; 1714 } 1715 lcore_params = lcore_params_array; 1716 1717 return 0; 1718 } 1719 1720 static int 1721 parse_pmd_mgmt_config(const char *name) 1722 { 1723 #define PMD_MGMT_MONITOR "monitor" 1724 #define PMD_MGMT_PAUSE "pause" 1725 #define PMD_MGMT_SCALE "scale" 1726 #define PMD_MGMT_BASELINE "baseline" 1727 1728 if (strncmp(PMD_MGMT_MONITOR, name, sizeof(PMD_MGMT_MONITOR)) == 0) { 1729 pmgmt_type = RTE_POWER_MGMT_TYPE_MONITOR; 1730 return 0; 1731 } 1732 1733 if (strncmp(PMD_MGMT_PAUSE, name, sizeof(PMD_MGMT_PAUSE)) == 0) { 1734 pmgmt_type = RTE_POWER_MGMT_TYPE_PAUSE; 1735 return 0; 1736 } 1737 1738 if (strncmp(PMD_MGMT_SCALE, name, sizeof(PMD_MGMT_SCALE)) == 0) { 1739 pmgmt_type = RTE_POWER_MGMT_TYPE_SCALE; 1740 return 0; 1741 } 1742 if (strncmp(PMD_MGMT_BASELINE, name, sizeof(PMD_MGMT_BASELINE)) == 0) { 1743 baseline_enabled = true; 1744 return 0; 1745 } 1746 /* unknown PMD power management mode */ 1747 return -1; 1748 } 1749 1750 static int 1751 parse_ep_config(const char *q_arg) 1752 { 1753 char s[256]; 1754 const char *p = q_arg; 1755 char *end; 1756 int num_arg; 1757 1758 char *str_fld[3]; 1759 1760 int training_flag; 1761 int med_edpi; 1762 int hgh_edpi; 1763 1764 ep_med_edpi = EMPTY_POLL_MED_THRESHOLD; 1765 ep_hgh_edpi = EMPTY_POLL_HGH_THRESHOLD; 1766 1767 strlcpy(s, p, sizeof(s)); 1768 1769 num_arg = rte_strsplit(s, sizeof(s), str_fld, 3, ','); 1770 1771 empty_poll_train = false; 1772 1773 if (num_arg == 0) 1774 return 0; 1775 1776 if (num_arg == 3) { 1777 1778 training_flag = strtoul(str_fld[0], &end, 0); 1779 med_edpi = strtoul(str_fld[1], &end, 0); 1780 hgh_edpi = strtoul(str_fld[2], &end, 0); 1781 1782 if (training_flag == 1) 1783 empty_poll_train = true; 1784 1785 if (med_edpi > 0) 1786 ep_med_edpi = med_edpi; 1787 1788 if (hgh_edpi > 0) 1789 ep_hgh_edpi = hgh_edpi; 1790 1791 } else { 1792 1793 return -1; 1794 } 1795 1796 return 0; 1797 1798 } 1799 #define CMD_LINE_OPT_PARSE_PTYPE "parse-ptype" 1800 #define CMD_LINE_OPT_LEGACY "legacy" 1801 #define CMD_LINE_OPT_EMPTY_POLL "empty-poll" 1802 #define CMD_LINE_OPT_INTERRUPT_ONLY "interrupt-only" 1803 #define CMD_LINE_OPT_TELEMETRY "telemetry" 1804 #define CMD_LINE_OPT_PMD_MGMT "pmd-mgmt" 1805 #define CMD_LINE_OPT_MAX_PKT_LEN "max-pkt-len" 1806 1807 /* Parse the argument given in the command line of the application */ 1808 static int 1809 parse_args(int argc, char **argv) 1810 { 1811 int opt, ret; 1812 char **argvopt; 1813 int option_index; 1814 uint32_t limit; 1815 char *prgname = argv[0]; 1816 static struct option lgopts[] = { 1817 {"config", 1, 0, 0}, 1818 {"perf-config", 1, 0, 0}, 1819 {"high-perf-cores", 1, 0, 0}, 1820 {"no-numa", 0, 0, 0}, 1821 {CMD_LINE_OPT_MAX_PKT_LEN, 1, 0, 0}, 1822 {CMD_LINE_OPT_EMPTY_POLL, 1, 0, 0}, 1823 {CMD_LINE_OPT_PARSE_PTYPE, 0, 0, 0}, 1824 {CMD_LINE_OPT_LEGACY, 0, 0, 0}, 1825 {CMD_LINE_OPT_TELEMETRY, 0, 0, 0}, 1826 {CMD_LINE_OPT_INTERRUPT_ONLY, 0, 0, 0}, 1827 {CMD_LINE_OPT_PMD_MGMT, 1, 0, 0}, 1828 {NULL, 0, 0, 0} 1829 }; 1830 1831 argvopt = argv; 1832 1833 while ((opt = getopt_long(argc, argvopt, "p:l:m:h:P", 1834 lgopts, &option_index)) != EOF) { 1835 1836 switch (opt) { 1837 /* portmask */ 1838 case 'p': 1839 enabled_port_mask = parse_portmask(optarg); 1840 if (enabled_port_mask == 0) { 1841 printf("invalid portmask\n"); 1842 print_usage(prgname); 1843 return -1; 1844 } 1845 break; 1846 case 'P': 1847 printf("Promiscuous mode selected\n"); 1848 promiscuous_on = 1; 1849 break; 1850 case 'l': 1851 limit = parse_max_pkt_len(optarg); 1852 freq_tlb[LOW] = limit; 1853 break; 1854 case 'm': 1855 limit = parse_max_pkt_len(optarg); 1856 freq_tlb[MED] = limit; 1857 break; 1858 case 'h': 1859 limit = parse_max_pkt_len(optarg); 1860 freq_tlb[HGH] = limit; 1861 break; 1862 /* long options */ 1863 case 0: 1864 if (!strncmp(lgopts[option_index].name, "config", 6)) { 1865 ret = parse_config(optarg); 1866 if (ret) { 1867 printf("invalid config\n"); 1868 print_usage(prgname); 1869 return -1; 1870 } 1871 } 1872 1873 if (!strncmp(lgopts[option_index].name, 1874 "perf-config", 11)) { 1875 ret = parse_perf_config(optarg); 1876 if (ret) { 1877 printf("invalid perf-config\n"); 1878 print_usage(prgname); 1879 return -1; 1880 } 1881 } 1882 1883 if (!strncmp(lgopts[option_index].name, 1884 "high-perf-cores", 15)) { 1885 ret = parse_perf_core_list(optarg); 1886 if (ret) { 1887 printf("invalid high-perf-cores\n"); 1888 print_usage(prgname); 1889 return -1; 1890 } 1891 } 1892 1893 if (!strncmp(lgopts[option_index].name, 1894 "no-numa", 7)) { 1895 printf("numa is disabled \n"); 1896 numa_on = 0; 1897 } 1898 1899 if (!strncmp(lgopts[option_index].name, 1900 CMD_LINE_OPT_LEGACY, 1901 sizeof(CMD_LINE_OPT_LEGACY))) { 1902 if (app_mode != APP_MODE_DEFAULT) { 1903 printf(" legacy mode is mutually exclusive with other modes\n"); 1904 return -1; 1905 } 1906 app_mode = APP_MODE_LEGACY; 1907 printf("legacy mode is enabled\n"); 1908 } 1909 1910 if (!strncmp(lgopts[option_index].name, 1911 CMD_LINE_OPT_EMPTY_POLL, 10)) { 1912 if (app_mode != APP_MODE_DEFAULT) { 1913 printf(" empty-poll mode is mutually exclusive with other modes\n"); 1914 return -1; 1915 } 1916 app_mode = APP_MODE_EMPTY_POLL; 1917 ret = parse_ep_config(optarg); 1918 1919 if (ret) { 1920 printf("invalid empty poll config\n"); 1921 print_usage(prgname); 1922 return -1; 1923 } 1924 printf("empty-poll is enabled\n"); 1925 } 1926 1927 if (!strncmp(lgopts[option_index].name, 1928 CMD_LINE_OPT_TELEMETRY, 1929 sizeof(CMD_LINE_OPT_TELEMETRY))) { 1930 if (app_mode != APP_MODE_DEFAULT) { 1931 printf(" telemetry mode is mutually exclusive with other modes\n"); 1932 return -1; 1933 } 1934 app_mode = APP_MODE_TELEMETRY; 1935 printf("telemetry mode is enabled\n"); 1936 } 1937 1938 if (!strncmp(lgopts[option_index].name, 1939 CMD_LINE_OPT_PMD_MGMT, 1940 sizeof(CMD_LINE_OPT_PMD_MGMT))) { 1941 if (app_mode != APP_MODE_DEFAULT) { 1942 printf(" power mgmt mode is mutually exclusive with other modes\n"); 1943 return -1; 1944 } 1945 if (parse_pmd_mgmt_config(optarg) < 0) { 1946 printf(" Invalid PMD power management mode: %s\n", 1947 optarg); 1948 return -1; 1949 } 1950 app_mode = APP_MODE_PMD_MGMT; 1951 printf("PMD power mgmt mode is enabled\n"); 1952 } 1953 if (!strncmp(lgopts[option_index].name, 1954 CMD_LINE_OPT_INTERRUPT_ONLY, 1955 sizeof(CMD_LINE_OPT_INTERRUPT_ONLY))) { 1956 if (app_mode != APP_MODE_DEFAULT) { 1957 printf(" interrupt-only mode is mutually exclusive with other modes\n"); 1958 return -1; 1959 } 1960 app_mode = APP_MODE_INTERRUPT; 1961 printf("interrupt-only mode is enabled\n"); 1962 } 1963 1964 if (!strncmp(lgopts[option_index].name, 1965 CMD_LINE_OPT_MAX_PKT_LEN, 1966 sizeof(CMD_LINE_OPT_MAX_PKT_LEN))) { 1967 printf("Custom frame size is configured\n"); 1968 max_pkt_len = parse_max_pkt_len(optarg); 1969 } 1970 1971 if (!strncmp(lgopts[option_index].name, 1972 CMD_LINE_OPT_PARSE_PTYPE, 1973 sizeof(CMD_LINE_OPT_PARSE_PTYPE))) { 1974 printf("soft parse-ptype is enabled\n"); 1975 parse_ptype = 1; 1976 } 1977 1978 break; 1979 1980 default: 1981 print_usage(prgname); 1982 return -1; 1983 } 1984 } 1985 1986 if (optind >= 0) 1987 argv[optind-1] = prgname; 1988 1989 ret = optind-1; 1990 optind = 1; /* reset getopt lib */ 1991 return ret; 1992 } 1993 1994 static void 1995 print_ethaddr(const char *name, const struct rte_ether_addr *eth_addr) 1996 { 1997 char buf[RTE_ETHER_ADDR_FMT_SIZE]; 1998 rte_ether_format_addr(buf, RTE_ETHER_ADDR_FMT_SIZE, eth_addr); 1999 printf("%s%s", name, buf); 2000 } 2001 2002 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 2003 static void 2004 setup_hash(int socketid) 2005 { 2006 struct rte_hash_parameters ipv4_l3fwd_hash_params = { 2007 .name = NULL, 2008 .entries = L3FWD_HASH_ENTRIES, 2009 .key_len = sizeof(struct ipv4_5tuple), 2010 .hash_func = DEFAULT_HASH_FUNC, 2011 .hash_func_init_val = 0, 2012 }; 2013 2014 struct rte_hash_parameters ipv6_l3fwd_hash_params = { 2015 .name = NULL, 2016 .entries = L3FWD_HASH_ENTRIES, 2017 .key_len = sizeof(struct ipv6_5tuple), 2018 .hash_func = DEFAULT_HASH_FUNC, 2019 .hash_func_init_val = 0, 2020 }; 2021 2022 unsigned i; 2023 int ret; 2024 char s[64]; 2025 2026 /* create ipv4 hash */ 2027 snprintf(s, sizeof(s), "ipv4_l3fwd_hash_%d", socketid); 2028 ipv4_l3fwd_hash_params.name = s; 2029 ipv4_l3fwd_hash_params.socket_id = socketid; 2030 ipv4_l3fwd_lookup_struct[socketid] = 2031 rte_hash_create(&ipv4_l3fwd_hash_params); 2032 if (ipv4_l3fwd_lookup_struct[socketid] == NULL) 2033 rte_exit(EXIT_FAILURE, "Unable to create the l3fwd hash on " 2034 "socket %d\n", socketid); 2035 2036 /* create ipv6 hash */ 2037 snprintf(s, sizeof(s), "ipv6_l3fwd_hash_%d", socketid); 2038 ipv6_l3fwd_hash_params.name = s; 2039 ipv6_l3fwd_hash_params.socket_id = socketid; 2040 ipv6_l3fwd_lookup_struct[socketid] = 2041 rte_hash_create(&ipv6_l3fwd_hash_params); 2042 if (ipv6_l3fwd_lookup_struct[socketid] == NULL) 2043 rte_exit(EXIT_FAILURE, "Unable to create the l3fwd hash on " 2044 "socket %d\n", socketid); 2045 2046 2047 /* populate the ipv4 hash */ 2048 for (i = 0; i < RTE_DIM(ipv4_l3fwd_route_array); i++) { 2049 ret = rte_hash_add_key (ipv4_l3fwd_lookup_struct[socketid], 2050 (void *) &ipv4_l3fwd_route_array[i].key); 2051 if (ret < 0) { 2052 rte_exit(EXIT_FAILURE, "Unable to add entry %u to the" 2053 "l3fwd hash on socket %d\n", i, socketid); 2054 } 2055 ipv4_l3fwd_out_if[ret] = ipv4_l3fwd_route_array[i].if_out; 2056 printf("Hash: Adding key\n"); 2057 print_ipv4_key(ipv4_l3fwd_route_array[i].key); 2058 } 2059 2060 /* populate the ipv6 hash */ 2061 for (i = 0; i < RTE_DIM(ipv6_l3fwd_route_array); i++) { 2062 ret = rte_hash_add_key (ipv6_l3fwd_lookup_struct[socketid], 2063 (void *) &ipv6_l3fwd_route_array[i].key); 2064 if (ret < 0) { 2065 rte_exit(EXIT_FAILURE, "Unable to add entry %u to the" 2066 "l3fwd hash on socket %d\n", i, socketid); 2067 } 2068 ipv6_l3fwd_out_if[ret] = ipv6_l3fwd_route_array[i].if_out; 2069 printf("Hash: Adding key\n"); 2070 print_ipv6_key(ipv6_l3fwd_route_array[i].key); 2071 } 2072 } 2073 #endif 2074 2075 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 2076 static void 2077 setup_lpm(int socketid) 2078 { 2079 unsigned i; 2080 int ret; 2081 char s[64]; 2082 2083 /* create the LPM table */ 2084 struct rte_lpm_config lpm_ipv4_config; 2085 2086 lpm_ipv4_config.max_rules = IPV4_L3FWD_LPM_MAX_RULES; 2087 lpm_ipv4_config.number_tbl8s = 256; 2088 lpm_ipv4_config.flags = 0; 2089 2090 snprintf(s, sizeof(s), "IPV4_L3FWD_LPM_%d", socketid); 2091 ipv4_l3fwd_lookup_struct[socketid] = 2092 rte_lpm_create(s, socketid, &lpm_ipv4_config); 2093 if (ipv4_l3fwd_lookup_struct[socketid] == NULL) 2094 rte_exit(EXIT_FAILURE, "Unable to create the l3fwd LPM table" 2095 " on socket %d\n", socketid); 2096 2097 /* populate the LPM table */ 2098 for (i = 0; i < RTE_DIM(ipv4_l3fwd_route_array); i++) { 2099 ret = rte_lpm_add(ipv4_l3fwd_lookup_struct[socketid], 2100 ipv4_l3fwd_route_array[i].ip, 2101 ipv4_l3fwd_route_array[i].depth, 2102 ipv4_l3fwd_route_array[i].if_out); 2103 2104 if (ret < 0) { 2105 rte_exit(EXIT_FAILURE, "Unable to add entry %u to the " 2106 "l3fwd LPM table on socket %d\n", 2107 i, socketid); 2108 } 2109 2110 printf("LPM: Adding route 0x%08x / %d (%d)\n", 2111 (unsigned)ipv4_l3fwd_route_array[i].ip, 2112 ipv4_l3fwd_route_array[i].depth, 2113 ipv4_l3fwd_route_array[i].if_out); 2114 } 2115 } 2116 #endif 2117 2118 static int 2119 init_mem(unsigned nb_mbuf) 2120 { 2121 struct lcore_conf *qconf; 2122 int socketid; 2123 unsigned lcore_id; 2124 char s[64]; 2125 2126 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2127 if (rte_lcore_is_enabled(lcore_id) == 0) 2128 continue; 2129 2130 if (numa_on) 2131 socketid = rte_lcore_to_socket_id(lcore_id); 2132 else 2133 socketid = 0; 2134 2135 if (socketid >= NB_SOCKETS) { 2136 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is " 2137 "out of range %d\n", socketid, 2138 lcore_id, NB_SOCKETS); 2139 } 2140 if (pktmbuf_pool[socketid] == NULL) { 2141 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 2142 pktmbuf_pool[socketid] = 2143 rte_pktmbuf_pool_create(s, nb_mbuf, 2144 MEMPOOL_CACHE_SIZE, 0, 2145 RTE_MBUF_DEFAULT_BUF_SIZE, 2146 socketid); 2147 if (pktmbuf_pool[socketid] == NULL) 2148 rte_exit(EXIT_FAILURE, 2149 "Cannot init mbuf pool on socket %d\n", 2150 socketid); 2151 else 2152 printf("Allocated mbuf pool on socket %d\n", 2153 socketid); 2154 2155 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 2156 setup_lpm(socketid); 2157 #else 2158 setup_hash(socketid); 2159 #endif 2160 } 2161 qconf = &lcore_conf[lcore_id]; 2162 qconf->ipv4_lookup_struct = ipv4_l3fwd_lookup_struct[socketid]; 2163 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 2164 qconf->ipv6_lookup_struct = ipv6_l3fwd_lookup_struct[socketid]; 2165 #endif 2166 } 2167 return 0; 2168 } 2169 2170 /* Check the link status of all ports in up to 9s, and print them finally */ 2171 static void 2172 check_all_ports_link_status(uint32_t port_mask) 2173 { 2174 #define CHECK_INTERVAL 100 /* 100ms */ 2175 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 2176 uint8_t count, all_ports_up, print_flag = 0; 2177 uint16_t portid; 2178 struct rte_eth_link link; 2179 int ret; 2180 char link_status_text[RTE_ETH_LINK_MAX_STR_LEN]; 2181 2182 printf("\nChecking link status"); 2183 fflush(stdout); 2184 for (count = 0; count <= MAX_CHECK_TIME; count++) { 2185 all_ports_up = 1; 2186 RTE_ETH_FOREACH_DEV(portid) { 2187 if ((port_mask & (1 << portid)) == 0) 2188 continue; 2189 memset(&link, 0, sizeof(link)); 2190 ret = rte_eth_link_get_nowait(portid, &link); 2191 if (ret < 0) { 2192 all_ports_up = 0; 2193 if (print_flag == 1) 2194 printf("Port %u link get failed: %s\n", 2195 portid, rte_strerror(-ret)); 2196 continue; 2197 } 2198 /* print link status if flag set */ 2199 if (print_flag == 1) { 2200 rte_eth_link_to_str(link_status_text, 2201 sizeof(link_status_text), &link); 2202 printf("Port %d %s\n", portid, 2203 link_status_text); 2204 continue; 2205 } 2206 /* clear all_ports_up flag if any link down */ 2207 if (link.link_status == RTE_ETH_LINK_DOWN) { 2208 all_ports_up = 0; 2209 break; 2210 } 2211 } 2212 /* after finally printing all link status, get out */ 2213 if (print_flag == 1) 2214 break; 2215 2216 if (all_ports_up == 0) { 2217 printf("."); 2218 fflush(stdout); 2219 rte_delay_ms(CHECK_INTERVAL); 2220 } 2221 2222 /* set the print_flag if all ports up or timeout */ 2223 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 2224 print_flag = 1; 2225 printf("done\n"); 2226 } 2227 } 2228 } 2229 2230 static int check_ptype(uint16_t portid) 2231 { 2232 int i, ret; 2233 int ptype_l3_ipv4 = 0; 2234 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 2235 int ptype_l3_ipv6 = 0; 2236 #endif 2237 uint32_t ptype_mask = RTE_PTYPE_L3_MASK; 2238 2239 ret = rte_eth_dev_get_supported_ptypes(portid, ptype_mask, NULL, 0); 2240 if (ret <= 0) 2241 return 0; 2242 2243 uint32_t ptypes[ret]; 2244 2245 ret = rte_eth_dev_get_supported_ptypes(portid, ptype_mask, ptypes, ret); 2246 for (i = 0; i < ret; ++i) { 2247 if (ptypes[i] & RTE_PTYPE_L3_IPV4) 2248 ptype_l3_ipv4 = 1; 2249 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 2250 if (ptypes[i] & RTE_PTYPE_L3_IPV6) 2251 ptype_l3_ipv6 = 1; 2252 #endif 2253 } 2254 2255 if (ptype_l3_ipv4 == 0) 2256 printf("port %d cannot parse RTE_PTYPE_L3_IPV4\n", portid); 2257 2258 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 2259 if (ptype_l3_ipv6 == 0) 2260 printf("port %d cannot parse RTE_PTYPE_L3_IPV6\n", portid); 2261 #endif 2262 2263 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 2264 if (ptype_l3_ipv4) 2265 #else /* APP_LOOKUP_EXACT_MATCH */ 2266 if (ptype_l3_ipv4 && ptype_l3_ipv6) 2267 #endif 2268 return 1; 2269 2270 return 0; 2271 2272 } 2273 2274 static int 2275 init_power_library(void) 2276 { 2277 enum power_management_env env; 2278 unsigned int lcore_id; 2279 int ret = 0; 2280 2281 RTE_LCORE_FOREACH(lcore_id) { 2282 /* init power management library */ 2283 ret = rte_power_init(lcore_id); 2284 if (ret) { 2285 RTE_LOG(ERR, POWER, 2286 "Library initialization failed on core %u\n", 2287 lcore_id); 2288 return ret; 2289 } 2290 /* we're not supporting the VM channel mode */ 2291 env = rte_power_get_env(); 2292 if (env != PM_ENV_ACPI_CPUFREQ && 2293 env != PM_ENV_PSTATE_CPUFREQ) { 2294 RTE_LOG(ERR, POWER, 2295 "Only ACPI and PSTATE mode are supported\n"); 2296 return -1; 2297 } 2298 } 2299 return ret; 2300 } 2301 2302 static int 2303 deinit_power_library(void) 2304 { 2305 unsigned int lcore_id; 2306 int ret = 0; 2307 2308 RTE_LCORE_FOREACH(lcore_id) { 2309 /* deinit power management library */ 2310 ret = rte_power_exit(lcore_id); 2311 if (ret) { 2312 RTE_LOG(ERR, POWER, 2313 "Library deinitialization failed on core %u\n", 2314 lcore_id); 2315 return ret; 2316 } 2317 } 2318 return ret; 2319 } 2320 2321 static void 2322 get_current_stat_values(uint64_t *values) 2323 { 2324 unsigned int lcore_id = rte_lcore_id(); 2325 struct lcore_conf *qconf; 2326 uint64_t app_eps = 0, app_fps = 0, app_br = 0; 2327 uint64_t count = 0; 2328 2329 RTE_LCORE_FOREACH_WORKER(lcore_id) { 2330 qconf = &lcore_conf[lcore_id]; 2331 if (qconf->n_rx_queue == 0) 2332 continue; 2333 count++; 2334 rte_spinlock_lock(&stats[lcore_id].telemetry_lock); 2335 app_eps += stats[lcore_id].ep_nep[1]; 2336 app_fps += stats[lcore_id].fp_nfp[1]; 2337 app_br += stats[lcore_id].br; 2338 rte_spinlock_unlock(&stats[lcore_id].telemetry_lock); 2339 } 2340 2341 if (count > 0) { 2342 values[0] = app_eps/count; 2343 values[1] = app_fps/count; 2344 values[2] = app_br/count; 2345 } else 2346 memset(values, 0, sizeof(uint64_t) * NUM_TELSTATS); 2347 2348 } 2349 2350 static void 2351 update_telemetry(__rte_unused struct rte_timer *tim, 2352 __rte_unused void *arg) 2353 { 2354 int ret; 2355 uint64_t values[NUM_TELSTATS] = {0}; 2356 2357 get_current_stat_values(values); 2358 ret = rte_metrics_update_values(RTE_METRICS_GLOBAL, telstats_index, 2359 values, RTE_DIM(values)); 2360 if (ret < 0) 2361 RTE_LOG(WARNING, POWER, "failed to update metrcis\n"); 2362 } 2363 2364 static int 2365 handle_app_stats(const char *cmd __rte_unused, 2366 const char *params __rte_unused, 2367 struct rte_tel_data *d) 2368 { 2369 uint64_t values[NUM_TELSTATS] = {0}; 2370 uint32_t i; 2371 2372 rte_tel_data_start_dict(d); 2373 get_current_stat_values(values); 2374 for (i = 0; i < NUM_TELSTATS; i++) 2375 rte_tel_data_add_dict_u64(d, telstats_strings[i].name, 2376 values[i]); 2377 return 0; 2378 } 2379 2380 static void 2381 telemetry_setup_timer(void) 2382 { 2383 int lcore_id = rte_lcore_id(); 2384 uint64_t hz = rte_get_timer_hz(); 2385 uint64_t ticks; 2386 2387 ticks = hz / TELEMETRY_INTERVALS_PER_SEC; 2388 rte_timer_reset_sync(&telemetry_timer, 2389 ticks, 2390 PERIODICAL, 2391 lcore_id, 2392 update_telemetry, 2393 NULL); 2394 } 2395 static void 2396 empty_poll_setup_timer(void) 2397 { 2398 int lcore_id = rte_lcore_id(); 2399 uint64_t hz = rte_get_timer_hz(); 2400 2401 struct ep_params *ep_ptr = ep_params; 2402 2403 ep_ptr->interval_ticks = hz / INTERVALS_PER_SECOND; 2404 2405 rte_timer_reset_sync(&ep_ptr->timer0, 2406 ep_ptr->interval_ticks, 2407 PERIODICAL, 2408 lcore_id, 2409 rte_empty_poll_detection, 2410 (void *)ep_ptr); 2411 2412 } 2413 static int 2414 launch_timer(unsigned int lcore_id) 2415 { 2416 int64_t prev_tsc = 0, cur_tsc, diff_tsc, cycles_10ms; 2417 2418 RTE_SET_USED(lcore_id); 2419 2420 2421 if (rte_get_main_lcore() != lcore_id) { 2422 rte_panic("timer on lcore:%d which is not main core:%d\n", 2423 lcore_id, 2424 rte_get_main_lcore()); 2425 } 2426 2427 RTE_LOG(INFO, POWER, "Bring up the Timer\n"); 2428 2429 if (app_mode == APP_MODE_EMPTY_POLL) 2430 empty_poll_setup_timer(); 2431 else 2432 telemetry_setup_timer(); 2433 2434 cycles_10ms = rte_get_timer_hz() / 100; 2435 2436 while (!is_done()) { 2437 cur_tsc = rte_rdtsc(); 2438 diff_tsc = cur_tsc - prev_tsc; 2439 if (diff_tsc > cycles_10ms) { 2440 rte_timer_manage(); 2441 prev_tsc = cur_tsc; 2442 cycles_10ms = rte_get_timer_hz() / 100; 2443 } 2444 } 2445 2446 RTE_LOG(INFO, POWER, "Timer_subsystem is done\n"); 2447 2448 return 0; 2449 } 2450 2451 static int 2452 autodetect_mode(void) 2453 { 2454 RTE_LOG(NOTICE, L3FWD_POWER, "Operating mode not specified, probing frequency scaling support...\n"); 2455 2456 /* 2457 * Empty poll and telemetry modes have to be specifically requested to 2458 * be enabled, but we can auto-detect between interrupt mode with or 2459 * without frequency scaling. Both ACPI and pstate can be used. 2460 */ 2461 if (rte_power_check_env_supported(PM_ENV_ACPI_CPUFREQ)) 2462 return APP_MODE_LEGACY; 2463 if (rte_power_check_env_supported(PM_ENV_PSTATE_CPUFREQ)) 2464 return APP_MODE_LEGACY; 2465 2466 RTE_LOG(NOTICE, L3FWD_POWER, "Frequency scaling not supported, selecting interrupt-only mode\n"); 2467 2468 return APP_MODE_INTERRUPT; 2469 } 2470 2471 static const char * 2472 mode_to_str(enum appmode mode) 2473 { 2474 switch (mode) { 2475 case APP_MODE_LEGACY: 2476 return "legacy"; 2477 case APP_MODE_EMPTY_POLL: 2478 return "empty poll"; 2479 case APP_MODE_TELEMETRY: 2480 return "telemetry"; 2481 case APP_MODE_INTERRUPT: 2482 return "interrupt-only"; 2483 case APP_MODE_PMD_MGMT: 2484 return "pmd mgmt"; 2485 default: 2486 return "invalid"; 2487 } 2488 } 2489 2490 static uint32_t 2491 eth_dev_get_overhead_len(uint32_t max_rx_pktlen, uint16_t max_mtu) 2492 { 2493 uint32_t overhead_len; 2494 2495 if (max_mtu != UINT16_MAX && max_rx_pktlen > max_mtu) 2496 overhead_len = max_rx_pktlen - max_mtu; 2497 else 2498 overhead_len = RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN; 2499 2500 return overhead_len; 2501 } 2502 2503 static int 2504 config_port_max_pkt_len(struct rte_eth_conf *conf, 2505 struct rte_eth_dev_info *dev_info) 2506 { 2507 uint32_t overhead_len; 2508 2509 if (max_pkt_len == 0) 2510 return 0; 2511 2512 if (max_pkt_len < RTE_ETHER_MIN_LEN || max_pkt_len > MAX_JUMBO_PKT_LEN) 2513 return -1; 2514 2515 overhead_len = eth_dev_get_overhead_len(dev_info->max_rx_pktlen, 2516 dev_info->max_mtu); 2517 conf->rxmode.mtu = max_pkt_len - overhead_len; 2518 2519 if (conf->rxmode.mtu > RTE_ETHER_MTU) 2520 conf->txmode.offloads |= RTE_ETH_TX_OFFLOAD_MULTI_SEGS; 2521 2522 return 0; 2523 } 2524 2525 /* Power library initialized in the main routine. 8< */ 2526 int 2527 main(int argc, char **argv) 2528 { 2529 struct lcore_conf *qconf; 2530 struct rte_eth_dev_info dev_info; 2531 struct rte_eth_txconf *txconf; 2532 int ret; 2533 uint16_t nb_ports; 2534 uint16_t queueid; 2535 unsigned lcore_id; 2536 uint64_t hz; 2537 uint32_t n_tx_queue, nb_lcores; 2538 uint32_t dev_rxq_num, dev_txq_num; 2539 uint8_t nb_rx_queue, queue, socketid; 2540 uint16_t portid; 2541 const char *ptr_strings[NUM_TELSTATS]; 2542 2543 /* init EAL */ 2544 ret = rte_eal_init(argc, argv); 2545 if (ret < 0) 2546 rte_exit(EXIT_FAILURE, "Invalid EAL parameters\n"); 2547 argc -= ret; 2548 argv += ret; 2549 2550 /* catch SIGINT and restore cpufreq governor to ondemand */ 2551 signal(SIGINT, signal_exit_now); 2552 2553 /* init RTE timer library to be used late */ 2554 rte_timer_subsystem_init(); 2555 2556 /* if we're running pmd-mgmt mode, don't default to baseline mode */ 2557 baseline_enabled = false; 2558 2559 /* parse application arguments (after the EAL ones) */ 2560 ret = parse_args(argc, argv); 2561 if (ret < 0) 2562 rte_exit(EXIT_FAILURE, "Invalid L3FWD parameters\n"); 2563 2564 if (app_mode == APP_MODE_DEFAULT) 2565 app_mode = autodetect_mode(); 2566 2567 RTE_LOG(INFO, L3FWD_POWER, "Selected operation mode: %s\n", 2568 mode_to_str(app_mode)); 2569 2570 /* only legacy and empty poll mode rely on power library */ 2571 if ((app_mode == APP_MODE_LEGACY || app_mode == APP_MODE_EMPTY_POLL) && 2572 init_power_library()) 2573 rte_exit(EXIT_FAILURE, "init_power_library failed\n"); 2574 2575 if (update_lcore_params() < 0) 2576 rte_exit(EXIT_FAILURE, "update_lcore_params failed\n"); 2577 2578 if (check_lcore_params() < 0) 2579 rte_exit(EXIT_FAILURE, "check_lcore_params failed\n"); 2580 2581 ret = init_lcore_rx_queues(); 2582 if (ret < 0) 2583 rte_exit(EXIT_FAILURE, "init_lcore_rx_queues failed\n"); 2584 2585 nb_ports = rte_eth_dev_count_avail(); 2586 2587 if (check_port_config() < 0) 2588 rte_exit(EXIT_FAILURE, "check_port_config failed\n"); 2589 2590 nb_lcores = rte_lcore_count(); 2591 2592 /* initialize all ports */ 2593 RTE_ETH_FOREACH_DEV(portid) { 2594 struct rte_eth_conf local_port_conf = port_conf; 2595 /* not all app modes need interrupts */ 2596 bool need_intr = app_mode == APP_MODE_LEGACY || 2597 app_mode == APP_MODE_INTERRUPT; 2598 2599 /* skip ports that are not enabled */ 2600 if ((enabled_port_mask & (1 << portid)) == 0) { 2601 printf("\nSkipping disabled port %d\n", portid); 2602 continue; 2603 } 2604 2605 /* init port */ 2606 printf("Initializing port %d ... ", portid ); 2607 fflush(stdout); 2608 2609 ret = rte_eth_dev_info_get(portid, &dev_info); 2610 if (ret != 0) 2611 rte_exit(EXIT_FAILURE, 2612 "Error during getting device (port %u) info: %s\n", 2613 portid, strerror(-ret)); 2614 2615 dev_rxq_num = dev_info.max_rx_queues; 2616 dev_txq_num = dev_info.max_tx_queues; 2617 2618 nb_rx_queue = get_port_n_rx_queues(portid); 2619 if (nb_rx_queue > dev_rxq_num) 2620 rte_exit(EXIT_FAILURE, 2621 "Cannot configure not existed rxq: " 2622 "port=%d\n", portid); 2623 2624 n_tx_queue = nb_lcores; 2625 if (n_tx_queue > dev_txq_num) 2626 n_tx_queue = dev_txq_num; 2627 printf("Creating queues: nb_rxq=%d nb_txq=%u... ", 2628 nb_rx_queue, (unsigned)n_tx_queue ); 2629 /* If number of Rx queue is 0, no need to enable Rx interrupt */ 2630 if (nb_rx_queue == 0) 2631 need_intr = false; 2632 2633 if (need_intr) 2634 local_port_conf.intr_conf.rxq = 1; 2635 2636 ret = rte_eth_dev_info_get(portid, &dev_info); 2637 if (ret != 0) 2638 rte_exit(EXIT_FAILURE, 2639 "Error during getting device (port %u) info: %s\n", 2640 portid, strerror(-ret)); 2641 2642 ret = config_port_max_pkt_len(&local_port_conf, &dev_info); 2643 if (ret != 0) 2644 rte_exit(EXIT_FAILURE, 2645 "Invalid max packet length: %u (port %u)\n", 2646 max_pkt_len, portid); 2647 2648 if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) 2649 local_port_conf.txmode.offloads |= 2650 RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE; 2651 2652 local_port_conf.rx_adv_conf.rss_conf.rss_hf &= 2653 dev_info.flow_type_rss_offloads; 2654 if (local_port_conf.rx_adv_conf.rss_conf.rss_hf != 2655 port_conf.rx_adv_conf.rss_conf.rss_hf) { 2656 printf("Port %u modified RSS hash function based on hardware support," 2657 "requested:%#"PRIx64" configured:%#"PRIx64"\n", 2658 portid, 2659 port_conf.rx_adv_conf.rss_conf.rss_hf, 2660 local_port_conf.rx_adv_conf.rss_conf.rss_hf); 2661 } 2662 2663 if (local_port_conf.rx_adv_conf.rss_conf.rss_hf == 0) 2664 local_port_conf.rxmode.mq_mode = RTE_ETH_MQ_RX_NONE; 2665 local_port_conf.rxmode.offloads &= dev_info.rx_offload_capa; 2666 port_conf.rxmode.offloads = local_port_conf.rxmode.offloads; 2667 2668 ret = rte_eth_dev_configure(portid, nb_rx_queue, 2669 (uint16_t)n_tx_queue, &local_port_conf); 2670 if (ret < 0) 2671 rte_exit(EXIT_FAILURE, "Cannot configure device: " 2672 "err=%d, port=%d\n", ret, portid); 2673 2674 ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, 2675 &nb_txd); 2676 if (ret < 0) 2677 rte_exit(EXIT_FAILURE, 2678 "Cannot adjust number of descriptors: err=%d, port=%d\n", 2679 ret, portid); 2680 2681 ret = rte_eth_macaddr_get(portid, &ports_eth_addr[portid]); 2682 if (ret < 0) 2683 rte_exit(EXIT_FAILURE, 2684 "Cannot get MAC address: err=%d, port=%d\n", 2685 ret, portid); 2686 2687 print_ethaddr(" Address:", &ports_eth_addr[portid]); 2688 printf(", "); 2689 2690 /* init memory */ 2691 ret = init_mem(NB_MBUF); 2692 if (ret < 0) 2693 rte_exit(EXIT_FAILURE, "init_mem failed\n"); 2694 2695 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2696 if (rte_lcore_is_enabled(lcore_id) == 0) 2697 continue; 2698 2699 /* Initialize TX buffers */ 2700 qconf = &lcore_conf[lcore_id]; 2701 qconf->tx_buffer[portid] = rte_zmalloc_socket("tx_buffer", 2702 RTE_ETH_TX_BUFFER_SIZE(MAX_PKT_BURST), 0, 2703 rte_eth_dev_socket_id(portid)); 2704 if (qconf->tx_buffer[portid] == NULL) 2705 rte_exit(EXIT_FAILURE, "Can't allocate tx buffer for port %u\n", 2706 portid); 2707 2708 rte_eth_tx_buffer_init(qconf->tx_buffer[portid], MAX_PKT_BURST); 2709 } 2710 2711 /* init one TX queue per couple (lcore,port) */ 2712 queueid = 0; 2713 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2714 if (rte_lcore_is_enabled(lcore_id) == 0) 2715 continue; 2716 2717 if (queueid >= dev_txq_num) 2718 continue; 2719 2720 if (numa_on) 2721 socketid = \ 2722 (uint8_t)rte_lcore_to_socket_id(lcore_id); 2723 else 2724 socketid = 0; 2725 2726 printf("txq=%u,%d,%d ", lcore_id, queueid, socketid); 2727 fflush(stdout); 2728 2729 txconf = &dev_info.default_txconf; 2730 txconf->offloads = local_port_conf.txmode.offloads; 2731 ret = rte_eth_tx_queue_setup(portid, queueid, nb_txd, 2732 socketid, txconf); 2733 if (ret < 0) 2734 rte_exit(EXIT_FAILURE, 2735 "rte_eth_tx_queue_setup: err=%d, " 2736 "port=%d\n", ret, portid); 2737 2738 qconf = &lcore_conf[lcore_id]; 2739 qconf->tx_queue_id[portid] = queueid; 2740 queueid++; 2741 2742 qconf->tx_port_id[qconf->n_tx_port] = portid; 2743 qconf->n_tx_port++; 2744 } 2745 printf("\n"); 2746 } 2747 2748 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2749 if (rte_lcore_is_enabled(lcore_id) == 0) 2750 continue; 2751 2752 if (app_mode == APP_MODE_LEGACY) { 2753 /* init timer structures for each enabled lcore */ 2754 rte_timer_init(&power_timers[lcore_id]); 2755 hz = rte_get_timer_hz(); 2756 rte_timer_reset(&power_timers[lcore_id], 2757 hz/TIMER_NUMBER_PER_SECOND, 2758 SINGLE, lcore_id, 2759 power_timer_cb, NULL); 2760 } 2761 qconf = &lcore_conf[lcore_id]; 2762 printf("\nInitializing rx queues on lcore %u ... ", lcore_id ); 2763 fflush(stdout); 2764 2765 /* init RX queues */ 2766 for(queue = 0; queue < qconf->n_rx_queue; ++queue) { 2767 struct rte_eth_rxconf rxq_conf; 2768 2769 portid = qconf->rx_queue_list[queue].port_id; 2770 queueid = qconf->rx_queue_list[queue].queue_id; 2771 2772 if (numa_on) 2773 socketid = \ 2774 (uint8_t)rte_lcore_to_socket_id(lcore_id); 2775 else 2776 socketid = 0; 2777 2778 printf("rxq=%d,%d,%d ", portid, queueid, socketid); 2779 fflush(stdout); 2780 2781 ret = rte_eth_dev_info_get(portid, &dev_info); 2782 if (ret != 0) 2783 rte_exit(EXIT_FAILURE, 2784 "Error during getting device (port %u) info: %s\n", 2785 portid, strerror(-ret)); 2786 2787 rxq_conf = dev_info.default_rxconf; 2788 rxq_conf.offloads = port_conf.rxmode.offloads; 2789 ret = rte_eth_rx_queue_setup(portid, queueid, nb_rxd, 2790 socketid, &rxq_conf, 2791 pktmbuf_pool[socketid]); 2792 if (ret < 0) 2793 rte_exit(EXIT_FAILURE, 2794 "rte_eth_rx_queue_setup: err=%d, " 2795 "port=%d\n", ret, portid); 2796 2797 if (parse_ptype) { 2798 if (add_cb_parse_ptype(portid, queueid) < 0) 2799 rte_exit(EXIT_FAILURE, 2800 "Fail to add ptype cb\n"); 2801 } 2802 2803 if (app_mode == APP_MODE_PMD_MGMT && !baseline_enabled) { 2804 ret = rte_power_ethdev_pmgmt_queue_enable( 2805 lcore_id, portid, queueid, 2806 pmgmt_type); 2807 if (ret < 0) 2808 rte_exit(EXIT_FAILURE, 2809 "rte_power_ethdev_pmgmt_queue_enable: err=%d, port=%d\n", 2810 ret, portid); 2811 } 2812 } 2813 } 2814 /* >8 End of power library initialization. */ 2815 2816 printf("\n"); 2817 2818 /* start ports */ 2819 RTE_ETH_FOREACH_DEV(portid) { 2820 if ((enabled_port_mask & (1 << portid)) == 0) { 2821 continue; 2822 } 2823 /* Start device */ 2824 ret = rte_eth_dev_start(portid); 2825 if (ret < 0) 2826 rte_exit(EXIT_FAILURE, "rte_eth_dev_start: err=%d, " 2827 "port=%d\n", ret, portid); 2828 /* 2829 * If enabled, put device in promiscuous mode. 2830 * This allows IO forwarding mode to forward packets 2831 * to itself through 2 cross-connected ports of the 2832 * target machine. 2833 */ 2834 if (promiscuous_on) { 2835 ret = rte_eth_promiscuous_enable(portid); 2836 if (ret != 0) 2837 rte_exit(EXIT_FAILURE, 2838 "rte_eth_promiscuous_enable: err=%s, port=%u\n", 2839 rte_strerror(-ret), portid); 2840 } 2841 /* initialize spinlock for each port */ 2842 rte_spinlock_init(&(locks[portid])); 2843 2844 if (!parse_ptype) 2845 if (!check_ptype(portid)) 2846 rte_exit(EXIT_FAILURE, 2847 "PMD can not provide needed ptypes\n"); 2848 } 2849 2850 check_all_ports_link_status(enabled_port_mask); 2851 2852 if (app_mode == APP_MODE_EMPTY_POLL) { 2853 2854 if (empty_poll_train) { 2855 policy.state = TRAINING; 2856 } else { 2857 policy.state = MED_NORMAL; 2858 policy.med_base_edpi = ep_med_edpi; 2859 policy.hgh_base_edpi = ep_hgh_edpi; 2860 } 2861 2862 ret = rte_power_empty_poll_stat_init(&ep_params, 2863 freq_tlb, 2864 &policy); 2865 if (ret < 0) 2866 rte_exit(EXIT_FAILURE, "empty poll init failed"); 2867 } 2868 2869 2870 /* launch per-lcore init on every lcore */ 2871 if (app_mode == APP_MODE_LEGACY) { 2872 rte_eal_mp_remote_launch(main_legacy_loop, NULL, CALL_MAIN); 2873 } else if (app_mode == APP_MODE_EMPTY_POLL) { 2874 empty_poll_stop = false; 2875 rte_eal_mp_remote_launch(main_empty_poll_loop, NULL, 2876 SKIP_MAIN); 2877 } else if (app_mode == APP_MODE_TELEMETRY) { 2878 unsigned int i; 2879 2880 /* Init metrics library */ 2881 rte_metrics_init(rte_socket_id()); 2882 /** Register stats with metrics library */ 2883 for (i = 0; i < NUM_TELSTATS; i++) 2884 ptr_strings[i] = telstats_strings[i].name; 2885 2886 ret = rte_metrics_reg_names(ptr_strings, NUM_TELSTATS); 2887 if (ret >= 0) 2888 telstats_index = ret; 2889 else 2890 rte_exit(EXIT_FAILURE, "failed to register metrics names"); 2891 2892 RTE_LCORE_FOREACH_WORKER(lcore_id) { 2893 rte_spinlock_init(&stats[lcore_id].telemetry_lock); 2894 } 2895 rte_timer_init(&telemetry_timer); 2896 rte_telemetry_register_cmd("/l3fwd-power/stats", 2897 handle_app_stats, 2898 "Returns global power stats. Parameters: None"); 2899 rte_eal_mp_remote_launch(main_telemetry_loop, NULL, 2900 SKIP_MAIN); 2901 } else if (app_mode == APP_MODE_INTERRUPT) { 2902 rte_eal_mp_remote_launch(main_intr_loop, NULL, CALL_MAIN); 2903 } else if (app_mode == APP_MODE_PMD_MGMT) { 2904 /* reuse telemetry loop for PMD power management mode */ 2905 rte_eal_mp_remote_launch(main_telemetry_loop, NULL, CALL_MAIN); 2906 } 2907 2908 if (app_mode == APP_MODE_EMPTY_POLL || app_mode == APP_MODE_TELEMETRY) 2909 launch_timer(rte_lcore_id()); 2910 2911 RTE_LCORE_FOREACH_WORKER(lcore_id) { 2912 if (rte_eal_wait_lcore(lcore_id) < 0) 2913 return -1; 2914 } 2915 2916 if (app_mode == APP_MODE_PMD_MGMT) { 2917 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2918 if (rte_lcore_is_enabled(lcore_id) == 0) 2919 continue; 2920 qconf = &lcore_conf[lcore_id]; 2921 for (queue = 0; queue < qconf->n_rx_queue; ++queue) { 2922 portid = qconf->rx_queue_list[queue].port_id; 2923 queueid = qconf->rx_queue_list[queue].queue_id; 2924 2925 rte_power_ethdev_pmgmt_queue_disable(lcore_id, 2926 portid, queueid); 2927 } 2928 } 2929 } 2930 2931 RTE_ETH_FOREACH_DEV(portid) 2932 { 2933 if ((enabled_port_mask & (1 << portid)) == 0) 2934 continue; 2935 2936 ret = rte_eth_dev_stop(portid); 2937 if (ret != 0) 2938 RTE_LOG(ERR, L3FWD_POWER, "rte_eth_dev_stop: err=%d, port=%u\n", 2939 ret, portid); 2940 2941 rte_eth_dev_close(portid); 2942 } 2943 2944 if (app_mode == APP_MODE_EMPTY_POLL) 2945 rte_power_empty_poll_stat_free(); 2946 2947 if ((app_mode == APP_MODE_LEGACY || app_mode == APP_MODE_EMPTY_POLL) && 2948 deinit_power_library()) 2949 rte_exit(EXIT_FAILURE, "deinit_power_library failed\n"); 2950 2951 if (rte_eal_cleanup() < 0) 2952 RTE_LOG(ERR, L3FWD_POWER, "EAL cleanup failed\n"); 2953 2954 return 0; 2955 } 2956