1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2018 Intel Corporation 3 */ 4 5 #include <stdio.h> 6 #include <stdlib.h> 7 #include <stdint.h> 8 #include <inttypes.h> 9 #include <sys/types.h> 10 #include <string.h> 11 #include <sys/queue.h> 12 #include <stdarg.h> 13 #include <errno.h> 14 #include <getopt.h> 15 #include <unistd.h> 16 #include <signal.h> 17 #include <math.h> 18 19 #include <rte_common.h> 20 #include <rte_byteorder.h> 21 #include <rte_log.h> 22 #include <rte_malloc.h> 23 #include <rte_memory.h> 24 #include <rte_memcpy.h> 25 #include <rte_eal.h> 26 #include <rte_launch.h> 27 #include <rte_cycles.h> 28 #include <rte_prefetch.h> 29 #include <rte_lcore.h> 30 #include <rte_per_lcore.h> 31 #include <rte_branch_prediction.h> 32 #include <rte_interrupts.h> 33 #include <rte_random.h> 34 #include <rte_debug.h> 35 #include <rte_ether.h> 36 #include <rte_ethdev.h> 37 #include <rte_mempool.h> 38 #include <rte_mbuf.h> 39 #include <rte_ip.h> 40 #include <rte_tcp.h> 41 #include <rte_udp.h> 42 #include <rte_string_fns.h> 43 #include <rte_timer.h> 44 #include <rte_power.h> 45 #include <rte_spinlock.h> 46 #include <rte_power_empty_poll.h> 47 #include <rte_metrics.h> 48 #include <rte_telemetry.h> 49 #include <rte_power_pmd_mgmt.h> 50 51 #include "perf_core.h" 52 #include "main.h" 53 54 #define RTE_LOGTYPE_L3FWD_POWER RTE_LOGTYPE_USER1 55 56 #define MAX_PKT_BURST 32 57 58 #define MIN_ZERO_POLL_COUNT 10 59 60 /* 100 ms interval */ 61 #define TIMER_NUMBER_PER_SECOND 10 62 /* (10ms) */ 63 #define INTERVALS_PER_SECOND 100 64 /* 100000 us */ 65 #define SCALING_PERIOD (1000000/TIMER_NUMBER_PER_SECOND) 66 #define SCALING_DOWN_TIME_RATIO_THRESHOLD 0.25 67 68 #define APP_LOOKUP_EXACT_MATCH 0 69 #define APP_LOOKUP_LPM 1 70 #define DO_RFC_1812_CHECKS 71 72 #ifndef APP_LOOKUP_METHOD 73 #define APP_LOOKUP_METHOD APP_LOOKUP_LPM 74 #endif 75 76 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 77 #include <rte_hash.h> 78 #elif (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 79 #include <rte_lpm.h> 80 #else 81 #error "APP_LOOKUP_METHOD set to incorrect value" 82 #endif 83 84 #ifndef IPv6_BYTES 85 #define IPv6_BYTES_FMT "%02x%02x:%02x%02x:%02x%02x:%02x%02x:"\ 86 "%02x%02x:%02x%02x:%02x%02x:%02x%02x" 87 #define IPv6_BYTES(addr) \ 88 addr[0], addr[1], addr[2], addr[3], \ 89 addr[4], addr[5], addr[6], addr[7], \ 90 addr[8], addr[9], addr[10], addr[11],\ 91 addr[12], addr[13],addr[14], addr[15] 92 #endif 93 94 #define MAX_JUMBO_PKT_LEN 9600 95 96 #define IPV6_ADDR_LEN 16 97 98 #define MEMPOOL_CACHE_SIZE 256 99 100 /* 101 * This expression is used to calculate the number of mbufs needed depending on 102 * user input, taking into account memory for rx and tx hardware rings, cache 103 * per lcore and mtable per port per lcore. RTE_MAX is used to ensure that 104 * NB_MBUF never goes below a minimum value of 8192. 105 */ 106 107 #define NB_MBUF RTE_MAX ( \ 108 (nb_ports*nb_rx_queue*nb_rxd + \ 109 nb_ports*nb_lcores*MAX_PKT_BURST + \ 110 nb_ports*n_tx_queue*nb_txd + \ 111 nb_lcores*MEMPOOL_CACHE_SIZE), \ 112 (unsigned)8192) 113 114 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 115 116 #define NB_SOCKETS 8 117 118 /* Configure how many packets ahead to prefetch, when reading packets */ 119 #define PREFETCH_OFFSET 3 120 121 /* 122 * Configurable number of RX/TX ring descriptors 123 */ 124 #define RTE_TEST_RX_DESC_DEFAULT 1024 125 #define RTE_TEST_TX_DESC_DEFAULT 1024 126 127 /* 128 * These two thresholds were decided on by running the training algorithm on 129 * a 2.5GHz Xeon. These defaults can be overridden by supplying non-zero values 130 * for the med_threshold and high_threshold parameters on the command line. 131 */ 132 #define EMPTY_POLL_MED_THRESHOLD 350000UL 133 #define EMPTY_POLL_HGH_THRESHOLD 580000UL 134 135 #define NUM_TELSTATS RTE_DIM(telstats_strings) 136 137 static uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT; 138 static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT; 139 140 /* ethernet addresses of ports */ 141 static struct rte_ether_addr ports_eth_addr[RTE_MAX_ETHPORTS]; 142 143 /* ethernet addresses of ports */ 144 static rte_spinlock_t locks[RTE_MAX_ETHPORTS]; 145 146 /* mask of enabled ports */ 147 static uint32_t enabled_port_mask = 0; 148 /* Ports set in promiscuous mode off by default. */ 149 static int promiscuous_on = 0; 150 /* NUMA is enabled by default. */ 151 static int numa_on = 1; 152 static bool empty_poll_stop; 153 static bool empty_poll_train; 154 volatile bool quit_signal; 155 static struct ep_params *ep_params; 156 static struct ep_policy policy; 157 static long ep_med_edpi, ep_hgh_edpi; 158 /* timer to update telemetry every 500ms */ 159 static struct rte_timer telemetry_timer; 160 161 /* stats index returned by metrics lib */ 162 int telstats_index; 163 164 struct telstats_name { 165 char name[RTE_ETH_XSTATS_NAME_SIZE]; 166 }; 167 168 /* telemetry stats to be reported */ 169 const struct telstats_name telstats_strings[] = { 170 {"empty_poll"}, 171 {"full_poll"}, 172 {"busy_percent"} 173 }; 174 175 /* core busyness in percentage */ 176 enum busy_rate { 177 ZERO = 0, 178 PARTIAL = 50, 179 FULL = 100 180 }; 181 182 /* reference poll count to measure core busyness */ 183 #define DEFAULT_COUNT 10000 184 /* 185 * reference CYCLES to be used to 186 * measure core busyness based on poll count 187 */ 188 #define MIN_CYCLES 1500000ULL 189 #define MAX_CYCLES 22000000ULL 190 191 /* (500ms) */ 192 #define TELEMETRY_INTERVALS_PER_SEC 2 193 194 static int parse_ptype; /**< Parse packet type using rx callback, and */ 195 /**< disabled by default */ 196 197 enum appmode { 198 APP_MODE_DEFAULT = 0, 199 APP_MODE_LEGACY, 200 APP_MODE_EMPTY_POLL, 201 APP_MODE_TELEMETRY, 202 APP_MODE_INTERRUPT, 203 APP_MODE_PMD_MGMT 204 }; 205 206 enum appmode app_mode; 207 208 static enum rte_power_pmd_mgmt_type pmgmt_type; 209 bool baseline_enabled; 210 211 enum freq_scale_hint_t 212 { 213 FREQ_LOWER = -1, 214 FREQ_CURRENT = 0, 215 FREQ_HIGHER = 1, 216 FREQ_HIGHEST = 2 217 }; 218 219 struct lcore_rx_queue { 220 uint16_t port_id; 221 uint8_t queue_id; 222 enum freq_scale_hint_t freq_up_hint; 223 uint32_t zero_rx_packet_count; 224 uint32_t idle_hint; 225 } __rte_cache_aligned; 226 227 #define MAX_RX_QUEUE_PER_LCORE 16 228 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS 229 #define MAX_RX_QUEUE_PER_PORT 128 230 231 #define MAX_RX_QUEUE_INTERRUPT_PER_PORT 16 232 233 234 struct lcore_params lcore_params_array[MAX_LCORE_PARAMS]; 235 static struct lcore_params lcore_params_array_default[] = { 236 {0, 0, 2}, 237 {0, 1, 2}, 238 {0, 2, 2}, 239 {1, 0, 2}, 240 {1, 1, 2}, 241 {1, 2, 2}, 242 {2, 0, 2}, 243 {3, 0, 3}, 244 {3, 1, 3}, 245 }; 246 247 struct lcore_params *lcore_params = lcore_params_array_default; 248 uint16_t nb_lcore_params = RTE_DIM(lcore_params_array_default); 249 250 static struct rte_eth_conf port_conf = { 251 .rxmode = { 252 .mq_mode = RTE_ETH_MQ_RX_RSS, 253 .split_hdr_size = 0, 254 .offloads = RTE_ETH_RX_OFFLOAD_CHECKSUM, 255 }, 256 .rx_adv_conf = { 257 .rss_conf = { 258 .rss_key = NULL, 259 .rss_hf = RTE_ETH_RSS_UDP, 260 }, 261 }, 262 .txmode = { 263 .mq_mode = RTE_ETH_MQ_TX_NONE, 264 } 265 }; 266 267 static uint32_t max_pkt_len; 268 269 static struct rte_mempool * pktmbuf_pool[NB_SOCKETS]; 270 271 272 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 273 274 #ifdef RTE_ARCH_X86 275 #include <rte_hash_crc.h> 276 #define DEFAULT_HASH_FUNC rte_hash_crc 277 #else 278 #include <rte_jhash.h> 279 #define DEFAULT_HASH_FUNC rte_jhash 280 #endif 281 282 struct ipv4_5tuple { 283 uint32_t ip_dst; 284 uint32_t ip_src; 285 uint16_t port_dst; 286 uint16_t port_src; 287 uint8_t proto; 288 } __rte_packed; 289 290 struct ipv6_5tuple { 291 uint8_t ip_dst[IPV6_ADDR_LEN]; 292 uint8_t ip_src[IPV6_ADDR_LEN]; 293 uint16_t port_dst; 294 uint16_t port_src; 295 uint8_t proto; 296 } __rte_packed; 297 298 struct ipv4_l3fwd_route { 299 struct ipv4_5tuple key; 300 uint8_t if_out; 301 }; 302 303 struct ipv6_l3fwd_route { 304 struct ipv6_5tuple key; 305 uint8_t if_out; 306 }; 307 308 static struct ipv4_l3fwd_route ipv4_l3fwd_route_array[] = { 309 {{RTE_IPV4(100,10,0,1), RTE_IPV4(200,10,0,1), 101, 11, IPPROTO_TCP}, 0}, 310 {{RTE_IPV4(100,20,0,2), RTE_IPV4(200,20,0,2), 102, 12, IPPROTO_TCP}, 1}, 311 {{RTE_IPV4(100,30,0,3), RTE_IPV4(200,30,0,3), 103, 13, IPPROTO_TCP}, 2}, 312 {{RTE_IPV4(100,40,0,4), RTE_IPV4(200,40,0,4), 104, 14, IPPROTO_TCP}, 3}, 313 }; 314 315 static struct ipv6_l3fwd_route ipv6_l3fwd_route_array[] = { 316 { 317 { 318 {0xfe, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 319 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05}, 320 {0xfe, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 321 0x02, 0x1e, 0x67, 0xff, 0xfe, 0x0d, 0xb6, 0x0a}, 322 1, 10, IPPROTO_UDP 323 }, 4 324 }, 325 }; 326 327 typedef struct rte_hash lookup_struct_t; 328 static lookup_struct_t *ipv4_l3fwd_lookup_struct[NB_SOCKETS]; 329 static lookup_struct_t *ipv6_l3fwd_lookup_struct[NB_SOCKETS]; 330 331 #define L3FWD_HASH_ENTRIES 1024 332 333 static uint16_t ipv4_l3fwd_out_if[L3FWD_HASH_ENTRIES] __rte_cache_aligned; 334 static uint16_t ipv6_l3fwd_out_if[L3FWD_HASH_ENTRIES] __rte_cache_aligned; 335 #endif 336 337 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 338 struct ipv4_l3fwd_route { 339 uint32_t ip; 340 uint8_t depth; 341 uint8_t if_out; 342 }; 343 344 static struct ipv4_l3fwd_route ipv4_l3fwd_route_array[] = { 345 {RTE_IPV4(1,1,1,0), 24, 0}, 346 {RTE_IPV4(2,1,1,0), 24, 1}, 347 {RTE_IPV4(3,1,1,0), 24, 2}, 348 {RTE_IPV4(4,1,1,0), 24, 3}, 349 {RTE_IPV4(5,1,1,0), 24, 4}, 350 {RTE_IPV4(6,1,1,0), 24, 5}, 351 {RTE_IPV4(7,1,1,0), 24, 6}, 352 {RTE_IPV4(8,1,1,0), 24, 7}, 353 }; 354 355 #define IPV4_L3FWD_LPM_MAX_RULES 1024 356 357 typedef struct rte_lpm lookup_struct_t; 358 static lookup_struct_t *ipv4_l3fwd_lookup_struct[NB_SOCKETS]; 359 #endif 360 361 struct lcore_conf { 362 uint16_t n_rx_queue; 363 struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; 364 uint16_t n_tx_port; 365 uint16_t tx_port_id[RTE_MAX_ETHPORTS]; 366 uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; 367 struct rte_eth_dev_tx_buffer *tx_buffer[RTE_MAX_ETHPORTS]; 368 lookup_struct_t * ipv4_lookup_struct; 369 lookup_struct_t * ipv6_lookup_struct; 370 } __rte_cache_aligned; 371 372 struct lcore_stats { 373 /* total sleep time in ms since last frequency scaling down */ 374 uint32_t sleep_time; 375 /* number of long sleep recently */ 376 uint32_t nb_long_sleep; 377 /* freq. scaling up trend */ 378 uint32_t trend; 379 /* total packet processed recently */ 380 uint64_t nb_rx_processed; 381 /* total iterations looped recently */ 382 uint64_t nb_iteration_looped; 383 /* 384 * Represents empty and non empty polls 385 * of rte_eth_rx_burst(); 386 * ep_nep[0] holds non empty polls 387 * i.e. 0 < nb_rx <= MAX_BURST 388 * ep_nep[1] holds empty polls. 389 * i.e. nb_rx == 0 390 */ 391 uint64_t ep_nep[2]; 392 /* 393 * Represents full and empty+partial 394 * polls of rte_eth_rx_burst(); 395 * ep_nep[0] holds empty+partial polls. 396 * i.e. 0 <= nb_rx < MAX_BURST 397 * ep_nep[1] holds full polls 398 * i.e. nb_rx == MAX_BURST 399 */ 400 uint64_t fp_nfp[2]; 401 enum busy_rate br; 402 rte_spinlock_t telemetry_lock; 403 } __rte_cache_aligned; 404 405 static struct lcore_conf lcore_conf[RTE_MAX_LCORE] __rte_cache_aligned; 406 static struct lcore_stats stats[RTE_MAX_LCORE] __rte_cache_aligned; 407 static struct rte_timer power_timers[RTE_MAX_LCORE]; 408 409 static inline uint32_t power_idle_heuristic(uint32_t zero_rx_packet_count); 410 static inline enum freq_scale_hint_t power_freq_scaleup_heuristic( \ 411 unsigned int lcore_id, uint16_t port_id, uint16_t queue_id); 412 413 414 /* 415 * These defaults are using the max frequency index (1), a medium index (9) 416 * and a typical low frequency index (14). These can be adjusted to use 417 * different indexes using the relevant command line parameters. 418 */ 419 static uint8_t freq_tlb[] = {14, 9, 1}; 420 421 static int is_done(void) 422 { 423 return quit_signal; 424 } 425 426 /* exit signal handler */ 427 static void 428 signal_exit_now(int sigtype) 429 { 430 431 if (sigtype == SIGINT) 432 quit_signal = true; 433 434 } 435 436 /* Freqency scale down timer callback */ 437 static void 438 power_timer_cb(__rte_unused struct rte_timer *tim, 439 __rte_unused void *arg) 440 { 441 uint64_t hz; 442 float sleep_time_ratio; 443 unsigned lcore_id = rte_lcore_id(); 444 445 /* accumulate total execution time in us when callback is invoked */ 446 sleep_time_ratio = (float)(stats[lcore_id].sleep_time) / 447 (float)SCALING_PERIOD; 448 /** 449 * check whether need to scale down frequency a step if it sleep a lot. 450 */ 451 if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD) { 452 if (rte_power_freq_down) 453 rte_power_freq_down(lcore_id); 454 } 455 else if ( (unsigned)(stats[lcore_id].nb_rx_processed / 456 stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST) { 457 /** 458 * scale down a step if average packet per iteration less 459 * than expectation. 460 */ 461 if (rte_power_freq_down) 462 rte_power_freq_down(lcore_id); 463 } 464 465 /** 466 * initialize another timer according to current frequency to ensure 467 * timer interval is relatively fixed. 468 */ 469 hz = rte_get_timer_hz(); 470 rte_timer_reset(&power_timers[lcore_id], hz/TIMER_NUMBER_PER_SECOND, 471 SINGLE, lcore_id, power_timer_cb, NULL); 472 473 stats[lcore_id].nb_rx_processed = 0; 474 stats[lcore_id].nb_iteration_looped = 0; 475 476 stats[lcore_id].sleep_time = 0; 477 } 478 479 /* Enqueue a single packet, and send burst if queue is filled */ 480 static inline int 481 send_single_packet(struct rte_mbuf *m, uint16_t port) 482 { 483 uint32_t lcore_id; 484 struct lcore_conf *qconf; 485 486 lcore_id = rte_lcore_id(); 487 qconf = &lcore_conf[lcore_id]; 488 489 rte_eth_tx_buffer(port, qconf->tx_queue_id[port], 490 qconf->tx_buffer[port], m); 491 492 return 0; 493 } 494 495 #ifdef DO_RFC_1812_CHECKS 496 static inline int 497 is_valid_ipv4_pkt(struct rte_ipv4_hdr *pkt, uint32_t link_len) 498 { 499 /* From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2 */ 500 /* 501 * 1. The packet length reported by the Link Layer must be large 502 * enough to hold the minimum length legal IP datagram (20 bytes). 503 */ 504 if (link_len < sizeof(struct rte_ipv4_hdr)) 505 return -1; 506 507 /* 2. The IP checksum must be correct. */ 508 /* this is checked in H/W */ 509 510 /* 511 * 3. The IP version number must be 4. If the version number is not 4 512 * then the packet may be another version of IP, such as IPng or 513 * ST-II. 514 */ 515 if (((pkt->version_ihl) >> 4) != 4) 516 return -3; 517 /* 518 * 4. The IP header length field must be large enough to hold the 519 * minimum length legal IP datagram (20 bytes = 5 words). 520 */ 521 if ((pkt->version_ihl & 0xf) < 5) 522 return -4; 523 524 /* 525 * 5. The IP total length field must be large enough to hold the IP 526 * datagram header, whose length is specified in the IP header length 527 * field. 528 */ 529 if (rte_cpu_to_be_16(pkt->total_length) < sizeof(struct rte_ipv4_hdr)) 530 return -5; 531 532 return 0; 533 } 534 #endif 535 536 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 537 static void 538 print_ipv4_key(struct ipv4_5tuple key) 539 { 540 printf("IP dst = %08x, IP src = %08x, port dst = %d, port src = %d, " 541 "proto = %d\n", (unsigned)key.ip_dst, (unsigned)key.ip_src, 542 key.port_dst, key.port_src, key.proto); 543 } 544 static void 545 print_ipv6_key(struct ipv6_5tuple key) 546 { 547 printf( "IP dst = " IPv6_BYTES_FMT ", IP src = " IPv6_BYTES_FMT ", " 548 "port dst = %d, port src = %d, proto = %d\n", 549 IPv6_BYTES(key.ip_dst), IPv6_BYTES(key.ip_src), 550 key.port_dst, key.port_src, key.proto); 551 } 552 553 static inline uint16_t 554 get_ipv4_dst_port(struct rte_ipv4_hdr *ipv4_hdr, uint16_t portid, 555 lookup_struct_t * ipv4_l3fwd_lookup_struct) 556 { 557 struct ipv4_5tuple key; 558 struct rte_tcp_hdr *tcp; 559 struct rte_udp_hdr *udp; 560 int ret = 0; 561 562 key.ip_dst = rte_be_to_cpu_32(ipv4_hdr->dst_addr); 563 key.ip_src = rte_be_to_cpu_32(ipv4_hdr->src_addr); 564 key.proto = ipv4_hdr->next_proto_id; 565 566 switch (ipv4_hdr->next_proto_id) { 567 case IPPROTO_TCP: 568 tcp = (struct rte_tcp_hdr *)((unsigned char *)ipv4_hdr + 569 sizeof(struct rte_ipv4_hdr)); 570 key.port_dst = rte_be_to_cpu_16(tcp->dst_port); 571 key.port_src = rte_be_to_cpu_16(tcp->src_port); 572 break; 573 574 case IPPROTO_UDP: 575 udp = (struct rte_udp_hdr *)((unsigned char *)ipv4_hdr + 576 sizeof(struct rte_ipv4_hdr)); 577 key.port_dst = rte_be_to_cpu_16(udp->dst_port); 578 key.port_src = rte_be_to_cpu_16(udp->src_port); 579 break; 580 581 default: 582 key.port_dst = 0; 583 key.port_src = 0; 584 break; 585 } 586 587 /* Find destination port */ 588 ret = rte_hash_lookup(ipv4_l3fwd_lookup_struct, (const void *)&key); 589 return ((ret < 0) ? portid : ipv4_l3fwd_out_if[ret]); 590 } 591 592 static inline uint16_t 593 get_ipv6_dst_port(struct rte_ipv6_hdr *ipv6_hdr, uint16_t portid, 594 lookup_struct_t *ipv6_l3fwd_lookup_struct) 595 { 596 struct ipv6_5tuple key; 597 struct rte_tcp_hdr *tcp; 598 struct rte_udp_hdr *udp; 599 int ret = 0; 600 601 memcpy(key.ip_dst, ipv6_hdr->dst_addr, IPV6_ADDR_LEN); 602 memcpy(key.ip_src, ipv6_hdr->src_addr, IPV6_ADDR_LEN); 603 604 key.proto = ipv6_hdr->proto; 605 606 switch (ipv6_hdr->proto) { 607 case IPPROTO_TCP: 608 tcp = (struct rte_tcp_hdr *)((unsigned char *) ipv6_hdr + 609 sizeof(struct rte_ipv6_hdr)); 610 key.port_dst = rte_be_to_cpu_16(tcp->dst_port); 611 key.port_src = rte_be_to_cpu_16(tcp->src_port); 612 break; 613 614 case IPPROTO_UDP: 615 udp = (struct rte_udp_hdr *)((unsigned char *) ipv6_hdr + 616 sizeof(struct rte_ipv6_hdr)); 617 key.port_dst = rte_be_to_cpu_16(udp->dst_port); 618 key.port_src = rte_be_to_cpu_16(udp->src_port); 619 break; 620 621 default: 622 key.port_dst = 0; 623 key.port_src = 0; 624 break; 625 } 626 627 /* Find destination port */ 628 ret = rte_hash_lookup(ipv6_l3fwd_lookup_struct, (const void *)&key); 629 return ((ret < 0) ? portid : ipv6_l3fwd_out_if[ret]); 630 } 631 #endif 632 633 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 634 static inline uint16_t 635 get_ipv4_dst_port(struct rte_ipv4_hdr *ipv4_hdr, uint16_t portid, 636 lookup_struct_t *ipv4_l3fwd_lookup_struct) 637 { 638 uint32_t next_hop; 639 640 return ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct, 641 rte_be_to_cpu_32(ipv4_hdr->dst_addr), &next_hop) == 0)? 642 next_hop : portid); 643 } 644 #endif 645 646 static inline void 647 parse_ptype_one(struct rte_mbuf *m) 648 { 649 struct rte_ether_hdr *eth_hdr; 650 uint32_t packet_type = RTE_PTYPE_UNKNOWN; 651 uint16_t ether_type; 652 653 eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 654 ether_type = eth_hdr->ether_type; 655 if (ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4)) 656 packet_type |= RTE_PTYPE_L3_IPV4_EXT_UNKNOWN; 657 else if (ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV6)) 658 packet_type |= RTE_PTYPE_L3_IPV6_EXT_UNKNOWN; 659 660 m->packet_type = packet_type; 661 } 662 663 static uint16_t 664 cb_parse_ptype(uint16_t port __rte_unused, uint16_t queue __rte_unused, 665 struct rte_mbuf *pkts[], uint16_t nb_pkts, 666 uint16_t max_pkts __rte_unused, 667 void *user_param __rte_unused) 668 { 669 unsigned int i; 670 671 for (i = 0; i < nb_pkts; ++i) 672 parse_ptype_one(pkts[i]); 673 674 return nb_pkts; 675 } 676 677 static int 678 add_cb_parse_ptype(uint16_t portid, uint16_t queueid) 679 { 680 printf("Port %d: softly parse packet type info\n", portid); 681 if (rte_eth_add_rx_callback(portid, queueid, cb_parse_ptype, NULL)) 682 return 0; 683 684 printf("Failed to add rx callback: port=%d\n", portid); 685 return -1; 686 } 687 688 static inline void 689 l3fwd_simple_forward(struct rte_mbuf *m, uint16_t portid, 690 struct lcore_conf *qconf) 691 { 692 struct rte_ether_hdr *eth_hdr; 693 struct rte_ipv4_hdr *ipv4_hdr; 694 void *d_addr_bytes; 695 uint16_t dst_port; 696 697 eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); 698 699 if (RTE_ETH_IS_IPV4_HDR(m->packet_type)) { 700 /* Handle IPv4 headers.*/ 701 ipv4_hdr = 702 rte_pktmbuf_mtod_offset(m, struct rte_ipv4_hdr *, 703 sizeof(struct rte_ether_hdr)); 704 705 #ifdef DO_RFC_1812_CHECKS 706 /* Check to make sure the packet is valid (RFC1812) */ 707 if (is_valid_ipv4_pkt(ipv4_hdr, m->pkt_len) < 0) { 708 rte_pktmbuf_free(m); 709 return; 710 } 711 #endif 712 713 dst_port = get_ipv4_dst_port(ipv4_hdr, portid, 714 qconf->ipv4_lookup_struct); 715 if (dst_port >= RTE_MAX_ETHPORTS || 716 (enabled_port_mask & 1 << dst_port) == 0) 717 dst_port = portid; 718 719 /* 02:00:00:00:00:xx */ 720 d_addr_bytes = ð_hdr->dst_addr.addr_bytes[0]; 721 *((uint64_t *)d_addr_bytes) = 722 0x000000000002 + ((uint64_t)dst_port << 40); 723 724 #ifdef DO_RFC_1812_CHECKS 725 /* Update time to live and header checksum */ 726 --(ipv4_hdr->time_to_live); 727 ++(ipv4_hdr->hdr_checksum); 728 #endif 729 730 /* src addr */ 731 rte_ether_addr_copy(&ports_eth_addr[dst_port], 732 ð_hdr->src_addr); 733 734 send_single_packet(m, dst_port); 735 } else if (RTE_ETH_IS_IPV6_HDR(m->packet_type)) { 736 /* Handle IPv6 headers.*/ 737 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 738 struct rte_ipv6_hdr *ipv6_hdr; 739 740 ipv6_hdr = 741 rte_pktmbuf_mtod_offset(m, struct rte_ipv6_hdr *, 742 sizeof(struct rte_ether_hdr)); 743 744 dst_port = get_ipv6_dst_port(ipv6_hdr, portid, 745 qconf->ipv6_lookup_struct); 746 747 if (dst_port >= RTE_MAX_ETHPORTS || 748 (enabled_port_mask & 1 << dst_port) == 0) 749 dst_port = portid; 750 751 /* 02:00:00:00:00:xx */ 752 d_addr_bytes = ð_hdr->dst_addr.addr_bytes[0]; 753 *((uint64_t *)d_addr_bytes) = 754 0x000000000002 + ((uint64_t)dst_port << 40); 755 756 /* src addr */ 757 rte_ether_addr_copy(&ports_eth_addr[dst_port], 758 ð_hdr->src_addr); 759 760 send_single_packet(m, dst_port); 761 #else 762 /* We don't currently handle IPv6 packets in LPM mode. */ 763 rte_pktmbuf_free(m); 764 #endif 765 } else 766 rte_pktmbuf_free(m); 767 768 } 769 770 #define MINIMUM_SLEEP_TIME 1 771 #define SUSPEND_THRESHOLD 300 772 773 static inline uint32_t 774 power_idle_heuristic(uint32_t zero_rx_packet_count) 775 { 776 /* If zero count is less than 100, sleep 1us */ 777 if (zero_rx_packet_count < SUSPEND_THRESHOLD) 778 return MINIMUM_SLEEP_TIME; 779 /* If zero count is less than 1000, sleep 100 us which is the 780 minimum latency switching from C3/C6 to C0 781 */ 782 else 783 return SUSPEND_THRESHOLD; 784 } 785 786 static inline enum freq_scale_hint_t 787 power_freq_scaleup_heuristic(unsigned lcore_id, 788 uint16_t port_id, 789 uint16_t queue_id) 790 { 791 uint32_t rxq_count = rte_eth_rx_queue_count(port_id, queue_id); 792 /** 793 * HW Rx queue size is 128 by default, Rx burst read at maximum 32 entries 794 * per iteration 795 */ 796 #define FREQ_GEAR1_RX_PACKET_THRESHOLD MAX_PKT_BURST 797 #define FREQ_GEAR2_RX_PACKET_THRESHOLD (MAX_PKT_BURST*2) 798 #define FREQ_GEAR3_RX_PACKET_THRESHOLD (MAX_PKT_BURST*3) 799 #define FREQ_UP_TREND1_ACC 1 800 #define FREQ_UP_TREND2_ACC 100 801 #define FREQ_UP_THRESHOLD 10000 802 803 if (likely(rxq_count > FREQ_GEAR3_RX_PACKET_THRESHOLD)) { 804 stats[lcore_id].trend = 0; 805 return FREQ_HIGHEST; 806 } else if (likely(rxq_count > FREQ_GEAR2_RX_PACKET_THRESHOLD)) 807 stats[lcore_id].trend += FREQ_UP_TREND2_ACC; 808 else if (likely(rxq_count > FREQ_GEAR1_RX_PACKET_THRESHOLD)) 809 stats[lcore_id].trend += FREQ_UP_TREND1_ACC; 810 811 if (likely(stats[lcore_id].trend > FREQ_UP_THRESHOLD)) { 812 stats[lcore_id].trend = 0; 813 return FREQ_HIGHER; 814 } 815 816 return FREQ_CURRENT; 817 } 818 819 /** 820 * force polling thread sleep until one-shot rx interrupt triggers 821 * @param port_id 822 * Port id. 823 * @param queue_id 824 * Rx queue id. 825 * @return 826 * 0 on success 827 */ 828 static int 829 sleep_until_rx_interrupt(int num, int lcore) 830 { 831 /* 832 * we want to track when we are woken up by traffic so that we can go 833 * back to sleep again without log spamming. Avoid cache line sharing 834 * to prevent threads stepping on each others' toes. 835 */ 836 static struct { 837 bool wakeup; 838 } __rte_cache_aligned status[RTE_MAX_LCORE]; 839 struct rte_epoll_event event[num]; 840 int n, i; 841 uint16_t port_id; 842 uint8_t queue_id; 843 void *data; 844 845 if (status[lcore].wakeup) { 846 RTE_LOG(INFO, L3FWD_POWER, 847 "lcore %u sleeps until interrupt triggers\n", 848 rte_lcore_id()); 849 } 850 851 n = rte_epoll_wait(RTE_EPOLL_PER_THREAD, event, num, 10); 852 for (i = 0; i < n; i++) { 853 data = event[i].epdata.data; 854 port_id = ((uintptr_t)data) >> CHAR_BIT; 855 queue_id = ((uintptr_t)data) & 856 RTE_LEN2MASK(CHAR_BIT, uint8_t); 857 RTE_LOG(INFO, L3FWD_POWER, 858 "lcore %u is waked up from rx interrupt on" 859 " port %d queue %d\n", 860 rte_lcore_id(), port_id, queue_id); 861 } 862 status[lcore].wakeup = n != 0; 863 864 return 0; 865 } 866 867 static void turn_on_off_intr(struct lcore_conf *qconf, bool on) 868 { 869 int i; 870 struct lcore_rx_queue *rx_queue; 871 uint8_t queue_id; 872 uint16_t port_id; 873 874 for (i = 0; i < qconf->n_rx_queue; ++i) { 875 rx_queue = &(qconf->rx_queue_list[i]); 876 port_id = rx_queue->port_id; 877 queue_id = rx_queue->queue_id; 878 879 rte_spinlock_lock(&(locks[port_id])); 880 if (on) 881 rte_eth_dev_rx_intr_enable(port_id, queue_id); 882 else 883 rte_eth_dev_rx_intr_disable(port_id, queue_id); 884 rte_spinlock_unlock(&(locks[port_id])); 885 } 886 } 887 888 static int event_register(struct lcore_conf *qconf) 889 { 890 struct lcore_rx_queue *rx_queue; 891 uint8_t queueid; 892 uint16_t portid; 893 uint32_t data; 894 int ret; 895 int i; 896 897 for (i = 0; i < qconf->n_rx_queue; ++i) { 898 rx_queue = &(qconf->rx_queue_list[i]); 899 portid = rx_queue->port_id; 900 queueid = rx_queue->queue_id; 901 data = portid << CHAR_BIT | queueid; 902 903 ret = rte_eth_dev_rx_intr_ctl_q(portid, queueid, 904 RTE_EPOLL_PER_THREAD, 905 RTE_INTR_EVENT_ADD, 906 (void *)((uintptr_t)data)); 907 if (ret) 908 return ret; 909 } 910 911 return 0; 912 } 913 914 /* Main processing loop. 8< */ 915 static int main_intr_loop(__rte_unused void *dummy) 916 { 917 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 918 unsigned int lcore_id; 919 uint64_t prev_tsc, diff_tsc, cur_tsc; 920 int i, j, nb_rx; 921 uint8_t queueid; 922 uint16_t portid; 923 struct lcore_conf *qconf; 924 struct lcore_rx_queue *rx_queue; 925 uint32_t lcore_rx_idle_count = 0; 926 uint32_t lcore_idle_hint = 0; 927 int intr_en = 0; 928 929 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 930 US_PER_S * BURST_TX_DRAIN_US; 931 932 prev_tsc = 0; 933 934 lcore_id = rte_lcore_id(); 935 qconf = &lcore_conf[lcore_id]; 936 937 if (qconf->n_rx_queue == 0) { 938 RTE_LOG(INFO, L3FWD_POWER, "lcore %u has nothing to do\n", 939 lcore_id); 940 return 0; 941 } 942 943 RTE_LOG(INFO, L3FWD_POWER, "entering main interrupt loop on lcore %u\n", 944 lcore_id); 945 946 for (i = 0; i < qconf->n_rx_queue; i++) { 947 portid = qconf->rx_queue_list[i].port_id; 948 queueid = qconf->rx_queue_list[i].queue_id; 949 RTE_LOG(INFO, L3FWD_POWER, 950 " -- lcoreid=%u portid=%u rxqueueid=%hhu\n", 951 lcore_id, portid, queueid); 952 } 953 954 /* add into event wait list */ 955 if (event_register(qconf) == 0) 956 intr_en = 1; 957 else 958 RTE_LOG(INFO, L3FWD_POWER, "RX interrupt won't enable.\n"); 959 960 while (!is_done()) { 961 stats[lcore_id].nb_iteration_looped++; 962 963 cur_tsc = rte_rdtsc(); 964 965 /* 966 * TX burst queue drain 967 */ 968 diff_tsc = cur_tsc - prev_tsc; 969 if (unlikely(diff_tsc > drain_tsc)) { 970 for (i = 0; i < qconf->n_tx_port; ++i) { 971 portid = qconf->tx_port_id[i]; 972 rte_eth_tx_buffer_flush(portid, 973 qconf->tx_queue_id[portid], 974 qconf->tx_buffer[portid]); 975 } 976 prev_tsc = cur_tsc; 977 } 978 979 start_rx: 980 /* 981 * Read packet from RX queues 982 */ 983 lcore_rx_idle_count = 0; 984 for (i = 0; i < qconf->n_rx_queue; ++i) { 985 rx_queue = &(qconf->rx_queue_list[i]); 986 rx_queue->idle_hint = 0; 987 portid = rx_queue->port_id; 988 queueid = rx_queue->queue_id; 989 990 nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst, 991 MAX_PKT_BURST); 992 993 stats[lcore_id].nb_rx_processed += nb_rx; 994 if (unlikely(nb_rx == 0)) { 995 /** 996 * no packet received from rx queue, try to 997 * sleep for a while forcing CPU enter deeper 998 * C states. 999 */ 1000 rx_queue->zero_rx_packet_count++; 1001 1002 if (rx_queue->zero_rx_packet_count <= 1003 MIN_ZERO_POLL_COUNT) 1004 continue; 1005 1006 rx_queue->idle_hint = power_idle_heuristic( 1007 rx_queue->zero_rx_packet_count); 1008 lcore_rx_idle_count++; 1009 } else { 1010 rx_queue->zero_rx_packet_count = 0; 1011 } 1012 1013 /* Prefetch first packets */ 1014 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1015 rte_prefetch0(rte_pktmbuf_mtod( 1016 pkts_burst[j], void *)); 1017 } 1018 1019 /* Prefetch and forward already prefetched packets */ 1020 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1021 rte_prefetch0(rte_pktmbuf_mtod( 1022 pkts_burst[j + PREFETCH_OFFSET], 1023 void *)); 1024 l3fwd_simple_forward( 1025 pkts_burst[j], portid, qconf); 1026 } 1027 1028 /* Forward remaining prefetched packets */ 1029 for (; j < nb_rx; j++) { 1030 l3fwd_simple_forward( 1031 pkts_burst[j], portid, qconf); 1032 } 1033 } 1034 1035 if (unlikely(lcore_rx_idle_count == qconf->n_rx_queue)) { 1036 /** 1037 * All Rx queues empty in recent consecutive polls, 1038 * sleep in a conservative manner, meaning sleep as 1039 * less as possible. 1040 */ 1041 for (i = 1, 1042 lcore_idle_hint = qconf->rx_queue_list[0].idle_hint; 1043 i < qconf->n_rx_queue; ++i) { 1044 rx_queue = &(qconf->rx_queue_list[i]); 1045 if (rx_queue->idle_hint < lcore_idle_hint) 1046 lcore_idle_hint = rx_queue->idle_hint; 1047 } 1048 1049 if (lcore_idle_hint < SUSPEND_THRESHOLD) 1050 /** 1051 * execute "pause" instruction to avoid context 1052 * switch which generally take hundred of 1053 * microseconds for short sleep. 1054 */ 1055 rte_delay_us(lcore_idle_hint); 1056 else { 1057 /* suspend until rx interrupt triggers */ 1058 if (intr_en) { 1059 turn_on_off_intr(qconf, 1); 1060 sleep_until_rx_interrupt( 1061 qconf->n_rx_queue, 1062 lcore_id); 1063 turn_on_off_intr(qconf, 0); 1064 /** 1065 * start receiving packets immediately 1066 */ 1067 if (likely(!is_done())) 1068 goto start_rx; 1069 } 1070 } 1071 stats[lcore_id].sleep_time += lcore_idle_hint; 1072 } 1073 } 1074 1075 return 0; 1076 } 1077 /* >8 End of main processing loop. */ 1078 1079 /* main processing loop */ 1080 static int 1081 main_telemetry_loop(__rte_unused void *dummy) 1082 { 1083 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1084 unsigned int lcore_id; 1085 uint64_t prev_tsc, diff_tsc, cur_tsc, prev_tel_tsc; 1086 int i, j, nb_rx; 1087 uint8_t queueid; 1088 uint16_t portid; 1089 struct lcore_conf *qconf; 1090 struct lcore_rx_queue *rx_queue; 1091 uint64_t ep_nep[2] = {0}, fp_nfp[2] = {0}; 1092 uint64_t poll_count; 1093 enum busy_rate br; 1094 1095 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 1096 US_PER_S * BURST_TX_DRAIN_US; 1097 1098 poll_count = 0; 1099 prev_tsc = 0; 1100 prev_tel_tsc = 0; 1101 1102 lcore_id = rte_lcore_id(); 1103 qconf = &lcore_conf[lcore_id]; 1104 1105 if (qconf->n_rx_queue == 0) { 1106 RTE_LOG(INFO, L3FWD_POWER, "lcore %u has nothing to do\n", 1107 lcore_id); 1108 return 0; 1109 } 1110 1111 RTE_LOG(INFO, L3FWD_POWER, "entering main telemetry loop on lcore %u\n", 1112 lcore_id); 1113 1114 for (i = 0; i < qconf->n_rx_queue; i++) { 1115 portid = qconf->rx_queue_list[i].port_id; 1116 queueid = qconf->rx_queue_list[i].queue_id; 1117 RTE_LOG(INFO, L3FWD_POWER, " -- lcoreid=%u portid=%u " 1118 "rxqueueid=%hhu\n", lcore_id, portid, queueid); 1119 } 1120 1121 while (!is_done()) { 1122 1123 cur_tsc = rte_rdtsc(); 1124 /* 1125 * TX burst queue drain 1126 */ 1127 diff_tsc = cur_tsc - prev_tsc; 1128 if (unlikely(diff_tsc > drain_tsc)) { 1129 for (i = 0; i < qconf->n_tx_port; ++i) { 1130 portid = qconf->tx_port_id[i]; 1131 rte_eth_tx_buffer_flush(portid, 1132 qconf->tx_queue_id[portid], 1133 qconf->tx_buffer[portid]); 1134 } 1135 prev_tsc = cur_tsc; 1136 } 1137 1138 /* 1139 * Read packet from RX queues 1140 */ 1141 for (i = 0; i < qconf->n_rx_queue; ++i) { 1142 rx_queue = &(qconf->rx_queue_list[i]); 1143 portid = rx_queue->port_id; 1144 queueid = rx_queue->queue_id; 1145 1146 nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst, 1147 MAX_PKT_BURST); 1148 ep_nep[nb_rx == 0]++; 1149 fp_nfp[nb_rx == MAX_PKT_BURST]++; 1150 poll_count++; 1151 if (unlikely(nb_rx == 0)) 1152 continue; 1153 1154 /* Prefetch first packets */ 1155 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1156 rte_prefetch0(rte_pktmbuf_mtod( 1157 pkts_burst[j], void *)); 1158 } 1159 1160 /* Prefetch and forward already prefetched packets */ 1161 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1162 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1163 j + PREFETCH_OFFSET], void *)); 1164 l3fwd_simple_forward(pkts_burst[j], portid, 1165 qconf); 1166 } 1167 1168 /* Forward remaining prefetched packets */ 1169 for (; j < nb_rx; j++) { 1170 l3fwd_simple_forward(pkts_burst[j], portid, 1171 qconf); 1172 } 1173 } 1174 if (unlikely(poll_count >= DEFAULT_COUNT)) { 1175 diff_tsc = cur_tsc - prev_tel_tsc; 1176 if (diff_tsc >= MAX_CYCLES) { 1177 br = FULL; 1178 } else if (diff_tsc > MIN_CYCLES && 1179 diff_tsc < MAX_CYCLES) { 1180 br = (diff_tsc * 100) / MAX_CYCLES; 1181 } else { 1182 br = ZERO; 1183 } 1184 poll_count = 0; 1185 prev_tel_tsc = cur_tsc; 1186 /* update stats for telemetry */ 1187 rte_spinlock_lock(&stats[lcore_id].telemetry_lock); 1188 stats[lcore_id].ep_nep[0] = ep_nep[0]; 1189 stats[lcore_id].ep_nep[1] = ep_nep[1]; 1190 stats[lcore_id].fp_nfp[0] = fp_nfp[0]; 1191 stats[lcore_id].fp_nfp[1] = fp_nfp[1]; 1192 stats[lcore_id].br = br; 1193 rte_spinlock_unlock(&stats[lcore_id].telemetry_lock); 1194 } 1195 } 1196 1197 return 0; 1198 } 1199 /* main processing loop */ 1200 static int 1201 main_empty_poll_loop(__rte_unused void *dummy) 1202 { 1203 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1204 unsigned int lcore_id; 1205 uint64_t prev_tsc, diff_tsc, cur_tsc; 1206 int i, j, nb_rx; 1207 uint8_t queueid; 1208 uint16_t portid; 1209 struct lcore_conf *qconf; 1210 struct lcore_rx_queue *rx_queue; 1211 1212 const uint64_t drain_tsc = 1213 (rte_get_tsc_hz() + US_PER_S - 1) / 1214 US_PER_S * BURST_TX_DRAIN_US; 1215 1216 prev_tsc = 0; 1217 1218 lcore_id = rte_lcore_id(); 1219 qconf = &lcore_conf[lcore_id]; 1220 1221 if (qconf->n_rx_queue == 0) { 1222 RTE_LOG(INFO, L3FWD_POWER, "lcore %u has nothing to do\n", 1223 lcore_id); 1224 return 0; 1225 } 1226 1227 for (i = 0; i < qconf->n_rx_queue; i++) { 1228 portid = qconf->rx_queue_list[i].port_id; 1229 queueid = qconf->rx_queue_list[i].queue_id; 1230 RTE_LOG(INFO, L3FWD_POWER, " -- lcoreid=%u portid=%u " 1231 "rxqueueid=%hhu\n", lcore_id, portid, queueid); 1232 } 1233 1234 while (!is_done()) { 1235 stats[lcore_id].nb_iteration_looped++; 1236 1237 cur_tsc = rte_rdtsc(); 1238 /* 1239 * TX burst queue drain 1240 */ 1241 diff_tsc = cur_tsc - prev_tsc; 1242 if (unlikely(diff_tsc > drain_tsc)) { 1243 for (i = 0; i < qconf->n_tx_port; ++i) { 1244 portid = qconf->tx_port_id[i]; 1245 rte_eth_tx_buffer_flush(portid, 1246 qconf->tx_queue_id[portid], 1247 qconf->tx_buffer[portid]); 1248 } 1249 prev_tsc = cur_tsc; 1250 } 1251 1252 /* 1253 * Read packet from RX queues 1254 */ 1255 for (i = 0; i < qconf->n_rx_queue; ++i) { 1256 rx_queue = &(qconf->rx_queue_list[i]); 1257 rx_queue->idle_hint = 0; 1258 portid = rx_queue->port_id; 1259 queueid = rx_queue->queue_id; 1260 1261 nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst, 1262 MAX_PKT_BURST); 1263 1264 stats[lcore_id].nb_rx_processed += nb_rx; 1265 1266 if (nb_rx == 0) { 1267 1268 rte_power_empty_poll_stat_update(lcore_id); 1269 1270 continue; 1271 } else { 1272 rte_power_poll_stat_update(lcore_id, nb_rx); 1273 } 1274 1275 1276 /* Prefetch first packets */ 1277 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1278 rte_prefetch0(rte_pktmbuf_mtod( 1279 pkts_burst[j], void *)); 1280 } 1281 1282 /* Prefetch and forward already prefetched packets */ 1283 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1284 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1285 j + PREFETCH_OFFSET], 1286 void *)); 1287 l3fwd_simple_forward(pkts_burst[j], portid, 1288 qconf); 1289 } 1290 1291 /* Forward remaining prefetched packets */ 1292 for (; j < nb_rx; j++) { 1293 l3fwd_simple_forward(pkts_burst[j], portid, 1294 qconf); 1295 } 1296 1297 } 1298 1299 } 1300 1301 return 0; 1302 } 1303 /* main processing loop */ 1304 static int 1305 main_legacy_loop(__rte_unused void *dummy) 1306 { 1307 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1308 unsigned lcore_id; 1309 uint64_t prev_tsc, diff_tsc, cur_tsc, tim_res_tsc, hz; 1310 uint64_t prev_tsc_power = 0, cur_tsc_power, diff_tsc_power; 1311 int i, j, nb_rx; 1312 uint8_t queueid; 1313 uint16_t portid; 1314 struct lcore_conf *qconf; 1315 struct lcore_rx_queue *rx_queue; 1316 enum freq_scale_hint_t lcore_scaleup_hint; 1317 uint32_t lcore_rx_idle_count = 0; 1318 uint32_t lcore_idle_hint = 0; 1319 int intr_en = 0; 1320 1321 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; 1322 1323 prev_tsc = 0; 1324 hz = rte_get_timer_hz(); 1325 tim_res_tsc = hz/TIMER_NUMBER_PER_SECOND; 1326 1327 lcore_id = rte_lcore_id(); 1328 qconf = &lcore_conf[lcore_id]; 1329 1330 if (qconf->n_rx_queue == 0) { 1331 RTE_LOG(INFO, L3FWD_POWER, "lcore %u has nothing to do\n", lcore_id); 1332 return 0; 1333 } 1334 1335 RTE_LOG(INFO, L3FWD_POWER, "entering main loop on lcore %u\n", lcore_id); 1336 1337 for (i = 0; i < qconf->n_rx_queue; i++) { 1338 portid = qconf->rx_queue_list[i].port_id; 1339 queueid = qconf->rx_queue_list[i].queue_id; 1340 RTE_LOG(INFO, L3FWD_POWER, " -- lcoreid=%u portid=%u " 1341 "rxqueueid=%hhu\n", lcore_id, portid, queueid); 1342 } 1343 1344 /* add into event wait list */ 1345 if (event_register(qconf) == 0) 1346 intr_en = 1; 1347 else 1348 RTE_LOG(INFO, L3FWD_POWER, "RX interrupt won't enable.\n"); 1349 1350 while (!is_done()) { 1351 stats[lcore_id].nb_iteration_looped++; 1352 1353 cur_tsc = rte_rdtsc(); 1354 cur_tsc_power = cur_tsc; 1355 1356 /* 1357 * TX burst queue drain 1358 */ 1359 diff_tsc = cur_tsc - prev_tsc; 1360 if (unlikely(diff_tsc > drain_tsc)) { 1361 for (i = 0; i < qconf->n_tx_port; ++i) { 1362 portid = qconf->tx_port_id[i]; 1363 rte_eth_tx_buffer_flush(portid, 1364 qconf->tx_queue_id[portid], 1365 qconf->tx_buffer[portid]); 1366 } 1367 prev_tsc = cur_tsc; 1368 } 1369 1370 diff_tsc_power = cur_tsc_power - prev_tsc_power; 1371 if (diff_tsc_power > tim_res_tsc) { 1372 rte_timer_manage(); 1373 prev_tsc_power = cur_tsc_power; 1374 } 1375 1376 start_rx: 1377 /* 1378 * Read packet from RX queues 1379 */ 1380 lcore_scaleup_hint = FREQ_CURRENT; 1381 lcore_rx_idle_count = 0; 1382 for (i = 0; i < qconf->n_rx_queue; ++i) { 1383 rx_queue = &(qconf->rx_queue_list[i]); 1384 rx_queue->idle_hint = 0; 1385 portid = rx_queue->port_id; 1386 queueid = rx_queue->queue_id; 1387 1388 nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst, 1389 MAX_PKT_BURST); 1390 1391 stats[lcore_id].nb_rx_processed += nb_rx; 1392 if (unlikely(nb_rx == 0)) { 1393 /** 1394 * no packet received from rx queue, try to 1395 * sleep for a while forcing CPU enter deeper 1396 * C states. 1397 */ 1398 rx_queue->zero_rx_packet_count++; 1399 1400 if (rx_queue->zero_rx_packet_count <= 1401 MIN_ZERO_POLL_COUNT) 1402 continue; 1403 1404 rx_queue->idle_hint = power_idle_heuristic(\ 1405 rx_queue->zero_rx_packet_count); 1406 lcore_rx_idle_count++; 1407 } else { 1408 rx_queue->zero_rx_packet_count = 0; 1409 1410 /** 1411 * do not scale up frequency immediately as 1412 * user to kernel space communication is costly 1413 * which might impact packet I/O for received 1414 * packets. 1415 */ 1416 rx_queue->freq_up_hint = 1417 power_freq_scaleup_heuristic(lcore_id, 1418 portid, queueid); 1419 } 1420 1421 /* Prefetch first packets */ 1422 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1423 rte_prefetch0(rte_pktmbuf_mtod( 1424 pkts_burst[j], void *)); 1425 } 1426 1427 /* Prefetch and forward already prefetched packets */ 1428 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1429 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1430 j + PREFETCH_OFFSET], void *)); 1431 l3fwd_simple_forward(pkts_burst[j], portid, 1432 qconf); 1433 } 1434 1435 /* Forward remaining prefetched packets */ 1436 for (; j < nb_rx; j++) { 1437 l3fwd_simple_forward(pkts_burst[j], portid, 1438 qconf); 1439 } 1440 } 1441 1442 if (likely(lcore_rx_idle_count != qconf->n_rx_queue)) { 1443 for (i = 1, lcore_scaleup_hint = 1444 qconf->rx_queue_list[0].freq_up_hint; 1445 i < qconf->n_rx_queue; ++i) { 1446 rx_queue = &(qconf->rx_queue_list[i]); 1447 if (rx_queue->freq_up_hint > 1448 lcore_scaleup_hint) 1449 lcore_scaleup_hint = 1450 rx_queue->freq_up_hint; 1451 } 1452 1453 if (lcore_scaleup_hint == FREQ_HIGHEST) { 1454 if (rte_power_freq_max) 1455 rte_power_freq_max(lcore_id); 1456 } else if (lcore_scaleup_hint == FREQ_HIGHER) { 1457 if (rte_power_freq_up) 1458 rte_power_freq_up(lcore_id); 1459 } 1460 } else { 1461 /** 1462 * All Rx queues empty in recent consecutive polls, 1463 * sleep in a conservative manner, meaning sleep as 1464 * less as possible. 1465 */ 1466 for (i = 1, lcore_idle_hint = 1467 qconf->rx_queue_list[0].idle_hint; 1468 i < qconf->n_rx_queue; ++i) { 1469 rx_queue = &(qconf->rx_queue_list[i]); 1470 if (rx_queue->idle_hint < lcore_idle_hint) 1471 lcore_idle_hint = rx_queue->idle_hint; 1472 } 1473 1474 if (lcore_idle_hint < SUSPEND_THRESHOLD) 1475 /** 1476 * execute "pause" instruction to avoid context 1477 * switch which generally take hundred of 1478 * microseconds for short sleep. 1479 */ 1480 rte_delay_us(lcore_idle_hint); 1481 else { 1482 /* suspend until rx interrupt triggers */ 1483 if (intr_en) { 1484 turn_on_off_intr(qconf, 1); 1485 sleep_until_rx_interrupt( 1486 qconf->n_rx_queue, 1487 lcore_id); 1488 turn_on_off_intr(qconf, 0); 1489 /** 1490 * start receiving packets immediately 1491 */ 1492 if (likely(!is_done())) 1493 goto start_rx; 1494 } 1495 } 1496 stats[lcore_id].sleep_time += lcore_idle_hint; 1497 } 1498 } 1499 1500 return 0; 1501 } 1502 1503 static int 1504 check_lcore_params(void) 1505 { 1506 uint8_t queue, lcore; 1507 uint16_t i; 1508 int socketid; 1509 1510 for (i = 0; i < nb_lcore_params; ++i) { 1511 queue = lcore_params[i].queue_id; 1512 if (queue >= MAX_RX_QUEUE_PER_PORT) { 1513 printf("invalid queue number: %hhu\n", queue); 1514 return -1; 1515 } 1516 lcore = lcore_params[i].lcore_id; 1517 if (!rte_lcore_is_enabled(lcore)) { 1518 printf("error: lcore %hhu is not enabled in lcore " 1519 "mask\n", lcore); 1520 return -1; 1521 } 1522 if ((socketid = rte_lcore_to_socket_id(lcore) != 0) && 1523 (numa_on == 0)) { 1524 printf("warning: lcore %hhu is on socket %d with numa " 1525 "off\n", lcore, socketid); 1526 } 1527 if (app_mode == APP_MODE_TELEMETRY && lcore == rte_lcore_id()) { 1528 printf("cannot enable main core %d in config for telemetry mode\n", 1529 rte_lcore_id()); 1530 return -1; 1531 } 1532 } 1533 return 0; 1534 } 1535 1536 static int 1537 check_port_config(void) 1538 { 1539 unsigned portid; 1540 uint16_t i; 1541 1542 for (i = 0; i < nb_lcore_params; ++i) { 1543 portid = lcore_params[i].port_id; 1544 if ((enabled_port_mask & (1 << portid)) == 0) { 1545 printf("port %u is not enabled in port mask\n", 1546 portid); 1547 return -1; 1548 } 1549 if (!rte_eth_dev_is_valid_port(portid)) { 1550 printf("port %u is not present on the board\n", 1551 portid); 1552 return -1; 1553 } 1554 } 1555 return 0; 1556 } 1557 1558 static uint8_t 1559 get_port_n_rx_queues(const uint16_t port) 1560 { 1561 int queue = -1; 1562 uint16_t i; 1563 1564 for (i = 0; i < nb_lcore_params; ++i) { 1565 if (lcore_params[i].port_id == port && 1566 lcore_params[i].queue_id > queue) 1567 queue = lcore_params[i].queue_id; 1568 } 1569 return (uint8_t)(++queue); 1570 } 1571 1572 static int 1573 init_lcore_rx_queues(void) 1574 { 1575 uint16_t i, nb_rx_queue; 1576 uint8_t lcore; 1577 1578 for (i = 0; i < nb_lcore_params; ++i) { 1579 lcore = lcore_params[i].lcore_id; 1580 nb_rx_queue = lcore_conf[lcore].n_rx_queue; 1581 if (nb_rx_queue >= MAX_RX_QUEUE_PER_LCORE) { 1582 printf("error: too many queues (%u) for lcore: %u\n", 1583 (unsigned)nb_rx_queue + 1, (unsigned)lcore); 1584 return -1; 1585 } else { 1586 lcore_conf[lcore].rx_queue_list[nb_rx_queue].port_id = 1587 lcore_params[i].port_id; 1588 lcore_conf[lcore].rx_queue_list[nb_rx_queue].queue_id = 1589 lcore_params[i].queue_id; 1590 lcore_conf[lcore].n_rx_queue++; 1591 } 1592 } 1593 return 0; 1594 } 1595 1596 /* display usage */ 1597 static void 1598 print_usage(const char *prgname) 1599 { 1600 printf ("%s [EAL options] -- -p PORTMASK -P" 1601 " [--config (port,queue,lcore)[,(port,queue,lcore]]" 1602 " [--high-perf-cores CORELIST" 1603 " [--perf-config (port,queue,hi_perf,lcore_index)[,(port,queue,hi_perf,lcore_index]]" 1604 " [--max-pkt-len PKTLEN]\n" 1605 " -p PORTMASK: hexadecimal bitmask of ports to configure\n" 1606 " -P: enable promiscuous mode\n" 1607 " --config (port,queue,lcore): rx queues configuration\n" 1608 " --high-perf-cores CORELIST: list of high performance cores\n" 1609 " --perf-config: similar as config, cores specified as indices" 1610 " for bins containing high or regular performance cores\n" 1611 " --no-numa: optional, disable numa awareness\n" 1612 " --max-pkt-len PKTLEN: maximum packet length in decimal (64-9600)\n" 1613 " --parse-ptype: parse packet type by software\n" 1614 " --legacy: use legacy interrupt-based scaling\n" 1615 " --empty-poll: enable empty poll detection" 1616 " follow (training_flag, high_threshold, med_threshold)\n" 1617 " --telemetry: enable telemetry mode, to update" 1618 " empty polls, full polls, and core busyness to telemetry\n" 1619 " --interrupt-only: enable interrupt-only mode\n" 1620 " --pmd-mgmt MODE: enable PMD power management mode. " 1621 "Currently supported modes: baseline, monitor, pause, scale\n", 1622 prgname); 1623 } 1624 1625 static int parse_max_pkt_len(const char *pktlen) 1626 { 1627 char *end = NULL; 1628 unsigned long len; 1629 1630 /* parse decimal string */ 1631 len = strtoul(pktlen, &end, 10); 1632 if ((pktlen[0] == '\0') || (end == NULL) || (*end != '\0')) 1633 return -1; 1634 1635 if (len == 0) 1636 return -1; 1637 1638 return len; 1639 } 1640 1641 static int 1642 parse_portmask(const char *portmask) 1643 { 1644 char *end = NULL; 1645 unsigned long pm; 1646 1647 /* parse hexadecimal string */ 1648 pm = strtoul(portmask, &end, 16); 1649 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0')) 1650 return 0; 1651 1652 return pm; 1653 } 1654 1655 static int 1656 parse_config(const char *q_arg) 1657 { 1658 char s[256]; 1659 const char *p, *p0 = q_arg; 1660 char *end; 1661 enum fieldnames { 1662 FLD_PORT = 0, 1663 FLD_QUEUE, 1664 FLD_LCORE, 1665 _NUM_FLD 1666 }; 1667 unsigned long int_fld[_NUM_FLD]; 1668 char *str_fld[_NUM_FLD]; 1669 int i; 1670 unsigned size; 1671 1672 nb_lcore_params = 0; 1673 1674 while ((p = strchr(p0,'(')) != NULL) { 1675 ++p; 1676 if((p0 = strchr(p,')')) == NULL) 1677 return -1; 1678 1679 size = p0 - p; 1680 if(size >= sizeof(s)) 1681 return -1; 1682 1683 snprintf(s, sizeof(s), "%.*s", size, p); 1684 if (rte_strsplit(s, sizeof(s), str_fld, _NUM_FLD, ',') != 1685 _NUM_FLD) 1686 return -1; 1687 for (i = 0; i < _NUM_FLD; i++){ 1688 errno = 0; 1689 int_fld[i] = strtoul(str_fld[i], &end, 0); 1690 if (errno != 0 || end == str_fld[i] || int_fld[i] > 1691 255) 1692 return -1; 1693 } 1694 if (nb_lcore_params >= MAX_LCORE_PARAMS) { 1695 printf("exceeded max number of lcore params: %hu\n", 1696 nb_lcore_params); 1697 return -1; 1698 } 1699 lcore_params_array[nb_lcore_params].port_id = 1700 (uint8_t)int_fld[FLD_PORT]; 1701 lcore_params_array[nb_lcore_params].queue_id = 1702 (uint8_t)int_fld[FLD_QUEUE]; 1703 lcore_params_array[nb_lcore_params].lcore_id = 1704 (uint8_t)int_fld[FLD_LCORE]; 1705 ++nb_lcore_params; 1706 } 1707 lcore_params = lcore_params_array; 1708 1709 return 0; 1710 } 1711 1712 static int 1713 parse_pmd_mgmt_config(const char *name) 1714 { 1715 #define PMD_MGMT_MONITOR "monitor" 1716 #define PMD_MGMT_PAUSE "pause" 1717 #define PMD_MGMT_SCALE "scale" 1718 #define PMD_MGMT_BASELINE "baseline" 1719 1720 if (strncmp(PMD_MGMT_MONITOR, name, sizeof(PMD_MGMT_MONITOR)) == 0) { 1721 pmgmt_type = RTE_POWER_MGMT_TYPE_MONITOR; 1722 return 0; 1723 } 1724 1725 if (strncmp(PMD_MGMT_PAUSE, name, sizeof(PMD_MGMT_PAUSE)) == 0) { 1726 pmgmt_type = RTE_POWER_MGMT_TYPE_PAUSE; 1727 return 0; 1728 } 1729 1730 if (strncmp(PMD_MGMT_SCALE, name, sizeof(PMD_MGMT_SCALE)) == 0) { 1731 pmgmt_type = RTE_POWER_MGMT_TYPE_SCALE; 1732 return 0; 1733 } 1734 if (strncmp(PMD_MGMT_BASELINE, name, sizeof(PMD_MGMT_BASELINE)) == 0) { 1735 baseline_enabled = true; 1736 return 0; 1737 } 1738 /* unknown PMD power management mode */ 1739 return -1; 1740 } 1741 1742 static int 1743 parse_ep_config(const char *q_arg) 1744 { 1745 char s[256]; 1746 const char *p = q_arg; 1747 char *end; 1748 int num_arg; 1749 1750 char *str_fld[3]; 1751 1752 int training_flag; 1753 int med_edpi; 1754 int hgh_edpi; 1755 1756 ep_med_edpi = EMPTY_POLL_MED_THRESHOLD; 1757 ep_hgh_edpi = EMPTY_POLL_HGH_THRESHOLD; 1758 1759 strlcpy(s, p, sizeof(s)); 1760 1761 num_arg = rte_strsplit(s, sizeof(s), str_fld, 3, ','); 1762 1763 empty_poll_train = false; 1764 1765 if (num_arg == 0) 1766 return 0; 1767 1768 if (num_arg == 3) { 1769 1770 training_flag = strtoul(str_fld[0], &end, 0); 1771 med_edpi = strtoul(str_fld[1], &end, 0); 1772 hgh_edpi = strtoul(str_fld[2], &end, 0); 1773 1774 if (training_flag == 1) 1775 empty_poll_train = true; 1776 1777 if (med_edpi > 0) 1778 ep_med_edpi = med_edpi; 1779 1780 if (hgh_edpi > 0) 1781 ep_hgh_edpi = hgh_edpi; 1782 1783 } else { 1784 1785 return -1; 1786 } 1787 1788 return 0; 1789 1790 } 1791 #define CMD_LINE_OPT_PARSE_PTYPE "parse-ptype" 1792 #define CMD_LINE_OPT_LEGACY "legacy" 1793 #define CMD_LINE_OPT_EMPTY_POLL "empty-poll" 1794 #define CMD_LINE_OPT_INTERRUPT_ONLY "interrupt-only" 1795 #define CMD_LINE_OPT_TELEMETRY "telemetry" 1796 #define CMD_LINE_OPT_PMD_MGMT "pmd-mgmt" 1797 #define CMD_LINE_OPT_MAX_PKT_LEN "max-pkt-len" 1798 1799 /* Parse the argument given in the command line of the application */ 1800 static int 1801 parse_args(int argc, char **argv) 1802 { 1803 int opt, ret; 1804 char **argvopt; 1805 int option_index; 1806 uint32_t limit; 1807 char *prgname = argv[0]; 1808 static struct option lgopts[] = { 1809 {"config", 1, 0, 0}, 1810 {"perf-config", 1, 0, 0}, 1811 {"high-perf-cores", 1, 0, 0}, 1812 {"no-numa", 0, 0, 0}, 1813 {CMD_LINE_OPT_MAX_PKT_LEN, 1, 0, 0}, 1814 {CMD_LINE_OPT_EMPTY_POLL, 1, 0, 0}, 1815 {CMD_LINE_OPT_PARSE_PTYPE, 0, 0, 0}, 1816 {CMD_LINE_OPT_LEGACY, 0, 0, 0}, 1817 {CMD_LINE_OPT_TELEMETRY, 0, 0, 0}, 1818 {CMD_LINE_OPT_INTERRUPT_ONLY, 0, 0, 0}, 1819 {CMD_LINE_OPT_PMD_MGMT, 1, 0, 0}, 1820 {NULL, 0, 0, 0} 1821 }; 1822 1823 argvopt = argv; 1824 1825 while ((opt = getopt_long(argc, argvopt, "p:l:m:h:P", 1826 lgopts, &option_index)) != EOF) { 1827 1828 switch (opt) { 1829 /* portmask */ 1830 case 'p': 1831 enabled_port_mask = parse_portmask(optarg); 1832 if (enabled_port_mask == 0) { 1833 printf("invalid portmask\n"); 1834 print_usage(prgname); 1835 return -1; 1836 } 1837 break; 1838 case 'P': 1839 printf("Promiscuous mode selected\n"); 1840 promiscuous_on = 1; 1841 break; 1842 case 'l': 1843 limit = parse_max_pkt_len(optarg); 1844 freq_tlb[LOW] = limit; 1845 break; 1846 case 'm': 1847 limit = parse_max_pkt_len(optarg); 1848 freq_tlb[MED] = limit; 1849 break; 1850 case 'h': 1851 limit = parse_max_pkt_len(optarg); 1852 freq_tlb[HGH] = limit; 1853 break; 1854 /* long options */ 1855 case 0: 1856 if (!strncmp(lgopts[option_index].name, "config", 6)) { 1857 ret = parse_config(optarg); 1858 if (ret) { 1859 printf("invalid config\n"); 1860 print_usage(prgname); 1861 return -1; 1862 } 1863 } 1864 1865 if (!strncmp(lgopts[option_index].name, 1866 "perf-config", 11)) { 1867 ret = parse_perf_config(optarg); 1868 if (ret) { 1869 printf("invalid perf-config\n"); 1870 print_usage(prgname); 1871 return -1; 1872 } 1873 } 1874 1875 if (!strncmp(lgopts[option_index].name, 1876 "high-perf-cores", 15)) { 1877 ret = parse_perf_core_list(optarg); 1878 if (ret) { 1879 printf("invalid high-perf-cores\n"); 1880 print_usage(prgname); 1881 return -1; 1882 } 1883 } 1884 1885 if (!strncmp(lgopts[option_index].name, 1886 "no-numa", 7)) { 1887 printf("numa is disabled \n"); 1888 numa_on = 0; 1889 } 1890 1891 if (!strncmp(lgopts[option_index].name, 1892 CMD_LINE_OPT_LEGACY, 1893 sizeof(CMD_LINE_OPT_LEGACY))) { 1894 if (app_mode != APP_MODE_DEFAULT) { 1895 printf(" legacy mode is mutually exclusive with other modes\n"); 1896 return -1; 1897 } 1898 app_mode = APP_MODE_LEGACY; 1899 printf("legacy mode is enabled\n"); 1900 } 1901 1902 if (!strncmp(lgopts[option_index].name, 1903 CMD_LINE_OPT_EMPTY_POLL, 10)) { 1904 if (app_mode != APP_MODE_DEFAULT) { 1905 printf(" empty-poll mode is mutually exclusive with other modes\n"); 1906 return -1; 1907 } 1908 app_mode = APP_MODE_EMPTY_POLL; 1909 ret = parse_ep_config(optarg); 1910 1911 if (ret) { 1912 printf("invalid empty poll config\n"); 1913 print_usage(prgname); 1914 return -1; 1915 } 1916 printf("empty-poll is enabled\n"); 1917 } 1918 1919 if (!strncmp(lgopts[option_index].name, 1920 CMD_LINE_OPT_TELEMETRY, 1921 sizeof(CMD_LINE_OPT_TELEMETRY))) { 1922 if (app_mode != APP_MODE_DEFAULT) { 1923 printf(" telemetry mode is mutually exclusive with other modes\n"); 1924 return -1; 1925 } 1926 app_mode = APP_MODE_TELEMETRY; 1927 printf("telemetry mode is enabled\n"); 1928 } 1929 1930 if (!strncmp(lgopts[option_index].name, 1931 CMD_LINE_OPT_PMD_MGMT, 1932 sizeof(CMD_LINE_OPT_PMD_MGMT))) { 1933 if (app_mode != APP_MODE_DEFAULT) { 1934 printf(" power mgmt mode is mutually exclusive with other modes\n"); 1935 return -1; 1936 } 1937 if (parse_pmd_mgmt_config(optarg) < 0) { 1938 printf(" Invalid PMD power management mode: %s\n", 1939 optarg); 1940 return -1; 1941 } 1942 app_mode = APP_MODE_PMD_MGMT; 1943 printf("PMD power mgmt mode is enabled\n"); 1944 } 1945 if (!strncmp(lgopts[option_index].name, 1946 CMD_LINE_OPT_INTERRUPT_ONLY, 1947 sizeof(CMD_LINE_OPT_INTERRUPT_ONLY))) { 1948 if (app_mode != APP_MODE_DEFAULT) { 1949 printf(" interrupt-only mode is mutually exclusive with other modes\n"); 1950 return -1; 1951 } 1952 app_mode = APP_MODE_INTERRUPT; 1953 printf("interrupt-only mode is enabled\n"); 1954 } 1955 1956 if (!strncmp(lgopts[option_index].name, 1957 CMD_LINE_OPT_MAX_PKT_LEN, 1958 sizeof(CMD_LINE_OPT_MAX_PKT_LEN))) { 1959 printf("Custom frame size is configured\n"); 1960 max_pkt_len = parse_max_pkt_len(optarg); 1961 } 1962 1963 if (!strncmp(lgopts[option_index].name, 1964 CMD_LINE_OPT_PARSE_PTYPE, 1965 sizeof(CMD_LINE_OPT_PARSE_PTYPE))) { 1966 printf("soft parse-ptype is enabled\n"); 1967 parse_ptype = 1; 1968 } 1969 1970 break; 1971 1972 default: 1973 print_usage(prgname); 1974 return -1; 1975 } 1976 } 1977 1978 if (optind >= 0) 1979 argv[optind-1] = prgname; 1980 1981 ret = optind-1; 1982 optind = 1; /* reset getopt lib */ 1983 return ret; 1984 } 1985 1986 static void 1987 print_ethaddr(const char *name, const struct rte_ether_addr *eth_addr) 1988 { 1989 char buf[RTE_ETHER_ADDR_FMT_SIZE]; 1990 rte_ether_format_addr(buf, RTE_ETHER_ADDR_FMT_SIZE, eth_addr); 1991 printf("%s%s", name, buf); 1992 } 1993 1994 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 1995 static void 1996 setup_hash(int socketid) 1997 { 1998 struct rte_hash_parameters ipv4_l3fwd_hash_params = { 1999 .name = NULL, 2000 .entries = L3FWD_HASH_ENTRIES, 2001 .key_len = sizeof(struct ipv4_5tuple), 2002 .hash_func = DEFAULT_HASH_FUNC, 2003 .hash_func_init_val = 0, 2004 }; 2005 2006 struct rte_hash_parameters ipv6_l3fwd_hash_params = { 2007 .name = NULL, 2008 .entries = L3FWD_HASH_ENTRIES, 2009 .key_len = sizeof(struct ipv6_5tuple), 2010 .hash_func = DEFAULT_HASH_FUNC, 2011 .hash_func_init_val = 0, 2012 }; 2013 2014 unsigned i; 2015 int ret; 2016 char s[64]; 2017 2018 /* create ipv4 hash */ 2019 snprintf(s, sizeof(s), "ipv4_l3fwd_hash_%d", socketid); 2020 ipv4_l3fwd_hash_params.name = s; 2021 ipv4_l3fwd_hash_params.socket_id = socketid; 2022 ipv4_l3fwd_lookup_struct[socketid] = 2023 rte_hash_create(&ipv4_l3fwd_hash_params); 2024 if (ipv4_l3fwd_lookup_struct[socketid] == NULL) 2025 rte_exit(EXIT_FAILURE, "Unable to create the l3fwd hash on " 2026 "socket %d\n", socketid); 2027 2028 /* create ipv6 hash */ 2029 snprintf(s, sizeof(s), "ipv6_l3fwd_hash_%d", socketid); 2030 ipv6_l3fwd_hash_params.name = s; 2031 ipv6_l3fwd_hash_params.socket_id = socketid; 2032 ipv6_l3fwd_lookup_struct[socketid] = 2033 rte_hash_create(&ipv6_l3fwd_hash_params); 2034 if (ipv6_l3fwd_lookup_struct[socketid] == NULL) 2035 rte_exit(EXIT_FAILURE, "Unable to create the l3fwd hash on " 2036 "socket %d\n", socketid); 2037 2038 2039 /* populate the ipv4 hash */ 2040 for (i = 0; i < RTE_DIM(ipv4_l3fwd_route_array); i++) { 2041 ret = rte_hash_add_key (ipv4_l3fwd_lookup_struct[socketid], 2042 (void *) &ipv4_l3fwd_route_array[i].key); 2043 if (ret < 0) { 2044 rte_exit(EXIT_FAILURE, "Unable to add entry %u to the" 2045 "l3fwd hash on socket %d\n", i, socketid); 2046 } 2047 ipv4_l3fwd_out_if[ret] = ipv4_l3fwd_route_array[i].if_out; 2048 printf("Hash: Adding key\n"); 2049 print_ipv4_key(ipv4_l3fwd_route_array[i].key); 2050 } 2051 2052 /* populate the ipv6 hash */ 2053 for (i = 0; i < RTE_DIM(ipv6_l3fwd_route_array); i++) { 2054 ret = rte_hash_add_key (ipv6_l3fwd_lookup_struct[socketid], 2055 (void *) &ipv6_l3fwd_route_array[i].key); 2056 if (ret < 0) { 2057 rte_exit(EXIT_FAILURE, "Unable to add entry %u to the" 2058 "l3fwd hash on socket %d\n", i, socketid); 2059 } 2060 ipv6_l3fwd_out_if[ret] = ipv6_l3fwd_route_array[i].if_out; 2061 printf("Hash: Adding key\n"); 2062 print_ipv6_key(ipv6_l3fwd_route_array[i].key); 2063 } 2064 } 2065 #endif 2066 2067 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 2068 static void 2069 setup_lpm(int socketid) 2070 { 2071 unsigned i; 2072 int ret; 2073 char s[64]; 2074 2075 /* create the LPM table */ 2076 struct rte_lpm_config lpm_ipv4_config; 2077 2078 lpm_ipv4_config.max_rules = IPV4_L3FWD_LPM_MAX_RULES; 2079 lpm_ipv4_config.number_tbl8s = 256; 2080 lpm_ipv4_config.flags = 0; 2081 2082 snprintf(s, sizeof(s), "IPV4_L3FWD_LPM_%d", socketid); 2083 ipv4_l3fwd_lookup_struct[socketid] = 2084 rte_lpm_create(s, socketid, &lpm_ipv4_config); 2085 if (ipv4_l3fwd_lookup_struct[socketid] == NULL) 2086 rte_exit(EXIT_FAILURE, "Unable to create the l3fwd LPM table" 2087 " on socket %d\n", socketid); 2088 2089 /* populate the LPM table */ 2090 for (i = 0; i < RTE_DIM(ipv4_l3fwd_route_array); i++) { 2091 ret = rte_lpm_add(ipv4_l3fwd_lookup_struct[socketid], 2092 ipv4_l3fwd_route_array[i].ip, 2093 ipv4_l3fwd_route_array[i].depth, 2094 ipv4_l3fwd_route_array[i].if_out); 2095 2096 if (ret < 0) { 2097 rte_exit(EXIT_FAILURE, "Unable to add entry %u to the " 2098 "l3fwd LPM table on socket %d\n", 2099 i, socketid); 2100 } 2101 2102 printf("LPM: Adding route 0x%08x / %d (%d)\n", 2103 (unsigned)ipv4_l3fwd_route_array[i].ip, 2104 ipv4_l3fwd_route_array[i].depth, 2105 ipv4_l3fwd_route_array[i].if_out); 2106 } 2107 } 2108 #endif 2109 2110 static int 2111 init_mem(unsigned nb_mbuf) 2112 { 2113 struct lcore_conf *qconf; 2114 int socketid; 2115 unsigned lcore_id; 2116 char s[64]; 2117 2118 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2119 if (rte_lcore_is_enabled(lcore_id) == 0) 2120 continue; 2121 2122 if (numa_on) 2123 socketid = rte_lcore_to_socket_id(lcore_id); 2124 else 2125 socketid = 0; 2126 2127 if (socketid >= NB_SOCKETS) { 2128 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is " 2129 "out of range %d\n", socketid, 2130 lcore_id, NB_SOCKETS); 2131 } 2132 if (pktmbuf_pool[socketid] == NULL) { 2133 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 2134 pktmbuf_pool[socketid] = 2135 rte_pktmbuf_pool_create(s, nb_mbuf, 2136 MEMPOOL_CACHE_SIZE, 0, 2137 RTE_MBUF_DEFAULT_BUF_SIZE, 2138 socketid); 2139 if (pktmbuf_pool[socketid] == NULL) 2140 rte_exit(EXIT_FAILURE, 2141 "Cannot init mbuf pool on socket %d\n", 2142 socketid); 2143 else 2144 printf("Allocated mbuf pool on socket %d\n", 2145 socketid); 2146 2147 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 2148 setup_lpm(socketid); 2149 #else 2150 setup_hash(socketid); 2151 #endif 2152 } 2153 qconf = &lcore_conf[lcore_id]; 2154 qconf->ipv4_lookup_struct = ipv4_l3fwd_lookup_struct[socketid]; 2155 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 2156 qconf->ipv6_lookup_struct = ipv6_l3fwd_lookup_struct[socketid]; 2157 #endif 2158 } 2159 return 0; 2160 } 2161 2162 /* Check the link status of all ports in up to 9s, and print them finally */ 2163 static void 2164 check_all_ports_link_status(uint32_t port_mask) 2165 { 2166 #define CHECK_INTERVAL 100 /* 100ms */ 2167 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 2168 uint8_t count, all_ports_up, print_flag = 0; 2169 uint16_t portid; 2170 struct rte_eth_link link; 2171 int ret; 2172 char link_status_text[RTE_ETH_LINK_MAX_STR_LEN]; 2173 2174 printf("\nChecking link status"); 2175 fflush(stdout); 2176 for (count = 0; count <= MAX_CHECK_TIME; count++) { 2177 all_ports_up = 1; 2178 RTE_ETH_FOREACH_DEV(portid) { 2179 if ((port_mask & (1 << portid)) == 0) 2180 continue; 2181 memset(&link, 0, sizeof(link)); 2182 ret = rte_eth_link_get_nowait(portid, &link); 2183 if (ret < 0) { 2184 all_ports_up = 0; 2185 if (print_flag == 1) 2186 printf("Port %u link get failed: %s\n", 2187 portid, rte_strerror(-ret)); 2188 continue; 2189 } 2190 /* print link status if flag set */ 2191 if (print_flag == 1) { 2192 rte_eth_link_to_str(link_status_text, 2193 sizeof(link_status_text), &link); 2194 printf("Port %d %s\n", portid, 2195 link_status_text); 2196 continue; 2197 } 2198 /* clear all_ports_up flag if any link down */ 2199 if (link.link_status == RTE_ETH_LINK_DOWN) { 2200 all_ports_up = 0; 2201 break; 2202 } 2203 } 2204 /* after finally printing all link status, get out */ 2205 if (print_flag == 1) 2206 break; 2207 2208 if (all_ports_up == 0) { 2209 printf("."); 2210 fflush(stdout); 2211 rte_delay_ms(CHECK_INTERVAL); 2212 } 2213 2214 /* set the print_flag if all ports up or timeout */ 2215 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 2216 print_flag = 1; 2217 printf("done\n"); 2218 } 2219 } 2220 } 2221 2222 static int check_ptype(uint16_t portid) 2223 { 2224 int i, ret; 2225 int ptype_l3_ipv4 = 0; 2226 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 2227 int ptype_l3_ipv6 = 0; 2228 #endif 2229 uint32_t ptype_mask = RTE_PTYPE_L3_MASK; 2230 2231 ret = rte_eth_dev_get_supported_ptypes(portid, ptype_mask, NULL, 0); 2232 if (ret <= 0) 2233 return 0; 2234 2235 uint32_t ptypes[ret]; 2236 2237 ret = rte_eth_dev_get_supported_ptypes(portid, ptype_mask, ptypes, ret); 2238 for (i = 0; i < ret; ++i) { 2239 if (ptypes[i] & RTE_PTYPE_L3_IPV4) 2240 ptype_l3_ipv4 = 1; 2241 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 2242 if (ptypes[i] & RTE_PTYPE_L3_IPV6) 2243 ptype_l3_ipv6 = 1; 2244 #endif 2245 } 2246 2247 if (ptype_l3_ipv4 == 0) 2248 printf("port %d cannot parse RTE_PTYPE_L3_IPV4\n", portid); 2249 2250 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) 2251 if (ptype_l3_ipv6 == 0) 2252 printf("port %d cannot parse RTE_PTYPE_L3_IPV6\n", portid); 2253 #endif 2254 2255 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) 2256 if (ptype_l3_ipv4) 2257 #else /* APP_LOOKUP_EXACT_MATCH */ 2258 if (ptype_l3_ipv4 && ptype_l3_ipv6) 2259 #endif 2260 return 1; 2261 2262 return 0; 2263 2264 } 2265 2266 static int 2267 init_power_library(void) 2268 { 2269 enum power_management_env env; 2270 unsigned int lcore_id; 2271 int ret = 0; 2272 2273 RTE_LCORE_FOREACH(lcore_id) { 2274 /* init power management library */ 2275 ret = rte_power_init(lcore_id); 2276 if (ret) { 2277 RTE_LOG(ERR, POWER, 2278 "Library initialization failed on core %u\n", 2279 lcore_id); 2280 return ret; 2281 } 2282 /* we're not supporting the VM channel mode */ 2283 env = rte_power_get_env(); 2284 if (env != PM_ENV_ACPI_CPUFREQ && 2285 env != PM_ENV_PSTATE_CPUFREQ) { 2286 RTE_LOG(ERR, POWER, 2287 "Only ACPI and PSTATE mode are supported\n"); 2288 return -1; 2289 } 2290 } 2291 return ret; 2292 } 2293 2294 static int 2295 deinit_power_library(void) 2296 { 2297 unsigned int lcore_id; 2298 int ret = 0; 2299 2300 RTE_LCORE_FOREACH(lcore_id) { 2301 /* deinit power management library */ 2302 ret = rte_power_exit(lcore_id); 2303 if (ret) { 2304 RTE_LOG(ERR, POWER, 2305 "Library deinitialization failed on core %u\n", 2306 lcore_id); 2307 return ret; 2308 } 2309 } 2310 return ret; 2311 } 2312 2313 static void 2314 get_current_stat_values(uint64_t *values) 2315 { 2316 unsigned int lcore_id = rte_lcore_id(); 2317 struct lcore_conf *qconf; 2318 uint64_t app_eps = 0, app_fps = 0, app_br = 0; 2319 uint64_t count = 0; 2320 2321 RTE_LCORE_FOREACH_WORKER(lcore_id) { 2322 qconf = &lcore_conf[lcore_id]; 2323 if (qconf->n_rx_queue == 0) 2324 continue; 2325 count++; 2326 rte_spinlock_lock(&stats[lcore_id].telemetry_lock); 2327 app_eps += stats[lcore_id].ep_nep[1]; 2328 app_fps += stats[lcore_id].fp_nfp[1]; 2329 app_br += stats[lcore_id].br; 2330 rte_spinlock_unlock(&stats[lcore_id].telemetry_lock); 2331 } 2332 2333 if (count > 0) { 2334 values[0] = app_eps/count; 2335 values[1] = app_fps/count; 2336 values[2] = app_br/count; 2337 } else 2338 memset(values, 0, sizeof(uint64_t) * NUM_TELSTATS); 2339 2340 } 2341 2342 static void 2343 update_telemetry(__rte_unused struct rte_timer *tim, 2344 __rte_unused void *arg) 2345 { 2346 int ret; 2347 uint64_t values[NUM_TELSTATS] = {0}; 2348 2349 get_current_stat_values(values); 2350 ret = rte_metrics_update_values(RTE_METRICS_GLOBAL, telstats_index, 2351 values, RTE_DIM(values)); 2352 if (ret < 0) 2353 RTE_LOG(WARNING, POWER, "failed to update metrcis\n"); 2354 } 2355 2356 static int 2357 handle_app_stats(const char *cmd __rte_unused, 2358 const char *params __rte_unused, 2359 struct rte_tel_data *d) 2360 { 2361 uint64_t values[NUM_TELSTATS] = {0}; 2362 uint32_t i; 2363 2364 rte_tel_data_start_dict(d); 2365 get_current_stat_values(values); 2366 for (i = 0; i < NUM_TELSTATS; i++) 2367 rte_tel_data_add_dict_u64(d, telstats_strings[i].name, 2368 values[i]); 2369 return 0; 2370 } 2371 2372 static void 2373 telemetry_setup_timer(void) 2374 { 2375 int lcore_id = rte_lcore_id(); 2376 uint64_t hz = rte_get_timer_hz(); 2377 uint64_t ticks; 2378 2379 ticks = hz / TELEMETRY_INTERVALS_PER_SEC; 2380 rte_timer_reset_sync(&telemetry_timer, 2381 ticks, 2382 PERIODICAL, 2383 lcore_id, 2384 update_telemetry, 2385 NULL); 2386 } 2387 static void 2388 empty_poll_setup_timer(void) 2389 { 2390 int lcore_id = rte_lcore_id(); 2391 uint64_t hz = rte_get_timer_hz(); 2392 2393 struct ep_params *ep_ptr = ep_params; 2394 2395 ep_ptr->interval_ticks = hz / INTERVALS_PER_SECOND; 2396 2397 rte_timer_reset_sync(&ep_ptr->timer0, 2398 ep_ptr->interval_ticks, 2399 PERIODICAL, 2400 lcore_id, 2401 rte_empty_poll_detection, 2402 (void *)ep_ptr); 2403 2404 } 2405 static int 2406 launch_timer(unsigned int lcore_id) 2407 { 2408 int64_t prev_tsc = 0, cur_tsc, diff_tsc, cycles_10ms; 2409 2410 RTE_SET_USED(lcore_id); 2411 2412 2413 if (rte_get_main_lcore() != lcore_id) { 2414 rte_panic("timer on lcore:%d which is not main core:%d\n", 2415 lcore_id, 2416 rte_get_main_lcore()); 2417 } 2418 2419 RTE_LOG(INFO, POWER, "Bring up the Timer\n"); 2420 2421 if (app_mode == APP_MODE_EMPTY_POLL) 2422 empty_poll_setup_timer(); 2423 else 2424 telemetry_setup_timer(); 2425 2426 cycles_10ms = rte_get_timer_hz() / 100; 2427 2428 while (!is_done()) { 2429 cur_tsc = rte_rdtsc(); 2430 diff_tsc = cur_tsc - prev_tsc; 2431 if (diff_tsc > cycles_10ms) { 2432 rte_timer_manage(); 2433 prev_tsc = cur_tsc; 2434 cycles_10ms = rte_get_timer_hz() / 100; 2435 } 2436 } 2437 2438 RTE_LOG(INFO, POWER, "Timer_subsystem is done\n"); 2439 2440 return 0; 2441 } 2442 2443 static int 2444 autodetect_mode(void) 2445 { 2446 RTE_LOG(NOTICE, L3FWD_POWER, "Operating mode not specified, probing frequency scaling support...\n"); 2447 2448 /* 2449 * Empty poll and telemetry modes have to be specifically requested to 2450 * be enabled, but we can auto-detect between interrupt mode with or 2451 * without frequency scaling. Both ACPI and pstate can be used. 2452 */ 2453 if (rte_power_check_env_supported(PM_ENV_ACPI_CPUFREQ)) 2454 return APP_MODE_LEGACY; 2455 if (rte_power_check_env_supported(PM_ENV_PSTATE_CPUFREQ)) 2456 return APP_MODE_LEGACY; 2457 2458 RTE_LOG(NOTICE, L3FWD_POWER, "Frequency scaling not supported, selecting interrupt-only mode\n"); 2459 2460 return APP_MODE_INTERRUPT; 2461 } 2462 2463 static const char * 2464 mode_to_str(enum appmode mode) 2465 { 2466 switch (mode) { 2467 case APP_MODE_LEGACY: 2468 return "legacy"; 2469 case APP_MODE_EMPTY_POLL: 2470 return "empty poll"; 2471 case APP_MODE_TELEMETRY: 2472 return "telemetry"; 2473 case APP_MODE_INTERRUPT: 2474 return "interrupt-only"; 2475 case APP_MODE_PMD_MGMT: 2476 return "pmd mgmt"; 2477 default: 2478 return "invalid"; 2479 } 2480 } 2481 2482 static uint32_t 2483 eth_dev_get_overhead_len(uint32_t max_rx_pktlen, uint16_t max_mtu) 2484 { 2485 uint32_t overhead_len; 2486 2487 if (max_mtu != UINT16_MAX && max_rx_pktlen > max_mtu) 2488 overhead_len = max_rx_pktlen - max_mtu; 2489 else 2490 overhead_len = RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN; 2491 2492 return overhead_len; 2493 } 2494 2495 static int 2496 config_port_max_pkt_len(struct rte_eth_conf *conf, 2497 struct rte_eth_dev_info *dev_info) 2498 { 2499 uint32_t overhead_len; 2500 2501 if (max_pkt_len == 0) 2502 return 0; 2503 2504 if (max_pkt_len < RTE_ETHER_MIN_LEN || max_pkt_len > MAX_JUMBO_PKT_LEN) 2505 return -1; 2506 2507 overhead_len = eth_dev_get_overhead_len(dev_info->max_rx_pktlen, 2508 dev_info->max_mtu); 2509 conf->rxmode.mtu = max_pkt_len - overhead_len; 2510 2511 if (conf->rxmode.mtu > RTE_ETHER_MTU) 2512 conf->txmode.offloads |= RTE_ETH_TX_OFFLOAD_MULTI_SEGS; 2513 2514 return 0; 2515 } 2516 2517 /* Power library initialized in the main routine. 8< */ 2518 int 2519 main(int argc, char **argv) 2520 { 2521 struct lcore_conf *qconf; 2522 struct rte_eth_dev_info dev_info; 2523 struct rte_eth_txconf *txconf; 2524 int ret; 2525 uint16_t nb_ports; 2526 uint16_t queueid; 2527 unsigned lcore_id; 2528 uint64_t hz; 2529 uint32_t n_tx_queue, nb_lcores; 2530 uint32_t dev_rxq_num, dev_txq_num; 2531 uint8_t nb_rx_queue, queue, socketid; 2532 uint16_t portid; 2533 const char *ptr_strings[NUM_TELSTATS]; 2534 2535 /* catch SIGINT and restore cpufreq governor to ondemand */ 2536 signal(SIGINT, signal_exit_now); 2537 2538 /* init EAL */ 2539 ret = rte_eal_init(argc, argv); 2540 if (ret < 0) 2541 rte_exit(EXIT_FAILURE, "Invalid EAL parameters\n"); 2542 argc -= ret; 2543 argv += ret; 2544 2545 /* init RTE timer library to be used late */ 2546 rte_timer_subsystem_init(); 2547 2548 /* if we're running pmd-mgmt mode, don't default to baseline mode */ 2549 baseline_enabled = false; 2550 2551 /* parse application arguments (after the EAL ones) */ 2552 ret = parse_args(argc, argv); 2553 if (ret < 0) 2554 rte_exit(EXIT_FAILURE, "Invalid L3FWD parameters\n"); 2555 2556 if (app_mode == APP_MODE_DEFAULT) 2557 app_mode = autodetect_mode(); 2558 2559 RTE_LOG(INFO, L3FWD_POWER, "Selected operation mode: %s\n", 2560 mode_to_str(app_mode)); 2561 2562 /* only legacy and empty poll mode rely on power library */ 2563 if ((app_mode == APP_MODE_LEGACY || app_mode == APP_MODE_EMPTY_POLL) && 2564 init_power_library()) 2565 rte_exit(EXIT_FAILURE, "init_power_library failed\n"); 2566 2567 if (update_lcore_params() < 0) 2568 rte_exit(EXIT_FAILURE, "update_lcore_params failed\n"); 2569 2570 if (check_lcore_params() < 0) 2571 rte_exit(EXIT_FAILURE, "check_lcore_params failed\n"); 2572 2573 ret = init_lcore_rx_queues(); 2574 if (ret < 0) 2575 rte_exit(EXIT_FAILURE, "init_lcore_rx_queues failed\n"); 2576 2577 nb_ports = rte_eth_dev_count_avail(); 2578 2579 if (check_port_config() < 0) 2580 rte_exit(EXIT_FAILURE, "check_port_config failed\n"); 2581 2582 nb_lcores = rte_lcore_count(); 2583 2584 /* initialize all ports */ 2585 RTE_ETH_FOREACH_DEV(portid) { 2586 struct rte_eth_conf local_port_conf = port_conf; 2587 /* not all app modes need interrupts */ 2588 bool need_intr = app_mode == APP_MODE_LEGACY || 2589 app_mode == APP_MODE_INTERRUPT; 2590 2591 /* skip ports that are not enabled */ 2592 if ((enabled_port_mask & (1 << portid)) == 0) { 2593 printf("\nSkipping disabled port %d\n", portid); 2594 continue; 2595 } 2596 2597 /* init port */ 2598 printf("Initializing port %d ... ", portid ); 2599 fflush(stdout); 2600 2601 ret = rte_eth_dev_info_get(portid, &dev_info); 2602 if (ret != 0) 2603 rte_exit(EXIT_FAILURE, 2604 "Error during getting device (port %u) info: %s\n", 2605 portid, strerror(-ret)); 2606 2607 dev_rxq_num = dev_info.max_rx_queues; 2608 dev_txq_num = dev_info.max_tx_queues; 2609 2610 nb_rx_queue = get_port_n_rx_queues(portid); 2611 if (nb_rx_queue > dev_rxq_num) 2612 rte_exit(EXIT_FAILURE, 2613 "Cannot configure not existed rxq: " 2614 "port=%d\n", portid); 2615 2616 n_tx_queue = nb_lcores; 2617 if (n_tx_queue > dev_txq_num) 2618 n_tx_queue = dev_txq_num; 2619 printf("Creating queues: nb_rxq=%d nb_txq=%u... ", 2620 nb_rx_queue, (unsigned)n_tx_queue ); 2621 /* If number of Rx queue is 0, no need to enable Rx interrupt */ 2622 if (nb_rx_queue == 0) 2623 need_intr = false; 2624 2625 if (need_intr) 2626 local_port_conf.intr_conf.rxq = 1; 2627 2628 ret = rte_eth_dev_info_get(portid, &dev_info); 2629 if (ret != 0) 2630 rte_exit(EXIT_FAILURE, 2631 "Error during getting device (port %u) info: %s\n", 2632 portid, strerror(-ret)); 2633 2634 ret = config_port_max_pkt_len(&local_port_conf, &dev_info); 2635 if (ret != 0) 2636 rte_exit(EXIT_FAILURE, 2637 "Invalid max packet length: %u (port %u)\n", 2638 max_pkt_len, portid); 2639 2640 if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) 2641 local_port_conf.txmode.offloads |= 2642 RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE; 2643 2644 local_port_conf.rx_adv_conf.rss_conf.rss_hf &= 2645 dev_info.flow_type_rss_offloads; 2646 if (local_port_conf.rx_adv_conf.rss_conf.rss_hf != 2647 port_conf.rx_adv_conf.rss_conf.rss_hf) { 2648 printf("Port %u modified RSS hash function based on hardware support," 2649 "requested:%#"PRIx64" configured:%#"PRIx64"\n", 2650 portid, 2651 port_conf.rx_adv_conf.rss_conf.rss_hf, 2652 local_port_conf.rx_adv_conf.rss_conf.rss_hf); 2653 } 2654 2655 ret = rte_eth_dev_configure(portid, nb_rx_queue, 2656 (uint16_t)n_tx_queue, &local_port_conf); 2657 if (ret < 0) 2658 rte_exit(EXIT_FAILURE, "Cannot configure device: " 2659 "err=%d, port=%d\n", ret, portid); 2660 2661 ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd, 2662 &nb_txd); 2663 if (ret < 0) 2664 rte_exit(EXIT_FAILURE, 2665 "Cannot adjust number of descriptors: err=%d, port=%d\n", 2666 ret, portid); 2667 2668 ret = rte_eth_macaddr_get(portid, &ports_eth_addr[portid]); 2669 if (ret < 0) 2670 rte_exit(EXIT_FAILURE, 2671 "Cannot get MAC address: err=%d, port=%d\n", 2672 ret, portid); 2673 2674 print_ethaddr(" Address:", &ports_eth_addr[portid]); 2675 printf(", "); 2676 2677 /* init memory */ 2678 ret = init_mem(NB_MBUF); 2679 if (ret < 0) 2680 rte_exit(EXIT_FAILURE, "init_mem failed\n"); 2681 2682 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2683 if (rte_lcore_is_enabled(lcore_id) == 0) 2684 continue; 2685 2686 /* Initialize TX buffers */ 2687 qconf = &lcore_conf[lcore_id]; 2688 qconf->tx_buffer[portid] = rte_zmalloc_socket("tx_buffer", 2689 RTE_ETH_TX_BUFFER_SIZE(MAX_PKT_BURST), 0, 2690 rte_eth_dev_socket_id(portid)); 2691 if (qconf->tx_buffer[portid] == NULL) 2692 rte_exit(EXIT_FAILURE, "Can't allocate tx buffer for port %u\n", 2693 portid); 2694 2695 rte_eth_tx_buffer_init(qconf->tx_buffer[portid], MAX_PKT_BURST); 2696 } 2697 2698 /* init one TX queue per couple (lcore,port) */ 2699 queueid = 0; 2700 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2701 if (rte_lcore_is_enabled(lcore_id) == 0) 2702 continue; 2703 2704 if (queueid >= dev_txq_num) 2705 continue; 2706 2707 if (numa_on) 2708 socketid = \ 2709 (uint8_t)rte_lcore_to_socket_id(lcore_id); 2710 else 2711 socketid = 0; 2712 2713 printf("txq=%u,%d,%d ", lcore_id, queueid, socketid); 2714 fflush(stdout); 2715 2716 txconf = &dev_info.default_txconf; 2717 txconf->offloads = local_port_conf.txmode.offloads; 2718 ret = rte_eth_tx_queue_setup(portid, queueid, nb_txd, 2719 socketid, txconf); 2720 if (ret < 0) 2721 rte_exit(EXIT_FAILURE, 2722 "rte_eth_tx_queue_setup: err=%d, " 2723 "port=%d\n", ret, portid); 2724 2725 qconf = &lcore_conf[lcore_id]; 2726 qconf->tx_queue_id[portid] = queueid; 2727 queueid++; 2728 2729 qconf->tx_port_id[qconf->n_tx_port] = portid; 2730 qconf->n_tx_port++; 2731 } 2732 printf("\n"); 2733 } 2734 2735 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2736 if (rte_lcore_is_enabled(lcore_id) == 0) 2737 continue; 2738 2739 if (app_mode == APP_MODE_LEGACY) { 2740 /* init timer structures for each enabled lcore */ 2741 rte_timer_init(&power_timers[lcore_id]); 2742 hz = rte_get_timer_hz(); 2743 rte_timer_reset(&power_timers[lcore_id], 2744 hz/TIMER_NUMBER_PER_SECOND, 2745 SINGLE, lcore_id, 2746 power_timer_cb, NULL); 2747 } 2748 qconf = &lcore_conf[lcore_id]; 2749 printf("\nInitializing rx queues on lcore %u ... ", lcore_id ); 2750 fflush(stdout); 2751 2752 /* init RX queues */ 2753 for(queue = 0; queue < qconf->n_rx_queue; ++queue) { 2754 struct rte_eth_rxconf rxq_conf; 2755 2756 portid = qconf->rx_queue_list[queue].port_id; 2757 queueid = qconf->rx_queue_list[queue].queue_id; 2758 2759 if (numa_on) 2760 socketid = \ 2761 (uint8_t)rte_lcore_to_socket_id(lcore_id); 2762 else 2763 socketid = 0; 2764 2765 printf("rxq=%d,%d,%d ", portid, queueid, socketid); 2766 fflush(stdout); 2767 2768 ret = rte_eth_dev_info_get(portid, &dev_info); 2769 if (ret != 0) 2770 rte_exit(EXIT_FAILURE, 2771 "Error during getting device (port %u) info: %s\n", 2772 portid, strerror(-ret)); 2773 2774 rxq_conf = dev_info.default_rxconf; 2775 rxq_conf.offloads = port_conf.rxmode.offloads; 2776 ret = rte_eth_rx_queue_setup(portid, queueid, nb_rxd, 2777 socketid, &rxq_conf, 2778 pktmbuf_pool[socketid]); 2779 if (ret < 0) 2780 rte_exit(EXIT_FAILURE, 2781 "rte_eth_rx_queue_setup: err=%d, " 2782 "port=%d\n", ret, portid); 2783 2784 if (parse_ptype) { 2785 if (add_cb_parse_ptype(portid, queueid) < 0) 2786 rte_exit(EXIT_FAILURE, 2787 "Fail to add ptype cb\n"); 2788 } 2789 2790 if (app_mode == APP_MODE_PMD_MGMT && !baseline_enabled) { 2791 ret = rte_power_ethdev_pmgmt_queue_enable( 2792 lcore_id, portid, queueid, 2793 pmgmt_type); 2794 if (ret < 0) 2795 rte_exit(EXIT_FAILURE, 2796 "rte_power_ethdev_pmgmt_queue_enable: err=%d, port=%d\n", 2797 ret, portid); 2798 } 2799 } 2800 } 2801 /* >8 End of power library initialization. */ 2802 2803 printf("\n"); 2804 2805 /* start ports */ 2806 RTE_ETH_FOREACH_DEV(portid) { 2807 if ((enabled_port_mask & (1 << portid)) == 0) { 2808 continue; 2809 } 2810 /* Start device */ 2811 ret = rte_eth_dev_start(portid); 2812 if (ret < 0) 2813 rte_exit(EXIT_FAILURE, "rte_eth_dev_start: err=%d, " 2814 "port=%d\n", ret, portid); 2815 /* 2816 * If enabled, put device in promiscuous mode. 2817 * This allows IO forwarding mode to forward packets 2818 * to itself through 2 cross-connected ports of the 2819 * target machine. 2820 */ 2821 if (promiscuous_on) { 2822 ret = rte_eth_promiscuous_enable(portid); 2823 if (ret != 0) 2824 rte_exit(EXIT_FAILURE, 2825 "rte_eth_promiscuous_enable: err=%s, port=%u\n", 2826 rte_strerror(-ret), portid); 2827 } 2828 /* initialize spinlock for each port */ 2829 rte_spinlock_init(&(locks[portid])); 2830 2831 if (!parse_ptype) 2832 if (!check_ptype(portid)) 2833 rte_exit(EXIT_FAILURE, 2834 "PMD can not provide needed ptypes\n"); 2835 } 2836 2837 check_all_ports_link_status(enabled_port_mask); 2838 2839 if (app_mode == APP_MODE_EMPTY_POLL) { 2840 2841 if (empty_poll_train) { 2842 policy.state = TRAINING; 2843 } else { 2844 policy.state = MED_NORMAL; 2845 policy.med_base_edpi = ep_med_edpi; 2846 policy.hgh_base_edpi = ep_hgh_edpi; 2847 } 2848 2849 ret = rte_power_empty_poll_stat_init(&ep_params, 2850 freq_tlb, 2851 &policy); 2852 if (ret < 0) 2853 rte_exit(EXIT_FAILURE, "empty poll init failed"); 2854 } 2855 2856 2857 /* launch per-lcore init on every lcore */ 2858 if (app_mode == APP_MODE_LEGACY) { 2859 rte_eal_mp_remote_launch(main_legacy_loop, NULL, CALL_MAIN); 2860 } else if (app_mode == APP_MODE_EMPTY_POLL) { 2861 empty_poll_stop = false; 2862 rte_eal_mp_remote_launch(main_empty_poll_loop, NULL, 2863 SKIP_MAIN); 2864 } else if (app_mode == APP_MODE_TELEMETRY) { 2865 unsigned int i; 2866 2867 /* Init metrics library */ 2868 rte_metrics_init(rte_socket_id()); 2869 /** Register stats with metrics library */ 2870 for (i = 0; i < NUM_TELSTATS; i++) 2871 ptr_strings[i] = telstats_strings[i].name; 2872 2873 ret = rte_metrics_reg_names(ptr_strings, NUM_TELSTATS); 2874 if (ret >= 0) 2875 telstats_index = ret; 2876 else 2877 rte_exit(EXIT_FAILURE, "failed to register metrics names"); 2878 2879 RTE_LCORE_FOREACH_WORKER(lcore_id) { 2880 rte_spinlock_init(&stats[lcore_id].telemetry_lock); 2881 } 2882 rte_timer_init(&telemetry_timer); 2883 rte_telemetry_register_cmd("/l3fwd-power/stats", 2884 handle_app_stats, 2885 "Returns global power stats. Parameters: None"); 2886 rte_eal_mp_remote_launch(main_telemetry_loop, NULL, 2887 SKIP_MAIN); 2888 } else if (app_mode == APP_MODE_INTERRUPT) { 2889 rte_eal_mp_remote_launch(main_intr_loop, NULL, CALL_MAIN); 2890 } else if (app_mode == APP_MODE_PMD_MGMT) { 2891 /* reuse telemetry loop for PMD power management mode */ 2892 rte_eal_mp_remote_launch(main_telemetry_loop, NULL, CALL_MAIN); 2893 } 2894 2895 if (app_mode == APP_MODE_EMPTY_POLL || app_mode == APP_MODE_TELEMETRY) 2896 launch_timer(rte_lcore_id()); 2897 2898 RTE_LCORE_FOREACH_WORKER(lcore_id) { 2899 if (rte_eal_wait_lcore(lcore_id) < 0) 2900 return -1; 2901 } 2902 2903 if (app_mode == APP_MODE_PMD_MGMT) { 2904 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 2905 if (rte_lcore_is_enabled(lcore_id) == 0) 2906 continue; 2907 qconf = &lcore_conf[lcore_id]; 2908 for (queue = 0; queue < qconf->n_rx_queue; ++queue) { 2909 portid = qconf->rx_queue_list[queue].port_id; 2910 queueid = qconf->rx_queue_list[queue].queue_id; 2911 2912 rte_power_ethdev_pmgmt_queue_disable(lcore_id, 2913 portid, queueid); 2914 } 2915 } 2916 } 2917 2918 RTE_ETH_FOREACH_DEV(portid) 2919 { 2920 if ((enabled_port_mask & (1 << portid)) == 0) 2921 continue; 2922 2923 ret = rte_eth_dev_stop(portid); 2924 if (ret != 0) 2925 RTE_LOG(ERR, L3FWD_POWER, "rte_eth_dev_stop: err=%d, port=%u\n", 2926 ret, portid); 2927 2928 rte_eth_dev_close(portid); 2929 } 2930 2931 if (app_mode == APP_MODE_EMPTY_POLL) 2932 rte_power_empty_poll_stat_free(); 2933 2934 if ((app_mode == APP_MODE_LEGACY || app_mode == APP_MODE_EMPTY_POLL) && 2935 deinit_power_library()) 2936 rte_exit(EXIT_FAILURE, "deinit_power_library failed\n"); 2937 2938 if (rte_eal_cleanup() < 0) 2939 RTE_LOG(ERR, L3FWD_POWER, "EAL cleanup failed\n"); 2940 2941 return 0; 2942 } 2943