xref: /dpdk/examples/l3fwd-power/main.c (revision 665b49c51639a10c553433bc2bcd85c7331c631e)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2018 Intel Corporation
3  */
4 
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <stdint.h>
8 #include <inttypes.h>
9 #include <sys/types.h>
10 #include <string.h>
11 #include <sys/queue.h>
12 #include <stdarg.h>
13 #include <errno.h>
14 #include <getopt.h>
15 #include <unistd.h>
16 #include <signal.h>
17 #include <math.h>
18 
19 #include <rte_common.h>
20 #include <rte_byteorder.h>
21 #include <rte_log.h>
22 #include <rte_malloc.h>
23 #include <rte_memory.h>
24 #include <rte_memcpy.h>
25 #include <rte_eal.h>
26 #include <rte_launch.h>
27 #include <rte_cycles.h>
28 #include <rte_prefetch.h>
29 #include <rte_lcore.h>
30 #include <rte_per_lcore.h>
31 #include <rte_branch_prediction.h>
32 #include <rte_interrupts.h>
33 #include <rte_random.h>
34 #include <rte_debug.h>
35 #include <rte_ether.h>
36 #include <rte_ethdev.h>
37 #include <rte_mempool.h>
38 #include <rte_mbuf.h>
39 #include <rte_ip.h>
40 #include <rte_tcp.h>
41 #include <rte_udp.h>
42 #include <rte_string_fns.h>
43 #include <rte_timer.h>
44 #include <rte_power.h>
45 #include <rte_spinlock.h>
46 #include <rte_metrics.h>
47 #include <rte_telemetry.h>
48 #include <rte_power_pmd_mgmt.h>
49 #include <rte_power_intel_uncore.h>
50 
51 #include "perf_core.h"
52 #include "main.h"
53 
54 #define RTE_LOGTYPE_L3FWD_POWER RTE_LOGTYPE_USER1
55 
56 #define MAX_PKT_BURST 32
57 
58 #define MIN_ZERO_POLL_COUNT 10
59 
60 /* 100 ms interval */
61 #define TIMER_NUMBER_PER_SECOND           10
62 /* (10ms) */
63 #define INTERVALS_PER_SECOND             100
64 /* 100000 us */
65 #define SCALING_PERIOD                    (1000000/TIMER_NUMBER_PER_SECOND)
66 #define SCALING_DOWN_TIME_RATIO_THRESHOLD 0.25
67 
68 #define APP_LOOKUP_EXACT_MATCH          0
69 #define APP_LOOKUP_LPM                  1
70 #define DO_RFC_1812_CHECKS
71 
72 #ifndef APP_LOOKUP_METHOD
73 #define APP_LOOKUP_METHOD             APP_LOOKUP_LPM
74 #endif
75 
76 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH)
77 #include <rte_hash.h>
78 #elif (APP_LOOKUP_METHOD == APP_LOOKUP_LPM)
79 #include <rte_lpm.h>
80 #else
81 #error "APP_LOOKUP_METHOD set to incorrect value"
82 #endif
83 
84 #ifndef IPv6_BYTES
85 #define IPv6_BYTES_FMT "%02x%02x:%02x%02x:%02x%02x:%02x%02x:"\
86                        "%02x%02x:%02x%02x:%02x%02x:%02x%02x"
87 #define IPv6_BYTES(addr) \
88 	addr[0],  addr[1], addr[2],  addr[3], \
89 	addr[4],  addr[5], addr[6],  addr[7], \
90 	addr[8],  addr[9], addr[10], addr[11],\
91 	addr[12], addr[13],addr[14], addr[15]
92 #endif
93 
94 #define MAX_JUMBO_PKT_LEN  9600
95 
96 #define IPV6_ADDR_LEN 16
97 
98 #define MEMPOOL_CACHE_SIZE 256
99 
100 /*
101  * This expression is used to calculate the number of mbufs needed depending on
102  * user input, taking into account memory for rx and tx hardware rings, cache
103  * per lcore and mtable per port per lcore. RTE_MAX is used to ensure that
104  * NB_MBUF never goes below a minimum value of 8192.
105  */
106 
107 #define NB_MBUF RTE_MAX	( \
108 	(nb_ports*nb_rx_queue*nb_rxd + \
109 	nb_ports*nb_lcores*MAX_PKT_BURST + \
110 	nb_ports*n_tx_queue*nb_txd + \
111 	nb_lcores*MEMPOOL_CACHE_SIZE), \
112 	(unsigned)8192)
113 
114 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
115 
116 #define NB_SOCKETS 8
117 
118 /* Configure how many packets ahead to prefetch, when reading packets */
119 #define PREFETCH_OFFSET	3
120 
121 /*
122  * Configurable number of RX/TX ring descriptors
123  */
124 #define RX_DESC_DEFAULT 1024
125 #define TX_DESC_DEFAULT 1024
126 
127 #define NUM_TELSTATS RTE_DIM(telstats_strings)
128 
129 static uint16_t nb_rxd = RX_DESC_DEFAULT;
130 static uint16_t nb_txd = TX_DESC_DEFAULT;
131 
132 /* ethernet addresses of ports */
133 static struct rte_ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
134 
135 /* ethernet addresses of ports */
136 static rte_spinlock_t locks[RTE_MAX_ETHPORTS];
137 
138 /* mask of enabled ports */
139 static uint32_t enabled_port_mask = 0;
140 /* Ports set in promiscuous mode off by default. */
141 static int promiscuous_on = 0;
142 /* NUMA is enabled by default. */
143 static int numa_on = 1;
144 volatile bool quit_signal;
145 /* timer to update telemetry every 500ms */
146 static struct rte_timer telemetry_timer;
147 
148 /* stats index returned by metrics lib */
149 int telstats_index;
150 
151 /* flag to check if uncore option enabled */
152 int enabled_uncore = -1;
153 
154 struct telstats_name {
155 	char name[RTE_ETH_XSTATS_NAME_SIZE];
156 };
157 
158 /* telemetry stats to be reported */
159 const struct telstats_name telstats_strings[] = {
160 	{"empty_poll"},
161 	{"full_poll"},
162 	{"busy_percent"}
163 };
164 
165 /* core busyness in percentage */
166 enum busy_rate {
167 	ZERO = 0,
168 	PARTIAL = 50,
169 	FULL = 100
170 };
171 
172 enum uncore_choice {
173 	UNCORE_MIN = 0,
174 	UNCORE_MAX = 1,
175 	UNCORE_IDX = 2
176 };
177 
178 /* reference poll count to measure core busyness */
179 #define DEFAULT_COUNT 10000
180 /*
181  * reference CYCLES to be used to
182  * measure core busyness based on poll count
183  */
184 #define MIN_CYCLES  1500000ULL
185 #define MAX_CYCLES 22000000ULL
186 
187 /* (500ms) */
188 #define TELEMETRY_INTERVALS_PER_SEC 2
189 
190 static int parse_ptype; /**< Parse packet type using rx callback, and */
191 			/**< disabled by default */
192 
193 enum appmode {
194 	APP_MODE_DEFAULT = 0,
195 	APP_MODE_LEGACY,
196 	APP_MODE_TELEMETRY,
197 	APP_MODE_INTERRUPT,
198 	APP_MODE_PMD_MGMT
199 };
200 
201 enum appmode app_mode;
202 
203 static enum rte_power_pmd_mgmt_type pmgmt_type;
204 bool baseline_enabled;
205 
206 enum freq_scale_hint_t
207 {
208 	FREQ_LOWER    =      -1,
209 	FREQ_CURRENT  =       0,
210 	FREQ_HIGHER   =       1,
211 	FREQ_HIGHEST  =       2
212 };
213 
214 struct lcore_rx_queue {
215 	uint16_t port_id;
216 	uint8_t queue_id;
217 	enum freq_scale_hint_t freq_up_hint;
218 	uint32_t zero_rx_packet_count;
219 	uint32_t idle_hint;
220 } __rte_cache_aligned;
221 
222 #define MAX_RX_QUEUE_PER_LCORE 16
223 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS
224 #define MAX_RX_QUEUE_PER_PORT 128
225 
226 #define MAX_RX_QUEUE_INTERRUPT_PER_PORT 16
227 
228 
229 struct lcore_params lcore_params_array[MAX_LCORE_PARAMS];
230 static struct lcore_params lcore_params_array_default[] = {
231 	{0, 0, 2},
232 	{0, 1, 2},
233 	{0, 2, 2},
234 	{1, 0, 2},
235 	{1, 1, 2},
236 	{1, 2, 2},
237 	{2, 0, 2},
238 	{3, 0, 3},
239 	{3, 1, 3},
240 };
241 
242 struct lcore_params *lcore_params = lcore_params_array_default;
243 uint16_t nb_lcore_params = RTE_DIM(lcore_params_array_default);
244 
245 static struct rte_eth_conf port_conf = {
246 	.rxmode = {
247 		.mq_mode        = RTE_ETH_MQ_RX_RSS,
248 		.offloads = RTE_ETH_RX_OFFLOAD_CHECKSUM,
249 	},
250 	.rx_adv_conf = {
251 		.rss_conf = {
252 			.rss_key = NULL,
253 			.rss_hf = RTE_ETH_RSS_UDP,
254 		},
255 	},
256 	.txmode = {
257 		.mq_mode = RTE_ETH_MQ_TX_NONE,
258 	}
259 };
260 
261 static uint32_t max_pkt_len;
262 static uint32_t max_empty_polls = 512;
263 static uint32_t pause_duration = 1;
264 static uint32_t scale_freq_min;
265 static uint32_t scale_freq_max;
266 
267 static struct rte_mempool * pktmbuf_pool[NB_SOCKETS];
268 
269 
270 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH)
271 
272 #ifdef RTE_ARCH_X86
273 #include <rte_hash_crc.h>
274 #define DEFAULT_HASH_FUNC       rte_hash_crc
275 #else
276 #include <rte_jhash.h>
277 #define DEFAULT_HASH_FUNC       rte_jhash
278 #endif
279 
280 struct ipv4_5tuple {
281 	uint32_t ip_dst;
282 	uint32_t ip_src;
283 	uint16_t port_dst;
284 	uint16_t port_src;
285 	uint8_t  proto;
286 } __rte_packed;
287 
288 struct ipv6_5tuple {
289 	uint8_t  ip_dst[IPV6_ADDR_LEN];
290 	uint8_t  ip_src[IPV6_ADDR_LEN];
291 	uint16_t port_dst;
292 	uint16_t port_src;
293 	uint8_t  proto;
294 } __rte_packed;
295 
296 struct ipv4_l3fwd_route {
297 	struct ipv4_5tuple key;
298 	uint8_t if_out;
299 };
300 
301 struct ipv6_l3fwd_route {
302 	struct ipv6_5tuple key;
303 	uint8_t if_out;
304 };
305 
306 static struct ipv4_l3fwd_route ipv4_l3fwd_route_array[] = {
307 	{{RTE_IPV4(100,10,0,1), RTE_IPV4(200,10,0,1), 101, 11, IPPROTO_TCP}, 0},
308 	{{RTE_IPV4(100,20,0,2), RTE_IPV4(200,20,0,2), 102, 12, IPPROTO_TCP}, 1},
309 	{{RTE_IPV4(100,30,0,3), RTE_IPV4(200,30,0,3), 103, 13, IPPROTO_TCP}, 2},
310 	{{RTE_IPV4(100,40,0,4), RTE_IPV4(200,40,0,4), 104, 14, IPPROTO_TCP}, 3},
311 };
312 
313 static struct ipv6_l3fwd_route ipv6_l3fwd_route_array[] = {
314 	{
315 		{
316 			{0xfe, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
317 			 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05},
318 			{0xfe, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
319 			 0x02, 0x1e, 0x67, 0xff, 0xfe, 0x0d, 0xb6, 0x0a},
320 			 1, 10, IPPROTO_UDP
321 		}, 4
322 	},
323 };
324 
325 typedef struct rte_hash lookup_struct_t;
326 static lookup_struct_t *ipv4_l3fwd_lookup_struct[NB_SOCKETS];
327 static lookup_struct_t *ipv6_l3fwd_lookup_struct[NB_SOCKETS];
328 
329 #define L3FWD_HASH_ENTRIES	1024
330 
331 static uint16_t ipv4_l3fwd_out_if[L3FWD_HASH_ENTRIES] __rte_cache_aligned;
332 static uint16_t ipv6_l3fwd_out_if[L3FWD_HASH_ENTRIES] __rte_cache_aligned;
333 #endif
334 
335 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM)
336 struct ipv4_l3fwd_route {
337 	uint32_t ip;
338 	uint8_t  depth;
339 	uint8_t  if_out;
340 };
341 
342 static struct ipv4_l3fwd_route ipv4_l3fwd_route_array[] = {
343 	{RTE_IPV4(1,1,1,0), 24, 0},
344 	{RTE_IPV4(2,1,1,0), 24, 1},
345 	{RTE_IPV4(3,1,1,0), 24, 2},
346 	{RTE_IPV4(4,1,1,0), 24, 3},
347 	{RTE_IPV4(5,1,1,0), 24, 4},
348 	{RTE_IPV4(6,1,1,0), 24, 5},
349 	{RTE_IPV4(7,1,1,0), 24, 6},
350 	{RTE_IPV4(8,1,1,0), 24, 7},
351 };
352 
353 #define IPV4_L3FWD_LPM_MAX_RULES     1024
354 
355 typedef struct rte_lpm lookup_struct_t;
356 static lookup_struct_t *ipv4_l3fwd_lookup_struct[NB_SOCKETS];
357 #endif
358 
359 struct lcore_conf {
360 	uint16_t n_rx_queue;
361 	struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
362 	uint16_t n_tx_port;
363 	uint16_t tx_port_id[RTE_MAX_ETHPORTS];
364 	uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
365 	struct rte_eth_dev_tx_buffer *tx_buffer[RTE_MAX_ETHPORTS];
366 	lookup_struct_t * ipv4_lookup_struct;
367 	lookup_struct_t * ipv6_lookup_struct;
368 } __rte_cache_aligned;
369 
370 struct lcore_stats {
371 	/* total sleep time in ms since last frequency scaling down */
372 	uint32_t sleep_time;
373 	/* number of long sleep recently */
374 	uint32_t nb_long_sleep;
375 	/* freq. scaling up trend */
376 	uint32_t trend;
377 	/* total packet processed recently */
378 	uint64_t nb_rx_processed;
379 	/* total iterations looped recently */
380 	uint64_t nb_iteration_looped;
381 	/*
382 	 * Represents empty and non empty polls
383 	 * of rte_eth_rx_burst();
384 	 * ep_nep[0] holds non empty polls
385 	 * i.e. 0 < nb_rx <= MAX_BURST
386 	 * ep_nep[1] holds empty polls.
387 	 * i.e. nb_rx == 0
388 	 */
389 	uint64_t ep_nep[2];
390 	/*
391 	 * Represents full and empty+partial
392 	 * polls of rte_eth_rx_burst();
393 	 * ep_nep[0] holds empty+partial polls.
394 	 * i.e. 0 <= nb_rx < MAX_BURST
395 	 * ep_nep[1] holds full polls
396 	 * i.e. nb_rx == MAX_BURST
397 	 */
398 	uint64_t fp_nfp[2];
399 	enum busy_rate br;
400 	rte_spinlock_t telemetry_lock;
401 } __rte_cache_aligned;
402 
403 static struct lcore_conf lcore_conf[RTE_MAX_LCORE] __rte_cache_aligned;
404 static struct lcore_stats stats[RTE_MAX_LCORE] __rte_cache_aligned;
405 static struct rte_timer power_timers[RTE_MAX_LCORE];
406 
407 static inline uint32_t power_idle_heuristic(uint32_t zero_rx_packet_count);
408 static inline enum freq_scale_hint_t power_freq_scaleup_heuristic( \
409 		unsigned int lcore_id, uint16_t port_id, uint16_t queue_id);
410 
411 static int is_done(void)
412 {
413 	return quit_signal;
414 }
415 
416 /* exit signal handler */
417 static void
418 signal_exit_now(int sigtype)
419 {
420 
421 	if (sigtype == SIGINT)
422 		quit_signal = true;
423 
424 }
425 
426 /*  Frequency scale down timer callback */
427 static void
428 power_timer_cb(__rte_unused struct rte_timer *tim,
429 			  __rte_unused void *arg)
430 {
431 	uint64_t hz;
432 	float sleep_time_ratio;
433 	unsigned lcore_id = rte_lcore_id();
434 
435 	/* accumulate total execution time in us when callback is invoked */
436 	sleep_time_ratio = (float)(stats[lcore_id].sleep_time) /
437 					(float)SCALING_PERIOD;
438 	/**
439 	 * check whether need to scale down frequency a step if it sleep a lot.
440 	 */
441 	if (sleep_time_ratio >= SCALING_DOWN_TIME_RATIO_THRESHOLD) {
442 		if (rte_power_freq_down)
443 			rte_power_freq_down(lcore_id);
444 	}
445 	else if ( (unsigned)(stats[lcore_id].nb_rx_processed /
446 		stats[lcore_id].nb_iteration_looped) < MAX_PKT_BURST) {
447 		/**
448 		 * scale down a step if average packet per iteration less
449 		 * than expectation.
450 		 */
451 		if (rte_power_freq_down)
452 			rte_power_freq_down(lcore_id);
453 	}
454 
455 	/**
456 	 * initialize another timer according to current frequency to ensure
457 	 * timer interval is relatively fixed.
458 	 */
459 	hz = rte_get_timer_hz();
460 	rte_timer_reset(&power_timers[lcore_id], hz/TIMER_NUMBER_PER_SECOND,
461 				SINGLE, lcore_id, power_timer_cb, NULL);
462 
463 	stats[lcore_id].nb_rx_processed = 0;
464 	stats[lcore_id].nb_iteration_looped = 0;
465 
466 	stats[lcore_id].sleep_time = 0;
467 }
468 
469 /* Enqueue a single packet, and send burst if queue is filled */
470 static inline int
471 send_single_packet(struct rte_mbuf *m, uint16_t port)
472 {
473 	uint32_t lcore_id;
474 	struct lcore_conf *qconf;
475 
476 	lcore_id = rte_lcore_id();
477 	qconf = &lcore_conf[lcore_id];
478 
479 	rte_eth_tx_buffer(port, qconf->tx_queue_id[port],
480 			qconf->tx_buffer[port], m);
481 
482 	return 0;
483 }
484 
485 #ifdef DO_RFC_1812_CHECKS
486 static inline int
487 is_valid_ipv4_pkt(struct rte_ipv4_hdr *pkt, uint32_t link_len)
488 {
489 	/* From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2 */
490 	/*
491 	 * 1. The packet length reported by the Link Layer must be large
492 	 * enough to hold the minimum length legal IP datagram (20 bytes).
493 	 */
494 	if (link_len < sizeof(struct rte_ipv4_hdr))
495 		return -1;
496 
497 	/* 2. The IP checksum must be correct. */
498 	/* if this is not checked in H/W, check it. */
499 	if ((port_conf.rxmode.offloads & RTE_ETH_RX_OFFLOAD_IPV4_CKSUM) == 0) {
500 		uint16_t actual_cksum, expected_cksum;
501 		actual_cksum = pkt->hdr_checksum;
502 		pkt->hdr_checksum = 0;
503 		expected_cksum = rte_ipv4_cksum(pkt);
504 		if (actual_cksum != expected_cksum)
505 			return -2;
506 	}
507 
508 	/*
509 	 * 3. The IP version number must be 4. If the version number is not 4
510 	 * then the packet may be another version of IP, such as IPng or
511 	 * ST-II.
512 	 */
513 	if (((pkt->version_ihl) >> 4) != 4)
514 		return -3;
515 	/*
516 	 * 4. The IP header length field must be large enough to hold the
517 	 * minimum length legal IP datagram (20 bytes = 5 words).
518 	 */
519 	if ((pkt->version_ihl & 0xf) < 5)
520 		return -4;
521 
522 	/*
523 	 * 5. The IP total length field must be large enough to hold the IP
524 	 * datagram header, whose length is specified in the IP header length
525 	 * field.
526 	 */
527 	if (rte_cpu_to_be_16(pkt->total_length) < sizeof(struct rte_ipv4_hdr))
528 		return -5;
529 
530 	return 0;
531 }
532 #endif
533 
534 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH)
535 static void
536 print_ipv4_key(struct ipv4_5tuple key)
537 {
538 	printf("IP dst = %08x, IP src = %08x, port dst = %d, port src = %d, "
539 		"proto = %d\n", (unsigned)key.ip_dst, (unsigned)key.ip_src,
540 				key.port_dst, key.port_src, key.proto);
541 }
542 static void
543 print_ipv6_key(struct ipv6_5tuple key)
544 {
545 	printf( "IP dst = " IPv6_BYTES_FMT ", IP src = " IPv6_BYTES_FMT ", "
546 	        "port dst = %d, port src = %d, proto = %d\n",
547 	        IPv6_BYTES(key.ip_dst), IPv6_BYTES(key.ip_src),
548 	        key.port_dst, key.port_src, key.proto);
549 }
550 
551 static inline uint16_t
552 get_ipv4_dst_port(struct rte_ipv4_hdr *ipv4_hdr, uint16_t portid,
553 		lookup_struct_t * ipv4_l3fwd_lookup_struct)
554 {
555 	struct ipv4_5tuple key;
556 	struct rte_tcp_hdr *tcp;
557 	struct rte_udp_hdr *udp;
558 	int ret = 0;
559 
560 	key.ip_dst = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
561 	key.ip_src = rte_be_to_cpu_32(ipv4_hdr->src_addr);
562 	key.proto = ipv4_hdr->next_proto_id;
563 
564 	switch (ipv4_hdr->next_proto_id) {
565 	case IPPROTO_TCP:
566 		tcp = (struct rte_tcp_hdr *)((unsigned char *)ipv4_hdr +
567 					sizeof(struct rte_ipv4_hdr));
568 		key.port_dst = rte_be_to_cpu_16(tcp->dst_port);
569 		key.port_src = rte_be_to_cpu_16(tcp->src_port);
570 		break;
571 
572 	case IPPROTO_UDP:
573 		udp = (struct rte_udp_hdr *)((unsigned char *)ipv4_hdr +
574 					sizeof(struct rte_ipv4_hdr));
575 		key.port_dst = rte_be_to_cpu_16(udp->dst_port);
576 		key.port_src = rte_be_to_cpu_16(udp->src_port);
577 		break;
578 
579 	default:
580 		key.port_dst = 0;
581 		key.port_src = 0;
582 		break;
583 	}
584 
585 	/* Find destination port */
586 	ret = rte_hash_lookup(ipv4_l3fwd_lookup_struct, (const void *)&key);
587 	return ((ret < 0) ? portid : ipv4_l3fwd_out_if[ret]);
588 }
589 
590 static inline uint16_t
591 get_ipv6_dst_port(struct rte_ipv6_hdr *ipv6_hdr, uint16_t portid,
592 			lookup_struct_t *ipv6_l3fwd_lookup_struct)
593 {
594 	struct ipv6_5tuple key;
595 	struct rte_tcp_hdr *tcp;
596 	struct rte_udp_hdr *udp;
597 	int ret = 0;
598 
599 	memcpy(key.ip_dst, ipv6_hdr->dst_addr, IPV6_ADDR_LEN);
600 	memcpy(key.ip_src, ipv6_hdr->src_addr, IPV6_ADDR_LEN);
601 
602 	key.proto = ipv6_hdr->proto;
603 
604 	switch (ipv6_hdr->proto) {
605 	case IPPROTO_TCP:
606 		tcp = (struct rte_tcp_hdr *)((unsigned char *) ipv6_hdr +
607 					sizeof(struct rte_ipv6_hdr));
608 		key.port_dst = rte_be_to_cpu_16(tcp->dst_port);
609 		key.port_src = rte_be_to_cpu_16(tcp->src_port);
610 		break;
611 
612 	case IPPROTO_UDP:
613 		udp = (struct rte_udp_hdr *)((unsigned char *) ipv6_hdr +
614 					sizeof(struct rte_ipv6_hdr));
615 		key.port_dst = rte_be_to_cpu_16(udp->dst_port);
616 		key.port_src = rte_be_to_cpu_16(udp->src_port);
617 		break;
618 
619 	default:
620 		key.port_dst = 0;
621 		key.port_src = 0;
622 		break;
623 	}
624 
625 	/* Find destination port */
626 	ret = rte_hash_lookup(ipv6_l3fwd_lookup_struct, (const void *)&key);
627 	return ((ret < 0) ? portid : ipv6_l3fwd_out_if[ret]);
628 }
629 #endif
630 
631 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM)
632 static inline uint16_t
633 get_ipv4_dst_port(struct rte_ipv4_hdr *ipv4_hdr, uint16_t portid,
634 		lookup_struct_t *ipv4_l3fwd_lookup_struct)
635 {
636 	uint32_t next_hop;
637 
638 	return ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct,
639 			rte_be_to_cpu_32(ipv4_hdr->dst_addr), &next_hop) == 0)?
640 			next_hop : portid);
641 }
642 #endif
643 
644 static inline void
645 parse_ptype_one(struct rte_mbuf *m)
646 {
647 	struct rte_ether_hdr *eth_hdr;
648 	uint32_t packet_type = RTE_PTYPE_UNKNOWN;
649 	uint16_t ether_type;
650 
651 	eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
652 	ether_type = eth_hdr->ether_type;
653 	if (ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4))
654 		packet_type |= RTE_PTYPE_L3_IPV4_EXT_UNKNOWN;
655 	else if (ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV6))
656 		packet_type |= RTE_PTYPE_L3_IPV6_EXT_UNKNOWN;
657 
658 	m->packet_type = packet_type;
659 }
660 
661 static uint16_t
662 cb_parse_ptype(uint16_t port __rte_unused, uint16_t queue __rte_unused,
663 	       struct rte_mbuf *pkts[], uint16_t nb_pkts,
664 	       uint16_t max_pkts __rte_unused,
665 	       void *user_param __rte_unused)
666 {
667 	unsigned int i;
668 
669 	for (i = 0; i < nb_pkts; ++i)
670 		parse_ptype_one(pkts[i]);
671 
672 	return nb_pkts;
673 }
674 
675 static int
676 add_cb_parse_ptype(uint16_t portid, uint16_t queueid)
677 {
678 	printf("Port %d: softly parse packet type info\n", portid);
679 	if (rte_eth_add_rx_callback(portid, queueid, cb_parse_ptype, NULL))
680 		return 0;
681 
682 	printf("Failed to add rx callback: port=%d\n", portid);
683 	return -1;
684 }
685 
686 static inline void
687 l3fwd_simple_forward(struct rte_mbuf *m, uint16_t portid,
688 				struct lcore_conf *qconf)
689 {
690 	struct rte_ether_hdr *eth_hdr;
691 	struct rte_ipv4_hdr *ipv4_hdr;
692 	void *d_addr_bytes;
693 	uint16_t dst_port;
694 
695 	eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
696 
697 	if (RTE_ETH_IS_IPV4_HDR(m->packet_type)) {
698 		/* Handle IPv4 headers.*/
699 		ipv4_hdr =
700 			rte_pktmbuf_mtod_offset(m, struct rte_ipv4_hdr *,
701 						sizeof(struct rte_ether_hdr));
702 
703 #ifdef DO_RFC_1812_CHECKS
704 		/* Check to make sure the packet is valid (RFC1812) */
705 		if (is_valid_ipv4_pkt(ipv4_hdr, m->pkt_len) < 0) {
706 			rte_pktmbuf_free(m);
707 			return;
708 		}
709 #endif
710 
711 		dst_port = get_ipv4_dst_port(ipv4_hdr, portid,
712 					qconf->ipv4_lookup_struct);
713 		if (dst_port >= RTE_MAX_ETHPORTS ||
714 				(enabled_port_mask & 1 << dst_port) == 0)
715 			dst_port = portid;
716 
717 		/* 02:00:00:00:00:xx */
718 		d_addr_bytes = &eth_hdr->dst_addr.addr_bytes[0];
719 		*((uint64_t *)d_addr_bytes) =
720 			0x000000000002 + ((uint64_t)dst_port << 40);
721 
722 #ifdef DO_RFC_1812_CHECKS
723 		/* Update time to live and header checksum */
724 		--(ipv4_hdr->time_to_live);
725 		++(ipv4_hdr->hdr_checksum);
726 #endif
727 
728 		/* src addr */
729 		rte_ether_addr_copy(&ports_eth_addr[dst_port],
730 				&eth_hdr->src_addr);
731 
732 		send_single_packet(m, dst_port);
733 	} else if (RTE_ETH_IS_IPV6_HDR(m->packet_type)) {
734 		/* Handle IPv6 headers.*/
735 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH)
736 		struct rte_ipv6_hdr *ipv6_hdr;
737 
738 		ipv6_hdr =
739 			rte_pktmbuf_mtod_offset(m, struct rte_ipv6_hdr *,
740 						sizeof(struct rte_ether_hdr));
741 
742 		dst_port = get_ipv6_dst_port(ipv6_hdr, portid,
743 					qconf->ipv6_lookup_struct);
744 
745 		if (dst_port >= RTE_MAX_ETHPORTS ||
746 				(enabled_port_mask & 1 << dst_port) == 0)
747 			dst_port = portid;
748 
749 		/* 02:00:00:00:00:xx */
750 		d_addr_bytes = &eth_hdr->dst_addr.addr_bytes[0];
751 		*((uint64_t *)d_addr_bytes) =
752 			0x000000000002 + ((uint64_t)dst_port << 40);
753 
754 		/* src addr */
755 		rte_ether_addr_copy(&ports_eth_addr[dst_port],
756 				&eth_hdr->src_addr);
757 
758 		send_single_packet(m, dst_port);
759 #else
760 		/* We don't currently handle IPv6 packets in LPM mode. */
761 		rte_pktmbuf_free(m);
762 #endif
763 	} else
764 		rte_pktmbuf_free(m);
765 
766 }
767 
768 #define MINIMUM_SLEEP_TIME         1
769 #define SUSPEND_THRESHOLD          300
770 
771 static inline uint32_t
772 power_idle_heuristic(uint32_t zero_rx_packet_count)
773 {
774 	/* If zero count is less than 100,  sleep 1us */
775 	if (zero_rx_packet_count < SUSPEND_THRESHOLD)
776 		return MINIMUM_SLEEP_TIME;
777 	/* If zero count is less than 1000, sleep 100 us which is the
778 		minimum latency switching from C3/C6 to C0
779 	*/
780 	else
781 		return SUSPEND_THRESHOLD;
782 }
783 
784 static inline enum freq_scale_hint_t
785 power_freq_scaleup_heuristic(unsigned lcore_id,
786 			     uint16_t port_id,
787 			     uint16_t queue_id)
788 {
789 	uint32_t rxq_count = rte_eth_rx_queue_count(port_id, queue_id);
790 /**
791  * HW Rx queue size is 128 by default, Rx burst read at maximum 32 entries
792  * per iteration
793  */
794 #define FREQ_GEAR1_RX_PACKET_THRESHOLD             MAX_PKT_BURST
795 #define FREQ_GEAR2_RX_PACKET_THRESHOLD             (MAX_PKT_BURST*2)
796 #define FREQ_GEAR3_RX_PACKET_THRESHOLD             (MAX_PKT_BURST*3)
797 #define FREQ_UP_TREND1_ACC   1
798 #define FREQ_UP_TREND2_ACC   100
799 #define FREQ_UP_THRESHOLD    10000
800 
801 	if (likely(rxq_count > FREQ_GEAR3_RX_PACKET_THRESHOLD)) {
802 		stats[lcore_id].trend = 0;
803 		return FREQ_HIGHEST;
804 	} else if (likely(rxq_count > FREQ_GEAR2_RX_PACKET_THRESHOLD))
805 		stats[lcore_id].trend += FREQ_UP_TREND2_ACC;
806 	else if (likely(rxq_count > FREQ_GEAR1_RX_PACKET_THRESHOLD))
807 		stats[lcore_id].trend += FREQ_UP_TREND1_ACC;
808 
809 	if (likely(stats[lcore_id].trend > FREQ_UP_THRESHOLD)) {
810 		stats[lcore_id].trend = 0;
811 		return FREQ_HIGHER;
812 	}
813 
814 	return FREQ_CURRENT;
815 }
816 
817 /**
818  * force polling thread sleep until one-shot rx interrupt triggers
819  * @param port_id
820  *  Port id.
821  * @param queue_id
822  *  Rx queue id.
823  * @return
824  *  0 on success
825  */
826 static int
827 sleep_until_rx_interrupt(int num, int lcore)
828 {
829 	/*
830 	 * we want to track when we are woken up by traffic so that we can go
831 	 * back to sleep again without log spamming. Avoid cache line sharing
832 	 * to prevent threads stepping on each others' toes.
833 	 */
834 	static struct {
835 		bool wakeup;
836 	} __rte_cache_aligned status[RTE_MAX_LCORE];
837 	struct rte_epoll_event event[num];
838 	int n, i;
839 	uint16_t port_id;
840 	uint8_t queue_id;
841 	void *data;
842 
843 	if (status[lcore].wakeup) {
844 		RTE_LOG(INFO, L3FWD_POWER,
845 				"lcore %u sleeps until interrupt triggers\n",
846 				rte_lcore_id());
847 	}
848 
849 	n = rte_epoll_wait(RTE_EPOLL_PER_THREAD, event, num, 10);
850 	for (i = 0; i < n; i++) {
851 		data = event[i].epdata.data;
852 		port_id = ((uintptr_t)data) >> CHAR_BIT;
853 		queue_id = ((uintptr_t)data) &
854 			RTE_LEN2MASK(CHAR_BIT, uint8_t);
855 		RTE_LOG(INFO, L3FWD_POWER,
856 			"lcore %u is waked up from rx interrupt on"
857 			" port %d queue %d\n",
858 			rte_lcore_id(), port_id, queue_id);
859 	}
860 	status[lcore].wakeup = n != 0;
861 
862 	return 0;
863 }
864 
865 static void turn_on_off_intr(struct lcore_conf *qconf, bool on)
866 {
867 	int i;
868 	struct lcore_rx_queue *rx_queue;
869 	uint8_t queue_id;
870 	uint16_t port_id;
871 
872 	for (i = 0; i < qconf->n_rx_queue; ++i) {
873 		rx_queue = &(qconf->rx_queue_list[i]);
874 		port_id = rx_queue->port_id;
875 		queue_id = rx_queue->queue_id;
876 
877 		rte_spinlock_lock(&(locks[port_id]));
878 		if (on)
879 			rte_eth_dev_rx_intr_enable(port_id, queue_id);
880 		else
881 			rte_eth_dev_rx_intr_disable(port_id, queue_id);
882 		rte_spinlock_unlock(&(locks[port_id]));
883 	}
884 }
885 
886 static int event_register(struct lcore_conf *qconf)
887 {
888 	struct lcore_rx_queue *rx_queue;
889 	uint8_t queueid;
890 	uint16_t portid;
891 	uint32_t data;
892 	int ret;
893 	int i;
894 
895 	for (i = 0; i < qconf->n_rx_queue; ++i) {
896 		rx_queue = &(qconf->rx_queue_list[i]);
897 		portid = rx_queue->port_id;
898 		queueid = rx_queue->queue_id;
899 		data = portid << CHAR_BIT | queueid;
900 
901 		ret = rte_eth_dev_rx_intr_ctl_q(portid, queueid,
902 						RTE_EPOLL_PER_THREAD,
903 						RTE_INTR_EVENT_ADD,
904 						(void *)((uintptr_t)data));
905 		if (ret)
906 			return ret;
907 	}
908 
909 	return 0;
910 }
911 
912 /* Main processing loop. 8< */
913 static int main_intr_loop(__rte_unused void *dummy)
914 {
915 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
916 	unsigned int lcore_id;
917 	uint64_t prev_tsc, diff_tsc, cur_tsc;
918 	int i, j, nb_rx;
919 	uint8_t queueid;
920 	uint16_t portid;
921 	struct lcore_conf *qconf;
922 	struct lcore_rx_queue *rx_queue;
923 	uint32_t lcore_rx_idle_count = 0;
924 	uint32_t lcore_idle_hint = 0;
925 	int intr_en = 0;
926 
927 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) /
928 				   US_PER_S * BURST_TX_DRAIN_US;
929 
930 	prev_tsc = 0;
931 
932 	lcore_id = rte_lcore_id();
933 	qconf = &lcore_conf[lcore_id];
934 
935 	if (qconf->n_rx_queue == 0) {
936 		RTE_LOG(INFO, L3FWD_POWER, "lcore %u has nothing to do\n",
937 				lcore_id);
938 		return 0;
939 	}
940 
941 	RTE_LOG(INFO, L3FWD_POWER, "entering main interrupt loop on lcore %u\n",
942 			lcore_id);
943 
944 	for (i = 0; i < qconf->n_rx_queue; i++) {
945 		portid = qconf->rx_queue_list[i].port_id;
946 		queueid = qconf->rx_queue_list[i].queue_id;
947 		RTE_LOG(INFO, L3FWD_POWER,
948 				" -- lcoreid=%u portid=%u rxqueueid=%hhu\n",
949 				lcore_id, portid, queueid);
950 	}
951 
952 	/* add into event wait list */
953 	if (event_register(qconf) == 0)
954 		intr_en = 1;
955 	else
956 		RTE_LOG(INFO, L3FWD_POWER, "RX interrupt won't enable.\n");
957 
958 	while (!is_done()) {
959 		stats[lcore_id].nb_iteration_looped++;
960 
961 		cur_tsc = rte_rdtsc();
962 
963 		/*
964 		 * TX burst queue drain
965 		 */
966 		diff_tsc = cur_tsc - prev_tsc;
967 		if (unlikely(diff_tsc > drain_tsc)) {
968 			for (i = 0; i < qconf->n_tx_port; ++i) {
969 				portid = qconf->tx_port_id[i];
970 				rte_eth_tx_buffer_flush(portid,
971 						qconf->tx_queue_id[portid],
972 						qconf->tx_buffer[portid]);
973 			}
974 			prev_tsc = cur_tsc;
975 		}
976 
977 start_rx:
978 		/*
979 		 * Read packet from RX queues
980 		 */
981 		lcore_rx_idle_count = 0;
982 		for (i = 0; i < qconf->n_rx_queue; ++i) {
983 			rx_queue = &(qconf->rx_queue_list[i]);
984 			rx_queue->idle_hint = 0;
985 			portid = rx_queue->port_id;
986 			queueid = rx_queue->queue_id;
987 
988 			nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst,
989 					MAX_PKT_BURST);
990 
991 			stats[lcore_id].nb_rx_processed += nb_rx;
992 			if (unlikely(nb_rx == 0)) {
993 				/**
994 				 * no packet received from rx queue, try to
995 				 * sleep for a while forcing CPU enter deeper
996 				 * C states.
997 				 */
998 				rx_queue->zero_rx_packet_count++;
999 
1000 				if (rx_queue->zero_rx_packet_count <=
1001 						MIN_ZERO_POLL_COUNT)
1002 					continue;
1003 
1004 				rx_queue->idle_hint = power_idle_heuristic(
1005 						rx_queue->zero_rx_packet_count);
1006 				lcore_rx_idle_count++;
1007 			} else {
1008 				rx_queue->zero_rx_packet_count = 0;
1009 			}
1010 
1011 			/* Prefetch first packets */
1012 			for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1013 				rte_prefetch0(rte_pktmbuf_mtod(
1014 						pkts_burst[j], void *));
1015 			}
1016 
1017 			/* Prefetch and forward already prefetched packets */
1018 			for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1019 				rte_prefetch0(rte_pktmbuf_mtod(
1020 						pkts_burst[j + PREFETCH_OFFSET],
1021 						void *));
1022 				l3fwd_simple_forward(
1023 						pkts_burst[j], portid, qconf);
1024 			}
1025 
1026 			/* Forward remaining prefetched packets */
1027 			for (; j < nb_rx; j++) {
1028 				l3fwd_simple_forward(
1029 						pkts_burst[j], portid, qconf);
1030 			}
1031 		}
1032 
1033 		if (unlikely(lcore_rx_idle_count == qconf->n_rx_queue)) {
1034 			/**
1035 			 * All Rx queues empty in recent consecutive polls,
1036 			 * sleep in a conservative manner, meaning sleep as
1037 			 * less as possible.
1038 			 */
1039 			for (i = 1,
1040 			    lcore_idle_hint = qconf->rx_queue_list[0].idle_hint;
1041 					i < qconf->n_rx_queue; ++i) {
1042 				rx_queue = &(qconf->rx_queue_list[i]);
1043 				if (rx_queue->idle_hint < lcore_idle_hint)
1044 					lcore_idle_hint = rx_queue->idle_hint;
1045 			}
1046 
1047 			if (lcore_idle_hint < SUSPEND_THRESHOLD)
1048 				/**
1049 				 * execute "pause" instruction to avoid context
1050 				 * switch which generally take hundred of
1051 				 * microseconds for short sleep.
1052 				 */
1053 				rte_delay_us(lcore_idle_hint);
1054 			else {
1055 				/* suspend until rx interrupt triggers */
1056 				if (intr_en) {
1057 					turn_on_off_intr(qconf, 1);
1058 					sleep_until_rx_interrupt(
1059 							qconf->n_rx_queue,
1060 							lcore_id);
1061 					turn_on_off_intr(qconf, 0);
1062 					/**
1063 					 * start receiving packets immediately
1064 					 */
1065 					if (likely(!is_done()))
1066 						goto start_rx;
1067 				}
1068 			}
1069 			stats[lcore_id].sleep_time += lcore_idle_hint;
1070 		}
1071 	}
1072 
1073 	return 0;
1074 }
1075 /* >8 End of main processing loop. */
1076 
1077 /* main processing loop */
1078 static int
1079 main_telemetry_loop(__rte_unused void *dummy)
1080 {
1081 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1082 	unsigned int lcore_id;
1083 	uint64_t prev_tsc, diff_tsc, cur_tsc, prev_tel_tsc;
1084 	int i, j, nb_rx;
1085 	uint8_t queueid;
1086 	uint16_t portid;
1087 	struct lcore_conf *qconf;
1088 	struct lcore_rx_queue *rx_queue;
1089 	uint64_t ep_nep[2] = {0}, fp_nfp[2] = {0};
1090 	uint64_t poll_count;
1091 	enum busy_rate br;
1092 
1093 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) /
1094 					US_PER_S * BURST_TX_DRAIN_US;
1095 
1096 	poll_count = 0;
1097 	prev_tsc = 0;
1098 	prev_tel_tsc = 0;
1099 
1100 	lcore_id = rte_lcore_id();
1101 	qconf = &lcore_conf[lcore_id];
1102 
1103 	if (qconf->n_rx_queue == 0) {
1104 		RTE_LOG(INFO, L3FWD_POWER, "lcore %u has nothing to do\n",
1105 			lcore_id);
1106 		return 0;
1107 	}
1108 
1109 	RTE_LOG(INFO, L3FWD_POWER, "entering main telemetry loop on lcore %u\n",
1110 		lcore_id);
1111 
1112 	for (i = 0; i < qconf->n_rx_queue; i++) {
1113 		portid = qconf->rx_queue_list[i].port_id;
1114 		queueid = qconf->rx_queue_list[i].queue_id;
1115 		RTE_LOG(INFO, L3FWD_POWER, " -- lcoreid=%u portid=%u "
1116 			"rxqueueid=%hhu\n", lcore_id, portid, queueid);
1117 	}
1118 
1119 	while (!is_done()) {
1120 
1121 		cur_tsc = rte_rdtsc();
1122 		/*
1123 		 * TX burst queue drain
1124 		 */
1125 		diff_tsc = cur_tsc - prev_tsc;
1126 		if (unlikely(diff_tsc > drain_tsc)) {
1127 			for (i = 0; i < qconf->n_tx_port; ++i) {
1128 				portid = qconf->tx_port_id[i];
1129 				rte_eth_tx_buffer_flush(portid,
1130 						qconf->tx_queue_id[portid],
1131 						qconf->tx_buffer[portid]);
1132 			}
1133 			prev_tsc = cur_tsc;
1134 		}
1135 
1136 		/*
1137 		 * Read packet from RX queues
1138 		 */
1139 		for (i = 0; i < qconf->n_rx_queue; ++i) {
1140 			rx_queue = &(qconf->rx_queue_list[i]);
1141 			portid = rx_queue->port_id;
1142 			queueid = rx_queue->queue_id;
1143 
1144 			nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst,
1145 								MAX_PKT_BURST);
1146 			ep_nep[nb_rx == 0]++;
1147 			fp_nfp[nb_rx == MAX_PKT_BURST]++;
1148 			poll_count++;
1149 			if (unlikely(nb_rx == 0))
1150 				continue;
1151 
1152 			/* Prefetch first packets */
1153 			for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1154 				rte_prefetch0(rte_pktmbuf_mtod(
1155 						pkts_burst[j], void *));
1156 			}
1157 
1158 			/* Prefetch and forward already prefetched packets */
1159 			for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1160 				rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1161 						j + PREFETCH_OFFSET], void *));
1162 				l3fwd_simple_forward(pkts_burst[j], portid,
1163 								qconf);
1164 			}
1165 
1166 			/* Forward remaining prefetched packets */
1167 			for (; j < nb_rx; j++) {
1168 				l3fwd_simple_forward(pkts_burst[j], portid,
1169 								qconf);
1170 			}
1171 		}
1172 		if (unlikely(poll_count >= DEFAULT_COUNT)) {
1173 			diff_tsc = cur_tsc - prev_tel_tsc;
1174 			if (diff_tsc >= MAX_CYCLES) {
1175 				br = FULL;
1176 			} else if (diff_tsc > MIN_CYCLES &&
1177 					diff_tsc < MAX_CYCLES) {
1178 				br = (diff_tsc * 100) / MAX_CYCLES;
1179 			} else {
1180 				br = ZERO;
1181 			}
1182 			poll_count = 0;
1183 			prev_tel_tsc = cur_tsc;
1184 			/* update stats for telemetry */
1185 			rte_spinlock_lock(&stats[lcore_id].telemetry_lock);
1186 			stats[lcore_id].ep_nep[0] = ep_nep[0];
1187 			stats[lcore_id].ep_nep[1] = ep_nep[1];
1188 			stats[lcore_id].fp_nfp[0] = fp_nfp[0];
1189 			stats[lcore_id].fp_nfp[1] = fp_nfp[1];
1190 			stats[lcore_id].br = br;
1191 			rte_spinlock_unlock(&stats[lcore_id].telemetry_lock);
1192 		}
1193 	}
1194 
1195 	return 0;
1196 }
1197 
1198 /* main processing loop */
1199 static int
1200 main_legacy_loop(__rte_unused void *dummy)
1201 {
1202 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1203 	unsigned lcore_id;
1204 	uint64_t prev_tsc, diff_tsc, cur_tsc, tim_res_tsc, hz;
1205 	uint64_t prev_tsc_power = 0, cur_tsc_power, diff_tsc_power;
1206 	int i, j, nb_rx;
1207 	uint8_t queueid;
1208 	uint16_t portid;
1209 	struct lcore_conf *qconf;
1210 	struct lcore_rx_queue *rx_queue;
1211 	enum freq_scale_hint_t lcore_scaleup_hint;
1212 	uint32_t lcore_rx_idle_count = 0;
1213 	uint32_t lcore_idle_hint = 0;
1214 	int intr_en = 0;
1215 
1216 	const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1217 
1218 	prev_tsc = 0;
1219 	hz = rte_get_timer_hz();
1220 	tim_res_tsc = hz/TIMER_NUMBER_PER_SECOND;
1221 
1222 	lcore_id = rte_lcore_id();
1223 	qconf = &lcore_conf[lcore_id];
1224 
1225 	if (qconf->n_rx_queue == 0) {
1226 		RTE_LOG(INFO, L3FWD_POWER, "lcore %u has nothing to do\n", lcore_id);
1227 		return 0;
1228 	}
1229 
1230 	RTE_LOG(INFO, L3FWD_POWER, "entering main loop on lcore %u\n", lcore_id);
1231 
1232 	for (i = 0; i < qconf->n_rx_queue; i++) {
1233 		portid = qconf->rx_queue_list[i].port_id;
1234 		queueid = qconf->rx_queue_list[i].queue_id;
1235 		RTE_LOG(INFO, L3FWD_POWER, " -- lcoreid=%u portid=%u "
1236 			"rxqueueid=%hhu\n", lcore_id, portid, queueid);
1237 	}
1238 
1239 	/* add into event wait list */
1240 	if (event_register(qconf) == 0)
1241 		intr_en = 1;
1242 	else
1243 		RTE_LOG(INFO, L3FWD_POWER, "RX interrupt won't enable.\n");
1244 
1245 	while (!is_done()) {
1246 		stats[lcore_id].nb_iteration_looped++;
1247 
1248 		cur_tsc = rte_rdtsc();
1249 		cur_tsc_power = cur_tsc;
1250 
1251 		/*
1252 		 * TX burst queue drain
1253 		 */
1254 		diff_tsc = cur_tsc - prev_tsc;
1255 		if (unlikely(diff_tsc > drain_tsc)) {
1256 			for (i = 0; i < qconf->n_tx_port; ++i) {
1257 				portid = qconf->tx_port_id[i];
1258 				rte_eth_tx_buffer_flush(portid,
1259 						qconf->tx_queue_id[portid],
1260 						qconf->tx_buffer[portid]);
1261 			}
1262 			prev_tsc = cur_tsc;
1263 		}
1264 
1265 		diff_tsc_power = cur_tsc_power - prev_tsc_power;
1266 		if (diff_tsc_power > tim_res_tsc) {
1267 			rte_timer_manage();
1268 			prev_tsc_power = cur_tsc_power;
1269 		}
1270 
1271 start_rx:
1272 		/*
1273 		 * Read packet from RX queues
1274 		 */
1275 		lcore_scaleup_hint = FREQ_CURRENT;
1276 		lcore_rx_idle_count = 0;
1277 		for (i = 0; i < qconf->n_rx_queue; ++i) {
1278 			rx_queue = &(qconf->rx_queue_list[i]);
1279 			rx_queue->idle_hint = 0;
1280 			portid = rx_queue->port_id;
1281 			queueid = rx_queue->queue_id;
1282 
1283 			nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst,
1284 								MAX_PKT_BURST);
1285 
1286 			stats[lcore_id].nb_rx_processed += nb_rx;
1287 			if (unlikely(nb_rx == 0)) {
1288 				/**
1289 				 * no packet received from rx queue, try to
1290 				 * sleep for a while forcing CPU enter deeper
1291 				 * C states.
1292 				 */
1293 				rx_queue->zero_rx_packet_count++;
1294 
1295 				if (rx_queue->zero_rx_packet_count <=
1296 							MIN_ZERO_POLL_COUNT)
1297 					continue;
1298 
1299 				rx_queue->idle_hint = power_idle_heuristic(\
1300 					rx_queue->zero_rx_packet_count);
1301 				lcore_rx_idle_count++;
1302 			} else {
1303 				rx_queue->zero_rx_packet_count = 0;
1304 
1305 				/**
1306 				 * do not scale up frequency immediately as
1307 				 * user to kernel space communication is costly
1308 				 * which might impact packet I/O for received
1309 				 * packets.
1310 				 */
1311 				rx_queue->freq_up_hint =
1312 					power_freq_scaleup_heuristic(lcore_id,
1313 							portid, queueid);
1314 			}
1315 
1316 			/* Prefetch first packets */
1317 			for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1318 				rte_prefetch0(rte_pktmbuf_mtod(
1319 						pkts_burst[j], void *));
1320 			}
1321 
1322 			/* Prefetch and forward already prefetched packets */
1323 			for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1324 				rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1325 						j + PREFETCH_OFFSET], void *));
1326 				l3fwd_simple_forward(pkts_burst[j], portid,
1327 								qconf);
1328 			}
1329 
1330 			/* Forward remaining prefetched packets */
1331 			for (; j < nb_rx; j++) {
1332 				l3fwd_simple_forward(pkts_burst[j], portid,
1333 								qconf);
1334 			}
1335 		}
1336 
1337 		if (likely(lcore_rx_idle_count != qconf->n_rx_queue)) {
1338 			for (i = 1, lcore_scaleup_hint =
1339 				qconf->rx_queue_list[0].freq_up_hint;
1340 					i < qconf->n_rx_queue; ++i) {
1341 				rx_queue = &(qconf->rx_queue_list[i]);
1342 				if (rx_queue->freq_up_hint >
1343 						lcore_scaleup_hint)
1344 					lcore_scaleup_hint =
1345 						rx_queue->freq_up_hint;
1346 			}
1347 
1348 			if (lcore_scaleup_hint == FREQ_HIGHEST) {
1349 				if (rte_power_freq_max)
1350 					rte_power_freq_max(lcore_id);
1351 			} else if (lcore_scaleup_hint == FREQ_HIGHER) {
1352 				if (rte_power_freq_up)
1353 					rte_power_freq_up(lcore_id);
1354 			}
1355 		} else {
1356 			/**
1357 			 * All Rx queues empty in recent consecutive polls,
1358 			 * sleep in a conservative manner, meaning sleep as
1359 			 * less as possible.
1360 			 */
1361 			for (i = 1, lcore_idle_hint =
1362 				qconf->rx_queue_list[0].idle_hint;
1363 					i < qconf->n_rx_queue; ++i) {
1364 				rx_queue = &(qconf->rx_queue_list[i]);
1365 				if (rx_queue->idle_hint < lcore_idle_hint)
1366 					lcore_idle_hint = rx_queue->idle_hint;
1367 			}
1368 
1369 			if (lcore_idle_hint < SUSPEND_THRESHOLD)
1370 				/**
1371 				 * execute "pause" instruction to avoid context
1372 				 * switch which generally take hundred of
1373 				 * microseconds for short sleep.
1374 				 */
1375 				rte_delay_us(lcore_idle_hint);
1376 			else {
1377 				/* suspend until rx interrupt triggers */
1378 				if (intr_en) {
1379 					turn_on_off_intr(qconf, 1);
1380 					sleep_until_rx_interrupt(
1381 							qconf->n_rx_queue,
1382 							lcore_id);
1383 					turn_on_off_intr(qconf, 0);
1384 					/**
1385 					 * start receiving packets immediately
1386 					 */
1387 					if (likely(!is_done()))
1388 						goto start_rx;
1389 				}
1390 			}
1391 			stats[lcore_id].sleep_time += lcore_idle_hint;
1392 		}
1393 	}
1394 
1395 	return 0;
1396 }
1397 
1398 static int
1399 check_lcore_params(void)
1400 {
1401 	uint8_t queue, lcore;
1402 	uint16_t i;
1403 	int socketid;
1404 
1405 	for (i = 0; i < nb_lcore_params; ++i) {
1406 		queue = lcore_params[i].queue_id;
1407 		if (queue >= MAX_RX_QUEUE_PER_PORT) {
1408 			printf("invalid queue number: %hhu\n", queue);
1409 			return -1;
1410 		}
1411 		lcore = lcore_params[i].lcore_id;
1412 		if (!rte_lcore_is_enabled(lcore)) {
1413 			printf("error: lcore %hhu is not enabled in lcore "
1414 							"mask\n", lcore);
1415 			return -1;
1416 		}
1417 		if ((socketid = rte_lcore_to_socket_id(lcore) != 0) &&
1418 							(numa_on == 0)) {
1419 			printf("warning: lcore %hhu is on socket %d with numa "
1420 						"off\n", lcore, socketid);
1421 		}
1422 		if (app_mode == APP_MODE_TELEMETRY && lcore == rte_lcore_id()) {
1423 			printf("cannot enable main core %d in config for telemetry mode\n",
1424 				rte_lcore_id());
1425 			return -1;
1426 		}
1427 	}
1428 	return 0;
1429 }
1430 
1431 static int
1432 check_port_config(void)
1433 {
1434 	unsigned portid;
1435 	uint16_t i;
1436 
1437 	for (i = 0; i < nb_lcore_params; ++i) {
1438 		portid = lcore_params[i].port_id;
1439 		if ((enabled_port_mask & (1 << portid)) == 0) {
1440 			printf("port %u is not enabled in port mask\n",
1441 								portid);
1442 			return -1;
1443 		}
1444 		if (!rte_eth_dev_is_valid_port(portid)) {
1445 			printf("port %u is not present on the board\n",
1446 								portid);
1447 			return -1;
1448 		}
1449 	}
1450 	return 0;
1451 }
1452 
1453 static uint8_t
1454 get_port_n_rx_queues(const uint16_t port)
1455 {
1456 	int queue = -1;
1457 	uint16_t i;
1458 
1459 	for (i = 0; i < nb_lcore_params; ++i) {
1460 		if (lcore_params[i].port_id == port &&
1461 				lcore_params[i].queue_id > queue)
1462 			queue = lcore_params[i].queue_id;
1463 	}
1464 	return (uint8_t)(++queue);
1465 }
1466 
1467 static int
1468 init_lcore_rx_queues(void)
1469 {
1470 	uint16_t i, nb_rx_queue;
1471 	uint8_t lcore;
1472 
1473 	for (i = 0; i < nb_lcore_params; ++i) {
1474 		lcore = lcore_params[i].lcore_id;
1475 		nb_rx_queue = lcore_conf[lcore].n_rx_queue;
1476 		if (nb_rx_queue >= MAX_RX_QUEUE_PER_LCORE) {
1477 			printf("error: too many queues (%u) for lcore: %u\n",
1478 				(unsigned)nb_rx_queue + 1, (unsigned)lcore);
1479 			return -1;
1480 		} else {
1481 			lcore_conf[lcore].rx_queue_list[nb_rx_queue].port_id =
1482 				lcore_params[i].port_id;
1483 			lcore_conf[lcore].rx_queue_list[nb_rx_queue].queue_id =
1484 				lcore_params[i].queue_id;
1485 			lcore_conf[lcore].n_rx_queue++;
1486 		}
1487 	}
1488 	return 0;
1489 }
1490 
1491 /* display usage */
1492 static void
1493 print_usage(const char *prgname)
1494 {
1495 	printf ("%s [EAL options] -- -p PORTMASK -P"
1496 		"  [--config (port,queue,lcore)[,(port,queue,lcore]]"
1497 		"  [--high-perf-cores CORELIST"
1498 		"  [--perf-config (port,queue,hi_perf,lcore_index)[,(port,queue,hi_perf,lcore_index]]"
1499 		"  [--max-pkt-len PKTLEN]\n"
1500 		"  -p PORTMASK: hexadecimal bitmask of ports to configure\n"
1501 		"  -P: enable promiscuous mode\n"
1502 		"  -u: set min/max frequency for uncore to minimum value\n"
1503 		"  -U: set min/max frequency for uncore to maximum value\n"
1504 		"  -i (frequency index): set min/max frequency for uncore to specified frequency index\n"
1505 		"  --config (port,queue,lcore): rx queues configuration\n"
1506 		"  --high-perf-cores CORELIST: list of high performance cores\n"
1507 		"  --perf-config: similar as config, cores specified as indices"
1508 		" for bins containing high or regular performance cores\n"
1509 		"  --no-numa: optional, disable numa awareness\n"
1510 		"  --max-pkt-len PKTLEN: maximum packet length in decimal (64-9600)\n"
1511 		"  --parse-ptype: parse packet type by software\n"
1512 		"  --legacy: use legacy interrupt-based scaling\n"
1513 		" --telemetry: enable telemetry mode, to update"
1514 		" empty polls, full polls, and core busyness to telemetry\n"
1515 		" --interrupt-only: enable interrupt-only mode\n"
1516 		" --pmd-mgmt MODE: enable PMD power management mode. "
1517 		"Currently supported modes: baseline, monitor, pause, scale\n"
1518 		"  --max-empty-polls MAX_EMPTY_POLLS: number of empty polls to"
1519 		" wait before entering sleep state\n"
1520 		"  --pause-duration DURATION: set the duration, in microseconds,"
1521 		" of the pause callback\n"
1522 		"  --scale-freq-min FREQ_MIN: set minimum frequency for scaling mode for"
1523 		" all application lcores (FREQ_MIN must be in kHz, in increments of 100MHz)\n"
1524 		"  --scale-freq-max FREQ_MAX: set maximum frequency for scaling mode for"
1525 		" all application lcores (FREQ_MAX must be in kHz, in increments of 100MHz)\n",
1526 		prgname);
1527 }
1528 
1529 static int
1530 parse_int(const char *opt)
1531 {
1532 	char *end = NULL;
1533 	unsigned long val;
1534 
1535 	/* parse integer string */
1536 	val = strtoul(opt, &end, 10);
1537 	if ((opt[0] == '\0') || (end == NULL) || (*end != '\0'))
1538 		return -1;
1539 
1540 	return val;
1541 }
1542 
1543 static int parse_max_pkt_len(const char *pktlen)
1544 {
1545 	char *end = NULL;
1546 	unsigned long len;
1547 
1548 	/* parse decimal string */
1549 	len = strtoul(pktlen, &end, 10);
1550 	if ((pktlen[0] == '\0') || (end == NULL) || (*end != '\0'))
1551 		return -1;
1552 
1553 	if (len == 0)
1554 		return -1;
1555 
1556 	return len;
1557 }
1558 
1559 static int
1560 parse_uncore_options(enum uncore_choice choice, const char *argument)
1561 {
1562 	unsigned int die, pkg, max_pkg, max_die;
1563 	int ret = 0;
1564 	max_pkg = rte_power_uncore_get_num_pkgs();
1565 	if (max_pkg == 0)
1566 		return -1;
1567 
1568 	for (pkg = 0; pkg < max_pkg; pkg++) {
1569 		max_die = rte_power_uncore_get_num_dies(pkg);
1570 		if (max_die == 0)
1571 			return -1;
1572 		for (die = 0; die < max_die; die++) {
1573 			ret = rte_power_uncore_init(pkg, die);
1574 			if (ret == -1) {
1575 				RTE_LOG(INFO, L3FWD_POWER, "Unable to initialize uncore for pkg %02u die %02u\n"
1576 				, pkg, die);
1577 				return ret;
1578 			}
1579 			if (choice == UNCORE_MIN) {
1580 				ret = rte_power_uncore_freq_min(pkg, die);
1581 				if (ret == -1) {
1582 					RTE_LOG(INFO, L3FWD_POWER,
1583 					"Unable to set the uncore min/max to minimum uncore frequency value for pkg %02u die %02u\n"
1584 					, pkg, die);
1585 					return ret;
1586 				}
1587 			} else if (choice == UNCORE_MAX) {
1588 				ret = rte_power_uncore_freq_max(pkg, die);
1589 				if (ret == -1) {
1590 					RTE_LOG(INFO, L3FWD_POWER,
1591 					"Unable to set uncore min/max to maximum uncore frequency value for pkg %02u die %02u\n"
1592 					, pkg, die);
1593 					return ret;
1594 				}
1595 			} else if (choice == UNCORE_IDX) {
1596 				char *ptr = NULL;
1597 				int frequency_index = strtol(argument, &ptr, 10);
1598 				if (argument == ptr) {
1599 					RTE_LOG(INFO, L3FWD_POWER, "Index given is not a valid number.");
1600 					return -1;
1601 				}
1602 				int freq_array_len = rte_power_uncore_get_num_freqs(pkg, die);
1603 				if (frequency_index > freq_array_len - 1) {
1604 					RTE_LOG(INFO, L3FWD_POWER,
1605 					"Frequency index given out of range, please choose a value from 0 to %d.\n",
1606 					freq_array_len);
1607 					return -1;
1608 				}
1609 				ret = rte_power_set_uncore_freq(pkg, die, frequency_index);
1610 				if (ret == -1) {
1611 					RTE_LOG(INFO, L3FWD_POWER,
1612 					"Unable to set min/max uncore index value for pkg %02u die %02u\n",
1613 					pkg, die);
1614 					return ret;
1615 				}
1616 			} else {
1617 				RTE_LOG(INFO, L3FWD_POWER, "Uncore choice provided invalid\n");
1618 				return -1;
1619 			}
1620 		}
1621 	}
1622 
1623 	RTE_LOG(INFO, L3FWD_POWER, "Successfully set max/min/index uncore frequency.\n");
1624 	return ret;
1625 }
1626 
1627 static int
1628 parse_portmask(const char *portmask)
1629 {
1630 	char *end = NULL;
1631 	unsigned long pm;
1632 
1633 	/* parse hexadecimal string */
1634 	pm = strtoul(portmask, &end, 16);
1635 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0'))
1636 		return 0;
1637 
1638 	return pm;
1639 }
1640 
1641 static int
1642 parse_config(const char *q_arg)
1643 {
1644 	char s[256];
1645 	const char *p, *p0 = q_arg;
1646 	char *end;
1647 	enum fieldnames {
1648 		FLD_PORT = 0,
1649 		FLD_QUEUE,
1650 		FLD_LCORE,
1651 		_NUM_FLD
1652 	};
1653 	unsigned long int_fld[_NUM_FLD];
1654 	char *str_fld[_NUM_FLD];
1655 	int i;
1656 	unsigned size;
1657 
1658 	nb_lcore_params = 0;
1659 
1660 	while ((p = strchr(p0,'(')) != NULL) {
1661 		++p;
1662 		if((p0 = strchr(p,')')) == NULL)
1663 			return -1;
1664 
1665 		size = p0 - p;
1666 		if(size >= sizeof(s))
1667 			return -1;
1668 
1669 		snprintf(s, sizeof(s), "%.*s", size, p);
1670 		if (rte_strsplit(s, sizeof(s), str_fld, _NUM_FLD, ',') !=
1671 								_NUM_FLD)
1672 			return -1;
1673 		for (i = 0; i < _NUM_FLD; i++){
1674 			errno = 0;
1675 			int_fld[i] = strtoul(str_fld[i], &end, 0);
1676 			if (errno != 0 || end == str_fld[i] || int_fld[i] >
1677 									255)
1678 				return -1;
1679 		}
1680 		if (nb_lcore_params >= MAX_LCORE_PARAMS) {
1681 			printf("exceeded max number of lcore params: %hu\n",
1682 				nb_lcore_params);
1683 			return -1;
1684 		}
1685 		lcore_params_array[nb_lcore_params].port_id =
1686 				(uint8_t)int_fld[FLD_PORT];
1687 		lcore_params_array[nb_lcore_params].queue_id =
1688 				(uint8_t)int_fld[FLD_QUEUE];
1689 		lcore_params_array[nb_lcore_params].lcore_id =
1690 				(uint8_t)int_fld[FLD_LCORE];
1691 		++nb_lcore_params;
1692 	}
1693 	lcore_params = lcore_params_array;
1694 
1695 	return 0;
1696 }
1697 
1698 static int
1699 parse_pmd_mgmt_config(const char *name)
1700 {
1701 #define PMD_MGMT_MONITOR "monitor"
1702 #define PMD_MGMT_PAUSE   "pause"
1703 #define PMD_MGMT_SCALE   "scale"
1704 #define PMD_MGMT_BASELINE  "baseline"
1705 
1706 	if (strncmp(PMD_MGMT_MONITOR, name, sizeof(PMD_MGMT_MONITOR)) == 0) {
1707 		pmgmt_type = RTE_POWER_MGMT_TYPE_MONITOR;
1708 		return 0;
1709 	}
1710 
1711 	if (strncmp(PMD_MGMT_PAUSE, name, sizeof(PMD_MGMT_PAUSE)) == 0) {
1712 		pmgmt_type = RTE_POWER_MGMT_TYPE_PAUSE;
1713 		return 0;
1714 	}
1715 
1716 	if (strncmp(PMD_MGMT_SCALE, name, sizeof(PMD_MGMT_SCALE)) == 0) {
1717 		pmgmt_type = RTE_POWER_MGMT_TYPE_SCALE;
1718 		return 0;
1719 	}
1720 	if (strncmp(PMD_MGMT_BASELINE, name, sizeof(PMD_MGMT_BASELINE)) == 0) {
1721 		baseline_enabled = true;
1722 		return 0;
1723 	}
1724 	/* unknown PMD power management mode */
1725 	return -1;
1726 }
1727 
1728 #define CMD_LINE_OPT_PARSE_PTYPE "parse-ptype"
1729 #define CMD_LINE_OPT_LEGACY "legacy"
1730 #define CMD_LINE_OPT_INTERRUPT_ONLY "interrupt-only"
1731 #define CMD_LINE_OPT_TELEMETRY "telemetry"
1732 #define CMD_LINE_OPT_PMD_MGMT "pmd-mgmt"
1733 #define CMD_LINE_OPT_MAX_PKT_LEN "max-pkt-len"
1734 #define CMD_LINE_OPT_MAX_EMPTY_POLLS "max-empty-polls"
1735 #define CMD_LINE_OPT_PAUSE_DURATION "pause-duration"
1736 #define CMD_LINE_OPT_SCALE_FREQ_MIN "scale-freq-min"
1737 #define CMD_LINE_OPT_SCALE_FREQ_MAX "scale-freq-max"
1738 
1739 /* Parse the argument given in the command line of the application */
1740 static int
1741 parse_args(int argc, char **argv)
1742 {
1743 	int opt, ret;
1744 	char **argvopt;
1745 	int option_index;
1746 	char *prgname = argv[0];
1747 	static struct option lgopts[] = {
1748 		{"config", 1, 0, 0},
1749 		{"perf-config", 1, 0, 0},
1750 		{"high-perf-cores", 1, 0, 0},
1751 		{"no-numa", 0, 0, 0},
1752 		{CMD_LINE_OPT_MAX_PKT_LEN, 1, 0, 0},
1753 		{CMD_LINE_OPT_PARSE_PTYPE, 0, 0, 0},
1754 		{CMD_LINE_OPT_LEGACY, 0, 0, 0},
1755 		{CMD_LINE_OPT_TELEMETRY, 0, 0, 0},
1756 		{CMD_LINE_OPT_INTERRUPT_ONLY, 0, 0, 0},
1757 		{CMD_LINE_OPT_PMD_MGMT, 1, 0, 0},
1758 		{CMD_LINE_OPT_MAX_EMPTY_POLLS, 1, 0, 0},
1759 		{CMD_LINE_OPT_PAUSE_DURATION, 1, 0, 0},
1760 		{CMD_LINE_OPT_SCALE_FREQ_MIN, 1, 0, 0},
1761 		{CMD_LINE_OPT_SCALE_FREQ_MAX, 1, 0, 0},
1762 		{NULL, 0, 0, 0}
1763 	};
1764 
1765 	argvopt = argv;
1766 
1767 	while ((opt = getopt_long(argc, argvopt, "p:PuUi:",
1768 				lgopts, &option_index)) != EOF) {
1769 
1770 		switch (opt) {
1771 		/* portmask */
1772 		case 'p':
1773 			enabled_port_mask = parse_portmask(optarg);
1774 			if (enabled_port_mask == 0) {
1775 				printf("invalid portmask\n");
1776 				print_usage(prgname);
1777 				return -1;
1778 			}
1779 			break;
1780 		case 'P':
1781 			printf("Promiscuous mode selected\n");
1782 			promiscuous_on = 1;
1783 			break;
1784 		case 'u':
1785 			enabled_uncore = parse_uncore_options(UNCORE_MIN, NULL);
1786 			if (enabled_uncore < 0) {
1787 				print_usage(prgname);
1788 				return -1;
1789 			}
1790 			break;
1791 		case 'U':
1792 			enabled_uncore = parse_uncore_options(UNCORE_MAX, NULL);
1793 			if (enabled_uncore < 0) {
1794 				print_usage(prgname);
1795 				return -1;
1796 			}
1797 			break;
1798 		case 'i':
1799 			enabled_uncore = parse_uncore_options(UNCORE_IDX, optarg);
1800 			if (enabled_uncore < 0) {
1801 				print_usage(prgname);
1802 				return -1;
1803 			}
1804 			break;
1805 		/* long options */
1806 		case 0:
1807 			if (!strncmp(lgopts[option_index].name, "config", 6)) {
1808 				ret = parse_config(optarg);
1809 				if (ret) {
1810 					printf("invalid config\n");
1811 					print_usage(prgname);
1812 					return -1;
1813 				}
1814 			}
1815 
1816 			if (!strncmp(lgopts[option_index].name,
1817 					"perf-config", 11)) {
1818 				ret = parse_perf_config(optarg);
1819 				if (ret) {
1820 					printf("invalid perf-config\n");
1821 					print_usage(prgname);
1822 					return -1;
1823 				}
1824 			}
1825 
1826 			if (!strncmp(lgopts[option_index].name,
1827 					"high-perf-cores", 15)) {
1828 				ret = parse_perf_core_list(optarg);
1829 				if (ret) {
1830 					printf("invalid high-perf-cores\n");
1831 					print_usage(prgname);
1832 					return -1;
1833 				}
1834 			}
1835 
1836 			if (!strncmp(lgopts[option_index].name,
1837 						"no-numa", 7)) {
1838 				printf("numa is disabled \n");
1839 				numa_on = 0;
1840 			}
1841 
1842 			if (!strncmp(lgopts[option_index].name,
1843 					CMD_LINE_OPT_LEGACY,
1844 					sizeof(CMD_LINE_OPT_LEGACY))) {
1845 				if (app_mode != APP_MODE_DEFAULT) {
1846 					printf(" legacy mode is mutually exclusive with other modes\n");
1847 					return -1;
1848 				}
1849 				app_mode = APP_MODE_LEGACY;
1850 				printf("legacy mode is enabled\n");
1851 			}
1852 
1853 			if (!strncmp(lgopts[option_index].name,
1854 					CMD_LINE_OPT_TELEMETRY,
1855 					sizeof(CMD_LINE_OPT_TELEMETRY))) {
1856 				if (app_mode != APP_MODE_DEFAULT) {
1857 					printf(" telemetry mode is mutually exclusive with other modes\n");
1858 					return -1;
1859 				}
1860 				app_mode = APP_MODE_TELEMETRY;
1861 				printf("telemetry mode is enabled\n");
1862 			}
1863 
1864 			if (!strncmp(lgopts[option_index].name,
1865 					CMD_LINE_OPT_PMD_MGMT,
1866 					sizeof(CMD_LINE_OPT_PMD_MGMT))) {
1867 				if (app_mode != APP_MODE_DEFAULT) {
1868 					printf(" power mgmt mode is mutually exclusive with other modes\n");
1869 					return -1;
1870 				}
1871 				if (parse_pmd_mgmt_config(optarg) < 0) {
1872 					printf(" Invalid PMD power management mode: %s\n",
1873 							optarg);
1874 					return -1;
1875 				}
1876 				app_mode = APP_MODE_PMD_MGMT;
1877 				printf("PMD power mgmt mode is enabled\n");
1878 			}
1879 			if (!strncmp(lgopts[option_index].name,
1880 					CMD_LINE_OPT_INTERRUPT_ONLY,
1881 					sizeof(CMD_LINE_OPT_INTERRUPT_ONLY))) {
1882 				if (app_mode != APP_MODE_DEFAULT) {
1883 					printf(" interrupt-only mode is mutually exclusive with other modes\n");
1884 					return -1;
1885 				}
1886 				app_mode = APP_MODE_INTERRUPT;
1887 				printf("interrupt-only mode is enabled\n");
1888 			}
1889 
1890 			if (!strncmp(lgopts[option_index].name,
1891 					CMD_LINE_OPT_MAX_PKT_LEN,
1892 					sizeof(CMD_LINE_OPT_MAX_PKT_LEN))) {
1893 				printf("Custom frame size is configured\n");
1894 				max_pkt_len = parse_max_pkt_len(optarg);
1895 			}
1896 
1897 			if (!strncmp(lgopts[option_index].name,
1898 				     CMD_LINE_OPT_PARSE_PTYPE,
1899 				     sizeof(CMD_LINE_OPT_PARSE_PTYPE))) {
1900 				printf("soft parse-ptype is enabled\n");
1901 				parse_ptype = 1;
1902 			}
1903 
1904 			if (!strncmp(lgopts[option_index].name,
1905 					CMD_LINE_OPT_MAX_EMPTY_POLLS,
1906 					sizeof(CMD_LINE_OPT_MAX_EMPTY_POLLS))) {
1907 				printf("Maximum empty polls configured\n");
1908 				max_empty_polls = parse_int(optarg);
1909 			}
1910 
1911 			if (!strncmp(lgopts[option_index].name,
1912 					CMD_LINE_OPT_PAUSE_DURATION,
1913 					sizeof(CMD_LINE_OPT_PAUSE_DURATION))) {
1914 				printf("Pause duration configured\n");
1915 				pause_duration = parse_int(optarg);
1916 			}
1917 
1918 			if (!strncmp(lgopts[option_index].name,
1919 					CMD_LINE_OPT_SCALE_FREQ_MIN,
1920 					sizeof(CMD_LINE_OPT_SCALE_FREQ_MIN))) {
1921 				printf("Scaling frequency minimum configured\n");
1922 				scale_freq_min = parse_int(optarg);
1923 			}
1924 
1925 			if (!strncmp(lgopts[option_index].name,
1926 					CMD_LINE_OPT_SCALE_FREQ_MAX,
1927 					sizeof(CMD_LINE_OPT_SCALE_FREQ_MAX))) {
1928 				printf("Scaling frequency maximum configured\n");
1929 				scale_freq_max = parse_int(optarg);
1930 			}
1931 
1932 			break;
1933 
1934 		default:
1935 			print_usage(prgname);
1936 			return -1;
1937 		}
1938 	}
1939 
1940 	if (optind >= 0)
1941 		argv[optind-1] = prgname;
1942 
1943 	ret = optind-1;
1944 	optind = 1; /* reset getopt lib */
1945 	return ret;
1946 }
1947 
1948 static void
1949 print_ethaddr(const char *name, const struct rte_ether_addr *eth_addr)
1950 {
1951 	char buf[RTE_ETHER_ADDR_FMT_SIZE];
1952 	rte_ether_format_addr(buf, RTE_ETHER_ADDR_FMT_SIZE, eth_addr);
1953 	printf("%s%s", name, buf);
1954 }
1955 
1956 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH)
1957 static void
1958 setup_hash(int socketid)
1959 {
1960 	struct rte_hash_parameters ipv4_l3fwd_hash_params = {
1961 		.name = NULL,
1962 		.entries = L3FWD_HASH_ENTRIES,
1963 		.key_len = sizeof(struct ipv4_5tuple),
1964 		.hash_func = DEFAULT_HASH_FUNC,
1965 		.hash_func_init_val = 0,
1966 	};
1967 
1968 	struct rte_hash_parameters ipv6_l3fwd_hash_params = {
1969 		.name = NULL,
1970 		.entries = L3FWD_HASH_ENTRIES,
1971 		.key_len = sizeof(struct ipv6_5tuple),
1972 		.hash_func = DEFAULT_HASH_FUNC,
1973 		.hash_func_init_val = 0,
1974 	};
1975 
1976 	unsigned i;
1977 	int ret;
1978 	char s[64];
1979 
1980 	/* create ipv4 hash */
1981 	snprintf(s, sizeof(s), "ipv4_l3fwd_hash_%d", socketid);
1982 	ipv4_l3fwd_hash_params.name = s;
1983 	ipv4_l3fwd_hash_params.socket_id = socketid;
1984 	ipv4_l3fwd_lookup_struct[socketid] =
1985 		rte_hash_create(&ipv4_l3fwd_hash_params);
1986 	if (ipv4_l3fwd_lookup_struct[socketid] == NULL)
1987 		rte_exit(EXIT_FAILURE, "Unable to create the l3fwd hash on "
1988 				"socket %d\n", socketid);
1989 
1990 	/* create ipv6 hash */
1991 	snprintf(s, sizeof(s), "ipv6_l3fwd_hash_%d", socketid);
1992 	ipv6_l3fwd_hash_params.name = s;
1993 	ipv6_l3fwd_hash_params.socket_id = socketid;
1994 	ipv6_l3fwd_lookup_struct[socketid] =
1995 		rte_hash_create(&ipv6_l3fwd_hash_params);
1996 	if (ipv6_l3fwd_lookup_struct[socketid] == NULL)
1997 		rte_exit(EXIT_FAILURE, "Unable to create the l3fwd hash on "
1998 				"socket %d\n", socketid);
1999 
2000 
2001 	/* populate the ipv4 hash */
2002 	for (i = 0; i < RTE_DIM(ipv4_l3fwd_route_array); i++) {
2003 		ret = rte_hash_add_key (ipv4_l3fwd_lookup_struct[socketid],
2004 				(void *) &ipv4_l3fwd_route_array[i].key);
2005 		if (ret < 0) {
2006 			rte_exit(EXIT_FAILURE, "Unable to add entry %u to the"
2007 				"l3fwd hash on socket %d\n", i, socketid);
2008 		}
2009 		ipv4_l3fwd_out_if[ret] = ipv4_l3fwd_route_array[i].if_out;
2010 		printf("Hash: Adding key\n");
2011 		print_ipv4_key(ipv4_l3fwd_route_array[i].key);
2012 	}
2013 
2014 	/* populate the ipv6 hash */
2015 	for (i = 0; i < RTE_DIM(ipv6_l3fwd_route_array); i++) {
2016 		ret = rte_hash_add_key (ipv6_l3fwd_lookup_struct[socketid],
2017 				(void *) &ipv6_l3fwd_route_array[i].key);
2018 		if (ret < 0) {
2019 			rte_exit(EXIT_FAILURE, "Unable to add entry %u to the"
2020 				"l3fwd hash on socket %d\n", i, socketid);
2021 		}
2022 		ipv6_l3fwd_out_if[ret] = ipv6_l3fwd_route_array[i].if_out;
2023 		printf("Hash: Adding key\n");
2024 		print_ipv6_key(ipv6_l3fwd_route_array[i].key);
2025 	}
2026 }
2027 #endif
2028 
2029 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM)
2030 static void
2031 setup_lpm(int socketid)
2032 {
2033 	unsigned i;
2034 	int ret;
2035 	char s[64];
2036 
2037 	/* create the LPM table */
2038 	struct rte_lpm_config lpm_ipv4_config;
2039 
2040 	lpm_ipv4_config.max_rules = IPV4_L3FWD_LPM_MAX_RULES;
2041 	lpm_ipv4_config.number_tbl8s = 256;
2042 	lpm_ipv4_config.flags = 0;
2043 
2044 	snprintf(s, sizeof(s), "IPV4_L3FWD_LPM_%d", socketid);
2045 	ipv4_l3fwd_lookup_struct[socketid] =
2046 			rte_lpm_create(s, socketid, &lpm_ipv4_config);
2047 	if (ipv4_l3fwd_lookup_struct[socketid] == NULL)
2048 		rte_exit(EXIT_FAILURE, "Unable to create the l3fwd LPM table"
2049 				" on socket %d\n", socketid);
2050 
2051 	/* populate the LPM table */
2052 	for (i = 0; i < RTE_DIM(ipv4_l3fwd_route_array); i++) {
2053 		ret = rte_lpm_add(ipv4_l3fwd_lookup_struct[socketid],
2054 			ipv4_l3fwd_route_array[i].ip,
2055 			ipv4_l3fwd_route_array[i].depth,
2056 			ipv4_l3fwd_route_array[i].if_out);
2057 
2058 		if (ret < 0) {
2059 			rte_exit(EXIT_FAILURE, "Unable to add entry %u to the "
2060 				"l3fwd LPM table on socket %d\n",
2061 				i, socketid);
2062 		}
2063 
2064 		printf("LPM: Adding route 0x%08x / %d (%d)\n",
2065 			(unsigned)ipv4_l3fwd_route_array[i].ip,
2066 			ipv4_l3fwd_route_array[i].depth,
2067 			ipv4_l3fwd_route_array[i].if_out);
2068 	}
2069 }
2070 #endif
2071 
2072 static int
2073 init_mem(unsigned nb_mbuf)
2074 {
2075 	struct lcore_conf *qconf;
2076 	int socketid;
2077 	unsigned lcore_id;
2078 	char s[64];
2079 
2080 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
2081 		if (rte_lcore_is_enabled(lcore_id) == 0)
2082 			continue;
2083 
2084 		if (numa_on)
2085 			socketid = rte_lcore_to_socket_id(lcore_id);
2086 		else
2087 			socketid = 0;
2088 
2089 		if (socketid >= NB_SOCKETS) {
2090 			rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is "
2091 					"out of range %d\n", socketid,
2092 						lcore_id, NB_SOCKETS);
2093 		}
2094 		if (pktmbuf_pool[socketid] == NULL) {
2095 			snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
2096 			pktmbuf_pool[socketid] =
2097 				rte_pktmbuf_pool_create(s, nb_mbuf,
2098 					MEMPOOL_CACHE_SIZE, 0,
2099 					RTE_MBUF_DEFAULT_BUF_SIZE,
2100 					socketid);
2101 			if (pktmbuf_pool[socketid] == NULL)
2102 				rte_exit(EXIT_FAILURE,
2103 					"Cannot init mbuf pool on socket %d\n",
2104 								socketid);
2105 			else
2106 				printf("Allocated mbuf pool on socket %d\n",
2107 								socketid);
2108 
2109 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM)
2110 			setup_lpm(socketid);
2111 #else
2112 			setup_hash(socketid);
2113 #endif
2114 		}
2115 		qconf = &lcore_conf[lcore_id];
2116 		qconf->ipv4_lookup_struct = ipv4_l3fwd_lookup_struct[socketid];
2117 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH)
2118 		qconf->ipv6_lookup_struct = ipv6_l3fwd_lookup_struct[socketid];
2119 #endif
2120 	}
2121 	return 0;
2122 }
2123 
2124 /* Check the link status of all ports in up to 9s, and print them finally */
2125 static void
2126 check_all_ports_link_status(uint32_t port_mask)
2127 {
2128 #define CHECK_INTERVAL 100 /* 100ms */
2129 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */
2130 	uint8_t count, all_ports_up, print_flag = 0;
2131 	uint16_t portid;
2132 	struct rte_eth_link link;
2133 	int ret;
2134 	char link_status_text[RTE_ETH_LINK_MAX_STR_LEN];
2135 
2136 	printf("\nChecking link status");
2137 	fflush(stdout);
2138 	for (count = 0; count <= MAX_CHECK_TIME; count++) {
2139 		all_ports_up = 1;
2140 		RTE_ETH_FOREACH_DEV(portid) {
2141 			if ((port_mask & (1 << portid)) == 0)
2142 				continue;
2143 			memset(&link, 0, sizeof(link));
2144 			ret = rte_eth_link_get_nowait(portid, &link);
2145 			if (ret < 0) {
2146 				all_ports_up = 0;
2147 				if (print_flag == 1)
2148 					printf("Port %u link get failed: %s\n",
2149 						portid, rte_strerror(-ret));
2150 				continue;
2151 			}
2152 			/* print link status if flag set */
2153 			if (print_flag == 1) {
2154 				rte_eth_link_to_str(link_status_text,
2155 					sizeof(link_status_text), &link);
2156 				printf("Port %d %s\n", portid,
2157 				       link_status_text);
2158 				continue;
2159 			}
2160 			/* clear all_ports_up flag if any link down */
2161 			if (link.link_status == RTE_ETH_LINK_DOWN) {
2162 				all_ports_up = 0;
2163 				break;
2164 			}
2165 		}
2166 		/* after finally printing all link status, get out */
2167 		if (print_flag == 1)
2168 			break;
2169 
2170 		if (all_ports_up == 0) {
2171 			printf(".");
2172 			fflush(stdout);
2173 			rte_delay_ms(CHECK_INTERVAL);
2174 		}
2175 
2176 		/* set the print_flag if all ports up or timeout */
2177 		if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
2178 			print_flag = 1;
2179 			printf("done\n");
2180 		}
2181 	}
2182 }
2183 
2184 static int check_ptype(uint16_t portid)
2185 {
2186 	int i, ret;
2187 	int ptype_l3_ipv4 = 0;
2188 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH)
2189 	int ptype_l3_ipv6 = 0;
2190 #endif
2191 	uint32_t ptype_mask = RTE_PTYPE_L3_MASK;
2192 
2193 	ret = rte_eth_dev_get_supported_ptypes(portid, ptype_mask, NULL, 0);
2194 	if (ret <= 0)
2195 		return 0;
2196 
2197 	uint32_t ptypes[ret];
2198 
2199 	ret = rte_eth_dev_get_supported_ptypes(portid, ptype_mask, ptypes, ret);
2200 	for (i = 0; i < ret; ++i) {
2201 		if (ptypes[i] & RTE_PTYPE_L3_IPV4)
2202 			ptype_l3_ipv4 = 1;
2203 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH)
2204 		if (ptypes[i] & RTE_PTYPE_L3_IPV6)
2205 			ptype_l3_ipv6 = 1;
2206 #endif
2207 	}
2208 
2209 	if (ptype_l3_ipv4 == 0)
2210 		printf("port %d cannot parse RTE_PTYPE_L3_IPV4\n", portid);
2211 
2212 #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH)
2213 	if (ptype_l3_ipv6 == 0)
2214 		printf("port %d cannot parse RTE_PTYPE_L3_IPV6\n", portid);
2215 #endif
2216 
2217 #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM)
2218 	if (ptype_l3_ipv4)
2219 #else /* APP_LOOKUP_EXACT_MATCH */
2220 	if (ptype_l3_ipv4 && ptype_l3_ipv6)
2221 #endif
2222 		return 1;
2223 
2224 	return 0;
2225 
2226 }
2227 
2228 static int
2229 init_power_library(void)
2230 {
2231 	enum power_management_env env;
2232 	unsigned int lcore_id;
2233 	int ret = 0;
2234 
2235 	RTE_LCORE_FOREACH(lcore_id) {
2236 		/* init power management library */
2237 		ret = rte_power_init(lcore_id);
2238 		if (ret) {
2239 			RTE_LOG(ERR, POWER,
2240 				"Library initialization failed on core %u\n",
2241 				lcore_id);
2242 			return ret;
2243 		}
2244 		/* we're not supporting the VM channel mode */
2245 		env = rte_power_get_env();
2246 		if (env != PM_ENV_ACPI_CPUFREQ &&
2247 				env != PM_ENV_PSTATE_CPUFREQ) {
2248 			RTE_LOG(ERR, POWER,
2249 				"Only ACPI and PSTATE mode are supported\n");
2250 			return -1;
2251 		}
2252 	}
2253 	return ret;
2254 }
2255 
2256 static int
2257 deinit_power_library(void)
2258 {
2259 	unsigned int lcore_id, max_pkg, max_die, die, pkg;
2260 	int ret = 0;
2261 
2262 	RTE_LCORE_FOREACH(lcore_id) {
2263 		/* deinit power management library */
2264 		ret = rte_power_exit(lcore_id);
2265 		if (ret) {
2266 			RTE_LOG(ERR, POWER,
2267 				"Library deinitialization failed on core %u\n",
2268 				lcore_id);
2269 			return ret;
2270 		}
2271 	}
2272 
2273 	/* if uncore option was set */
2274 	if (enabled_uncore == 0) {
2275 		max_pkg = rte_power_uncore_get_num_pkgs();
2276 		if (max_pkg == 0)
2277 			return -1;
2278 		for (pkg = 0; pkg < max_pkg; pkg++) {
2279 			max_die = rte_power_uncore_get_num_dies(pkg);
2280 			if (max_die == 0)
2281 				return -1;
2282 			for (die = 0; die < max_die; die++) {
2283 				ret = rte_power_uncore_exit(pkg, die);
2284 				if (ret < 0) {
2285 					RTE_LOG(ERR, L3FWD_POWER, "Failed to exit uncore deinit successfully for pkg %02u die %02u\n"
2286 						, pkg, die);
2287 					return -1;
2288 				}
2289 			}
2290 		}
2291 	}
2292 	return ret;
2293 }
2294 
2295 static void
2296 get_current_stat_values(uint64_t *values)
2297 {
2298 	unsigned int lcore_id = rte_lcore_id();
2299 	struct lcore_conf *qconf;
2300 	uint64_t app_eps = 0, app_fps = 0, app_br = 0;
2301 	uint64_t count = 0;
2302 
2303 	RTE_LCORE_FOREACH_WORKER(lcore_id) {
2304 		qconf = &lcore_conf[lcore_id];
2305 		if (qconf->n_rx_queue == 0)
2306 			continue;
2307 		count++;
2308 		rte_spinlock_lock(&stats[lcore_id].telemetry_lock);
2309 		app_eps += stats[lcore_id].ep_nep[1];
2310 		app_fps += stats[lcore_id].fp_nfp[1];
2311 		app_br += stats[lcore_id].br;
2312 		rte_spinlock_unlock(&stats[lcore_id].telemetry_lock);
2313 	}
2314 
2315 	if (count > 0) {
2316 		values[0] = app_eps/count;
2317 		values[1] = app_fps/count;
2318 		values[2] = app_br/count;
2319 	} else
2320 		memset(values, 0, sizeof(uint64_t) * NUM_TELSTATS);
2321 
2322 }
2323 
2324 static void
2325 update_telemetry(__rte_unused struct rte_timer *tim,
2326 		__rte_unused void *arg)
2327 {
2328 	int ret;
2329 	uint64_t values[NUM_TELSTATS] = {0};
2330 
2331 	get_current_stat_values(values);
2332 	ret = rte_metrics_update_values(RTE_METRICS_GLOBAL, telstats_index,
2333 					values, RTE_DIM(values));
2334 	if (ret < 0)
2335 		RTE_LOG(WARNING, POWER, "failed to update metrics\n");
2336 }
2337 
2338 static int
2339 handle_app_stats(const char *cmd __rte_unused,
2340 		const char *params __rte_unused,
2341 		struct rte_tel_data *d)
2342 {
2343 	uint64_t values[NUM_TELSTATS] = {0};
2344 	uint32_t i;
2345 
2346 	rte_tel_data_start_dict(d);
2347 	get_current_stat_values(values);
2348 	for (i = 0; i < NUM_TELSTATS; i++)
2349 		rte_tel_data_add_dict_uint(d, telstats_strings[i].name,
2350 					   values[i]);
2351 	return 0;
2352 }
2353 
2354 static void
2355 telemetry_setup_timer(void)
2356 {
2357 	int lcore_id = rte_lcore_id();
2358 	uint64_t hz = rte_get_timer_hz();
2359 	uint64_t ticks;
2360 
2361 	ticks = hz / TELEMETRY_INTERVALS_PER_SEC;
2362 	rte_timer_reset_sync(&telemetry_timer,
2363 			ticks,
2364 			PERIODICAL,
2365 			lcore_id,
2366 			update_telemetry,
2367 			NULL);
2368 }
2369 
2370 static int
2371 launch_timer(unsigned int lcore_id)
2372 {
2373 	int64_t prev_tsc = 0, cur_tsc, diff_tsc, cycles_10ms;
2374 
2375 	RTE_SET_USED(lcore_id);
2376 
2377 
2378 	if (rte_get_main_lcore() != lcore_id) {
2379 		rte_panic("timer on lcore:%d which is not main core:%d\n",
2380 				lcore_id,
2381 				rte_get_main_lcore());
2382 	}
2383 
2384 	RTE_LOG(INFO, POWER, "Bring up the Timer\n");
2385 
2386 	telemetry_setup_timer();
2387 
2388 	cycles_10ms = rte_get_timer_hz() / 100;
2389 
2390 	while (!is_done()) {
2391 		cur_tsc = rte_rdtsc();
2392 		diff_tsc = cur_tsc - prev_tsc;
2393 		if (diff_tsc > cycles_10ms) {
2394 			rte_timer_manage();
2395 			prev_tsc = cur_tsc;
2396 			cycles_10ms = rte_get_timer_hz() / 100;
2397 		}
2398 	}
2399 
2400 	RTE_LOG(INFO, POWER, "Timer_subsystem is done\n");
2401 
2402 	return 0;
2403 }
2404 
2405 static int
2406 autodetect_mode(void)
2407 {
2408 	RTE_LOG(NOTICE, L3FWD_POWER, "Operating mode not specified, probing frequency scaling support...\n");
2409 
2410 	/*
2411 	 * Empty poll and telemetry modes have to be specifically requested to
2412 	 * be enabled, but we can auto-detect between interrupt mode with or
2413 	 * without frequency scaling. Both ACPI and pstate can be used.
2414 	 */
2415 	if (rte_power_check_env_supported(PM_ENV_ACPI_CPUFREQ))
2416 		return APP_MODE_LEGACY;
2417 	if (rte_power_check_env_supported(PM_ENV_PSTATE_CPUFREQ))
2418 		return APP_MODE_LEGACY;
2419 
2420 	RTE_LOG(NOTICE, L3FWD_POWER, "Frequency scaling not supported, selecting interrupt-only mode\n");
2421 
2422 	return APP_MODE_INTERRUPT;
2423 }
2424 
2425 static const char *
2426 mode_to_str(enum appmode mode)
2427 {
2428 	switch (mode) {
2429 	case APP_MODE_LEGACY:
2430 		return "legacy";
2431 	case APP_MODE_TELEMETRY:
2432 		return "telemetry";
2433 	case APP_MODE_INTERRUPT:
2434 		return "interrupt-only";
2435 	case APP_MODE_PMD_MGMT:
2436 		return "pmd mgmt";
2437 	default:
2438 		return "invalid";
2439 	}
2440 }
2441 
2442 static uint32_t
2443 eth_dev_get_overhead_len(uint32_t max_rx_pktlen, uint16_t max_mtu)
2444 {
2445 	uint32_t overhead_len;
2446 
2447 	if (max_mtu != UINT16_MAX && max_rx_pktlen > max_mtu)
2448 		overhead_len = max_rx_pktlen - max_mtu;
2449 	else
2450 		overhead_len = RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN;
2451 
2452 	return overhead_len;
2453 }
2454 
2455 static int
2456 config_port_max_pkt_len(struct rte_eth_conf *conf,
2457 		struct rte_eth_dev_info *dev_info)
2458 {
2459 	uint32_t overhead_len;
2460 
2461 	if (max_pkt_len == 0)
2462 		return 0;
2463 
2464 	if (max_pkt_len < RTE_ETHER_MIN_LEN || max_pkt_len > MAX_JUMBO_PKT_LEN)
2465 		return -1;
2466 
2467 	overhead_len = eth_dev_get_overhead_len(dev_info->max_rx_pktlen,
2468 			dev_info->max_mtu);
2469 	conf->rxmode.mtu = max_pkt_len - overhead_len;
2470 
2471 	if (conf->rxmode.mtu > RTE_ETHER_MTU)
2472 		conf->txmode.offloads |= RTE_ETH_TX_OFFLOAD_MULTI_SEGS;
2473 
2474 	return 0;
2475 }
2476 
2477 /* Power library initialized in the main routine. 8< */
2478 int
2479 main(int argc, char **argv)
2480 {
2481 	struct lcore_conf *qconf;
2482 	struct rte_eth_dev_info dev_info;
2483 	struct rte_eth_txconf *txconf;
2484 	int ret;
2485 	uint16_t nb_ports;
2486 	uint16_t queueid;
2487 	unsigned lcore_id;
2488 	uint64_t hz;
2489 	uint32_t n_tx_queue, nb_lcores;
2490 	uint32_t dev_rxq_num, dev_txq_num;
2491 	uint8_t nb_rx_queue, queue, socketid;
2492 	uint16_t portid;
2493 	const char *ptr_strings[NUM_TELSTATS];
2494 
2495 	/* init EAL */
2496 	ret = rte_eal_init(argc, argv);
2497 	if (ret < 0)
2498 		rte_exit(EXIT_FAILURE, "Invalid EAL parameters\n");
2499 	argc -= ret;
2500 	argv += ret;
2501 
2502 	/* catch SIGINT and restore cpufreq governor to ondemand */
2503 	signal(SIGINT, signal_exit_now);
2504 
2505 	/* init RTE timer library to be used late */
2506 	rte_timer_subsystem_init();
2507 
2508 	/* if we're running pmd-mgmt mode, don't default to baseline mode */
2509 	baseline_enabled = false;
2510 
2511 	/* parse application arguments (after the EAL ones) */
2512 	ret = parse_args(argc, argv);
2513 	if (ret < 0)
2514 		rte_exit(EXIT_FAILURE, "Invalid L3FWD parameters\n");
2515 
2516 	if (app_mode == APP_MODE_DEFAULT)
2517 		app_mode = autodetect_mode();
2518 
2519 	RTE_LOG(INFO, L3FWD_POWER, "Selected operation mode: %s\n",
2520 			mode_to_str(app_mode));
2521 
2522 	/* only legacy mode relies on power library */
2523 	if ((app_mode == APP_MODE_LEGACY) && init_power_library())
2524 		rte_exit(EXIT_FAILURE, "init_power_library failed\n");
2525 
2526 	if (update_lcore_params() < 0)
2527 		rte_exit(EXIT_FAILURE, "update_lcore_params failed\n");
2528 
2529 	if (check_lcore_params() < 0)
2530 		rte_exit(EXIT_FAILURE, "check_lcore_params failed\n");
2531 
2532 	ret = init_lcore_rx_queues();
2533 	if (ret < 0)
2534 		rte_exit(EXIT_FAILURE, "init_lcore_rx_queues failed\n");
2535 
2536 	nb_ports = rte_eth_dev_count_avail();
2537 
2538 	if (check_port_config() < 0)
2539 		rte_exit(EXIT_FAILURE, "check_port_config failed\n");
2540 
2541 	nb_lcores = rte_lcore_count();
2542 
2543 	/* initialize all ports */
2544 	RTE_ETH_FOREACH_DEV(portid) {
2545 		struct rte_eth_conf local_port_conf = port_conf;
2546 		/* not all app modes need interrupts */
2547 		bool need_intr = app_mode == APP_MODE_LEGACY ||
2548 				app_mode == APP_MODE_INTERRUPT;
2549 
2550 		/* skip ports that are not enabled */
2551 		if ((enabled_port_mask & (1 << portid)) == 0) {
2552 			printf("\nSkipping disabled port %d\n", portid);
2553 			continue;
2554 		}
2555 
2556 		/* init port */
2557 		printf("Initializing port %d ... ", portid );
2558 		fflush(stdout);
2559 
2560 		ret = rte_eth_dev_info_get(portid, &dev_info);
2561 		if (ret != 0)
2562 			rte_exit(EXIT_FAILURE,
2563 				"Error during getting device (port %u) info: %s\n",
2564 				portid, strerror(-ret));
2565 
2566 		dev_rxq_num = dev_info.max_rx_queues;
2567 		dev_txq_num = dev_info.max_tx_queues;
2568 
2569 		nb_rx_queue = get_port_n_rx_queues(portid);
2570 		if (nb_rx_queue > dev_rxq_num)
2571 			rte_exit(EXIT_FAILURE,
2572 				"Cannot configure not existed rxq: "
2573 				"port=%d\n", portid);
2574 
2575 		n_tx_queue = nb_lcores;
2576 		if (n_tx_queue > dev_txq_num)
2577 			n_tx_queue = dev_txq_num;
2578 		printf("Creating queues: nb_rxq=%d nb_txq=%u... ",
2579 			nb_rx_queue, (unsigned)n_tx_queue );
2580 		/* If number of Rx queue is 0, no need to enable Rx interrupt */
2581 		if (nb_rx_queue == 0)
2582 			need_intr = false;
2583 
2584 		if (need_intr)
2585 			local_port_conf.intr_conf.rxq = 1;
2586 
2587 		ret = rte_eth_dev_info_get(portid, &dev_info);
2588 		if (ret != 0)
2589 			rte_exit(EXIT_FAILURE,
2590 				"Error during getting device (port %u) info: %s\n",
2591 				portid, strerror(-ret));
2592 
2593 		ret = config_port_max_pkt_len(&local_port_conf, &dev_info);
2594 		if (ret != 0)
2595 			rte_exit(EXIT_FAILURE,
2596 				"Invalid max packet length: %u (port %u)\n",
2597 				max_pkt_len, portid);
2598 
2599 		if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
2600 			local_port_conf.txmode.offloads |=
2601 				RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
2602 
2603 		local_port_conf.rx_adv_conf.rss_conf.rss_hf &=
2604 			dev_info.flow_type_rss_offloads;
2605 		if (local_port_conf.rx_adv_conf.rss_conf.rss_hf !=
2606 				port_conf.rx_adv_conf.rss_conf.rss_hf) {
2607 			printf("Port %u modified RSS hash function based on hardware support,"
2608 				"requested:%#"PRIx64" configured:%#"PRIx64"\n",
2609 				portid,
2610 				port_conf.rx_adv_conf.rss_conf.rss_hf,
2611 				local_port_conf.rx_adv_conf.rss_conf.rss_hf);
2612 		}
2613 
2614 		if (local_port_conf.rx_adv_conf.rss_conf.rss_hf == 0)
2615 			local_port_conf.rxmode.mq_mode = RTE_ETH_MQ_RX_NONE;
2616 		local_port_conf.rxmode.offloads &= dev_info.rx_offload_capa;
2617 		port_conf.rxmode.offloads = local_port_conf.rxmode.offloads;
2618 
2619 		ret = rte_eth_dev_configure(portid, nb_rx_queue,
2620 					(uint16_t)n_tx_queue, &local_port_conf);
2621 		if (ret < 0)
2622 			rte_exit(EXIT_FAILURE, "Cannot configure device: "
2623 					"err=%d, port=%d\n", ret, portid);
2624 
2625 		ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd,
2626 						       &nb_txd);
2627 		if (ret < 0)
2628 			rte_exit(EXIT_FAILURE,
2629 				 "Cannot adjust number of descriptors: err=%d, port=%d\n",
2630 				 ret, portid);
2631 
2632 		ret = rte_eth_macaddr_get(portid, &ports_eth_addr[portid]);
2633 		if (ret < 0)
2634 			rte_exit(EXIT_FAILURE,
2635 				 "Cannot get MAC address: err=%d, port=%d\n",
2636 				 ret, portid);
2637 
2638 		print_ethaddr(" Address:", &ports_eth_addr[portid]);
2639 		printf(", ");
2640 
2641 		/* init memory */
2642 		ret = init_mem(NB_MBUF);
2643 		if (ret < 0)
2644 			rte_exit(EXIT_FAILURE, "init_mem failed\n");
2645 
2646 		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
2647 			if (rte_lcore_is_enabled(lcore_id) == 0)
2648 				continue;
2649 
2650 			/* Initialize TX buffers */
2651 			qconf = &lcore_conf[lcore_id];
2652 			qconf->tx_buffer[portid] = rte_zmalloc_socket("tx_buffer",
2653 				RTE_ETH_TX_BUFFER_SIZE(MAX_PKT_BURST), 0,
2654 				rte_eth_dev_socket_id(portid));
2655 			if (qconf->tx_buffer[portid] == NULL)
2656 				rte_exit(EXIT_FAILURE, "Can't allocate tx buffer for port %u\n",
2657 						 portid);
2658 
2659 			rte_eth_tx_buffer_init(qconf->tx_buffer[portid], MAX_PKT_BURST);
2660 		}
2661 
2662 		/* init one TX queue per couple (lcore,port) */
2663 		queueid = 0;
2664 		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
2665 			if (rte_lcore_is_enabled(lcore_id) == 0)
2666 				continue;
2667 
2668 			if (queueid >= dev_txq_num)
2669 				continue;
2670 
2671 			if (numa_on)
2672 				socketid = \
2673 				(uint8_t)rte_lcore_to_socket_id(lcore_id);
2674 			else
2675 				socketid = 0;
2676 
2677 			printf("txq=%u,%d,%d ", lcore_id, queueid, socketid);
2678 			fflush(stdout);
2679 
2680 			txconf = &dev_info.default_txconf;
2681 			txconf->offloads = local_port_conf.txmode.offloads;
2682 			ret = rte_eth_tx_queue_setup(portid, queueid, nb_txd,
2683 						     socketid, txconf);
2684 			if (ret < 0)
2685 				rte_exit(EXIT_FAILURE,
2686 					"rte_eth_tx_queue_setup: err=%d, "
2687 						"port=%d\n", ret, portid);
2688 
2689 			qconf = &lcore_conf[lcore_id];
2690 			qconf->tx_queue_id[portid] = queueid;
2691 			queueid++;
2692 
2693 			qconf->tx_port_id[qconf->n_tx_port] = portid;
2694 			qconf->n_tx_port++;
2695 		}
2696 		printf("\n");
2697 	}
2698 
2699 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
2700 		if (rte_lcore_is_enabled(lcore_id) == 0)
2701 			continue;
2702 
2703 		if (app_mode == APP_MODE_LEGACY) {
2704 			/* init timer structures for each enabled lcore */
2705 			rte_timer_init(&power_timers[lcore_id]);
2706 			hz = rte_get_timer_hz();
2707 			rte_timer_reset(&power_timers[lcore_id],
2708 					hz/TIMER_NUMBER_PER_SECOND,
2709 					SINGLE, lcore_id,
2710 					power_timer_cb, NULL);
2711 		}
2712 		qconf = &lcore_conf[lcore_id];
2713 		printf("\nInitializing rx queues on lcore %u ... ", lcore_id );
2714 		fflush(stdout);
2715 
2716 		/* init RX queues */
2717 		for(queue = 0; queue < qconf->n_rx_queue; ++queue) {
2718 			struct rte_eth_rxconf rxq_conf;
2719 
2720 			portid = qconf->rx_queue_list[queue].port_id;
2721 			queueid = qconf->rx_queue_list[queue].queue_id;
2722 
2723 			if (numa_on)
2724 				socketid = \
2725 				(uint8_t)rte_lcore_to_socket_id(lcore_id);
2726 			else
2727 				socketid = 0;
2728 
2729 			printf("rxq=%d,%d,%d ", portid, queueid, socketid);
2730 			fflush(stdout);
2731 
2732 			ret = rte_eth_dev_info_get(portid, &dev_info);
2733 			if (ret != 0)
2734 				rte_exit(EXIT_FAILURE,
2735 					"Error during getting device (port %u) info: %s\n",
2736 					portid, strerror(-ret));
2737 
2738 			rxq_conf = dev_info.default_rxconf;
2739 			rxq_conf.offloads = port_conf.rxmode.offloads;
2740 			ret = rte_eth_rx_queue_setup(portid, queueid, nb_rxd,
2741 				socketid, &rxq_conf,
2742 				pktmbuf_pool[socketid]);
2743 			if (ret < 0)
2744 				rte_exit(EXIT_FAILURE,
2745 					"rte_eth_rx_queue_setup: err=%d, "
2746 						"port=%d\n", ret, portid);
2747 
2748 			if (parse_ptype) {
2749 				if (add_cb_parse_ptype(portid, queueid) < 0)
2750 					rte_exit(EXIT_FAILURE,
2751 						 "Fail to add ptype cb\n");
2752 			}
2753 
2754 			if (app_mode == APP_MODE_PMD_MGMT && !baseline_enabled) {
2755 				/* Set power_pmd_mgmt configs passed by user */
2756 				rte_power_pmd_mgmt_set_emptypoll_max(max_empty_polls);
2757 				ret = rte_power_pmd_mgmt_set_pause_duration(pause_duration);
2758 				if (ret < 0)
2759 					rte_exit(EXIT_FAILURE,
2760 						"Error setting pause_duration: err=%d, lcore=%d\n",
2761 							ret, lcore_id);
2762 
2763 				ret = rte_power_pmd_mgmt_set_scaling_freq_min(lcore_id,
2764 						scale_freq_min);
2765 				if (ret < 0)
2766 					rte_exit(EXIT_FAILURE,
2767 						"Error setting scaling freq min: err=%d, lcore=%d\n",
2768 							ret, lcore_id);
2769 
2770 				ret = rte_power_pmd_mgmt_set_scaling_freq_max(lcore_id,
2771 						scale_freq_max);
2772 				if (ret < 0)
2773 					rte_exit(EXIT_FAILURE,
2774 						"Error setting scaling freq max: err=%d, lcore %d\n",
2775 							ret, lcore_id);
2776 
2777 				ret = rte_power_ethdev_pmgmt_queue_enable(
2778 						lcore_id, portid, queueid,
2779 						pmgmt_type);
2780 				if (ret < 0)
2781 					rte_exit(EXIT_FAILURE,
2782 						"rte_power_ethdev_pmgmt_queue_enable: err=%d, port=%d\n",
2783 							ret, portid);
2784 			}
2785 		}
2786 	}
2787 	/* >8 End of power library initialization. */
2788 
2789 	printf("\n");
2790 
2791 	/* start ports */
2792 	RTE_ETH_FOREACH_DEV(portid) {
2793 		if ((enabled_port_mask & (1 << portid)) == 0) {
2794 			continue;
2795 		}
2796 		/* Start device */
2797 		ret = rte_eth_dev_start(portid);
2798 		if (ret < 0)
2799 			rte_exit(EXIT_FAILURE, "rte_eth_dev_start: err=%d, "
2800 						"port=%d\n", ret, portid);
2801 		/*
2802 		 * If enabled, put device in promiscuous mode.
2803 		 * This allows IO forwarding mode to forward packets
2804 		 * to itself through 2 cross-connected  ports of the
2805 		 * target machine.
2806 		 */
2807 		if (promiscuous_on) {
2808 			ret = rte_eth_promiscuous_enable(portid);
2809 			if (ret != 0)
2810 				rte_exit(EXIT_FAILURE,
2811 					"rte_eth_promiscuous_enable: err=%s, port=%u\n",
2812 					rte_strerror(-ret), portid);
2813 		}
2814 		/* initialize spinlock for each port */
2815 		rte_spinlock_init(&(locks[portid]));
2816 
2817 		if (!parse_ptype)
2818 			if (!check_ptype(portid))
2819 				rte_exit(EXIT_FAILURE,
2820 					"PMD can not provide needed ptypes\n");
2821 	}
2822 
2823 	check_all_ports_link_status(enabled_port_mask);
2824 
2825 	/* launch per-lcore init on every lcore */
2826 	if (app_mode == APP_MODE_LEGACY) {
2827 		rte_eal_mp_remote_launch(main_legacy_loop, NULL, CALL_MAIN);
2828 	} else if (app_mode == APP_MODE_TELEMETRY) {
2829 		unsigned int i;
2830 
2831 		/* Init metrics library */
2832 		rte_metrics_init(rte_socket_id());
2833 		/** Register stats with metrics library */
2834 		for (i = 0; i < NUM_TELSTATS; i++)
2835 			ptr_strings[i] = telstats_strings[i].name;
2836 
2837 		ret = rte_metrics_reg_names(ptr_strings, NUM_TELSTATS);
2838 		if (ret >= 0)
2839 			telstats_index = ret;
2840 		else
2841 			rte_exit(EXIT_FAILURE, "failed to register metrics names");
2842 
2843 		RTE_LCORE_FOREACH_WORKER(lcore_id) {
2844 			rte_spinlock_init(&stats[lcore_id].telemetry_lock);
2845 		}
2846 		rte_timer_init(&telemetry_timer);
2847 		rte_telemetry_register_cmd("/l3fwd-power/stats",
2848 				handle_app_stats,
2849 				"Returns global power stats. Parameters: None");
2850 		rte_eal_mp_remote_launch(main_telemetry_loop, NULL,
2851 						SKIP_MAIN);
2852 	} else if (app_mode == APP_MODE_INTERRUPT) {
2853 		rte_eal_mp_remote_launch(main_intr_loop, NULL, CALL_MAIN);
2854 	} else if (app_mode == APP_MODE_PMD_MGMT) {
2855 		/* reuse telemetry loop for PMD power management mode */
2856 		rte_eal_mp_remote_launch(main_telemetry_loop, NULL, CALL_MAIN);
2857 	}
2858 
2859 	if (app_mode == APP_MODE_TELEMETRY)
2860 		launch_timer(rte_lcore_id());
2861 
2862 	RTE_LCORE_FOREACH_WORKER(lcore_id) {
2863 		if (rte_eal_wait_lcore(lcore_id) < 0)
2864 			return -1;
2865 	}
2866 
2867 	if (app_mode == APP_MODE_PMD_MGMT) {
2868 		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
2869 			if (rte_lcore_is_enabled(lcore_id) == 0)
2870 				continue;
2871 			qconf = &lcore_conf[lcore_id];
2872 			for (queue = 0; queue < qconf->n_rx_queue; ++queue) {
2873 				portid = qconf->rx_queue_list[queue].port_id;
2874 				queueid = qconf->rx_queue_list[queue].queue_id;
2875 
2876 				rte_power_ethdev_pmgmt_queue_disable(lcore_id,
2877 						portid, queueid);
2878 			}
2879 		}
2880 	}
2881 
2882 	RTE_ETH_FOREACH_DEV(portid)
2883 	{
2884 		if ((enabled_port_mask & (1 << portid)) == 0)
2885 			continue;
2886 
2887 		ret = rte_eth_dev_stop(portid);
2888 		if (ret != 0)
2889 			RTE_LOG(ERR, L3FWD_POWER, "rte_eth_dev_stop: err=%d, port=%u\n",
2890 				ret, portid);
2891 
2892 		rte_eth_dev_close(portid);
2893 	}
2894 
2895 	if ((app_mode == APP_MODE_LEGACY) && deinit_power_library())
2896 		rte_exit(EXIT_FAILURE, "deinit_power_library failed\n");
2897 
2898 	if (rte_eal_cleanup() < 0)
2899 		RTE_LOG(ERR, L3FWD_POWER, "EAL cleanup failed\n");
2900 
2901 	return 0;
2902 }
2903