xref: /dpdk/drivers/power/intel_pstate/intel_pstate_cpufreq.c (revision 6f987b594fa6751b49769755fe1d1bf9f9d15ac4)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2018 Intel Corporation
3  */
4 
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <fcntl.h>
8 #include <string.h>
9 #include <unistd.h>
10 #include <limits.h>
11 #include <errno.h>
12 #include <inttypes.h>
13 
14 #include <rte_memcpy.h>
15 #include <rte_stdatomic.h>
16 
17 #include "rte_power_pmd_mgmt.h"
18 #include "intel_pstate_cpufreq.h"
19 #include "power_common.h"
20 
21 /* macros used for rounding frequency to nearest 100000 */
22 #define FREQ_ROUNDING_DELTA 50000
23 #define ROUND_FREQ_TO_N_100000 100000
24 
25 #define BUS_FREQ     100000
26 
27 #define POWER_GOVERNOR_PERF "performance"
28 #define POWER_SYSFILE_MAX_FREQ \
29 		"/sys/devices/system/cpu/cpu%u/cpufreq/scaling_max_freq"
30 #define POWER_SYSFILE_MIN_FREQ  \
31 		"/sys/devices/system/cpu/cpu%u/cpufreq/scaling_min_freq"
32 #define POWER_SYSFILE_CUR_FREQ  \
33 		"/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq"
34 #define POWER_SYSFILE_BASE_MAX_FREQ \
35 		"/sys/devices/system/cpu/cpu%u/cpufreq/cpuinfo_max_freq"
36 #define POWER_SYSFILE_BASE_MIN_FREQ  \
37 		"/sys/devices/system/cpu/cpu%u/cpufreq/cpuinfo_min_freq"
38 #define POWER_SYSFILE_BASE_FREQ  \
39 		"/sys/devices/system/cpu/cpu%u/cpufreq/base_frequency"
40 #define POWER_SYSFILE_TURBO_PCT  \
41 		"/sys/devices/system/cpu/intel_pstate/turbo_pct"
42 #define POWER_PSTATE_DRIVER "intel_pstate"
43 
44 
45 enum power_state {
46 	POWER_IDLE = 0,
47 	POWER_ONGOING,
48 	POWER_USED,
49 	POWER_UNKNOWN
50 };
51 
52 struct __rte_cache_aligned pstate_power_info {
53 	unsigned int lcore_id;               /**< Logical core id */
54 	uint32_t freqs[RTE_MAX_LCORE_FREQS]; /**< Frequency array */
55 	uint32_t nb_freqs;                   /**< number of available freqs */
56 	FILE *f_cur_min;                     /**< FD of scaling_min */
57 	FILE *f_cur_max;                     /**< FD of scaling_max */
58 	char governor_ori[32];               /**< Original governor name */
59 	uint32_t curr_idx;                   /**< Freq index in freqs array */
60 	uint32_t non_turbo_max_ratio;        /**< Non Turbo Max ratio  */
61 	uint32_t sys_max_freq;               /**< system wide max freq  */
62 	uint32_t core_base_freq;             /**< core base freq  */
63 	RTE_ATOMIC(uint32_t) state;          /**< Power in use state */
64 	uint16_t turbo_available;            /**< Turbo Boost available */
65 	uint16_t turbo_enable;               /**< Turbo Boost enable/disable */
66 	uint16_t priority_core;              /**< High Performance core */
67 };
68 
69 
70 static struct pstate_power_info lcore_power_info[RTE_MAX_LCORE];
71 
72 /**
73  * It is to read the turbo mode percentage from sysfs
74  */
75 static int32_t
76 power_read_turbo_pct(uint64_t *outVal)
77 {
78 	int fd, ret;
79 	char val[4] = {0};
80 	char *endptr;
81 
82 	fd = open(POWER_SYSFILE_TURBO_PCT, O_RDONLY);
83 
84 	if (fd < 0) {
85 		POWER_LOG(ERR, "Error opening '%s': %s", POWER_SYSFILE_TURBO_PCT,
86 				 strerror(errno));
87 		return fd;
88 	}
89 
90 	ret = read(fd, val, sizeof(val));
91 
92 	if (ret < 0) {
93 		POWER_LOG(ERR, "Error reading '%s': %s", POWER_SYSFILE_TURBO_PCT,
94 				 strerror(errno));
95 		goto out;
96 	}
97 
98 	errno = 0;
99 	*outVal = (uint64_t) strtol(val, &endptr, 10);
100 	if (errno != 0 || (*endptr != 0 && *endptr != '\n')) {
101 		POWER_LOG(ERR, "Error converting str to digits, read from %s: %s",
102 				 POWER_SYSFILE_TURBO_PCT, strerror(errno));
103 		ret = -1;
104 		goto out;
105 	}
106 
107 	POWER_DEBUG_LOG("power turbo pct: %"PRIu64, *outVal);
108 
109 out:	close(fd);
110 	return ret;
111 }
112 
113 /**
114  * It is to fopen the sys file for the future setting the lcore frequency.
115  */
116 static int
117 power_init_for_setting_freq(struct pstate_power_info *pi)
118 {
119 	FILE *f_base = NULL, *f_base_min = NULL, *f_base_max = NULL,
120 	     *f_min = NULL, *f_max = NULL;
121 	uint32_t base_ratio, base_min_ratio, base_max_ratio;
122 	uint64_t max_non_turbo;
123 	int ret;
124 
125 	/* open all files we expect to have open */
126 	open_core_sysfs_file(&f_base_max, "r", POWER_SYSFILE_BASE_MAX_FREQ,
127 			pi->lcore_id);
128 	if (f_base_max == NULL) {
129 		POWER_LOG(ERR, "failed to open %s",
130 				POWER_SYSFILE_BASE_MAX_FREQ);
131 		goto err;
132 	}
133 
134 	open_core_sysfs_file(&f_base_min, "r", POWER_SYSFILE_BASE_MIN_FREQ,
135 			pi->lcore_id);
136 	if (f_base_min == NULL) {
137 		POWER_LOG(ERR, "failed to open %s",
138 				POWER_SYSFILE_BASE_MIN_FREQ);
139 		goto err;
140 	}
141 
142 	open_core_sysfs_file(&f_min, "rw+", POWER_SYSFILE_MIN_FREQ,
143 			pi->lcore_id);
144 	if (f_min == NULL) {
145 		POWER_LOG(ERR, "failed to open %s",
146 				POWER_SYSFILE_MIN_FREQ);
147 		goto err;
148 	}
149 
150 	open_core_sysfs_file(&f_max, "rw+", POWER_SYSFILE_MAX_FREQ,
151 			pi->lcore_id);
152 	if (f_max == NULL) {
153 		POWER_LOG(ERR, "failed to open %s",
154 				POWER_SYSFILE_MAX_FREQ);
155 		goto err;
156 	}
157 
158 	open_core_sysfs_file(&f_base, "r", POWER_SYSFILE_BASE_FREQ,
159 			pi->lcore_id);
160 	/* base ratio file may not exist in some kernels, so no error check */
161 
162 	/* read base max ratio */
163 	ret = read_core_sysfs_u32(f_base_max, &base_max_ratio);
164 	if (ret < 0) {
165 		POWER_LOG(ERR, "Failed to read %s",
166 				POWER_SYSFILE_BASE_MAX_FREQ);
167 		goto err;
168 	}
169 
170 	/* read base min ratio */
171 	ret = read_core_sysfs_u32(f_base_min, &base_min_ratio);
172 	if (ret < 0) {
173 		POWER_LOG(ERR, "Failed to read %s",
174 				POWER_SYSFILE_BASE_MIN_FREQ);
175 		goto err;
176 	}
177 
178 	/* base ratio may not exist */
179 	if (f_base != NULL) {
180 		ret = read_core_sysfs_u32(f_base, &base_ratio);
181 		if (ret < 0) {
182 			POWER_LOG(ERR, "Failed to read %s",
183 					POWER_SYSFILE_BASE_FREQ);
184 			goto err;
185 		}
186 	} else {
187 		base_ratio = 0;
188 	}
189 
190 	/* convert ratios to bins */
191 	base_max_ratio /= BUS_FREQ;
192 	base_min_ratio /= BUS_FREQ;
193 	base_ratio /= BUS_FREQ;
194 
195 	/* assign file handles */
196 	pi->f_cur_min = f_min;
197 	pi->f_cur_max = f_max;
198 
199 	/* try to get turbo from global sysfs entry for less privileges than from MSR */
200 	if (power_read_turbo_pct(&max_non_turbo) < 0)
201 		goto err;
202 	/* no errors after this point */
203 
204 	max_non_turbo = base_min_ratio
205 		      + (100 - max_non_turbo) * (base_max_ratio - base_min_ratio) / 100;
206 
207 	POWER_DEBUG_LOG("no turbo perf %"PRIu64, max_non_turbo);
208 
209 	pi->non_turbo_max_ratio = (uint32_t)max_non_turbo;
210 
211 	/*
212 	 * If base_frequency is reported as greater than the maximum
213 	 * turbo frequency, that's a known issue with some kernels.
214 	 * Set base_frequency to max_non_turbo as a workaround.
215 	 */
216 	if (base_ratio > base_max_ratio) {
217 		/* base_ratio is greater than max turbo. Kernel bug. */
218 		pi->priority_core = 0;
219 		goto out;
220 	}
221 
222 	/*
223 	 * If base_frequency is reported as greater than the maximum
224 	 * non-turbo frequency, then mark it as a high priority core.
225 	 */
226 	if (base_ratio > max_non_turbo)
227 		pi->priority_core = 1;
228 	else
229 		pi->priority_core = 0;
230 	pi->core_base_freq = base_ratio * BUS_FREQ;
231 
232 out:
233 	if (f_base != NULL)
234 		fclose(f_base);
235 	fclose(f_base_max);
236 	fclose(f_base_min);
237 	/* f_min and f_max are stored, no need to close */
238 	return 0;
239 
240 err:
241 	if (f_base != NULL)
242 		fclose(f_base);
243 	if (f_base_min != NULL)
244 		fclose(f_base_min);
245 	if (f_base_max != NULL)
246 		fclose(f_base_max);
247 	if (f_min != NULL)
248 		fclose(f_min);
249 	if (f_max != NULL)
250 		fclose(f_max);
251 	return -1;
252 }
253 
254 static int
255 set_freq_internal(struct pstate_power_info *pi, uint32_t idx)
256 {
257 	uint32_t target_freq = 0;
258 
259 	if (idx >= RTE_MAX_LCORE_FREQS || idx >= pi->nb_freqs) {
260 		POWER_LOG(ERR, "Invalid frequency index %u, which "
261 				"should be less than %u", idx, pi->nb_freqs);
262 		return -1;
263 	}
264 
265 	/* Check if it is the same as current */
266 	if (idx == pi->curr_idx)
267 		return 0;
268 
269 	/* Because Intel Pstate Driver only allow user change min/max hint
270 	 * User need change the min/max as same value.
271 	 */
272 	if (fseek(pi->f_cur_min, 0, SEEK_SET) < 0) {
273 		POWER_LOG(ERR, "Fail to set file position indicator to 0 "
274 				"for setting frequency for lcore %u",
275 				pi->lcore_id);
276 		return -1;
277 	}
278 
279 	if (fseek(pi->f_cur_max, 0, SEEK_SET) < 0) {
280 		POWER_LOG(ERR, "Fail to set file position indicator to 0 "
281 				"for setting frequency for lcore %u",
282 				pi->lcore_id);
283 		return -1;
284 	}
285 
286 	/* Turbo is available and enabled, first freq bucket is sys max freq */
287 	if (pi->turbo_available && idx == 0) {
288 		if (pi->turbo_enable)
289 			target_freq = pi->sys_max_freq;
290 		else {
291 			POWER_LOG(ERR, "Turbo is off, frequency can't be scaled up more %u",
292 					pi->lcore_id);
293 			return -1;
294 		}
295 	} else
296 		target_freq = pi->freqs[idx];
297 
298 	/* Decrease freq, the min freq should be updated first */
299 	if (idx  >  pi->curr_idx) {
300 
301 		if (fprintf(pi->f_cur_min, "%u", target_freq) < 0) {
302 			POWER_LOG(ERR, "Fail to write new frequency for "
303 					"lcore %u", pi->lcore_id);
304 			return -1;
305 		}
306 
307 		if (fprintf(pi->f_cur_max, "%u", target_freq) < 0) {
308 			POWER_LOG(ERR, "Fail to write new frequency for "
309 					"lcore %u", pi->lcore_id);
310 			return -1;
311 		}
312 
313 		POWER_DEBUG_LOG("Frequency '%u' to be set for lcore %u",
314 				  target_freq, pi->lcore_id);
315 
316 		fflush(pi->f_cur_min);
317 		fflush(pi->f_cur_max);
318 
319 	}
320 
321 	/* Increase freq, the max freq should be updated first */
322 	if (idx  <  pi->curr_idx) {
323 
324 		if (fprintf(pi->f_cur_max, "%u", target_freq) < 0) {
325 			POWER_LOG(ERR, "Fail to write new frequency for "
326 					"lcore %u", pi->lcore_id);
327 			return -1;
328 		}
329 
330 		if (fprintf(pi->f_cur_min, "%u", target_freq) < 0) {
331 			POWER_LOG(ERR, "Fail to write new frequency for "
332 					"lcore %u", pi->lcore_id);
333 			return -1;
334 		}
335 
336 		POWER_DEBUG_LOG("Frequency '%u' to be set for lcore %u",
337 				  target_freq, pi->lcore_id);
338 
339 		fflush(pi->f_cur_max);
340 		fflush(pi->f_cur_min);
341 	}
342 
343 	pi->curr_idx = idx;
344 
345 	return 1;
346 }
347 
348 /**
349  * It is to check the current scaling governor by reading sys file, and then
350  * set it into 'performance' if it is not by writing the sys file. The original
351  * governor will be saved for rolling back.
352  */
353 static int
354 power_set_governor_performance(struct pstate_power_info *pi)
355 {
356 	return power_set_governor(pi->lcore_id, POWER_GOVERNOR_PERF,
357 			pi->governor_ori, sizeof(pi->governor_ori));
358 }
359 
360 /**
361  * It is to check the governor and then set the original governor back if
362  * needed by writing the sys file.
363  */
364 static int
365 power_set_governor_original(struct pstate_power_info *pi)
366 {
367 	return power_set_governor(pi->lcore_id, pi->governor_ori, NULL, 0);
368 }
369 
370 /**
371  * It is to get the available frequencies of the specific lcore by reading the
372  * sys file.
373  */
374 static int
375 power_get_available_freqs(struct pstate_power_info *pi)
376 {
377 	FILE *f_min = NULL, *f_max = NULL;
378 	int ret = -1;
379 	uint32_t sys_min_freq = 0, sys_max_freq = 0, base_max_freq = 0;
380 	int config_min_freq, config_max_freq;
381 	uint32_t i, num_freqs = 0;
382 
383 	/* open all files */
384 	open_core_sysfs_file(&f_max, "r", POWER_SYSFILE_BASE_MAX_FREQ,
385 			pi->lcore_id);
386 	if (f_max == NULL) {
387 		POWER_LOG(ERR, "failed to open %s",
388 				POWER_SYSFILE_BASE_MAX_FREQ);
389 		goto out;
390 	}
391 
392 	open_core_sysfs_file(&f_min, "r", POWER_SYSFILE_BASE_MIN_FREQ,
393 			pi->lcore_id);
394 	if (f_min == NULL) {
395 		POWER_LOG(ERR, "failed to open %s",
396 				POWER_SYSFILE_BASE_MIN_FREQ);
397 		goto out;
398 	}
399 
400 	/* read base ratios */
401 	ret = read_core_sysfs_u32(f_max, &sys_max_freq);
402 	if (ret < 0) {
403 		POWER_LOG(ERR, "Failed to read %s",
404 				POWER_SYSFILE_BASE_MAX_FREQ);
405 		goto out;
406 	}
407 
408 	ret = read_core_sysfs_u32(f_min, &sys_min_freq);
409 	if (ret < 0) {
410 		POWER_LOG(ERR, "Failed to read %s",
411 				POWER_SYSFILE_BASE_MIN_FREQ);
412 		goto out;
413 	}
414 
415 	/* check for config set by user or application to limit frequency range */
416 	config_min_freq = rte_power_pmd_mgmt_get_scaling_freq_min(pi->lcore_id);
417 	if (config_min_freq < 0)
418 		goto out;
419 	config_max_freq = rte_power_pmd_mgmt_get_scaling_freq_max(pi->lcore_id);
420 	if (config_max_freq < 0)
421 		goto out;
422 
423 	sys_min_freq = RTE_MAX(sys_min_freq, (uint32_t)config_min_freq);
424 	if (config_max_freq > 0) /* Only use config_max_freq if a value has been set */
425 		sys_max_freq = RTE_MIN(sys_max_freq, (uint32_t)config_max_freq);
426 
427 	if (sys_max_freq < sys_min_freq)
428 		goto out;
429 
430 	pi->sys_max_freq = sys_max_freq;
431 
432 	if (pi->priority_core == 1)
433 		base_max_freq = pi->core_base_freq;
434 	else
435 		base_max_freq = pi->non_turbo_max_ratio * BUS_FREQ;
436 
437 	POWER_DEBUG_LOG("sys min %u, sys max %u, base_max %u",
438 			sys_min_freq,
439 			sys_max_freq,
440 			base_max_freq);
441 
442 	if (base_max_freq < sys_max_freq)
443 		pi->turbo_available = 1;
444 	else
445 		pi->turbo_available = 0;
446 
447 	/* If turbo is available then there is one extra freq bucket
448 	 * to store the sys max freq which value is base_max +1
449 	 */
450 	num_freqs = (RTE_MIN(base_max_freq, sys_max_freq) - sys_min_freq) / BUS_FREQ
451 			+ 1 + pi->turbo_available;
452 	if (num_freqs >= RTE_MAX_LCORE_FREQS) {
453 		POWER_LOG(ERR, "Too many available frequencies: %d",
454 				num_freqs);
455 		goto out;
456 	}
457 
458 	/* Generate the freq bucket array.
459 	 * If turbo is available the freq bucket[0] value is base_max +1
460 	 * the bucket[1] is base_max, bucket[2] is base_max - BUS_FREQ
461 	 * and so on.
462 	 * If turbo is not available bucket[0] is base_max and so on
463 	 */
464 	for (i = 0, pi->nb_freqs = 0; i < num_freqs; i++) {
465 		if ((i == 0) && pi->turbo_available)
466 			pi->freqs[pi->nb_freqs++] = RTE_MIN(base_max_freq, sys_max_freq) + 1;
467 		else
468 			pi->freqs[pi->nb_freqs++] = RTE_MIN(base_max_freq, sys_max_freq) -
469 					(i - pi->turbo_available) * BUS_FREQ;
470 	}
471 
472 	ret = 0;
473 
474 	POWER_DEBUG_LOG("%d frequency(s) of lcore %u are available",
475 			num_freqs, pi->lcore_id);
476 
477 out:
478 	if (f_min != NULL)
479 		fclose(f_min);
480 	if (f_max != NULL)
481 		fclose(f_max);
482 
483 	return ret;
484 }
485 
486 static int
487 power_get_cur_idx(struct pstate_power_info *pi)
488 {
489 	FILE *f_cur;
490 	int ret = -1;
491 	uint32_t sys_cur_freq = 0;
492 	unsigned int i;
493 
494 	open_core_sysfs_file(&f_cur, "r", POWER_SYSFILE_CUR_FREQ,
495 			pi->lcore_id);
496 	if (f_cur == NULL) {
497 		POWER_LOG(ERR, "failed to open %s",
498 				POWER_SYSFILE_CUR_FREQ);
499 		goto fail;
500 	}
501 
502 	ret = read_core_sysfs_u32(f_cur, &sys_cur_freq);
503 	if (ret < 0) {
504 		POWER_LOG(ERR, "Failed to read %s",
505 				POWER_SYSFILE_CUR_FREQ);
506 		goto fail;
507 	}
508 
509 	/* convert the frequency to nearest 100000 value
510 	 * Ex: if sys_cur_freq=1396789 then freq_conv=1400000
511 	 * Ex: if sys_cur_freq=800030 then freq_conv=800000
512 	 * Ex: if sys_cur_freq=800030 then freq_conv=800000
513 	 */
514 	unsigned int freq_conv = 0;
515 	freq_conv = (sys_cur_freq + FREQ_ROUNDING_DELTA)
516 				/ ROUND_FREQ_TO_N_100000;
517 	freq_conv = freq_conv * ROUND_FREQ_TO_N_100000;
518 
519 	for (i = 0; i < pi->nb_freqs; i++) {
520 		if (freq_conv == pi->freqs[i]) {
521 			pi->curr_idx = i;
522 			break;
523 		}
524 	}
525 
526 	ret = 0;
527 fail:
528 	if (f_cur != NULL)
529 		fclose(f_cur);
530 	return ret;
531 }
532 
533 int
534 power_pstate_cpufreq_check_supported(void)
535 {
536 	return cpufreq_check_scaling_driver(POWER_PSTATE_DRIVER);
537 }
538 
539 int
540 power_pstate_cpufreq_init(unsigned int lcore_id)
541 {
542 	struct pstate_power_info *pi;
543 	uint32_t exp_state;
544 
545 	if (!power_pstate_cpufreq_check_supported()) {
546 		POWER_LOG(ERR, "%s driver is not supported",
547 				POWER_PSTATE_DRIVER);
548 		return -1;
549 	}
550 
551 	if (lcore_id >= RTE_MAX_LCORE) {
552 		POWER_LOG(ERR, "Lcore id %u can not exceed %u",
553 				lcore_id, RTE_MAX_LCORE - 1U);
554 		return -1;
555 	}
556 
557 	pi = &lcore_power_info[lcore_id];
558 	exp_state = POWER_IDLE;
559 	/* The power in use state works as a guard variable between
560 	 * the CPU frequency control initialization and exit process.
561 	 * The ACQUIRE memory ordering here pairs with the RELEASE
562 	 * ordering below as lock to make sure the frequency operations
563 	 * in the critical section are done under the correct state.
564 	 */
565 	if (!rte_atomic_compare_exchange_strong_explicit(&(pi->state), &exp_state,
566 					POWER_ONGOING,
567 					rte_memory_order_acquire, rte_memory_order_relaxed)) {
568 		POWER_LOG(INFO, "Power management of lcore %u is "
569 				"in use", lcore_id);
570 		return -1;
571 	}
572 
573 	if (power_get_lcore_mapped_cpu_id(lcore_id, &pi->lcore_id) < 0) {
574 		POWER_LOG(ERR, "Cannot get CPU ID mapped for lcore %u", lcore_id);
575 		return -1;
576 	}
577 
578 	/* Check and set the governor */
579 	if (power_set_governor_performance(pi) < 0) {
580 		POWER_LOG(ERR, "Cannot set governor of lcore %u to "
581 				"performance", lcore_id);
582 		goto fail;
583 	}
584 	/* Init for setting lcore frequency */
585 	if (power_init_for_setting_freq(pi) < 0) {
586 		POWER_LOG(ERR, "Cannot init for setting frequency for "
587 				"lcore %u", lcore_id);
588 		goto fail;
589 	}
590 
591 	/* Get the available frequencies */
592 	if (power_get_available_freqs(pi) < 0) {
593 		POWER_LOG(ERR, "Cannot get available frequencies of "
594 				"lcore %u", lcore_id);
595 		goto fail;
596 	}
597 
598 	if (power_get_cur_idx(pi) < 0) {
599 		POWER_LOG(ERR, "Cannot get current frequency "
600 				"index of lcore %u", lcore_id);
601 		goto fail;
602 	}
603 
604 	/* Set freq to max by default */
605 	if (power_pstate_cpufreq_freq_max(lcore_id) < 0) {
606 		POWER_LOG(ERR, "Cannot set frequency of lcore %u "
607 				"to max", lcore_id);
608 		goto fail;
609 	}
610 
611 	POWER_LOG(INFO, "Initialized successfully for lcore %u "
612 			"power management", lcore_id);
613 	exp_state = POWER_ONGOING;
614 	rte_atomic_compare_exchange_strong_explicit(&(pi->state), &exp_state, POWER_USED,
615 				    rte_memory_order_release, rte_memory_order_relaxed);
616 
617 	return 0;
618 
619 fail:
620 	exp_state = POWER_ONGOING;
621 	rte_atomic_compare_exchange_strong_explicit(&(pi->state), &exp_state, POWER_UNKNOWN,
622 				    rte_memory_order_release, rte_memory_order_relaxed);
623 
624 	return -1;
625 }
626 
627 int
628 power_pstate_cpufreq_exit(unsigned int lcore_id)
629 {
630 	struct pstate_power_info *pi;
631 	uint32_t exp_state;
632 
633 	if (lcore_id >= RTE_MAX_LCORE) {
634 		POWER_LOG(ERR, "Lcore id %u can not exceeds %u",
635 				lcore_id, RTE_MAX_LCORE - 1U);
636 		return -1;
637 	}
638 	pi = &lcore_power_info[lcore_id];
639 
640 	exp_state = POWER_USED;
641 	/* The power in use state works as a guard variable between
642 	 * the CPU frequency control initialization and exit process.
643 	 * The ACQUIRE memory ordering here pairs with the RELEASE
644 	 * ordering below as lock to make sure the frequency operations
645 	 * in the critical section are under done the correct state.
646 	 */
647 	if (!rte_atomic_compare_exchange_strong_explicit(&(pi->state), &exp_state,
648 					POWER_ONGOING,
649 					rte_memory_order_acquire, rte_memory_order_relaxed)) {
650 		POWER_LOG(INFO, "Power management of lcore %u is "
651 				"not used", lcore_id);
652 		return -1;
653 	}
654 
655 	/* Close FD of setting freq */
656 	fclose(pi->f_cur_min);
657 	fclose(pi->f_cur_max);
658 	pi->f_cur_min = NULL;
659 	pi->f_cur_max = NULL;
660 
661 	/* Set the governor back to the original */
662 	if (power_set_governor_original(pi) < 0) {
663 		POWER_LOG(ERR, "Cannot set the governor of %u back "
664 				"to the original", lcore_id);
665 		goto fail;
666 	}
667 
668 	POWER_LOG(INFO, "Power management of lcore %u has exited from "
669 			"'performance' mode and been set back to the "
670 			"original", lcore_id);
671 	exp_state = POWER_ONGOING;
672 	rte_atomic_compare_exchange_strong_explicit(&(pi->state), &exp_state, POWER_IDLE,
673 				    rte_memory_order_release, rte_memory_order_relaxed);
674 
675 	return 0;
676 
677 fail:
678 	exp_state = POWER_ONGOING;
679 	rte_atomic_compare_exchange_strong_explicit(&(pi->state), &exp_state, POWER_UNKNOWN,
680 				    rte_memory_order_release, rte_memory_order_relaxed);
681 
682 	return -1;
683 }
684 
685 
686 uint32_t
687 power_pstate_cpufreq_freqs(unsigned int lcore_id, uint32_t *freqs, uint32_t num)
688 {
689 	struct pstate_power_info *pi;
690 
691 	if (lcore_id >= RTE_MAX_LCORE) {
692 		POWER_LOG(ERR, "Invalid lcore ID");
693 		return 0;
694 	}
695 
696 	if (freqs == NULL) {
697 		POWER_LOG(ERR, "NULL buffer supplied");
698 		return 0;
699 	}
700 
701 	pi = &lcore_power_info[lcore_id];
702 	if (num < pi->nb_freqs) {
703 		POWER_LOG(ERR, "Buffer size is not enough");
704 		return 0;
705 	}
706 	rte_memcpy(freqs, pi->freqs, pi->nb_freqs * sizeof(uint32_t));
707 
708 	return pi->nb_freqs;
709 }
710 
711 uint32_t
712 power_pstate_cpufreq_get_freq(unsigned int lcore_id)
713 {
714 	if (lcore_id >= RTE_MAX_LCORE) {
715 		POWER_LOG(ERR, "Invalid lcore ID");
716 		return RTE_POWER_INVALID_FREQ_INDEX;
717 	}
718 
719 	return lcore_power_info[lcore_id].curr_idx;
720 }
721 
722 
723 int
724 power_pstate_cpufreq_set_freq(unsigned int lcore_id, uint32_t index)
725 {
726 	if (lcore_id >= RTE_MAX_LCORE) {
727 		POWER_LOG(ERR, "Invalid lcore ID");
728 		return -1;
729 	}
730 
731 	return set_freq_internal(&(lcore_power_info[lcore_id]), index);
732 }
733 
734 int
735 power_pstate_cpufreq_freq_up(unsigned int lcore_id)
736 {
737 	struct pstate_power_info *pi;
738 
739 	if (lcore_id >= RTE_MAX_LCORE) {
740 		POWER_LOG(ERR, "Invalid lcore ID");
741 		return -1;
742 	}
743 
744 	pi = &lcore_power_info[lcore_id];
745 	if (pi->curr_idx == 0 ||
746 	    (pi->curr_idx == 1 && pi->turbo_available && !pi->turbo_enable))
747 		return 0;
748 
749 	/* Frequencies in the array are from high to low. */
750 	return set_freq_internal(pi, pi->curr_idx - 1);
751 }
752 
753 int
754 power_pstate_cpufreq_freq_down(unsigned int lcore_id)
755 {
756 	struct pstate_power_info *pi;
757 
758 	if (lcore_id >= RTE_MAX_LCORE) {
759 		POWER_LOG(ERR, "Invalid lcore ID");
760 		return -1;
761 	}
762 
763 	pi = &lcore_power_info[lcore_id];
764 	if (pi->curr_idx + 1 == pi->nb_freqs)
765 		return 0;
766 
767 	/* Frequencies in the array are from high to low. */
768 	return set_freq_internal(pi, pi->curr_idx + 1);
769 }
770 
771 int
772 power_pstate_cpufreq_freq_max(unsigned int lcore_id)
773 {
774 	if (lcore_id >= RTE_MAX_LCORE) {
775 		POWER_LOG(ERR, "Invalid lcore ID");
776 		return -1;
777 	}
778 
779 	/* Frequencies in the array are from high to low. */
780 	if (lcore_power_info[lcore_id].turbo_available) {
781 		if (lcore_power_info[lcore_id].turbo_enable)
782 			/* Set to Turbo */
783 			return set_freq_internal(
784 					&lcore_power_info[lcore_id], 0);
785 		else
786 			/* Set to max non-turbo */
787 			return set_freq_internal(
788 					&lcore_power_info[lcore_id], 1);
789 	} else
790 		return set_freq_internal(&lcore_power_info[lcore_id], 0);
791 }
792 
793 
794 int
795 power_pstate_cpufreq_freq_min(unsigned int lcore_id)
796 {
797 	struct pstate_power_info *pi;
798 
799 	if (lcore_id >= RTE_MAX_LCORE) {
800 		POWER_LOG(ERR, "Invalid lcore ID");
801 		return -1;
802 	}
803 
804 	pi = &lcore_power_info[lcore_id];
805 
806 	/* Frequencies in the array are from high to low. */
807 	return set_freq_internal(pi, pi->nb_freqs - 1);
808 }
809 
810 
811 int
812 power_pstate_turbo_status(unsigned int lcore_id)
813 {
814 	struct pstate_power_info *pi;
815 
816 	if (lcore_id >= RTE_MAX_LCORE) {
817 		POWER_LOG(ERR, "Invalid lcore ID");
818 		return -1;
819 	}
820 
821 	pi = &lcore_power_info[lcore_id];
822 
823 	return pi->turbo_enable;
824 }
825 
826 int
827 power_pstate_enable_turbo(unsigned int lcore_id)
828 {
829 	struct pstate_power_info *pi;
830 
831 	if (lcore_id >= RTE_MAX_LCORE) {
832 		POWER_LOG(ERR, "Invalid lcore ID");
833 		return -1;
834 	}
835 
836 	pi = &lcore_power_info[lcore_id];
837 
838 	if (pi->turbo_available)
839 		pi->turbo_enable = 1;
840 	else {
841 		pi->turbo_enable = 0;
842 		POWER_LOG(ERR,
843 			"Failed to enable turbo on lcore %u",
844 			lcore_id);
845 			return -1;
846 	}
847 
848 	return 0;
849 }
850 
851 
852 int
853 power_pstate_disable_turbo(unsigned int lcore_id)
854 {
855 	struct pstate_power_info *pi;
856 
857 	if (lcore_id >= RTE_MAX_LCORE) {
858 		POWER_LOG(ERR, "Invalid lcore ID");
859 		return -1;
860 	}
861 
862 	pi = &lcore_power_info[lcore_id];
863 
864 	pi->turbo_enable = 0;
865 
866 	if (pi->turbo_available && pi->curr_idx <= 1) {
867 		/* Try to set freq to max by default coming out of turbo */
868 		if (power_pstate_cpufreq_freq_max(lcore_id) < 0) {
869 			POWER_LOG(ERR,
870 				"Failed to set frequency of lcore %u to max",
871 				lcore_id);
872 			return -1;
873 		}
874 	}
875 
876 	return 0;
877 }
878 
879 
880 int power_pstate_get_capabilities(unsigned int lcore_id,
881 		struct rte_power_core_capabilities *caps)
882 {
883 	struct pstate_power_info *pi;
884 
885 	if (lcore_id >= RTE_MAX_LCORE) {
886 		POWER_LOG(ERR, "Invalid lcore ID");
887 		return -1;
888 	}
889 	if (caps == NULL) {
890 		POWER_LOG(ERR, "Invalid argument");
891 		return -1;
892 	}
893 
894 	pi = &lcore_power_info[lcore_id];
895 	caps->capabilities = 0;
896 	caps->turbo = !!(pi->turbo_available);
897 	caps->priority = pi->priority_core;
898 
899 	return 0;
900 }
901 
902 static struct rte_power_cpufreq_ops pstate_ops = {
903 	.name = "intel-pstate",
904 	.init = power_pstate_cpufreq_init,
905 	.exit = power_pstate_cpufreq_exit,
906 	.check_env_support = power_pstate_cpufreq_check_supported,
907 	.get_avail_freqs = power_pstate_cpufreq_freqs,
908 	.get_freq = power_pstate_cpufreq_get_freq,
909 	.set_freq = power_pstate_cpufreq_set_freq,
910 	.freq_down = power_pstate_cpufreq_freq_down,
911 	.freq_up = power_pstate_cpufreq_freq_up,
912 	.freq_max = power_pstate_cpufreq_freq_max,
913 	.freq_min = power_pstate_cpufreq_freq_min,
914 	.turbo_status = power_pstate_turbo_status,
915 	.enable_turbo = power_pstate_enable_turbo,
916 	.disable_turbo = power_pstate_disable_turbo,
917 	.get_caps = power_pstate_get_capabilities
918 };
919 
920 RTE_POWER_REGISTER_CPUFREQ_OPS(pstate_ops);
921