xref: /dpdk/lib/power/rte_power_pmd_mgmt.c (revision b7fe612ac1de393f869c9818d5503633c8e96b36)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020 Intel Corporation
3  */
4 
5 #include <rte_lcore.h>
6 #include <rte_cycles.h>
7 #include <rte_cpuflags.h>
8 #include <rte_malloc.h>
9 #include <rte_ethdev.h>
10 #include <rte_power_intrinsics.h>
11 
12 #include "rte_power_pmd_mgmt.h"
13 
14 #define EMPTYPOLL_MAX  512
15 
16 /* store some internal state */
17 static struct pmd_conf_data {
18 	/** what do we support? */
19 	struct rte_cpu_intrinsics intrinsics_support;
20 	/** pre-calculated tsc diff for 1us */
21 	uint64_t tsc_per_us;
22 	/** how many rte_pause can we fit in a microsecond? */
23 	uint64_t pause_per_us;
24 } global_data;
25 
26 /**
27  * Possible power management states of an ethdev port.
28  */
29 enum pmd_mgmt_state {
30 	/** Device power management is disabled. */
31 	PMD_MGMT_DISABLED = 0,
32 	/** Device power management is enabled. */
33 	PMD_MGMT_ENABLED
34 };
35 
36 struct pmd_queue_cfg {
37 	volatile enum pmd_mgmt_state pwr_mgmt_state;
38 	/**< State of power management for this queue */
39 	enum rte_power_pmd_mgmt_type cb_mode;
40 	/**< Callback mode for this queue */
41 	const struct rte_eth_rxtx_callback *cur_cb;
42 	/**< Callback instance */
43 	volatile bool umwait_in_progress;
44 	/**< are we currently sleeping? */
45 	uint64_t empty_poll_stats;
46 	/**< Number of empty polls */
47 } __rte_cache_aligned;
48 
49 static struct pmd_queue_cfg port_cfg[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT];
50 
51 static void
52 calc_tsc(void)
53 {
54 	const uint64_t hz = rte_get_timer_hz();
55 	const uint64_t tsc_per_us = hz / US_PER_S; /* 1us */
56 
57 	global_data.tsc_per_us = tsc_per_us;
58 
59 	/* only do this if we don't have tpause */
60 	if (!global_data.intrinsics_support.power_pause) {
61 		const uint64_t start = rte_rdtsc_precise();
62 		const uint32_t n_pauses = 10000;
63 		double us, us_per_pause;
64 		uint64_t end;
65 		unsigned int i;
66 
67 		/* estimate number of rte_pause() calls per us*/
68 		for (i = 0; i < n_pauses; i++)
69 			rte_pause();
70 
71 		end = rte_rdtsc_precise();
72 		us = (end - start) / (double)tsc_per_us;
73 		us_per_pause = us / n_pauses;
74 
75 		global_data.pause_per_us = (uint64_t)(1.0 / us_per_pause);
76 	}
77 }
78 
79 static uint16_t
80 clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
81 		uint16_t nb_rx, uint16_t max_pkts __rte_unused,
82 		void *addr __rte_unused)
83 {
84 
85 	struct pmd_queue_cfg *q_conf;
86 
87 	q_conf = &port_cfg[port_id][qidx];
88 
89 	if (unlikely(nb_rx == 0)) {
90 		q_conf->empty_poll_stats++;
91 		if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
92 			struct rte_power_monitor_cond pmc;
93 			uint16_t ret;
94 
95 			/*
96 			 * we might get a cancellation request while being
97 			 * inside the callback, in which case the wakeup
98 			 * wouldn't work because it would've arrived too early.
99 			 *
100 			 * to get around this, we notify the other thread that
101 			 * we're sleeping, so that it can spin until we're done.
102 			 * unsolicited wakeups are perfectly safe.
103 			 */
104 			q_conf->umwait_in_progress = true;
105 
106 			rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
107 
108 			/* check if we need to cancel sleep */
109 			if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) {
110 				/* use monitoring condition to sleep */
111 				ret = rte_eth_get_monitor_addr(port_id, qidx,
112 						&pmc);
113 				if (ret == 0)
114 					rte_power_monitor(&pmc, UINT64_MAX);
115 			}
116 			q_conf->umwait_in_progress = false;
117 
118 			rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
119 		}
120 	} else
121 		q_conf->empty_poll_stats = 0;
122 
123 	return nb_rx;
124 }
125 
126 static uint16_t
127 clb_pause(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
128 		uint16_t nb_rx, uint16_t max_pkts __rte_unused,
129 		void *addr __rte_unused)
130 {
131 	struct pmd_queue_cfg *q_conf;
132 
133 	q_conf = &port_cfg[port_id][qidx];
134 
135 	if (unlikely(nb_rx == 0)) {
136 		q_conf->empty_poll_stats++;
137 		/* sleep for 1 microsecond */
138 		if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
139 			/* use tpause if we have it */
140 			if (global_data.intrinsics_support.power_pause) {
141 				const uint64_t cur = rte_rdtsc();
142 				const uint64_t wait_tsc =
143 						cur + global_data.tsc_per_us;
144 				rte_power_pause(wait_tsc);
145 			} else {
146 				uint64_t i;
147 				for (i = 0; i < global_data.pause_per_us; i++)
148 					rte_pause();
149 			}
150 		}
151 	} else
152 		q_conf->empty_poll_stats = 0;
153 
154 	return nb_rx;
155 }
156 
157 static uint16_t
158 clb_scale_freq(uint16_t port_id, uint16_t qidx,
159 		struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
160 		uint16_t max_pkts __rte_unused, void *_  __rte_unused)
161 {
162 	struct pmd_queue_cfg *q_conf;
163 
164 	q_conf = &port_cfg[port_id][qidx];
165 
166 	if (unlikely(nb_rx == 0)) {
167 		q_conf->empty_poll_stats++;
168 		if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX))
169 			/* scale down freq */
170 			rte_power_freq_min(rte_lcore_id());
171 	} else {
172 		q_conf->empty_poll_stats = 0;
173 		/* scale up freq */
174 		rte_power_freq_max(rte_lcore_id());
175 	}
176 
177 	return nb_rx;
178 }
179 
180 int
181 rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
182 		uint16_t queue_id, enum rte_power_pmd_mgmt_type mode)
183 {
184 	struct pmd_queue_cfg *queue_cfg;
185 	struct rte_eth_dev_info info;
186 	int ret;
187 
188 	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
189 
190 	if (queue_id >= RTE_MAX_QUEUES_PER_PORT || lcore_id >= RTE_MAX_LCORE) {
191 		ret = -EINVAL;
192 		goto end;
193 	}
194 
195 	if (rte_eth_dev_info_get(port_id, &info) < 0) {
196 		ret = -EINVAL;
197 		goto end;
198 	}
199 
200 	/* check if queue id is valid */
201 	if (queue_id >= info.nb_rx_queues) {
202 		ret = -EINVAL;
203 		goto end;
204 	}
205 
206 	queue_cfg = &port_cfg[port_id][queue_id];
207 
208 	if (queue_cfg->pwr_mgmt_state != PMD_MGMT_DISABLED) {
209 		ret = -EINVAL;
210 		goto end;
211 	}
212 
213 	/* we need this in various places */
214 	rte_cpu_get_intrinsics_support(&global_data.intrinsics_support);
215 
216 	switch (mode) {
217 	case RTE_POWER_MGMT_TYPE_MONITOR:
218 	{
219 		struct rte_power_monitor_cond dummy;
220 
221 		/* check if rte_power_monitor is supported */
222 		if (!global_data.intrinsics_support.power_monitor) {
223 			RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not supported\n");
224 			ret = -ENOTSUP;
225 			goto end;
226 		}
227 
228 		/* check if the device supports the necessary PMD API */
229 		if (rte_eth_get_monitor_addr(port_id, queue_id,
230 				&dummy) == -ENOTSUP) {
231 			RTE_LOG(DEBUG, POWER, "The device does not support rte_eth_get_monitor_addr\n");
232 			ret = -ENOTSUP;
233 			goto end;
234 		}
235 		/* initialize data before enabling the callback */
236 		queue_cfg->empty_poll_stats = 0;
237 		queue_cfg->cb_mode = mode;
238 		queue_cfg->umwait_in_progress = false;
239 		queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
240 
241 		/* ensure we update our state before callback starts */
242 		rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
243 
244 		queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
245 				clb_umwait, NULL);
246 		break;
247 	}
248 	case RTE_POWER_MGMT_TYPE_SCALE:
249 	{
250 		enum power_management_env env;
251 		/* only PSTATE and ACPI modes are supported */
252 		if (!rte_power_check_env_supported(PM_ENV_ACPI_CPUFREQ) &&
253 				!rte_power_check_env_supported(
254 					PM_ENV_PSTATE_CPUFREQ)) {
255 			RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes are supported\n");
256 			ret = -ENOTSUP;
257 			goto end;
258 		}
259 		/* ensure we could initialize the power library */
260 		if (rte_power_init(lcore_id)) {
261 			ret = -EINVAL;
262 			goto end;
263 		}
264 		/* ensure we initialized the correct env */
265 		env = rte_power_get_env();
266 		if (env != PM_ENV_ACPI_CPUFREQ &&
267 				env != PM_ENV_PSTATE_CPUFREQ) {
268 			RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes were initialized\n");
269 			ret = -ENOTSUP;
270 			goto end;
271 		}
272 		/* initialize data before enabling the callback */
273 		queue_cfg->empty_poll_stats = 0;
274 		queue_cfg->cb_mode = mode;
275 		queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
276 
277 		/* this is not necessary here, but do it anyway */
278 		rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
279 
280 		queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id,
281 				queue_id, clb_scale_freq, NULL);
282 		break;
283 	}
284 	case RTE_POWER_MGMT_TYPE_PAUSE:
285 		/* figure out various time-to-tsc conversions */
286 		if (global_data.tsc_per_us == 0)
287 			calc_tsc();
288 
289 		/* initialize data before enabling the callback */
290 		queue_cfg->empty_poll_stats = 0;
291 		queue_cfg->cb_mode = mode;
292 		queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
293 
294 		/* this is not necessary here, but do it anyway */
295 		rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
296 
297 		queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
298 				clb_pause, NULL);
299 		break;
300 	}
301 	ret = 0;
302 end:
303 	return ret;
304 }
305 
306 int
307 rte_power_ethdev_pmgmt_queue_disable(unsigned int lcore_id,
308 		uint16_t port_id, uint16_t queue_id)
309 {
310 	struct pmd_queue_cfg *queue_cfg;
311 
312 	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
313 
314 	if (lcore_id >= RTE_MAX_LCORE || queue_id >= RTE_MAX_QUEUES_PER_PORT)
315 		return -EINVAL;
316 
317 	/* no need to check queue id as wrong queue id would not be enabled */
318 	queue_cfg = &port_cfg[port_id][queue_id];
319 
320 	if (queue_cfg->pwr_mgmt_state != PMD_MGMT_ENABLED)
321 		return -EINVAL;
322 
323 	/* stop any callbacks from progressing */
324 	queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
325 
326 	/* ensure we update our state before continuing */
327 	rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
328 
329 	switch (queue_cfg->cb_mode) {
330 	case RTE_POWER_MGMT_TYPE_MONITOR:
331 	{
332 		bool exit = false;
333 		do {
334 			/*
335 			 * we may request cancellation while the other thread
336 			 * has just entered the callback but hasn't started
337 			 * sleeping yet, so keep waking it up until we know it's
338 			 * done sleeping.
339 			 */
340 			if (queue_cfg->umwait_in_progress)
341 				rte_power_monitor_wakeup(lcore_id);
342 			else
343 				exit = true;
344 		} while (!exit);
345 	}
346 	/* fall-through */
347 	case RTE_POWER_MGMT_TYPE_PAUSE:
348 		rte_eth_remove_rx_callback(port_id, queue_id,
349 				queue_cfg->cur_cb);
350 		break;
351 	case RTE_POWER_MGMT_TYPE_SCALE:
352 		rte_power_freq_max(lcore_id);
353 		rte_eth_remove_rx_callback(port_id, queue_id,
354 				queue_cfg->cur_cb);
355 		rte_power_exit(lcore_id);
356 		break;
357 	}
358 	/*
359 	 * we don't free the RX callback here because it is unsafe to do so
360 	 * unless we know for a fact that all data plane threads have stopped.
361 	 */
362 	queue_cfg->cur_cb = NULL;
363 
364 	return 0;
365 }
366