xref: /dpdk/lib/eal/x86/rte_power_intrinsics.c (revision c7ed1ce047045e0bbe8b0cdefb5ca0ff2d15302b)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020 Intel Corporation
3  */
4 
5 #include <rte_common.h>
6 #include <rte_lcore.h>
7 #include <rte_rtm.h>
8 #include <rte_spinlock.h>
9 
10 #include "rte_power_intrinsics.h"
11 
12 /*
13  * Per-lcore structure holding current status of C0.2 sleeps.
14  */
15 static struct power_wait_status {
16 	rte_spinlock_t lock;
17 	volatile void *monitor_addr; /**< NULL if not currently sleeping */
18 } __rte_cache_aligned wait_status[RTE_MAX_LCORE];
19 
20 /*
21  * This function uses UMONITOR/UMWAIT instructions and will enter C0.2 state.
22  * For more information about usage of these instructions, please refer to
23  * Intel(R) 64 and IA-32 Architectures Software Developer's Manual.
24  */
25 static void intel_umonitor(volatile void *addr)
26 {
27 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__WAITPKG__)
28 	/* cast away "volatile" when using the intrinsic */
29 	_umonitor((void *)(uintptr_t)addr);
30 #else
31 	/*
32 	 * we're using raw byte codes for compiler versions which
33 	 * don't support this instruction natively.
34 	 */
35 	asm volatile(".byte 0xf3, 0x0f, 0xae, 0xf7;"
36 			:
37 			: "D"(addr));
38 #endif
39 }
40 
41 static void intel_umwait(const uint64_t timeout)
42 {
43 	const uint32_t tsc_l = (uint32_t)timeout;
44 	const uint32_t tsc_h = (uint32_t)(timeout >> 32);
45 
46 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__WAITPKG__)
47 	_umwait(tsc_l, tsc_h);
48 #else
49 	asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;"
50 			: /* ignore rflags */
51 			: "D"(0), /* enter C0.2 */
52 			  "a"(tsc_l), "d"(tsc_h));
53 #endif
54 }
55 
56 /*
57  * This function uses MONITORX/MWAITX instructions and will enter C1 state.
58  * For more information about usage of these instructions, please refer to
59  * AMD64 Architecture Programmer’s Manual.
60  */
61 static void amd_monitorx(volatile void *addr)
62 {
63 #if defined(__MWAITX__)
64 	/* cast away "volatile" when using the intrinsic */
65 	_mm_monitorx((void *)(uintptr_t)addr, 0, 0);
66 #else
67 	asm volatile(".byte 0x0f, 0x01, 0xfa;"
68 			:
69 			: "a"(addr),
70 			"c"(0),  /* no extensions */
71 			"d"(0)); /* no hints */
72 #endif
73 }
74 
75 static void amd_mwaitx(const uint64_t timeout)
76 {
77 	RTE_SET_USED(timeout);
78 #if defined(__MWAITX__)
79 	_mm_mwaitx(0, 0, 0);
80 #else
81 	asm volatile(".byte 0x0f, 0x01, 0xfb;"
82 			: /* ignore rflags */
83 			: "a"(0), /* enter C1 */
84 			"c"(0)); /* no time-out */
85 #endif
86 }
87 
88 static struct {
89 	void (*mmonitor)(volatile void *addr);
90 	void (*mwait)(const uint64_t timeout);
91 } __rte_cache_aligned power_monitor_ops;
92 
93 static inline void
94 __umwait_wakeup(volatile void *addr)
95 {
96 	uint64_t val;
97 
98 	/* trigger a write but don't change the value */
99 	val = rte_atomic_load_explicit((volatile __rte_atomic uint64_t *)addr,
100 			rte_memory_order_relaxed);
101 	rte_atomic_compare_exchange_strong_explicit((volatile __rte_atomic uint64_t *)addr,
102 			&val, val, rte_memory_order_relaxed, rte_memory_order_relaxed);
103 }
104 
105 static bool wait_supported;
106 static bool wait_multi_supported;
107 static bool monitor_supported;
108 
109 static inline uint64_t
110 __get_umwait_val(const volatile void *p, const uint8_t sz)
111 {
112 	switch (sz) {
113 	case sizeof(uint8_t):
114 		return *(const volatile uint8_t *)p;
115 	case sizeof(uint16_t):
116 		return *(const volatile uint16_t *)p;
117 	case sizeof(uint32_t):
118 		return *(const volatile uint32_t *)p;
119 	case sizeof(uint64_t):
120 		return *(const volatile uint64_t *)p;
121 	default:
122 		/* shouldn't happen */
123 		RTE_ASSERT(0);
124 		return 0;
125 	}
126 }
127 
128 static inline int
129 __check_val_size(const uint8_t sz)
130 {
131 	switch (sz) {
132 	case sizeof(uint8_t):  /* fall-through */
133 	case sizeof(uint16_t): /* fall-through */
134 	case sizeof(uint32_t): /* fall-through */
135 	case sizeof(uint64_t): /* fall-through */
136 		return 0;
137 	default:
138 		/* unexpected size */
139 		return -1;
140 	}
141 }
142 
143 /**
144  * This function uses UMONITOR/UMWAIT instructions and will enter C0.2 state.
145  * For more information about usage of these instructions, please refer to
146  * Intel(R) 64 and IA-32 Architectures Software Developer's Manual.
147  */
148 int
149 rte_power_monitor(const struct rte_power_monitor_cond *pmc,
150 		const uint64_t tsc_timestamp)
151 {
152 	const unsigned int lcore_id = rte_lcore_id();
153 	struct power_wait_status *s;
154 	uint64_t cur_value;
155 
156 	/* prevent user from running this instruction if it's not supported */
157 	if (!monitor_supported)
158 		return -ENOTSUP;
159 
160 	/* prevent non-EAL thread from using this API */
161 	if (lcore_id >= RTE_MAX_LCORE)
162 		return -EINVAL;
163 
164 	if (pmc == NULL)
165 		return -EINVAL;
166 
167 	if (__check_val_size(pmc->size) < 0)
168 		return -EINVAL;
169 
170 	if (pmc->fn == NULL)
171 		return -EINVAL;
172 
173 	s = &wait_status[lcore_id];
174 
175 	/* update sleep address */
176 	rte_spinlock_lock(&s->lock);
177 	s->monitor_addr = pmc->addr;
178 
179 	/* set address for memory monitor */
180 	power_monitor_ops.mmonitor(pmc->addr);
181 
182 	/* now that we've put this address into monitor, we can unlock */
183 	rte_spinlock_unlock(&s->lock);
184 
185 	cur_value = __get_umwait_val(pmc->addr, pmc->size);
186 
187 	/* check if callback indicates we should abort */
188 	if (pmc->fn(cur_value, pmc->opaque) != 0)
189 		goto end;
190 
191 	/* execute mwait */
192 	power_monitor_ops.mwait(tsc_timestamp);
193 
194 end:
195 	/* erase sleep address */
196 	rte_spinlock_lock(&s->lock);
197 	s->monitor_addr = NULL;
198 	rte_spinlock_unlock(&s->lock);
199 
200 	return 0;
201 }
202 
203 /**
204  * This function uses TPAUSE instruction  and will enter C0.2 state. For more
205  * information about usage of this instruction, please refer to Intel(R) 64 and
206  * IA-32 Architectures Software Developer's Manual.
207  */
208 int
209 rte_power_pause(const uint64_t tsc_timestamp)
210 {
211 	const uint32_t tsc_l = (uint32_t)tsc_timestamp;
212 	const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
213 
214 	/* prevent user from running this instruction if it's not supported */
215 	if (!wait_supported)
216 		return -ENOTSUP;
217 
218 	/* execute TPAUSE */
219 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__WAITPKG__)
220 	_tpause(tsc_l, tsc_h);
221 #else
222 	asm volatile(".byte 0x66, 0x0f, 0xae, 0xf7;"
223 			: /* ignore rflags */
224 			: "D"(0), /* enter C0.2 */
225 			"a"(tsc_l), "d"(tsc_h));
226 #endif
227 
228 	return 0;
229 }
230 
231 RTE_INIT(rte_power_intrinsics_init) {
232 	struct rte_cpu_intrinsics i;
233 
234 	rte_cpu_get_intrinsics_support(&i);
235 
236 	if (i.power_monitor && i.power_pause)
237 		wait_supported = 1;
238 	if (i.power_monitor_multi)
239 		wait_multi_supported = 1;
240 	if (i.power_monitor)
241 		monitor_supported = 1;
242 
243 	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MONITORX)) {
244 		power_monitor_ops.mmonitor = &amd_monitorx;
245 		power_monitor_ops.mwait = &amd_mwaitx;
246 	} else {
247 		power_monitor_ops.mmonitor = &intel_umonitor;
248 		power_monitor_ops.mwait = &intel_umwait;
249 	}
250 }
251 
252 int
253 rte_power_monitor_wakeup(const unsigned int lcore_id)
254 {
255 	struct power_wait_status *s;
256 
257 	/* prevent user from running this instruction if it's not supported */
258 	if (!monitor_supported)
259 		return -ENOTSUP;
260 
261 	/* prevent buffer overrun */
262 	if (lcore_id >= RTE_MAX_LCORE)
263 		return -EINVAL;
264 
265 	s = &wait_status[lcore_id];
266 
267 	/*
268 	 * There is a race condition between sleep, wakeup and locking, but we
269 	 * don't need to handle it.
270 	 *
271 	 * Possible situations:
272 	 *
273 	 * 1. T1 locks, sets address, unlocks
274 	 * 2. T2 locks, triggers wakeup, unlocks
275 	 * 3. T1 sleeps
276 	 *
277 	 * In this case, because T1 has already set the address for monitoring,
278 	 * we will wake up immediately even if T2 triggers wakeup before T1
279 	 * goes to sleep.
280 	 *
281 	 * 1. T1 locks, sets address, unlocks, goes to sleep, and wakes up
282 	 * 2. T2 locks, triggers wakeup, and unlocks
283 	 * 3. T1 locks, erases address, and unlocks
284 	 *
285 	 * In this case, since we've already woken up, the "wakeup" was
286 	 * unneeded, and since T1 is still waiting on T2 releasing the lock, the
287 	 * wakeup address is still valid so it's perfectly safe to write it.
288 	 *
289 	 * For multi-monitor case, the act of locking will in itself trigger the
290 	 * wakeup, so no additional writes necessary.
291 	 */
292 	rte_spinlock_lock(&s->lock);
293 	if (s->monitor_addr != NULL)
294 		__umwait_wakeup(s->monitor_addr);
295 	rte_spinlock_unlock(&s->lock);
296 
297 	return 0;
298 }
299 
300 int
301 rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
302 		const uint32_t num, const uint64_t tsc_timestamp)
303 {
304 	const unsigned int lcore_id = rte_lcore_id();
305 	struct power_wait_status *s = &wait_status[lcore_id];
306 	uint32_t i, rc;
307 
308 	/* check if supported */
309 	if (!wait_multi_supported)
310 		return -ENOTSUP;
311 
312 	if (pmc == NULL || num == 0)
313 		return -EINVAL;
314 
315 	/* we are already inside transaction region, return */
316 	if (rte_xtest() != 0)
317 		return 0;
318 
319 	/* start new transaction region */
320 	rc = rte_xbegin();
321 
322 	/* transaction abort, possible write to one of wait addresses */
323 	if (rc != RTE_XBEGIN_STARTED)
324 		return 0;
325 
326 	/*
327 	 * the mere act of reading the lock status here adds the lock to
328 	 * the read set. This means that when we trigger a wakeup from another
329 	 * thread, even if we don't have a defined wakeup address and thus don't
330 	 * actually cause any writes, the act of locking our lock will itself
331 	 * trigger the wakeup and abort the transaction.
332 	 */
333 	rte_spinlock_is_locked(&s->lock);
334 
335 	/*
336 	 * add all addresses to wait on into transaction read-set and check if
337 	 * any of wakeup conditions are already met.
338 	 */
339 	rc = 0;
340 	for (i = 0; i < num; i++) {
341 		const struct rte_power_monitor_cond *c = &pmc[i];
342 
343 		/* cannot be NULL */
344 		if (c->fn == NULL) {
345 			rc = -EINVAL;
346 			break;
347 		}
348 
349 		const uint64_t val = __get_umwait_val(c->addr, c->size);
350 
351 		/* abort if callback indicates that we need to stop */
352 		if (c->fn(val, c->opaque) != 0)
353 			break;
354 	}
355 
356 	/* none of the conditions were met, sleep until timeout */
357 	if (i == num)
358 		rte_power_pause(tsc_timestamp);
359 
360 	/* end transaction region */
361 	rte_xend();
362 
363 	return rc;
364 }
365