xref: /dpdk/lib/eal/x86/rte_power_intrinsics.c (revision 18b5049ab4fecda6ad303606cc265d923b56da14)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020 Intel Corporation
3  */
4 
5 #include <stdalign.h>
6 
7 #include <rte_common.h>
8 #include <rte_lcore.h>
9 #include <rte_lcore_var.h>
10 #include <rte_rtm.h>
11 #include <rte_spinlock.h>
12 
13 #include "rte_power_intrinsics.h"
14 
15 /*
16  * Per-lcore structure holding current status of C0.2 sleeps.
17  */
18 struct power_wait_status {
19 	rte_spinlock_t lock;
20 	volatile void *monitor_addr; /**< NULL if not currently sleeping */
21 };
22 
23 RTE_LCORE_VAR_HANDLE(struct power_wait_status, wait_status);
24 
25 RTE_LCORE_VAR_INIT(wait_status);
26 
27 /*
28  * This function uses UMONITOR/UMWAIT instructions and will enter C0.2 state.
29  * For more information about usage of these instructions, please refer to
30  * Intel(R) 64 and IA-32 Architectures Software Developer's Manual.
31  */
32 static void intel_umonitor(volatile void *addr)
33 {
34 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__WAITPKG__)
35 	/* cast away "volatile" when using the intrinsic */
36 	_umonitor((void *)(uintptr_t)addr);
37 #else
38 	/*
39 	 * we're using raw byte codes for compiler versions which
40 	 * don't support this instruction natively.
41 	 */
42 	asm volatile(".byte 0xf3, 0x0f, 0xae, 0xf7;"
43 			:
44 			: "D"(addr));
45 #endif
46 }
47 
48 static void intel_umwait(const uint64_t timeout)
49 {
50 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__WAITPKG__)
51 	_umwait(0, timeout);
52 #else
53 	const uint32_t tsc_l = (uint32_t)timeout;
54 	const uint32_t tsc_h = (uint32_t)(timeout >> 32);
55 
56 	asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;"
57 			: /* ignore rflags */
58 			: "D"(0), /* enter C0.2 */
59 			  "a"(tsc_l), "d"(tsc_h));
60 #endif
61 }
62 
63 /*
64  * This function uses MONITORX/MWAITX instructions and will enter C1 state.
65  * For more information about usage of these instructions, please refer to
66  * AMD64 Architecture Programmer’s Manual.
67  */
68 static void amd_monitorx(volatile void *addr)
69 {
70 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__MWAITX__)
71 	/* cast away "volatile" when using the intrinsic */
72 	_mm_monitorx((void *)(uintptr_t)addr, 0, 0);
73 #else
74 	asm volatile(".byte 0x0f, 0x01, 0xfa;"
75 			:
76 			: "a"(addr),
77 			"c"(0),  /* no extensions */
78 			"d"(0)); /* no hints */
79 #endif
80 }
81 
82 static void amd_mwaitx(const uint64_t timeout)
83 {
84 	RTE_SET_USED(timeout);
85 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__MWAITX__)
86 	_mm_mwaitx(0, 0, 0);
87 #else
88 	asm volatile(".byte 0x0f, 0x01, 0xfb;"
89 			: /* ignore rflags */
90 			: "a"(0), /* enter C1 */
91 			"c"(0)); /* no time-out */
92 #endif
93 }
94 
95 static alignas(RTE_CACHE_LINE_SIZE) struct {
96 	void (*mmonitor)(volatile void *addr);
97 	void (*mwait)(const uint64_t timeout);
98 } power_monitor_ops;
99 
100 static inline void
101 __umwait_wakeup(volatile void *addr)
102 {
103 	uint64_t val;
104 
105 	/* trigger a write but don't change the value */
106 	val = rte_atomic_load_explicit((volatile __rte_atomic uint64_t *)addr,
107 			rte_memory_order_relaxed);
108 	rte_atomic_compare_exchange_strong_explicit((volatile __rte_atomic uint64_t *)addr,
109 			&val, val, rte_memory_order_relaxed, rte_memory_order_relaxed);
110 }
111 
112 static bool wait_supported;
113 static bool wait_multi_supported;
114 static bool monitor_supported;
115 
116 static inline uint64_t
117 __get_umwait_val(const volatile void *p, const uint8_t sz)
118 {
119 	switch (sz) {
120 	case sizeof(uint8_t):
121 		return *(const volatile uint8_t *)p;
122 	case sizeof(uint16_t):
123 		return *(const volatile uint16_t *)p;
124 	case sizeof(uint32_t):
125 		return *(const volatile uint32_t *)p;
126 	case sizeof(uint64_t):
127 		return *(const volatile uint64_t *)p;
128 	default:
129 		/* shouldn't happen */
130 		RTE_ASSERT(0);
131 		return 0;
132 	}
133 }
134 
135 static inline int
136 __check_val_size(const uint8_t sz)
137 {
138 	switch (sz) {
139 	case sizeof(uint8_t):  /* fall-through */
140 	case sizeof(uint16_t): /* fall-through */
141 	case sizeof(uint32_t): /* fall-through */
142 	case sizeof(uint64_t): /* fall-through */
143 		return 0;
144 	default:
145 		/* unexpected size */
146 		return -1;
147 	}
148 }
149 
150 /**
151  * This function uses UMONITOR/UMWAIT instructions and will enter C0.2 state.
152  * For more information about usage of these instructions, please refer to
153  * Intel(R) 64 and IA-32 Architectures Software Developer's Manual.
154  */
155 int
156 rte_power_monitor(const struct rte_power_monitor_cond *pmc,
157 		const uint64_t tsc_timestamp)
158 {
159 	const unsigned int lcore_id = rte_lcore_id();
160 	struct power_wait_status *s;
161 	uint64_t cur_value;
162 
163 	/* prevent user from running this instruction if it's not supported */
164 	if (!monitor_supported)
165 		return -ENOTSUP;
166 
167 	/* prevent non-EAL thread from using this API */
168 	if (lcore_id >= RTE_MAX_LCORE)
169 		return -EINVAL;
170 
171 	if (pmc == NULL)
172 		return -EINVAL;
173 
174 	if (__check_val_size(pmc->size) < 0)
175 		return -EINVAL;
176 
177 	if (pmc->fn == NULL)
178 		return -EINVAL;
179 
180 	s = RTE_LCORE_VAR_LCORE(lcore_id, wait_status);
181 
182 	/* update sleep address */
183 	rte_spinlock_lock(&s->lock);
184 	s->monitor_addr = pmc->addr;
185 
186 	/* set address for memory monitor */
187 	power_monitor_ops.mmonitor(pmc->addr);
188 
189 	/* now that we've put this address into monitor, we can unlock */
190 	rte_spinlock_unlock(&s->lock);
191 
192 	cur_value = __get_umwait_val(pmc->addr, pmc->size);
193 
194 	/* check if callback indicates we should abort */
195 	if (pmc->fn(cur_value, pmc->opaque) != 0)
196 		goto end;
197 
198 	/* execute mwait */
199 	power_monitor_ops.mwait(tsc_timestamp);
200 
201 end:
202 	/* erase sleep address */
203 	rte_spinlock_lock(&s->lock);
204 	s->monitor_addr = NULL;
205 	rte_spinlock_unlock(&s->lock);
206 
207 	return 0;
208 }
209 
210 /**
211  * This function uses TPAUSE instruction  and will enter C0.2 state. For more
212  * information about usage of this instruction, please refer to Intel(R) 64 and
213  * IA-32 Architectures Software Developer's Manual.
214  */
215 int
216 rte_power_pause(const uint64_t tsc_timestamp)
217 {
218 	/* prevent user from running this instruction if it's not supported */
219 	if (!wait_supported)
220 		return -ENOTSUP;
221 
222 	/* execute TPAUSE */
223 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__WAITPKG__)
224 	_tpause(0, tsc_timestamp);
225 #else
226 	const uint32_t tsc_l = (uint32_t)tsc_timestamp;
227 	const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
228 
229 	asm volatile(".byte 0x66, 0x0f, 0xae, 0xf7;"
230 			: /* ignore rflags */
231 			: "D"(0), /* enter C0.2 */
232 			"a"(tsc_l), "d"(tsc_h));
233 #endif
234 
235 	return 0;
236 }
237 
238 RTE_INIT(rte_power_intrinsics_init) {
239 	struct rte_cpu_intrinsics i;
240 
241 	rte_cpu_get_intrinsics_support(&i);
242 
243 	if (i.power_monitor && i.power_pause)
244 		wait_supported = 1;
245 	if (i.power_monitor_multi)
246 		wait_multi_supported = 1;
247 	if (i.power_monitor)
248 		monitor_supported = 1;
249 
250 	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MONITORX)) {
251 		power_monitor_ops.mmonitor = &amd_monitorx;
252 		power_monitor_ops.mwait = &amd_mwaitx;
253 	} else {
254 		power_monitor_ops.mmonitor = &intel_umonitor;
255 		power_monitor_ops.mwait = &intel_umwait;
256 	}
257 }
258 
259 int
260 rte_power_monitor_wakeup(const unsigned int lcore_id)
261 {
262 	struct power_wait_status *s;
263 
264 	/* prevent user from running this instruction if it's not supported */
265 	if (!monitor_supported)
266 		return -ENOTSUP;
267 
268 	/* prevent buffer overrun */
269 	if (lcore_id >= RTE_MAX_LCORE)
270 		return -EINVAL;
271 
272 	s = RTE_LCORE_VAR_LCORE(lcore_id, wait_status);
273 
274 	/*
275 	 * There is a race condition between sleep, wakeup and locking, but we
276 	 * don't need to handle it.
277 	 *
278 	 * Possible situations:
279 	 *
280 	 * 1. T1 locks, sets address, unlocks
281 	 * 2. T2 locks, triggers wakeup, unlocks
282 	 * 3. T1 sleeps
283 	 *
284 	 * In this case, because T1 has already set the address for monitoring,
285 	 * we will wake up immediately even if T2 triggers wakeup before T1
286 	 * goes to sleep.
287 	 *
288 	 * 1. T1 locks, sets address, unlocks, goes to sleep, and wakes up
289 	 * 2. T2 locks, triggers wakeup, and unlocks
290 	 * 3. T1 locks, erases address, and unlocks
291 	 *
292 	 * In this case, since we've already woken up, the "wakeup" was
293 	 * unneeded, and since T1 is still waiting on T2 releasing the lock, the
294 	 * wakeup address is still valid so it's perfectly safe to write it.
295 	 *
296 	 * For multi-monitor case, the act of locking will in itself trigger the
297 	 * wakeup, so no additional writes necessary.
298 	 */
299 	rte_spinlock_lock(&s->lock);
300 	if (s->monitor_addr != NULL)
301 		__umwait_wakeup(s->monitor_addr);
302 	rte_spinlock_unlock(&s->lock);
303 
304 	return 0;
305 }
306 
307 int
308 rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
309 		const uint32_t num, const uint64_t tsc_timestamp)
310 {
311 	struct power_wait_status *s = RTE_LCORE_VAR(wait_status);
312 	uint32_t i, rc;
313 
314 	/* check if supported */
315 	if (!wait_multi_supported)
316 		return -ENOTSUP;
317 
318 	if (pmc == NULL || num == 0)
319 		return -EINVAL;
320 
321 	/* we are already inside transaction region, return */
322 	if (rte_xtest() != 0)
323 		return 0;
324 
325 	/* start new transaction region */
326 	rc = rte_xbegin();
327 
328 	/* transaction abort, possible write to one of wait addresses */
329 	if (rc != RTE_XBEGIN_STARTED)
330 		return 0;
331 
332 	/*
333 	 * the mere act of reading the lock status here adds the lock to
334 	 * the read set. This means that when we trigger a wakeup from another
335 	 * thread, even if we don't have a defined wakeup address and thus don't
336 	 * actually cause any writes, the act of locking our lock will itself
337 	 * trigger the wakeup and abort the transaction.
338 	 */
339 	rte_spinlock_is_locked(&s->lock);
340 
341 	/*
342 	 * add all addresses to wait on into transaction read-set and check if
343 	 * any of wakeup conditions are already met.
344 	 */
345 	rc = 0;
346 	for (i = 0; i < num; i++) {
347 		const struct rte_power_monitor_cond *c = &pmc[i];
348 
349 		/* cannot be NULL */
350 		if (c->fn == NULL) {
351 			rc = -EINVAL;
352 			break;
353 		}
354 
355 		const uint64_t val = __get_umwait_val(c->addr, c->size);
356 
357 		/* abort if callback indicates that we need to stop */
358 		if (c->fn(val, c->opaque) != 0)
359 			break;
360 	}
361 
362 	/* none of the conditions were met, sleep until timeout */
363 	if (i == num)
364 		rte_power_pause(tsc_timestamp);
365 
366 	/* end transaction region */
367 	rte_xend();
368 
369 	return rc;
370 }
371