xref: /dpdk/lib/eal/x86/rte_power_intrinsics.c (revision e9fd1ebf981f361844aea9ec94e17f4bda5e1479)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020 Intel Corporation
3  */
4 
5 #include <stdalign.h>
6 
7 #include <rte_common.h>
8 #include <rte_lcore.h>
9 #include <rte_rtm.h>
10 #include <rte_spinlock.h>
11 
12 #include "rte_power_intrinsics.h"
13 
14 /*
15  * Per-lcore structure holding current status of C0.2 sleeps.
16  */
17 static alignas(RTE_CACHE_LINE_SIZE) struct power_wait_status {
18 	rte_spinlock_t lock;
19 	volatile void *monitor_addr; /**< NULL if not currently sleeping */
20 } wait_status[RTE_MAX_LCORE];
21 
22 /*
23  * This function uses UMONITOR/UMWAIT instructions and will enter C0.2 state.
24  * For more information about usage of these instructions, please refer to
25  * Intel(R) 64 and IA-32 Architectures Software Developer's Manual.
26  */
27 static void intel_umonitor(volatile void *addr)
28 {
29 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__WAITPKG__)
30 	/* cast away "volatile" when using the intrinsic */
31 	_umonitor((void *)(uintptr_t)addr);
32 #else
33 	/*
34 	 * we're using raw byte codes for compiler versions which
35 	 * don't support this instruction natively.
36 	 */
37 	asm volatile(".byte 0xf3, 0x0f, 0xae, 0xf7;"
38 			:
39 			: "D"(addr));
40 #endif
41 }
42 
43 static void intel_umwait(const uint64_t timeout)
44 {
45 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__WAITPKG__)
46 	_umwait(0, timeout);
47 #else
48 	const uint32_t tsc_l = (uint32_t)timeout;
49 	const uint32_t tsc_h = (uint32_t)(timeout >> 32);
50 
51 	asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;"
52 			: /* ignore rflags */
53 			: "D"(0), /* enter C0.2 */
54 			  "a"(tsc_l), "d"(tsc_h));
55 #endif
56 }
57 
58 /*
59  * This function uses MONITORX/MWAITX instructions and will enter C1 state.
60  * For more information about usage of these instructions, please refer to
61  * AMD64 Architecture Programmer’s Manual.
62  */
63 static void amd_monitorx(volatile void *addr)
64 {
65 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__MWAITX__)
66 	/* cast away "volatile" when using the intrinsic */
67 	_mm_monitorx((void *)(uintptr_t)addr, 0, 0);
68 #else
69 	asm volatile(".byte 0x0f, 0x01, 0xfa;"
70 			:
71 			: "a"(addr),
72 			"c"(0),  /* no extensions */
73 			"d"(0)); /* no hints */
74 #endif
75 }
76 
77 static void amd_mwaitx(const uint64_t timeout)
78 {
79 	RTE_SET_USED(timeout);
80 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__MWAITX__)
81 	_mm_mwaitx(0, 0, 0);
82 #else
83 	asm volatile(".byte 0x0f, 0x01, 0xfb;"
84 			: /* ignore rflags */
85 			: "a"(0), /* enter C1 */
86 			"c"(0)); /* no time-out */
87 #endif
88 }
89 
90 static alignas(RTE_CACHE_LINE_SIZE) struct {
91 	void (*mmonitor)(volatile void *addr);
92 	void (*mwait)(const uint64_t timeout);
93 } power_monitor_ops;
94 
95 static inline void
96 __umwait_wakeup(volatile void *addr)
97 {
98 	uint64_t val;
99 
100 	/* trigger a write but don't change the value */
101 	val = rte_atomic_load_explicit((volatile __rte_atomic uint64_t *)addr,
102 			rte_memory_order_relaxed);
103 	rte_atomic_compare_exchange_strong_explicit((volatile __rte_atomic uint64_t *)addr,
104 			&val, val, rte_memory_order_relaxed, rte_memory_order_relaxed);
105 }
106 
107 static bool wait_supported;
108 static bool wait_multi_supported;
109 static bool monitor_supported;
110 
111 static inline uint64_t
112 __get_umwait_val(const volatile void *p, const uint8_t sz)
113 {
114 	switch (sz) {
115 	case sizeof(uint8_t):
116 		return *(const volatile uint8_t *)p;
117 	case sizeof(uint16_t):
118 		return *(const volatile uint16_t *)p;
119 	case sizeof(uint32_t):
120 		return *(const volatile uint32_t *)p;
121 	case sizeof(uint64_t):
122 		return *(const volatile uint64_t *)p;
123 	default:
124 		/* shouldn't happen */
125 		RTE_ASSERT(0);
126 		return 0;
127 	}
128 }
129 
130 static inline int
131 __check_val_size(const uint8_t sz)
132 {
133 	switch (sz) {
134 	case sizeof(uint8_t):  /* fall-through */
135 	case sizeof(uint16_t): /* fall-through */
136 	case sizeof(uint32_t): /* fall-through */
137 	case sizeof(uint64_t): /* fall-through */
138 		return 0;
139 	default:
140 		/* unexpected size */
141 		return -1;
142 	}
143 }
144 
145 /**
146  * This function uses UMONITOR/UMWAIT instructions and will enter C0.2 state.
147  * For more information about usage of these instructions, please refer to
148  * Intel(R) 64 and IA-32 Architectures Software Developer's Manual.
149  */
150 int
151 rte_power_monitor(const struct rte_power_monitor_cond *pmc,
152 		const uint64_t tsc_timestamp)
153 {
154 	const unsigned int lcore_id = rte_lcore_id();
155 	struct power_wait_status *s;
156 	uint64_t cur_value;
157 
158 	/* prevent user from running this instruction if it's not supported */
159 	if (!monitor_supported)
160 		return -ENOTSUP;
161 
162 	/* prevent non-EAL thread from using this API */
163 	if (lcore_id >= RTE_MAX_LCORE)
164 		return -EINVAL;
165 
166 	if (pmc == NULL)
167 		return -EINVAL;
168 
169 	if (__check_val_size(pmc->size) < 0)
170 		return -EINVAL;
171 
172 	if (pmc->fn == NULL)
173 		return -EINVAL;
174 
175 	s = &wait_status[lcore_id];
176 
177 	/* update sleep address */
178 	rte_spinlock_lock(&s->lock);
179 	s->monitor_addr = pmc->addr;
180 
181 	/* set address for memory monitor */
182 	power_monitor_ops.mmonitor(pmc->addr);
183 
184 	/* now that we've put this address into monitor, we can unlock */
185 	rte_spinlock_unlock(&s->lock);
186 
187 	cur_value = __get_umwait_val(pmc->addr, pmc->size);
188 
189 	/* check if callback indicates we should abort */
190 	if (pmc->fn(cur_value, pmc->opaque) != 0)
191 		goto end;
192 
193 	/* execute mwait */
194 	power_monitor_ops.mwait(tsc_timestamp);
195 
196 end:
197 	/* erase sleep address */
198 	rte_spinlock_lock(&s->lock);
199 	s->monitor_addr = NULL;
200 	rte_spinlock_unlock(&s->lock);
201 
202 	return 0;
203 }
204 
205 /**
206  * This function uses TPAUSE instruction  and will enter C0.2 state. For more
207  * information about usage of this instruction, please refer to Intel(R) 64 and
208  * IA-32 Architectures Software Developer's Manual.
209  */
210 int
211 rte_power_pause(const uint64_t tsc_timestamp)
212 {
213 	/* prevent user from running this instruction if it's not supported */
214 	if (!wait_supported)
215 		return -ENOTSUP;
216 
217 	/* execute TPAUSE */
218 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__WAITPKG__)
219 	_tpause(0, tsc_timestamp);
220 #else
221 	const uint32_t tsc_l = (uint32_t)tsc_timestamp;
222 	const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
223 
224 	asm volatile(".byte 0x66, 0x0f, 0xae, 0xf7;"
225 			: /* ignore rflags */
226 			: "D"(0), /* enter C0.2 */
227 			"a"(tsc_l), "d"(tsc_h));
228 #endif
229 
230 	return 0;
231 }
232 
233 RTE_INIT(rte_power_intrinsics_init) {
234 	struct rte_cpu_intrinsics i;
235 
236 	rte_cpu_get_intrinsics_support(&i);
237 
238 	if (i.power_monitor && i.power_pause)
239 		wait_supported = 1;
240 	if (i.power_monitor_multi)
241 		wait_multi_supported = 1;
242 	if (i.power_monitor)
243 		monitor_supported = 1;
244 
245 	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MONITORX)) {
246 		power_monitor_ops.mmonitor = &amd_monitorx;
247 		power_monitor_ops.mwait = &amd_mwaitx;
248 	} else {
249 		power_monitor_ops.mmonitor = &intel_umonitor;
250 		power_monitor_ops.mwait = &intel_umwait;
251 	}
252 }
253 
254 int
255 rte_power_monitor_wakeup(const unsigned int lcore_id)
256 {
257 	struct power_wait_status *s;
258 
259 	/* prevent user from running this instruction if it's not supported */
260 	if (!monitor_supported)
261 		return -ENOTSUP;
262 
263 	/* prevent buffer overrun */
264 	if (lcore_id >= RTE_MAX_LCORE)
265 		return -EINVAL;
266 
267 	s = &wait_status[lcore_id];
268 
269 	/*
270 	 * There is a race condition between sleep, wakeup and locking, but we
271 	 * don't need to handle it.
272 	 *
273 	 * Possible situations:
274 	 *
275 	 * 1. T1 locks, sets address, unlocks
276 	 * 2. T2 locks, triggers wakeup, unlocks
277 	 * 3. T1 sleeps
278 	 *
279 	 * In this case, because T1 has already set the address for monitoring,
280 	 * we will wake up immediately even if T2 triggers wakeup before T1
281 	 * goes to sleep.
282 	 *
283 	 * 1. T1 locks, sets address, unlocks, goes to sleep, and wakes up
284 	 * 2. T2 locks, triggers wakeup, and unlocks
285 	 * 3. T1 locks, erases address, and unlocks
286 	 *
287 	 * In this case, since we've already woken up, the "wakeup" was
288 	 * unneeded, and since T1 is still waiting on T2 releasing the lock, the
289 	 * wakeup address is still valid so it's perfectly safe to write it.
290 	 *
291 	 * For multi-monitor case, the act of locking will in itself trigger the
292 	 * wakeup, so no additional writes necessary.
293 	 */
294 	rte_spinlock_lock(&s->lock);
295 	if (s->monitor_addr != NULL)
296 		__umwait_wakeup(s->monitor_addr);
297 	rte_spinlock_unlock(&s->lock);
298 
299 	return 0;
300 }
301 
302 int
303 rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
304 		const uint32_t num, const uint64_t tsc_timestamp)
305 {
306 	const unsigned int lcore_id = rte_lcore_id();
307 	struct power_wait_status *s = &wait_status[lcore_id];
308 	uint32_t i, rc;
309 
310 	/* check if supported */
311 	if (!wait_multi_supported)
312 		return -ENOTSUP;
313 
314 	if (pmc == NULL || num == 0)
315 		return -EINVAL;
316 
317 	/* we are already inside transaction region, return */
318 	if (rte_xtest() != 0)
319 		return 0;
320 
321 	/* start new transaction region */
322 	rc = rte_xbegin();
323 
324 	/* transaction abort, possible write to one of wait addresses */
325 	if (rc != RTE_XBEGIN_STARTED)
326 		return 0;
327 
328 	/*
329 	 * the mere act of reading the lock status here adds the lock to
330 	 * the read set. This means that when we trigger a wakeup from another
331 	 * thread, even if we don't have a defined wakeup address and thus don't
332 	 * actually cause any writes, the act of locking our lock will itself
333 	 * trigger the wakeup and abort the transaction.
334 	 */
335 	rte_spinlock_is_locked(&s->lock);
336 
337 	/*
338 	 * add all addresses to wait on into transaction read-set and check if
339 	 * any of wakeup conditions are already met.
340 	 */
341 	rc = 0;
342 	for (i = 0; i < num; i++) {
343 		const struct rte_power_monitor_cond *c = &pmc[i];
344 
345 		/* cannot be NULL */
346 		if (c->fn == NULL) {
347 			rc = -EINVAL;
348 			break;
349 		}
350 
351 		const uint64_t val = __get_umwait_val(c->addr, c->size);
352 
353 		/* abort if callback indicates that we need to stop */
354 		if (c->fn(val, c->opaque) != 0)
355 			break;
356 	}
357 
358 	/* none of the conditions were met, sleep until timeout */
359 	if (i == num)
360 		rte_power_pause(tsc_timestamp);
361 
362 	/* end transaction region */
363 	rte_xend();
364 
365 	return rc;
366 }
367