1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2020 Intel Corporation 3 */ 4 5 #include <rte_common.h> 6 #include <rte_lcore.h> 7 #include <rte_rtm.h> 8 #include <rte_spinlock.h> 9 10 #include "rte_power_intrinsics.h" 11 12 /* 13 * Per-lcore structure holding current status of C0.2 sleeps. 14 */ 15 static struct power_wait_status { 16 rte_spinlock_t lock; 17 volatile void *monitor_addr; /**< NULL if not currently sleeping */ 18 } __rte_cache_aligned wait_status[RTE_MAX_LCORE]; 19 20 /* 21 * This function uses UMONITOR/UMWAIT instructions and will enter C0.2 state. 22 * For more information about usage of these instructions, please refer to 23 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual. 24 */ 25 static void intel_umonitor(volatile void *addr) 26 { 27 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__WAITPKG__) 28 /* cast away "volatile" when using the intrinsic */ 29 _umonitor((void *)(uintptr_t)addr); 30 #else 31 /* 32 * we're using raw byte codes for compiler versions which 33 * don't support this instruction natively. 34 */ 35 asm volatile(".byte 0xf3, 0x0f, 0xae, 0xf7;" 36 : 37 : "D"(addr)); 38 #endif 39 } 40 41 static void intel_umwait(const uint64_t timeout) 42 { 43 const uint32_t tsc_l = (uint32_t)timeout; 44 const uint32_t tsc_h = (uint32_t)(timeout >> 32); 45 46 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__WAITPKG__) 47 _umwait(tsc_l, tsc_h); 48 #else 49 asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;" 50 : /* ignore rflags */ 51 : "D"(0), /* enter C0.2 */ 52 "a"(tsc_l), "d"(tsc_h)); 53 #endif 54 } 55 56 /* 57 * This function uses MONITORX/MWAITX instructions and will enter C1 state. 58 * For more information about usage of these instructions, please refer to 59 * AMD64 Architecture Programmer’s Manual. 60 */ 61 static void amd_monitorx(volatile void *addr) 62 { 63 #if defined(__MWAITX__) 64 /* cast away "volatile" when using the intrinsic */ 65 _mm_monitorx((void *)(uintptr_t)addr, 0, 0); 66 #else 67 asm volatile(".byte 0x0f, 0x01, 0xfa;" 68 : 69 : "a"(addr), 70 "c"(0), /* no extensions */ 71 "d"(0)); /* no hints */ 72 #endif 73 } 74 75 static void amd_mwaitx(const uint64_t timeout) 76 { 77 RTE_SET_USED(timeout); 78 #if defined(__MWAITX__) 79 _mm_mwaitx(0, 0, 0); 80 #else 81 asm volatile(".byte 0x0f, 0x01, 0xfb;" 82 : /* ignore rflags */ 83 : "a"(0), /* enter C1 */ 84 "c"(0)); /* no time-out */ 85 #endif 86 } 87 88 static struct { 89 void (*mmonitor)(volatile void *addr); 90 void (*mwait)(const uint64_t timeout); 91 } __rte_cache_aligned power_monitor_ops; 92 93 static inline void 94 __umwait_wakeup(volatile void *addr) 95 { 96 uint64_t val; 97 98 /* trigger a write but don't change the value */ 99 val = rte_atomic_load_explicit((volatile __rte_atomic uint64_t *)addr, 100 rte_memory_order_relaxed); 101 rte_atomic_compare_exchange_strong_explicit((volatile __rte_atomic uint64_t *)addr, 102 &val, val, rte_memory_order_relaxed, rte_memory_order_relaxed); 103 } 104 105 static bool wait_supported; 106 static bool wait_multi_supported; 107 static bool monitor_supported; 108 109 static inline uint64_t 110 __get_umwait_val(const volatile void *p, const uint8_t sz) 111 { 112 switch (sz) { 113 case sizeof(uint8_t): 114 return *(const volatile uint8_t *)p; 115 case sizeof(uint16_t): 116 return *(const volatile uint16_t *)p; 117 case sizeof(uint32_t): 118 return *(const volatile uint32_t *)p; 119 case sizeof(uint64_t): 120 return *(const volatile uint64_t *)p; 121 default: 122 /* shouldn't happen */ 123 RTE_ASSERT(0); 124 return 0; 125 } 126 } 127 128 static inline int 129 __check_val_size(const uint8_t sz) 130 { 131 switch (sz) { 132 case sizeof(uint8_t): /* fall-through */ 133 case sizeof(uint16_t): /* fall-through */ 134 case sizeof(uint32_t): /* fall-through */ 135 case sizeof(uint64_t): /* fall-through */ 136 return 0; 137 default: 138 /* unexpected size */ 139 return -1; 140 } 141 } 142 143 /** 144 * This function uses UMONITOR/UMWAIT instructions and will enter C0.2 state. 145 * For more information about usage of these instructions, please refer to 146 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual. 147 */ 148 int 149 rte_power_monitor(const struct rte_power_monitor_cond *pmc, 150 const uint64_t tsc_timestamp) 151 { 152 const unsigned int lcore_id = rte_lcore_id(); 153 struct power_wait_status *s; 154 uint64_t cur_value; 155 156 /* prevent user from running this instruction if it's not supported */ 157 if (!monitor_supported) 158 return -ENOTSUP; 159 160 /* prevent non-EAL thread from using this API */ 161 if (lcore_id >= RTE_MAX_LCORE) 162 return -EINVAL; 163 164 if (pmc == NULL) 165 return -EINVAL; 166 167 if (__check_val_size(pmc->size) < 0) 168 return -EINVAL; 169 170 if (pmc->fn == NULL) 171 return -EINVAL; 172 173 s = &wait_status[lcore_id]; 174 175 /* update sleep address */ 176 rte_spinlock_lock(&s->lock); 177 s->monitor_addr = pmc->addr; 178 179 /* set address for memory monitor */ 180 power_monitor_ops.mmonitor(pmc->addr); 181 182 /* now that we've put this address into monitor, we can unlock */ 183 rte_spinlock_unlock(&s->lock); 184 185 cur_value = __get_umwait_val(pmc->addr, pmc->size); 186 187 /* check if callback indicates we should abort */ 188 if (pmc->fn(cur_value, pmc->opaque) != 0) 189 goto end; 190 191 /* execute mwait */ 192 power_monitor_ops.mwait(tsc_timestamp); 193 194 end: 195 /* erase sleep address */ 196 rte_spinlock_lock(&s->lock); 197 s->monitor_addr = NULL; 198 rte_spinlock_unlock(&s->lock); 199 200 return 0; 201 } 202 203 /** 204 * This function uses TPAUSE instruction and will enter C0.2 state. For more 205 * information about usage of this instruction, please refer to Intel(R) 64 and 206 * IA-32 Architectures Software Developer's Manual. 207 */ 208 int 209 rte_power_pause(const uint64_t tsc_timestamp) 210 { 211 const uint32_t tsc_l = (uint32_t)tsc_timestamp; 212 const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32); 213 214 /* prevent user from running this instruction if it's not supported */ 215 if (!wait_supported) 216 return -ENOTSUP; 217 218 /* execute TPAUSE */ 219 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__WAITPKG__) 220 _tpause(tsc_l, tsc_h); 221 #else 222 asm volatile(".byte 0x66, 0x0f, 0xae, 0xf7;" 223 : /* ignore rflags */ 224 : "D"(0), /* enter C0.2 */ 225 "a"(tsc_l), "d"(tsc_h)); 226 #endif 227 228 return 0; 229 } 230 231 RTE_INIT(rte_power_intrinsics_init) { 232 struct rte_cpu_intrinsics i; 233 234 rte_cpu_get_intrinsics_support(&i); 235 236 if (i.power_monitor && i.power_pause) 237 wait_supported = 1; 238 if (i.power_monitor_multi) 239 wait_multi_supported = 1; 240 if (i.power_monitor) 241 monitor_supported = 1; 242 243 if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MONITORX)) { 244 power_monitor_ops.mmonitor = &amd_monitorx; 245 power_monitor_ops.mwait = &amd_mwaitx; 246 } else { 247 power_monitor_ops.mmonitor = &intel_umonitor; 248 power_monitor_ops.mwait = &intel_umwait; 249 } 250 } 251 252 int 253 rte_power_monitor_wakeup(const unsigned int lcore_id) 254 { 255 struct power_wait_status *s; 256 257 /* prevent user from running this instruction if it's not supported */ 258 if (!monitor_supported) 259 return -ENOTSUP; 260 261 /* prevent buffer overrun */ 262 if (lcore_id >= RTE_MAX_LCORE) 263 return -EINVAL; 264 265 s = &wait_status[lcore_id]; 266 267 /* 268 * There is a race condition between sleep, wakeup and locking, but we 269 * don't need to handle it. 270 * 271 * Possible situations: 272 * 273 * 1. T1 locks, sets address, unlocks 274 * 2. T2 locks, triggers wakeup, unlocks 275 * 3. T1 sleeps 276 * 277 * In this case, because T1 has already set the address for monitoring, 278 * we will wake up immediately even if T2 triggers wakeup before T1 279 * goes to sleep. 280 * 281 * 1. T1 locks, sets address, unlocks, goes to sleep, and wakes up 282 * 2. T2 locks, triggers wakeup, and unlocks 283 * 3. T1 locks, erases address, and unlocks 284 * 285 * In this case, since we've already woken up, the "wakeup" was 286 * unneeded, and since T1 is still waiting on T2 releasing the lock, the 287 * wakeup address is still valid so it's perfectly safe to write it. 288 * 289 * For multi-monitor case, the act of locking will in itself trigger the 290 * wakeup, so no additional writes necessary. 291 */ 292 rte_spinlock_lock(&s->lock); 293 if (s->monitor_addr != NULL) 294 __umwait_wakeup(s->monitor_addr); 295 rte_spinlock_unlock(&s->lock); 296 297 return 0; 298 } 299 300 int 301 rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], 302 const uint32_t num, const uint64_t tsc_timestamp) 303 { 304 const unsigned int lcore_id = rte_lcore_id(); 305 struct power_wait_status *s = &wait_status[lcore_id]; 306 uint32_t i, rc; 307 308 /* check if supported */ 309 if (!wait_multi_supported) 310 return -ENOTSUP; 311 312 if (pmc == NULL || num == 0) 313 return -EINVAL; 314 315 /* we are already inside transaction region, return */ 316 if (rte_xtest() != 0) 317 return 0; 318 319 /* start new transaction region */ 320 rc = rte_xbegin(); 321 322 /* transaction abort, possible write to one of wait addresses */ 323 if (rc != RTE_XBEGIN_STARTED) 324 return 0; 325 326 /* 327 * the mere act of reading the lock status here adds the lock to 328 * the read set. This means that when we trigger a wakeup from another 329 * thread, even if we don't have a defined wakeup address and thus don't 330 * actually cause any writes, the act of locking our lock will itself 331 * trigger the wakeup and abort the transaction. 332 */ 333 rte_spinlock_is_locked(&s->lock); 334 335 /* 336 * add all addresses to wait on into transaction read-set and check if 337 * any of wakeup conditions are already met. 338 */ 339 rc = 0; 340 for (i = 0; i < num; i++) { 341 const struct rte_power_monitor_cond *c = &pmc[i]; 342 343 /* cannot be NULL */ 344 if (c->fn == NULL) { 345 rc = -EINVAL; 346 break; 347 } 348 349 const uint64_t val = __get_umwait_val(c->addr, c->size); 350 351 /* abort if callback indicates that we need to stop */ 352 if (c->fn(val, c->opaque) != 0) 353 break; 354 } 355 356 /* none of the conditions were met, sleep until timeout */ 357 if (i == num) 358 rte_power_pause(tsc_timestamp); 359 360 /* end transaction region */ 361 rte_xend(); 362 363 return rc; 364 } 365