1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2020 Intel Corporation 3 */ 4 5 #include <stdalign.h> 6 7 #include <rte_common.h> 8 #include <rte_lcore.h> 9 #include <rte_lcore_var.h> 10 #include <rte_rtm.h> 11 #include <rte_spinlock.h> 12 13 #include "rte_power_intrinsics.h" 14 15 /* 16 * Per-lcore structure holding current status of C0.2 sleeps. 17 */ 18 struct power_wait_status { 19 rte_spinlock_t lock; 20 volatile void *monitor_addr; /**< NULL if not currently sleeping */ 21 }; 22 23 RTE_LCORE_VAR_HANDLE(struct power_wait_status, wait_status); 24 25 RTE_LCORE_VAR_INIT(wait_status); 26 27 /* 28 * This function uses UMONITOR/UMWAIT instructions and will enter C0.2 state. 29 * For more information about usage of these instructions, please refer to 30 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual. 31 */ 32 static void intel_umonitor(volatile void *addr) 33 { 34 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__WAITPKG__) 35 /* cast away "volatile" when using the intrinsic */ 36 _umonitor((void *)(uintptr_t)addr); 37 #else 38 /* 39 * we're using raw byte codes for compiler versions which 40 * don't support this instruction natively. 41 */ 42 asm volatile(".byte 0xf3, 0x0f, 0xae, 0xf7;" 43 : 44 : "D"(addr)); 45 #endif 46 } 47 48 static void intel_umwait(const uint64_t timeout) 49 { 50 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__WAITPKG__) 51 _umwait(0, timeout); 52 #else 53 const uint32_t tsc_l = (uint32_t)timeout; 54 const uint32_t tsc_h = (uint32_t)(timeout >> 32); 55 56 asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;" 57 : /* ignore rflags */ 58 : "D"(0), /* enter C0.2 */ 59 "a"(tsc_l), "d"(tsc_h)); 60 #endif 61 } 62 63 /* 64 * This function uses MONITORX/MWAITX instructions and will enter C1 state. 65 * For more information about usage of these instructions, please refer to 66 * AMD64 Architecture Programmer’s Manual. 67 */ 68 static void amd_monitorx(volatile void *addr) 69 { 70 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__MWAITX__) 71 /* cast away "volatile" when using the intrinsic */ 72 _mm_monitorx((void *)(uintptr_t)addr, 0, 0); 73 #else 74 asm volatile(".byte 0x0f, 0x01, 0xfa;" 75 : 76 : "a"(addr), 77 "c"(0), /* no extensions */ 78 "d"(0)); /* no hints */ 79 #endif 80 } 81 82 static void amd_mwaitx(const uint64_t timeout) 83 { 84 RTE_SET_USED(timeout); 85 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__MWAITX__) 86 _mm_mwaitx(0, 0, 0); 87 #else 88 asm volatile(".byte 0x0f, 0x01, 0xfb;" 89 : /* ignore rflags */ 90 : "a"(0), /* enter C1 */ 91 "c"(0)); /* no time-out */ 92 #endif 93 } 94 95 static alignas(RTE_CACHE_LINE_SIZE) struct { 96 void (*mmonitor)(volatile void *addr); 97 void (*mwait)(const uint64_t timeout); 98 } power_monitor_ops; 99 100 static inline void 101 __umwait_wakeup(volatile void *addr) 102 { 103 uint64_t val; 104 105 /* trigger a write but don't change the value */ 106 val = rte_atomic_load_explicit((volatile __rte_atomic uint64_t *)addr, 107 rte_memory_order_relaxed); 108 rte_atomic_compare_exchange_strong_explicit((volatile __rte_atomic uint64_t *)addr, 109 &val, val, rte_memory_order_relaxed, rte_memory_order_relaxed); 110 } 111 112 static bool wait_supported; 113 static bool wait_multi_supported; 114 static bool monitor_supported; 115 116 static inline uint64_t 117 __get_umwait_val(const volatile void *p, const uint8_t sz) 118 { 119 switch (sz) { 120 case sizeof(uint8_t): 121 return *(const volatile uint8_t *)p; 122 case sizeof(uint16_t): 123 return *(const volatile uint16_t *)p; 124 case sizeof(uint32_t): 125 return *(const volatile uint32_t *)p; 126 case sizeof(uint64_t): 127 return *(const volatile uint64_t *)p; 128 default: 129 /* shouldn't happen */ 130 RTE_ASSERT(0); 131 return 0; 132 } 133 } 134 135 static inline int 136 __check_val_size(const uint8_t sz) 137 { 138 switch (sz) { 139 case sizeof(uint8_t): /* fall-through */ 140 case sizeof(uint16_t): /* fall-through */ 141 case sizeof(uint32_t): /* fall-through */ 142 case sizeof(uint64_t): /* fall-through */ 143 return 0; 144 default: 145 /* unexpected size */ 146 return -1; 147 } 148 } 149 150 /** 151 * This function uses UMONITOR/UMWAIT instructions and will enter C0.2 state. 152 * For more information about usage of these instructions, please refer to 153 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual. 154 */ 155 int 156 rte_power_monitor(const struct rte_power_monitor_cond *pmc, 157 const uint64_t tsc_timestamp) 158 { 159 const unsigned int lcore_id = rte_lcore_id(); 160 struct power_wait_status *s; 161 uint64_t cur_value; 162 163 /* prevent user from running this instruction if it's not supported */ 164 if (!monitor_supported) 165 return -ENOTSUP; 166 167 /* prevent non-EAL thread from using this API */ 168 if (lcore_id >= RTE_MAX_LCORE) 169 return -EINVAL; 170 171 if (pmc == NULL) 172 return -EINVAL; 173 174 if (__check_val_size(pmc->size) < 0) 175 return -EINVAL; 176 177 if (pmc->fn == NULL) 178 return -EINVAL; 179 180 s = RTE_LCORE_VAR_LCORE(lcore_id, wait_status); 181 182 /* update sleep address */ 183 rte_spinlock_lock(&s->lock); 184 s->monitor_addr = pmc->addr; 185 186 /* set address for memory monitor */ 187 power_monitor_ops.mmonitor(pmc->addr); 188 189 /* now that we've put this address into monitor, we can unlock */ 190 rte_spinlock_unlock(&s->lock); 191 192 cur_value = __get_umwait_val(pmc->addr, pmc->size); 193 194 /* check if callback indicates we should abort */ 195 if (pmc->fn(cur_value, pmc->opaque) != 0) 196 goto end; 197 198 /* execute mwait */ 199 power_monitor_ops.mwait(tsc_timestamp); 200 201 end: 202 /* erase sleep address */ 203 rte_spinlock_lock(&s->lock); 204 s->monitor_addr = NULL; 205 rte_spinlock_unlock(&s->lock); 206 207 return 0; 208 } 209 210 /** 211 * This function uses TPAUSE instruction and will enter C0.2 state. For more 212 * information about usage of this instruction, please refer to Intel(R) 64 and 213 * IA-32 Architectures Software Developer's Manual. 214 */ 215 int 216 rte_power_pause(const uint64_t tsc_timestamp) 217 { 218 /* prevent user from running this instruction if it's not supported */ 219 if (!wait_supported) 220 return -ENOTSUP; 221 222 /* execute TPAUSE */ 223 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__WAITPKG__) 224 _tpause(0, tsc_timestamp); 225 #else 226 const uint32_t tsc_l = (uint32_t)tsc_timestamp; 227 const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32); 228 229 asm volatile(".byte 0x66, 0x0f, 0xae, 0xf7;" 230 : /* ignore rflags */ 231 : "D"(0), /* enter C0.2 */ 232 "a"(tsc_l), "d"(tsc_h)); 233 #endif 234 235 return 0; 236 } 237 238 RTE_INIT(rte_power_intrinsics_init) { 239 struct rte_cpu_intrinsics i; 240 241 rte_cpu_get_intrinsics_support(&i); 242 243 if (i.power_monitor && i.power_pause) 244 wait_supported = 1; 245 if (i.power_monitor_multi) 246 wait_multi_supported = 1; 247 if (i.power_monitor) 248 monitor_supported = 1; 249 250 if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MONITORX)) { 251 power_monitor_ops.mmonitor = &amd_monitorx; 252 power_monitor_ops.mwait = &amd_mwaitx; 253 } else { 254 power_monitor_ops.mmonitor = &intel_umonitor; 255 power_monitor_ops.mwait = &intel_umwait; 256 } 257 } 258 259 int 260 rte_power_monitor_wakeup(const unsigned int lcore_id) 261 { 262 struct power_wait_status *s; 263 264 /* prevent user from running this instruction if it's not supported */ 265 if (!monitor_supported) 266 return -ENOTSUP; 267 268 /* prevent buffer overrun */ 269 if (lcore_id >= RTE_MAX_LCORE) 270 return -EINVAL; 271 272 s = RTE_LCORE_VAR_LCORE(lcore_id, wait_status); 273 274 /* 275 * There is a race condition between sleep, wakeup and locking, but we 276 * don't need to handle it. 277 * 278 * Possible situations: 279 * 280 * 1. T1 locks, sets address, unlocks 281 * 2. T2 locks, triggers wakeup, unlocks 282 * 3. T1 sleeps 283 * 284 * In this case, because T1 has already set the address for monitoring, 285 * we will wake up immediately even if T2 triggers wakeup before T1 286 * goes to sleep. 287 * 288 * 1. T1 locks, sets address, unlocks, goes to sleep, and wakes up 289 * 2. T2 locks, triggers wakeup, and unlocks 290 * 3. T1 locks, erases address, and unlocks 291 * 292 * In this case, since we've already woken up, the "wakeup" was 293 * unneeded, and since T1 is still waiting on T2 releasing the lock, the 294 * wakeup address is still valid so it's perfectly safe to write it. 295 * 296 * For multi-monitor case, the act of locking will in itself trigger the 297 * wakeup, so no additional writes necessary. 298 */ 299 rte_spinlock_lock(&s->lock); 300 if (s->monitor_addr != NULL) 301 __umwait_wakeup(s->monitor_addr); 302 rte_spinlock_unlock(&s->lock); 303 304 return 0; 305 } 306 307 int 308 rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], 309 const uint32_t num, const uint64_t tsc_timestamp) 310 { 311 struct power_wait_status *s = RTE_LCORE_VAR(wait_status); 312 uint32_t i, rc; 313 314 /* check if supported */ 315 if (!wait_multi_supported) 316 return -ENOTSUP; 317 318 if (pmc == NULL || num == 0) 319 return -EINVAL; 320 321 /* we are already inside transaction region, return */ 322 if (rte_xtest() != 0) 323 return 0; 324 325 /* start new transaction region */ 326 rc = rte_xbegin(); 327 328 /* transaction abort, possible write to one of wait addresses */ 329 if (rc != RTE_XBEGIN_STARTED) 330 return 0; 331 332 /* 333 * the mere act of reading the lock status here adds the lock to 334 * the read set. This means that when we trigger a wakeup from another 335 * thread, even if we don't have a defined wakeup address and thus don't 336 * actually cause any writes, the act of locking our lock will itself 337 * trigger the wakeup and abort the transaction. 338 */ 339 rte_spinlock_is_locked(&s->lock); 340 341 /* 342 * add all addresses to wait on into transaction read-set and check if 343 * any of wakeup conditions are already met. 344 */ 345 rc = 0; 346 for (i = 0; i < num; i++) { 347 const struct rte_power_monitor_cond *c = &pmc[i]; 348 349 /* cannot be NULL */ 350 if (c->fn == NULL) { 351 rc = -EINVAL; 352 break; 353 } 354 355 const uint64_t val = __get_umwait_val(c->addr, c->size); 356 357 /* abort if callback indicates that we need to stop */ 358 if (c->fn(val, c->opaque) != 0) 359 break; 360 } 361 362 /* none of the conditions were met, sleep until timeout */ 363 if (i == num) 364 rte_power_pause(tsc_timestamp); 365 366 /* end transaction region */ 367 rte_xend(); 368 369 return rc; 370 } 371