1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2020 Intel Corporation 3 */ 4 5 #include <stdalign.h> 6 7 #include <rte_common.h> 8 #include <rte_lcore.h> 9 #include <rte_rtm.h> 10 #include <rte_spinlock.h> 11 12 #include "rte_power_intrinsics.h" 13 14 /* 15 * Per-lcore structure holding current status of C0.2 sleeps. 16 */ 17 static alignas(RTE_CACHE_LINE_SIZE) struct power_wait_status { 18 rte_spinlock_t lock; 19 volatile void *monitor_addr; /**< NULL if not currently sleeping */ 20 } wait_status[RTE_MAX_LCORE]; 21 22 /* 23 * This function uses UMONITOR/UMWAIT instructions and will enter C0.2 state. 24 * For more information about usage of these instructions, please refer to 25 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual. 26 */ 27 static void intel_umonitor(volatile void *addr) 28 { 29 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__WAITPKG__) 30 /* cast away "volatile" when using the intrinsic */ 31 _umonitor((void *)(uintptr_t)addr); 32 #else 33 /* 34 * we're using raw byte codes for compiler versions which 35 * don't support this instruction natively. 36 */ 37 asm volatile(".byte 0xf3, 0x0f, 0xae, 0xf7;" 38 : 39 : "D"(addr)); 40 #endif 41 } 42 43 static void intel_umwait(const uint64_t timeout) 44 { 45 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__WAITPKG__) 46 _umwait(0, timeout); 47 #else 48 const uint32_t tsc_l = (uint32_t)timeout; 49 const uint32_t tsc_h = (uint32_t)(timeout >> 32); 50 51 asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;" 52 : /* ignore rflags */ 53 : "D"(0), /* enter C0.2 */ 54 "a"(tsc_l), "d"(tsc_h)); 55 #endif 56 } 57 58 /* 59 * This function uses MONITORX/MWAITX instructions and will enter C1 state. 60 * For more information about usage of these instructions, please refer to 61 * AMD64 Architecture Programmer’s Manual. 62 */ 63 static void amd_monitorx(volatile void *addr) 64 { 65 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__MWAITX__) 66 /* cast away "volatile" when using the intrinsic */ 67 _mm_monitorx((void *)(uintptr_t)addr, 0, 0); 68 #else 69 asm volatile(".byte 0x0f, 0x01, 0xfa;" 70 : 71 : "a"(addr), 72 "c"(0), /* no extensions */ 73 "d"(0)); /* no hints */ 74 #endif 75 } 76 77 static void amd_mwaitx(const uint64_t timeout) 78 { 79 RTE_SET_USED(timeout); 80 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__MWAITX__) 81 _mm_mwaitx(0, 0, 0); 82 #else 83 asm volatile(".byte 0x0f, 0x01, 0xfb;" 84 : /* ignore rflags */ 85 : "a"(0), /* enter C1 */ 86 "c"(0)); /* no time-out */ 87 #endif 88 } 89 90 static alignas(RTE_CACHE_LINE_SIZE) struct { 91 void (*mmonitor)(volatile void *addr); 92 void (*mwait)(const uint64_t timeout); 93 } power_monitor_ops; 94 95 static inline void 96 __umwait_wakeup(volatile void *addr) 97 { 98 uint64_t val; 99 100 /* trigger a write but don't change the value */ 101 val = rte_atomic_load_explicit((volatile __rte_atomic uint64_t *)addr, 102 rte_memory_order_relaxed); 103 rte_atomic_compare_exchange_strong_explicit((volatile __rte_atomic uint64_t *)addr, 104 &val, val, rte_memory_order_relaxed, rte_memory_order_relaxed); 105 } 106 107 static bool wait_supported; 108 static bool wait_multi_supported; 109 static bool monitor_supported; 110 111 static inline uint64_t 112 __get_umwait_val(const volatile void *p, const uint8_t sz) 113 { 114 switch (sz) { 115 case sizeof(uint8_t): 116 return *(const volatile uint8_t *)p; 117 case sizeof(uint16_t): 118 return *(const volatile uint16_t *)p; 119 case sizeof(uint32_t): 120 return *(const volatile uint32_t *)p; 121 case sizeof(uint64_t): 122 return *(const volatile uint64_t *)p; 123 default: 124 /* shouldn't happen */ 125 RTE_ASSERT(0); 126 return 0; 127 } 128 } 129 130 static inline int 131 __check_val_size(const uint8_t sz) 132 { 133 switch (sz) { 134 case sizeof(uint8_t): /* fall-through */ 135 case sizeof(uint16_t): /* fall-through */ 136 case sizeof(uint32_t): /* fall-through */ 137 case sizeof(uint64_t): /* fall-through */ 138 return 0; 139 default: 140 /* unexpected size */ 141 return -1; 142 } 143 } 144 145 /** 146 * This function uses UMONITOR/UMWAIT instructions and will enter C0.2 state. 147 * For more information about usage of these instructions, please refer to 148 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual. 149 */ 150 int 151 rte_power_monitor(const struct rte_power_monitor_cond *pmc, 152 const uint64_t tsc_timestamp) 153 { 154 const unsigned int lcore_id = rte_lcore_id(); 155 struct power_wait_status *s; 156 uint64_t cur_value; 157 158 /* prevent user from running this instruction if it's not supported */ 159 if (!monitor_supported) 160 return -ENOTSUP; 161 162 /* prevent non-EAL thread from using this API */ 163 if (lcore_id >= RTE_MAX_LCORE) 164 return -EINVAL; 165 166 if (pmc == NULL) 167 return -EINVAL; 168 169 if (__check_val_size(pmc->size) < 0) 170 return -EINVAL; 171 172 if (pmc->fn == NULL) 173 return -EINVAL; 174 175 s = &wait_status[lcore_id]; 176 177 /* update sleep address */ 178 rte_spinlock_lock(&s->lock); 179 s->monitor_addr = pmc->addr; 180 181 /* set address for memory monitor */ 182 power_monitor_ops.mmonitor(pmc->addr); 183 184 /* now that we've put this address into monitor, we can unlock */ 185 rte_spinlock_unlock(&s->lock); 186 187 cur_value = __get_umwait_val(pmc->addr, pmc->size); 188 189 /* check if callback indicates we should abort */ 190 if (pmc->fn(cur_value, pmc->opaque) != 0) 191 goto end; 192 193 /* execute mwait */ 194 power_monitor_ops.mwait(tsc_timestamp); 195 196 end: 197 /* erase sleep address */ 198 rte_spinlock_lock(&s->lock); 199 s->monitor_addr = NULL; 200 rte_spinlock_unlock(&s->lock); 201 202 return 0; 203 } 204 205 /** 206 * This function uses TPAUSE instruction and will enter C0.2 state. For more 207 * information about usage of this instruction, please refer to Intel(R) 64 and 208 * IA-32 Architectures Software Developer's Manual. 209 */ 210 int 211 rte_power_pause(const uint64_t tsc_timestamp) 212 { 213 /* prevent user from running this instruction if it's not supported */ 214 if (!wait_supported) 215 return -ENOTSUP; 216 217 /* execute TPAUSE */ 218 #if defined(RTE_TOOLCHAIN_MSVC) || defined(__WAITPKG__) 219 _tpause(0, tsc_timestamp); 220 #else 221 const uint32_t tsc_l = (uint32_t)tsc_timestamp; 222 const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32); 223 224 asm volatile(".byte 0x66, 0x0f, 0xae, 0xf7;" 225 : /* ignore rflags */ 226 : "D"(0), /* enter C0.2 */ 227 "a"(tsc_l), "d"(tsc_h)); 228 #endif 229 230 return 0; 231 } 232 233 RTE_INIT(rte_power_intrinsics_init) { 234 struct rte_cpu_intrinsics i; 235 236 rte_cpu_get_intrinsics_support(&i); 237 238 if (i.power_monitor && i.power_pause) 239 wait_supported = 1; 240 if (i.power_monitor_multi) 241 wait_multi_supported = 1; 242 if (i.power_monitor) 243 monitor_supported = 1; 244 245 if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MONITORX)) { 246 power_monitor_ops.mmonitor = &amd_monitorx; 247 power_monitor_ops.mwait = &amd_mwaitx; 248 } else { 249 power_monitor_ops.mmonitor = &intel_umonitor; 250 power_monitor_ops.mwait = &intel_umwait; 251 } 252 } 253 254 int 255 rte_power_monitor_wakeup(const unsigned int lcore_id) 256 { 257 struct power_wait_status *s; 258 259 /* prevent user from running this instruction if it's not supported */ 260 if (!monitor_supported) 261 return -ENOTSUP; 262 263 /* prevent buffer overrun */ 264 if (lcore_id >= RTE_MAX_LCORE) 265 return -EINVAL; 266 267 s = &wait_status[lcore_id]; 268 269 /* 270 * There is a race condition between sleep, wakeup and locking, but we 271 * don't need to handle it. 272 * 273 * Possible situations: 274 * 275 * 1. T1 locks, sets address, unlocks 276 * 2. T2 locks, triggers wakeup, unlocks 277 * 3. T1 sleeps 278 * 279 * In this case, because T1 has already set the address for monitoring, 280 * we will wake up immediately even if T2 triggers wakeup before T1 281 * goes to sleep. 282 * 283 * 1. T1 locks, sets address, unlocks, goes to sleep, and wakes up 284 * 2. T2 locks, triggers wakeup, and unlocks 285 * 3. T1 locks, erases address, and unlocks 286 * 287 * In this case, since we've already woken up, the "wakeup" was 288 * unneeded, and since T1 is still waiting on T2 releasing the lock, the 289 * wakeup address is still valid so it's perfectly safe to write it. 290 * 291 * For multi-monitor case, the act of locking will in itself trigger the 292 * wakeup, so no additional writes necessary. 293 */ 294 rte_spinlock_lock(&s->lock); 295 if (s->monitor_addr != NULL) 296 __umwait_wakeup(s->monitor_addr); 297 rte_spinlock_unlock(&s->lock); 298 299 return 0; 300 } 301 302 int 303 rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], 304 const uint32_t num, const uint64_t tsc_timestamp) 305 { 306 const unsigned int lcore_id = rte_lcore_id(); 307 struct power_wait_status *s = &wait_status[lcore_id]; 308 uint32_t i, rc; 309 310 /* check if supported */ 311 if (!wait_multi_supported) 312 return -ENOTSUP; 313 314 if (pmc == NULL || num == 0) 315 return -EINVAL; 316 317 /* we are already inside transaction region, return */ 318 if (rte_xtest() != 0) 319 return 0; 320 321 /* start new transaction region */ 322 rc = rte_xbegin(); 323 324 /* transaction abort, possible write to one of wait addresses */ 325 if (rc != RTE_XBEGIN_STARTED) 326 return 0; 327 328 /* 329 * the mere act of reading the lock status here adds the lock to 330 * the read set. This means that when we trigger a wakeup from another 331 * thread, even if we don't have a defined wakeup address and thus don't 332 * actually cause any writes, the act of locking our lock will itself 333 * trigger the wakeup and abort the transaction. 334 */ 335 rte_spinlock_is_locked(&s->lock); 336 337 /* 338 * add all addresses to wait on into transaction read-set and check if 339 * any of wakeup conditions are already met. 340 */ 341 rc = 0; 342 for (i = 0; i < num; i++) { 343 const struct rte_power_monitor_cond *c = &pmc[i]; 344 345 /* cannot be NULL */ 346 if (c->fn == NULL) { 347 rc = -EINVAL; 348 break; 349 } 350 351 const uint64_t val = __get_umwait_val(c->addr, c->size); 352 353 /* abort if callback indicates that we need to stop */ 354 if (c->fn(val, c->opaque) != 0) 355 break; 356 } 357 358 /* none of the conditions were met, sleep until timeout */ 359 if (i == num) 360 rte_power_pause(tsc_timestamp); 361 362 /* end transaction region */ 363 rte_xend(); 364 365 return rc; 366 } 367