1 /* 2 * kmp_affinity.h -- header for affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef KMP_AFFINITY_H 14 #define KMP_AFFINITY_H 15 16 #include "kmp.h" 17 #include "kmp_os.h" 18 #include <limits> 19 20 #if KMP_AFFINITY_SUPPORTED 21 #if KMP_USE_HWLOC 22 class KMPHwlocAffinity : public KMPAffinity { 23 public: 24 class Mask : public KMPAffinity::Mask { 25 hwloc_cpuset_t mask; 26 27 public: 28 Mask() { 29 mask = hwloc_bitmap_alloc(); 30 this->zero(); 31 } 32 Mask(const Mask &other) = delete; 33 Mask &operator=(const Mask &other) = delete; 34 ~Mask() { hwloc_bitmap_free(mask); } 35 void set(int i) override { hwloc_bitmap_set(mask, i); } 36 bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); } 37 void clear(int i) override { hwloc_bitmap_clr(mask, i); } 38 void zero() override { hwloc_bitmap_zero(mask); } 39 bool empty() const override { return hwloc_bitmap_iszero(mask); } 40 void copy(const KMPAffinity::Mask *src) override { 41 const Mask *convert = static_cast<const Mask *>(src); 42 hwloc_bitmap_copy(mask, convert->mask); 43 } 44 void bitwise_and(const KMPAffinity::Mask *rhs) override { 45 const Mask *convert = static_cast<const Mask *>(rhs); 46 hwloc_bitmap_and(mask, mask, convert->mask); 47 } 48 void bitwise_or(const KMPAffinity::Mask *rhs) override { 49 const Mask *convert = static_cast<const Mask *>(rhs); 50 hwloc_bitmap_or(mask, mask, convert->mask); 51 } 52 void bitwise_not() override { hwloc_bitmap_not(mask, mask); } 53 bool is_equal(const KMPAffinity::Mask *rhs) const override { 54 const Mask *convert = static_cast<const Mask *>(rhs); 55 return hwloc_bitmap_isequal(mask, convert->mask); 56 } 57 int begin() const override { return hwloc_bitmap_first(mask); } 58 int end() const override { return -1; } 59 int next(int previous) const override { 60 return hwloc_bitmap_next(mask, previous); 61 } 62 int get_system_affinity(bool abort_on_error) override { 63 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 64 "Illegal get affinity operation when not capable"); 65 long retval = 66 hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD); 67 if (retval >= 0) { 68 return 0; 69 } 70 int error = errno; 71 if (abort_on_error) { 72 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"), 73 KMP_ERR(error), __kmp_msg_null); 74 } 75 return error; 76 } 77 int set_system_affinity(bool abort_on_error) const override { 78 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 79 "Illegal set affinity operation when not capable"); 80 long retval = 81 hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD); 82 if (retval >= 0) { 83 return 0; 84 } 85 int error = errno; 86 if (abort_on_error) { 87 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"), 88 KMP_ERR(error), __kmp_msg_null); 89 } 90 return error; 91 } 92 #if KMP_OS_WINDOWS 93 int set_process_affinity(bool abort_on_error) const override { 94 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 95 "Illegal set process affinity operation when not capable"); 96 int error = 0; 97 const hwloc_topology_support *support = 98 hwloc_topology_get_support(__kmp_hwloc_topology); 99 if (support->cpubind->set_proc_cpubind) { 100 int retval; 101 retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask, 102 HWLOC_CPUBIND_PROCESS); 103 if (retval >= 0) 104 return 0; 105 error = errno; 106 if (abort_on_error) 107 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"), 108 KMP_ERR(error), __kmp_msg_null); 109 } 110 return error; 111 } 112 #endif 113 int get_proc_group() const override { 114 int group = -1; 115 #if KMP_OS_WINDOWS 116 if (__kmp_num_proc_groups == 1) { 117 return 1; 118 } 119 for (int i = 0; i < __kmp_num_proc_groups; i++) { 120 // On windows, the long type is always 32 bits 121 unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2); 122 unsigned long second_32_bits = 123 hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1); 124 if (first_32_bits == 0 && second_32_bits == 0) { 125 continue; 126 } 127 if (group >= 0) { 128 return -1; 129 } 130 group = i; 131 } 132 #endif /* KMP_OS_WINDOWS */ 133 return group; 134 } 135 }; 136 void determine_capable(const char *var) override { 137 const hwloc_topology_support *topology_support; 138 if (__kmp_hwloc_topology == NULL) { 139 if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) { 140 __kmp_hwloc_error = TRUE; 141 if (__kmp_affinity.flags.verbose) { 142 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()"); 143 } 144 } 145 if (hwloc_topology_load(__kmp_hwloc_topology) < 0) { 146 __kmp_hwloc_error = TRUE; 147 if (__kmp_affinity.flags.verbose) { 148 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()"); 149 } 150 } 151 } 152 topology_support = hwloc_topology_get_support(__kmp_hwloc_topology); 153 // Is the system capable of setting/getting this thread's affinity? 154 // Also, is topology discovery possible? (pu indicates ability to discover 155 // processing units). And finally, were there no errors when calling any 156 // hwloc_* API functions? 157 if (topology_support && topology_support->cpubind->set_thisthread_cpubind && 158 topology_support->cpubind->get_thisthread_cpubind && 159 topology_support->discovery->pu && !__kmp_hwloc_error) { 160 // enables affinity according to KMP_AFFINITY_CAPABLE() macro 161 KMP_AFFINITY_ENABLE(TRUE); 162 } else { 163 // indicate that hwloc didn't work and disable affinity 164 __kmp_hwloc_error = TRUE; 165 KMP_AFFINITY_DISABLE(); 166 } 167 } 168 void bind_thread(int which) override { 169 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 170 "Illegal set affinity operation when not capable"); 171 KMPAffinity::Mask *mask; 172 KMP_CPU_ALLOC_ON_STACK(mask); 173 KMP_CPU_ZERO(mask); 174 KMP_CPU_SET(which, mask); 175 __kmp_set_system_affinity(mask, TRUE); 176 KMP_CPU_FREE_FROM_STACK(mask); 177 } 178 KMPAffinity::Mask *allocate_mask() override { return new Mask(); } 179 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; } 180 KMPAffinity::Mask *allocate_mask_array(int num) override { 181 return new Mask[num]; 182 } 183 void deallocate_mask_array(KMPAffinity::Mask *array) override { 184 Mask *hwloc_array = static_cast<Mask *>(array); 185 delete[] hwloc_array; 186 } 187 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 188 int index) override { 189 Mask *hwloc_array = static_cast<Mask *>(array); 190 return &(hwloc_array[index]); 191 } 192 api_type get_api_type() const override { return HWLOC; } 193 }; 194 #endif /* KMP_USE_HWLOC */ 195 196 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \ 197 KMP_OS_AIX 198 #if KMP_OS_LINUX 199 /* On some of the older OS's that we build on, these constants aren't present 200 in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on 201 all systems of the same arch where they are defined, and they cannot change. 202 stone forever. */ 203 #include <sys/syscall.h> 204 #if KMP_ARCH_X86 || KMP_ARCH_ARM 205 #ifndef __NR_sched_setaffinity 206 #define __NR_sched_setaffinity 241 207 #elif __NR_sched_setaffinity != 241 208 #error Wrong code for setaffinity system call. 209 #endif /* __NR_sched_setaffinity */ 210 #ifndef __NR_sched_getaffinity 211 #define __NR_sched_getaffinity 242 212 #elif __NR_sched_getaffinity != 242 213 #error Wrong code for getaffinity system call. 214 #endif /* __NR_sched_getaffinity */ 215 #elif KMP_ARCH_AARCH64 216 #ifndef __NR_sched_setaffinity 217 #define __NR_sched_setaffinity 122 218 #elif __NR_sched_setaffinity != 122 219 #error Wrong code for setaffinity system call. 220 #endif /* __NR_sched_setaffinity */ 221 #ifndef __NR_sched_getaffinity 222 #define __NR_sched_getaffinity 123 223 #elif __NR_sched_getaffinity != 123 224 #error Wrong code for getaffinity system call. 225 #endif /* __NR_sched_getaffinity */ 226 #elif KMP_ARCH_X86_64 227 #ifndef __NR_sched_setaffinity 228 #define __NR_sched_setaffinity 203 229 #elif __NR_sched_setaffinity != 203 230 #error Wrong code for setaffinity system call. 231 #endif /* __NR_sched_setaffinity */ 232 #ifndef __NR_sched_getaffinity 233 #define __NR_sched_getaffinity 204 234 #elif __NR_sched_getaffinity != 204 235 #error Wrong code for getaffinity system call. 236 #endif /* __NR_sched_getaffinity */ 237 #elif KMP_ARCH_PPC64 238 #ifndef __NR_sched_setaffinity 239 #define __NR_sched_setaffinity 222 240 #elif __NR_sched_setaffinity != 222 241 #error Wrong code for setaffinity system call. 242 #endif /* __NR_sched_setaffinity */ 243 #ifndef __NR_sched_getaffinity 244 #define __NR_sched_getaffinity 223 245 #elif __NR_sched_getaffinity != 223 246 #error Wrong code for getaffinity system call. 247 #endif /* __NR_sched_getaffinity */ 248 #elif KMP_ARCH_MIPS 249 #ifndef __NR_sched_setaffinity 250 #define __NR_sched_setaffinity 4239 251 #elif __NR_sched_setaffinity != 4239 252 #error Wrong code for setaffinity system call. 253 #endif /* __NR_sched_setaffinity */ 254 #ifndef __NR_sched_getaffinity 255 #define __NR_sched_getaffinity 4240 256 #elif __NR_sched_getaffinity != 4240 257 #error Wrong code for getaffinity system call. 258 #endif /* __NR_sched_getaffinity */ 259 #elif KMP_ARCH_MIPS64 260 #ifndef __NR_sched_setaffinity 261 #define __NR_sched_setaffinity 5195 262 #elif __NR_sched_setaffinity != 5195 263 #error Wrong code for setaffinity system call. 264 #endif /* __NR_sched_setaffinity */ 265 #ifndef __NR_sched_getaffinity 266 #define __NR_sched_getaffinity 5196 267 #elif __NR_sched_getaffinity != 5196 268 #error Wrong code for getaffinity system call. 269 #endif /* __NR_sched_getaffinity */ 270 #elif KMP_ARCH_LOONGARCH64 271 #ifndef __NR_sched_setaffinity 272 #define __NR_sched_setaffinity 122 273 #elif __NR_sched_setaffinity != 122 274 #error Wrong code for setaffinity system call. 275 #endif /* __NR_sched_setaffinity */ 276 #ifndef __NR_sched_getaffinity 277 #define __NR_sched_getaffinity 123 278 #elif __NR_sched_getaffinity != 123 279 #error Wrong code for getaffinity system call. 280 #endif /* __NR_sched_getaffinity */ 281 #elif KMP_ARCH_RISCV64 282 #ifndef __NR_sched_setaffinity 283 #define __NR_sched_setaffinity 122 284 #elif __NR_sched_setaffinity != 122 285 #error Wrong code for setaffinity system call. 286 #endif /* __NR_sched_setaffinity */ 287 #ifndef __NR_sched_getaffinity 288 #define __NR_sched_getaffinity 123 289 #elif __NR_sched_getaffinity != 123 290 #error Wrong code for getaffinity system call. 291 #endif /* __NR_sched_getaffinity */ 292 #elif KMP_ARCH_VE 293 #ifndef __NR_sched_setaffinity 294 #define __NR_sched_setaffinity 203 295 #elif __NR_sched_setaffinity != 203 296 #error Wrong code for setaffinity system call. 297 #endif /* __NR_sched_setaffinity */ 298 #ifndef __NR_sched_getaffinity 299 #define __NR_sched_getaffinity 204 300 #elif __NR_sched_getaffinity != 204 301 #error Wrong code for getaffinity system call. 302 #endif /* __NR_sched_getaffinity */ 303 #elif KMP_ARCH_S390X 304 #ifndef __NR_sched_setaffinity 305 #define __NR_sched_setaffinity 239 306 #elif __NR_sched_setaffinity != 239 307 #error Wrong code for setaffinity system call. 308 #endif /* __NR_sched_setaffinity */ 309 #ifndef __NR_sched_getaffinity 310 #define __NR_sched_getaffinity 240 311 #elif __NR_sched_getaffinity != 240 312 #error Wrong code for getaffinity system call. 313 #endif /* __NR_sched_getaffinity */ 314 #else 315 #error Unknown or unsupported architecture 316 #endif /* KMP_ARCH_* */ 317 #elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY 318 #include <pthread.h> 319 #include <pthread_np.h> 320 #elif KMP_OS_NETBSD 321 #include <pthread.h> 322 #include <sched.h> 323 #elif KMP_OS_AIX 324 #include <sys/dr.h> 325 #include <sys/rset.h> 326 #define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX. 327 #define GET_NUMBER_SMT_SETS 0x0004 328 extern "C" int syssmt(int flags, int, int, int *); 329 #endif 330 class KMPNativeAffinity : public KMPAffinity { 331 class Mask : public KMPAffinity::Mask { 332 typedef unsigned long mask_t; 333 typedef decltype(__kmp_affin_mask_size) mask_size_type; 334 static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT; 335 static const mask_t ONE = 1; 336 mask_size_type get_num_mask_types() const { 337 return __kmp_affin_mask_size / sizeof(mask_t); 338 } 339 340 public: 341 mask_t *mask; 342 Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); } 343 ~Mask() { 344 if (mask) 345 __kmp_free(mask); 346 } 347 void set(int i) override { 348 mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T)); 349 } 350 bool is_set(int i) const override { 351 return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T))); 352 } 353 void clear(int i) override { 354 mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T)); 355 } 356 void zero() override { 357 mask_size_type e = get_num_mask_types(); 358 for (mask_size_type i = 0; i < e; ++i) 359 mask[i] = (mask_t)0; 360 } 361 bool empty() const override { 362 mask_size_type e = get_num_mask_types(); 363 for (mask_size_type i = 0; i < e; ++i) 364 if (mask[i] != (mask_t)0) 365 return false; 366 return true; 367 } 368 void copy(const KMPAffinity::Mask *src) override { 369 const Mask *convert = static_cast<const Mask *>(src); 370 mask_size_type e = get_num_mask_types(); 371 for (mask_size_type i = 0; i < e; ++i) 372 mask[i] = convert->mask[i]; 373 } 374 void bitwise_and(const KMPAffinity::Mask *rhs) override { 375 const Mask *convert = static_cast<const Mask *>(rhs); 376 mask_size_type e = get_num_mask_types(); 377 for (mask_size_type i = 0; i < e; ++i) 378 mask[i] &= convert->mask[i]; 379 } 380 void bitwise_or(const KMPAffinity::Mask *rhs) override { 381 const Mask *convert = static_cast<const Mask *>(rhs); 382 mask_size_type e = get_num_mask_types(); 383 for (mask_size_type i = 0; i < e; ++i) 384 mask[i] |= convert->mask[i]; 385 } 386 void bitwise_not() override { 387 mask_size_type e = get_num_mask_types(); 388 for (mask_size_type i = 0; i < e; ++i) 389 mask[i] = ~(mask[i]); 390 } 391 bool is_equal(const KMPAffinity::Mask *rhs) const override { 392 const Mask *convert = static_cast<const Mask *>(rhs); 393 mask_size_type e = get_num_mask_types(); 394 for (mask_size_type i = 0; i < e; ++i) 395 if (mask[i] != convert->mask[i]) 396 return false; 397 return true; 398 } 399 int begin() const override { 400 int retval = 0; 401 while (retval < end() && !is_set(retval)) 402 ++retval; 403 return retval; 404 } 405 int end() const override { 406 int e; 407 __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e); 408 return e; 409 } 410 int next(int previous) const override { 411 int retval = previous + 1; 412 while (retval < end() && !is_set(retval)) 413 ++retval; 414 return retval; 415 } 416 #if KMP_OS_AIX 417 // On AIX, we don't have a way to get CPU(s) a thread is bound to. 418 // This routine is only used to get the full mask. 419 int get_system_affinity(bool abort_on_error) override { 420 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 421 "Illegal get affinity operation when not capable"); 422 423 (void)abort_on_error; 424 425 // Set the mask with all CPUs that are available. 426 for (int i = 0; i < __kmp_xproc; ++i) 427 KMP_CPU_SET(i, this); 428 return 0; 429 } 430 int set_system_affinity(bool abort_on_error) const override { 431 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 432 433 "Illegal set affinity operation when not capable"); 434 435 int location; 436 int gtid = __kmp_entry_gtid(); 437 int tid = thread_self(); 438 439 // Unbind the thread if it was bound to any processors before so that 440 // we can bind the thread to CPUs specified by the mask not others. 441 int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY); 442 443 // On AIX, we can only bind to one instead of a set of CPUs with the 444 // bindprocessor() system call. 445 KMP_CPU_SET_ITERATE(location, this) { 446 if (KMP_CPU_ISSET(location, this)) { 447 retval = bindprocessor(BINDTHREAD, tid, location); 448 if (retval == -1 && errno == 1) { 449 rsid_t rsid; 450 rsethandle_t rsh; 451 // Put something in rsh to prevent compiler warning 452 // about uninitalized use 453 rsh = rs_alloc(RS_EMPTY); 454 rsid.at_pid = getpid(); 455 if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) { 456 retval = ra_detachrset(R_PROCESS, rsid, 0); 457 retval = bindprocessor(BINDTHREAD, tid, location); 458 } 459 } 460 if (retval == 0) { 461 KA_TRACE(10, ("__kmp_set_system_affinity: Done binding " 462 "T#%d to cpu=%d.\n", 463 gtid, location)); 464 continue; 465 } 466 int error = errno; 467 if (abort_on_error) { 468 __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"), 469 KMP_ERR(error), __kmp_msg_null); 470 KA_TRACE(10, ("__kmp_set_system_affinity: Error binding " 471 "T#%d to cpu=%d, errno=%d.\n", 472 gtid, location, error)); 473 return error; 474 } 475 } 476 } 477 return 0; 478 } 479 #else // !KMP_OS_AIX 480 int get_system_affinity(bool abort_on_error) override { 481 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 482 "Illegal get affinity operation when not capable"); 483 #if KMP_OS_LINUX 484 long retval = 485 syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask); 486 #elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY 487 int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size, 488 reinterpret_cast<cpuset_t *>(mask)); 489 int retval = (r == 0 ? 0 : -1); 490 #endif 491 if (retval >= 0) { 492 return 0; 493 } 494 int error = errno; 495 if (abort_on_error) { 496 __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"), 497 KMP_ERR(error), __kmp_msg_null); 498 } 499 return error; 500 } 501 int set_system_affinity(bool abort_on_error) const override { 502 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 503 "Illegal set affinity operation when not capable"); 504 #if KMP_OS_LINUX 505 long retval = 506 syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask); 507 #elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY 508 int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size, 509 reinterpret_cast<cpuset_t *>(mask)); 510 int retval = (r == 0 ? 0 : -1); 511 #endif 512 if (retval >= 0) { 513 return 0; 514 } 515 int error = errno; 516 if (abort_on_error) { 517 __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"), 518 KMP_ERR(error), __kmp_msg_null); 519 } 520 return error; 521 } 522 #endif // KMP_OS_AIX 523 }; 524 void determine_capable(const char *env_var) override { 525 __kmp_affinity_determine_capable(env_var); 526 } 527 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); } 528 KMPAffinity::Mask *allocate_mask() override { 529 KMPNativeAffinity::Mask *retval = new Mask(); 530 return retval; 531 } 532 void deallocate_mask(KMPAffinity::Mask *m) override { 533 KMPNativeAffinity::Mask *native_mask = 534 static_cast<KMPNativeAffinity::Mask *>(m); 535 delete native_mask; 536 } 537 KMPAffinity::Mask *allocate_mask_array(int num) override { 538 return new Mask[num]; 539 } 540 void deallocate_mask_array(KMPAffinity::Mask *array) override { 541 Mask *linux_array = static_cast<Mask *>(array); 542 delete[] linux_array; 543 } 544 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 545 int index) override { 546 Mask *linux_array = static_cast<Mask *>(array); 547 return &(linux_array[index]); 548 } 549 api_type get_api_type() const override { return NATIVE_OS; } 550 }; 551 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY \ 552 || KMP_OS_AIX */ 553 554 #if KMP_OS_WINDOWS 555 class KMPNativeAffinity : public KMPAffinity { 556 class Mask : public KMPAffinity::Mask { 557 typedef ULONG_PTR mask_t; 558 static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT; 559 mask_t *mask; 560 561 public: 562 Mask() { 563 mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups); 564 } 565 ~Mask() { 566 if (mask) 567 __kmp_free(mask); 568 } 569 void set(int i) override { 570 mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T)); 571 } 572 bool is_set(int i) const override { 573 return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T))); 574 } 575 void clear(int i) override { 576 mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T)); 577 } 578 void zero() override { 579 for (int i = 0; i < __kmp_num_proc_groups; ++i) 580 mask[i] = 0; 581 } 582 bool empty() const override { 583 for (size_t i = 0; i < __kmp_num_proc_groups; ++i) 584 if (mask[i]) 585 return false; 586 return true; 587 } 588 void copy(const KMPAffinity::Mask *src) override { 589 const Mask *convert = static_cast<const Mask *>(src); 590 for (int i = 0; i < __kmp_num_proc_groups; ++i) 591 mask[i] = convert->mask[i]; 592 } 593 void bitwise_and(const KMPAffinity::Mask *rhs) override { 594 const Mask *convert = static_cast<const Mask *>(rhs); 595 for (int i = 0; i < __kmp_num_proc_groups; ++i) 596 mask[i] &= convert->mask[i]; 597 } 598 void bitwise_or(const KMPAffinity::Mask *rhs) override { 599 const Mask *convert = static_cast<const Mask *>(rhs); 600 for (int i = 0; i < __kmp_num_proc_groups; ++i) 601 mask[i] |= convert->mask[i]; 602 } 603 void bitwise_not() override { 604 for (int i = 0; i < __kmp_num_proc_groups; ++i) 605 mask[i] = ~(mask[i]); 606 } 607 bool is_equal(const KMPAffinity::Mask *rhs) const override { 608 const Mask *convert = static_cast<const Mask *>(rhs); 609 for (size_t i = 0; i < __kmp_num_proc_groups; ++i) 610 if (mask[i] != convert->mask[i]) 611 return false; 612 return true; 613 } 614 int begin() const override { 615 int retval = 0; 616 while (retval < end() && !is_set(retval)) 617 ++retval; 618 return retval; 619 } 620 int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; } 621 int next(int previous) const override { 622 int retval = previous + 1; 623 while (retval < end() && !is_set(retval)) 624 ++retval; 625 return retval; 626 } 627 int set_process_affinity(bool abort_on_error) const override { 628 if (__kmp_num_proc_groups <= 1) { 629 if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) { 630 DWORD error = GetLastError(); 631 if (abort_on_error) { 632 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 633 __kmp_msg_null); 634 } 635 return error; 636 } 637 } 638 return 0; 639 } 640 int set_system_affinity(bool abort_on_error) const override { 641 if (__kmp_num_proc_groups > 1) { 642 // Check for a valid mask. 643 GROUP_AFFINITY ga; 644 int group = get_proc_group(); 645 if (group < 0) { 646 if (abort_on_error) { 647 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 648 } 649 return -1; 650 } 651 // Transform the bit vector into a GROUP_AFFINITY struct 652 // and make the system call to set affinity. 653 ga.Group = group; 654 ga.Mask = mask[group]; 655 ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0; 656 657 KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL); 658 if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) { 659 DWORD error = GetLastError(); 660 if (abort_on_error) { 661 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 662 __kmp_msg_null); 663 } 664 return error; 665 } 666 } else { 667 if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) { 668 DWORD error = GetLastError(); 669 if (abort_on_error) { 670 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 671 __kmp_msg_null); 672 } 673 return error; 674 } 675 } 676 return 0; 677 } 678 int get_system_affinity(bool abort_on_error) override { 679 if (__kmp_num_proc_groups > 1) { 680 this->zero(); 681 GROUP_AFFINITY ga; 682 KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL); 683 if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) { 684 DWORD error = GetLastError(); 685 if (abort_on_error) { 686 __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"), 687 KMP_ERR(error), __kmp_msg_null); 688 } 689 return error; 690 } 691 if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) || 692 (ga.Mask == 0)) { 693 return -1; 694 } 695 mask[ga.Group] = ga.Mask; 696 } else { 697 mask_t newMask, sysMask, retval; 698 if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) { 699 DWORD error = GetLastError(); 700 if (abort_on_error) { 701 __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"), 702 KMP_ERR(error), __kmp_msg_null); 703 } 704 return error; 705 } 706 retval = SetThreadAffinityMask(GetCurrentThread(), newMask); 707 if (!retval) { 708 DWORD error = GetLastError(); 709 if (abort_on_error) { 710 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"), 711 KMP_ERR(error), __kmp_msg_null); 712 } 713 return error; 714 } 715 newMask = SetThreadAffinityMask(GetCurrentThread(), retval); 716 if (!newMask) { 717 DWORD error = GetLastError(); 718 if (abort_on_error) { 719 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"), 720 KMP_ERR(error), __kmp_msg_null); 721 } 722 } 723 *mask = retval; 724 } 725 return 0; 726 } 727 int get_proc_group() const override { 728 int group = -1; 729 if (__kmp_num_proc_groups == 1) { 730 return 1; 731 } 732 for (int i = 0; i < __kmp_num_proc_groups; i++) { 733 if (mask[i] == 0) 734 continue; 735 if (group >= 0) 736 return -1; 737 group = i; 738 } 739 return group; 740 } 741 }; 742 void determine_capable(const char *env_var) override { 743 __kmp_affinity_determine_capable(env_var); 744 } 745 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); } 746 KMPAffinity::Mask *allocate_mask() override { return new Mask(); } 747 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; } 748 KMPAffinity::Mask *allocate_mask_array(int num) override { 749 return new Mask[num]; 750 } 751 void deallocate_mask_array(KMPAffinity::Mask *array) override { 752 Mask *windows_array = static_cast<Mask *>(array); 753 delete[] windows_array; 754 } 755 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 756 int index) override { 757 Mask *windows_array = static_cast<Mask *>(array); 758 return &(windows_array[index]); 759 } 760 api_type get_api_type() const override { return NATIVE_OS; } 761 }; 762 #endif /* KMP_OS_WINDOWS */ 763 #endif /* KMP_AFFINITY_SUPPORTED */ 764 765 // Describe an attribute for a level in the machine topology 766 struct kmp_hw_attr_t { 767 int core_type : 8; 768 int core_eff : 8; 769 unsigned valid : 1; 770 unsigned reserved : 15; 771 772 static const int UNKNOWN_CORE_EFF = -1; 773 774 kmp_hw_attr_t() 775 : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF), 776 valid(0), reserved(0) {} 777 void set_core_type(kmp_hw_core_type_t type) { 778 valid = 1; 779 core_type = type; 780 } 781 void set_core_eff(int eff) { 782 valid = 1; 783 core_eff = eff; 784 } 785 kmp_hw_core_type_t get_core_type() const { 786 return (kmp_hw_core_type_t)core_type; 787 } 788 int get_core_eff() const { return core_eff; } 789 bool is_core_type_valid() const { 790 return core_type != KMP_HW_CORE_TYPE_UNKNOWN; 791 } 792 bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; } 793 operator bool() const { return valid; } 794 void clear() { 795 core_type = KMP_HW_CORE_TYPE_UNKNOWN; 796 core_eff = UNKNOWN_CORE_EFF; 797 valid = 0; 798 } 799 bool contains(const kmp_hw_attr_t &other) const { 800 if (!valid && !other.valid) 801 return true; 802 if (valid && other.valid) { 803 if (other.is_core_type_valid()) { 804 if (!is_core_type_valid() || (get_core_type() != other.get_core_type())) 805 return false; 806 } 807 if (other.is_core_eff_valid()) { 808 if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff())) 809 return false; 810 } 811 return true; 812 } 813 return false; 814 } 815 #if KMP_AFFINITY_SUPPORTED 816 bool contains(const kmp_affinity_attrs_t &attr) const { 817 if (!valid && !attr.valid) 818 return true; 819 if (valid && attr.valid) { 820 if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN) 821 return (is_core_type_valid() && 822 (get_core_type() == (kmp_hw_core_type_t)attr.core_type)); 823 if (attr.core_eff != UNKNOWN_CORE_EFF) 824 return (is_core_eff_valid() && (get_core_eff() == attr.core_eff)); 825 return true; 826 } 827 return false; 828 } 829 #endif // KMP_AFFINITY_SUPPORTED 830 bool operator==(const kmp_hw_attr_t &rhs) const { 831 return (rhs.valid == valid && rhs.core_eff == core_eff && 832 rhs.core_type == core_type); 833 } 834 bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); } 835 }; 836 837 #if KMP_AFFINITY_SUPPORTED 838 KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t)); 839 #endif 840 841 class kmp_hw_thread_t { 842 public: 843 static const int UNKNOWN_ID = -1; 844 static const int MULTIPLE_ID = -2; 845 static int compare_ids(const void *a, const void *b); 846 static int compare_compact(const void *a, const void *b); 847 int ids[KMP_HW_LAST]; 848 int sub_ids[KMP_HW_LAST]; 849 bool leader; 850 int os_id; 851 int original_idx; 852 kmp_hw_attr_t attrs; 853 854 void print() const; 855 void clear() { 856 for (int i = 0; i < (int)KMP_HW_LAST; ++i) 857 ids[i] = UNKNOWN_ID; 858 leader = false; 859 attrs.clear(); 860 } 861 }; 862 863 class kmp_topology_t { 864 865 struct flags_t { 866 int uniform : 1; 867 int reserved : 31; 868 }; 869 870 int depth; 871 872 // The following arrays are all 'depth' long and have been 873 // allocated to hold up to KMP_HW_LAST number of objects if 874 // needed so layers can be added without reallocation of any array 875 876 // Orderd array of the types in the topology 877 kmp_hw_t *types; 878 879 // Keep quick topology ratios, for non-uniform topologies, 880 // this ratio holds the max number of itemAs per itemB 881 // e.g., [ 4 packages | 6 cores / package | 2 threads / core ] 882 int *ratio; 883 884 // Storage containing the absolute number of each topology layer 885 int *count; 886 887 // The number of core efficiencies. This is only useful for hybrid 888 // topologies. Core efficiencies will range from 0 to num efficiencies - 1 889 int num_core_efficiencies; 890 int num_core_types; 891 kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES]; 892 893 // The hardware threads array 894 // hw_threads is num_hw_threads long 895 // Each hw_thread's ids and sub_ids are depth deep 896 int num_hw_threads; 897 kmp_hw_thread_t *hw_threads; 898 899 // Equivalence hash where the key is the hardware topology item 900 // and the value is the equivalent hardware topology type in the 901 // types[] array, if the value is KMP_HW_UNKNOWN, then there is no 902 // known equivalence for the topology type 903 kmp_hw_t equivalent[KMP_HW_LAST]; 904 905 // Flags describing the topology 906 flags_t flags; 907 908 // Compact value used during sort_compact() 909 int compact; 910 911 #if KMP_GROUP_AFFINITY 912 // Insert topology information about Windows Processor groups 913 void _insert_windows_proc_groups(); 914 #endif 915 916 // Count each item & get the num x's per y 917 // e.g., get the number of cores and the number of threads per core 918 // for each (x, y) in (KMP_HW_* , KMP_HW_*) 919 void _gather_enumeration_information(); 920 921 // Remove layers that don't add information to the topology. 922 // This is done by having the layer take on the id = UNKNOWN_ID (-1) 923 void _remove_radix1_layers(); 924 925 // Find out if the topology is uniform 926 void _discover_uniformity(); 927 928 // Set all the sub_ids for each hardware thread 929 void _set_sub_ids(); 930 931 // Set global affinity variables describing the number of threads per 932 // core, the number of packages, the number of cores per package, and 933 // the number of cores. 934 void _set_globals(); 935 936 // Set the last level cache equivalent type 937 void _set_last_level_cache(); 938 939 // Return the number of cores with a particular attribute, 'attr'. 940 // If 'find_all' is true, then find all cores on the machine, otherwise find 941 // all cores per the layer 'above' 942 int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above, 943 bool find_all = false) const; 944 945 public: 946 // Force use of allocate()/deallocate() 947 kmp_topology_t() = delete; 948 kmp_topology_t(const kmp_topology_t &t) = delete; 949 kmp_topology_t(kmp_topology_t &&t) = delete; 950 kmp_topology_t &operator=(const kmp_topology_t &t) = delete; 951 kmp_topology_t &operator=(kmp_topology_t &&t) = delete; 952 953 static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types); 954 static void deallocate(kmp_topology_t *); 955 956 // Functions used in create_map() routines 957 kmp_hw_thread_t &at(int index) { 958 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); 959 return hw_threads[index]; 960 } 961 const kmp_hw_thread_t &at(int index) const { 962 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); 963 return hw_threads[index]; 964 } 965 int get_num_hw_threads() const { return num_hw_threads; } 966 void sort_ids() { 967 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t), 968 kmp_hw_thread_t::compare_ids); 969 } 970 971 // Insert a new topology layer after allocation 972 void insert_layer(kmp_hw_t type, const int *ids); 973 974 // Check if the hardware ids are unique, if they are 975 // return true, otherwise return false 976 bool check_ids() const; 977 978 // Function to call after the create_map() routine 979 void canonicalize(); 980 void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores); 981 982 // Functions used after canonicalize() called 983 984 #if KMP_AFFINITY_SUPPORTED 985 // Set the granularity for affinity settings 986 void set_granularity(kmp_affinity_t &stgs) const; 987 bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const; 988 bool restrict_to_mask(const kmp_affin_mask_t *mask); 989 bool filter_hw_subset(); 990 #endif 991 bool is_uniform() const { return flags.uniform; } 992 // Tell whether a type is a valid type in the topology 993 // returns KMP_HW_UNKNOWN when there is no equivalent type 994 kmp_hw_t get_equivalent_type(kmp_hw_t type) const { 995 if (type == KMP_HW_UNKNOWN) 996 return KMP_HW_UNKNOWN; 997 return equivalent[type]; 998 } 999 // Set type1 = type2 1000 void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) { 1001 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1); 1002 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2); 1003 kmp_hw_t real_type2 = equivalent[type2]; 1004 if (real_type2 == KMP_HW_UNKNOWN) 1005 real_type2 = type2; 1006 equivalent[type1] = real_type2; 1007 // This loop is required since any of the types may have been set to 1008 // be equivalent to type1. They all must be checked and reset to type2. 1009 KMP_FOREACH_HW_TYPE(type) { 1010 if (equivalent[type] == type1) { 1011 equivalent[type] = real_type2; 1012 } 1013 } 1014 } 1015 // Calculate number of types corresponding to level1 1016 // per types corresponding to level2 (e.g., number of threads per core) 1017 int calculate_ratio(int level1, int level2) const { 1018 KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth); 1019 KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth); 1020 int r = 1; 1021 for (int level = level1; level > level2; --level) 1022 r *= ratio[level]; 1023 return r; 1024 } 1025 int get_ratio(int level) const { 1026 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 1027 return ratio[level]; 1028 } 1029 int get_depth() const { return depth; }; 1030 kmp_hw_t get_type(int level) const { 1031 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 1032 return types[level]; 1033 } 1034 int get_level(kmp_hw_t type) const { 1035 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type); 1036 int eq_type = equivalent[type]; 1037 if (eq_type == KMP_HW_UNKNOWN) 1038 return -1; 1039 for (int i = 0; i < depth; ++i) 1040 if (types[i] == eq_type) 1041 return i; 1042 return -1; 1043 } 1044 int get_count(int level) const { 1045 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 1046 return count[level]; 1047 } 1048 // Return the total number of cores with attribute 'attr' 1049 int get_ncores_with_attr(const kmp_hw_attr_t &attr) const { 1050 return _get_ncores_with_attr(attr, -1, true); 1051 } 1052 // Return the number of cores with attribute 1053 // 'attr' per topology level 'above' 1054 int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const { 1055 return _get_ncores_with_attr(attr, above, false); 1056 } 1057 1058 #if KMP_AFFINITY_SUPPORTED 1059 friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b); 1060 void sort_compact(kmp_affinity_t &affinity) { 1061 compact = affinity.compact; 1062 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t), 1063 kmp_hw_thread_t::compare_compact); 1064 } 1065 #endif 1066 void print(const char *env_var = "KMP_AFFINITY") const; 1067 void dump() const; 1068 }; 1069 extern kmp_topology_t *__kmp_topology; 1070 1071 class kmp_hw_subset_t { 1072 const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS; 1073 1074 public: 1075 // Describe a machine topology item in KMP_HW_SUBSET 1076 struct item_t { 1077 kmp_hw_t type; 1078 int num_attrs; 1079 int num[MAX_ATTRS]; 1080 int offset[MAX_ATTRS]; 1081 kmp_hw_attr_t attr[MAX_ATTRS]; 1082 }; 1083 // Put parenthesis around max to avoid accidental use of Windows max macro. 1084 const static int USE_ALL = (std::numeric_limits<int>::max)(); 1085 1086 private: 1087 int depth; 1088 int capacity; 1089 item_t *items; 1090 kmp_uint64 set; 1091 bool absolute; 1092 // The set must be able to handle up to KMP_HW_LAST number of layers 1093 KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST); 1094 // Sorting the KMP_HW_SUBSET items to follow topology order 1095 // All unknown topology types will be at the beginning of the subset 1096 static int hw_subset_compare(const void *i1, const void *i2) { 1097 kmp_hw_t type1 = ((const item_t *)i1)->type; 1098 kmp_hw_t type2 = ((const item_t *)i2)->type; 1099 int level1 = __kmp_topology->get_level(type1); 1100 int level2 = __kmp_topology->get_level(type2); 1101 return level1 - level2; 1102 } 1103 1104 public: 1105 // Force use of allocate()/deallocate() 1106 kmp_hw_subset_t() = delete; 1107 kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete; 1108 kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete; 1109 kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete; 1110 kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete; 1111 1112 static kmp_hw_subset_t *allocate() { 1113 int initial_capacity = 5; 1114 kmp_hw_subset_t *retval = 1115 (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t)); 1116 retval->depth = 0; 1117 retval->capacity = initial_capacity; 1118 retval->set = 0ull; 1119 retval->absolute = false; 1120 retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity); 1121 return retval; 1122 } 1123 static void deallocate(kmp_hw_subset_t *subset) { 1124 __kmp_free(subset->items); 1125 __kmp_free(subset); 1126 } 1127 void set_absolute() { absolute = true; } 1128 bool is_absolute() const { return absolute; } 1129 void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) { 1130 for (int i = 0; i < depth; ++i) { 1131 // Found an existing item for this layer type 1132 // Add the num, offset, and attr to this item 1133 if (items[i].type == type) { 1134 int idx = items[i].num_attrs++; 1135 if ((size_t)idx >= MAX_ATTRS) 1136 return; 1137 items[i].num[idx] = num; 1138 items[i].offset[idx] = offset; 1139 items[i].attr[idx] = attr; 1140 return; 1141 } 1142 } 1143 if (depth == capacity - 1) { 1144 capacity *= 2; 1145 item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity); 1146 for (int i = 0; i < depth; ++i) 1147 new_items[i] = items[i]; 1148 __kmp_free(items); 1149 items = new_items; 1150 } 1151 items[depth].num_attrs = 1; 1152 items[depth].type = type; 1153 items[depth].num[0] = num; 1154 items[depth].offset[0] = offset; 1155 items[depth].attr[0] = attr; 1156 depth++; 1157 set |= (1ull << type); 1158 } 1159 int get_depth() const { return depth; } 1160 const item_t &at(int index) const { 1161 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 1162 return items[index]; 1163 } 1164 item_t &at(int index) { 1165 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 1166 return items[index]; 1167 } 1168 void remove(int index) { 1169 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 1170 set &= ~(1ull << items[index].type); 1171 for (int j = index + 1; j < depth; ++j) { 1172 items[j - 1] = items[j]; 1173 } 1174 depth--; 1175 } 1176 void sort() { 1177 KMP_DEBUG_ASSERT(__kmp_topology); 1178 qsort(items, depth, sizeof(item_t), hw_subset_compare); 1179 } 1180 bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); } 1181 1182 // Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset. 1183 // This means putting each of {sockets, cores, threads} in the topology if 1184 // they are not specified: 1185 // e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc. 1186 // e.g., 3module => *s,3module,*c,*t 1187 // By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET 1188 // are expecting the traditional sockets/cores/threads topology. For newer 1189 // hardware, there can be intervening layers like dies/tiles/modules 1190 // (usually corresponding to a cache level). So when a user asks for 1191 // 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user 1192 // should get 12 hardware threads across 6 cores and effectively ignore the 1193 // module layer. 1194 void canonicalize(const kmp_topology_t *top) { 1195 // Layers to target for KMP_HW_SUBSET canonicalization 1196 kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD}; 1197 1198 // Do not target-layer-canonicalize absolute KMP_HW_SUBSETS 1199 if (is_absolute()) 1200 return; 1201 1202 // Do not target-layer-canonicalize KMP_HW_SUBSETS when the 1203 // topology doesn't have these layers 1204 for (kmp_hw_t type : targeted) 1205 if (top->get_level(type) == KMP_HW_UNKNOWN) 1206 return; 1207 1208 // Put targeted layers in topology if they do not exist 1209 for (kmp_hw_t type : targeted) { 1210 bool found = false; 1211 for (int i = 0; i < get_depth(); ++i) { 1212 if (top->get_equivalent_type(items[i].type) == type) { 1213 found = true; 1214 break; 1215 } 1216 } 1217 if (!found) { 1218 push_back(USE_ALL, type, 0, kmp_hw_attr_t{}); 1219 } 1220 } 1221 sort(); 1222 // Set as an absolute topology that only targets the targeted layers 1223 set_absolute(); 1224 } 1225 void dump() const { 1226 printf("**********************\n"); 1227 printf("*** kmp_hw_subset: ***\n"); 1228 printf("* depth: %d\n", depth); 1229 printf("* items:\n"); 1230 for (int i = 0; i < depth; ++i) { 1231 printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type)); 1232 for (int j = 0; j < items[i].num_attrs; ++j) { 1233 printf(" num: %d, offset: %d, attr: ", items[i].num[j], 1234 items[i].offset[j]); 1235 if (!items[i].attr[j]) { 1236 printf(" (none)\n"); 1237 } else { 1238 printf( 1239 " core_type = %s, core_eff = %d\n", 1240 __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()), 1241 items[i].attr[j].get_core_eff()); 1242 } 1243 } 1244 } 1245 printf("* set: 0x%llx\n", set); 1246 printf("* absolute: %d\n", absolute); 1247 printf("**********************\n"); 1248 } 1249 }; 1250 extern kmp_hw_subset_t *__kmp_hw_subset; 1251 1252 /* A structure for holding machine-specific hierarchy info to be computed once 1253 at init. This structure represents a mapping of threads to the actual machine 1254 hierarchy, or to our best guess at what the hierarchy might be, for the 1255 purpose of performing an efficient barrier. In the worst case, when there is 1256 no machine hierarchy information, it produces a tree suitable for a barrier, 1257 similar to the tree used in the hyper barrier. */ 1258 class hierarchy_info { 1259 public: 1260 /* Good default values for number of leaves and branching factor, given no 1261 affinity information. Behaves a bit like hyper barrier. */ 1262 static const kmp_uint32 maxLeaves = 4; 1263 static const kmp_uint32 minBranch = 4; 1264 /** Number of levels in the hierarchy. Typical levels are threads/core, 1265 cores/package or socket, packages/node, nodes/machine, etc. We don't want 1266 to get specific with nomenclature. When the machine is oversubscribed we 1267 add levels to duplicate the hierarchy, doubling the thread capacity of the 1268 hierarchy each time we add a level. */ 1269 kmp_uint32 maxLevels; 1270 1271 /** This is specifically the depth of the machine configuration hierarchy, in 1272 terms of the number of levels along the longest path from root to any 1273 leaf. It corresponds to the number of entries in numPerLevel if we exclude 1274 all but one trailing 1. */ 1275 kmp_uint32 depth; 1276 kmp_uint32 base_num_threads = 0; 1277 enum init_status { initialized = 0, not_initialized = 1, initializing = 2 }; 1278 volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized, 1279 // 2=initialization in progress 1280 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing 1281 1282 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children 1283 the parent of a node at level i has. For example, if we have a machine 1284 with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel = 1285 {2, 4, 4, 1, 1}. All empty levels are set to 1. */ 1286 kmp_uint32 *numPerLevel = nullptr; 1287 kmp_uint32 *skipPerLevel = nullptr; 1288 1289 void deriveLevels() { 1290 int hier_depth = __kmp_topology->get_depth(); 1291 for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) { 1292 numPerLevel[level] = __kmp_topology->get_ratio(i); 1293 } 1294 } 1295 1296 hierarchy_info() 1297 : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {} 1298 1299 void fini() { 1300 if (!uninitialized && numPerLevel) { 1301 __kmp_free(numPerLevel); 1302 numPerLevel = NULL; 1303 uninitialized = not_initialized; 1304 } 1305 } 1306 1307 void init(int num_addrs) { 1308 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8( 1309 &uninitialized, not_initialized, initializing); 1310 if (bool_result == 0) { // Wait for initialization 1311 while (TCR_1(uninitialized) != initialized) 1312 KMP_CPU_PAUSE(); 1313 return; 1314 } 1315 KMP_DEBUG_ASSERT(bool_result == 1); 1316 1317 /* Added explicit initialization of the data fields here to prevent usage of 1318 dirty value observed when static library is re-initialized multiple times 1319 (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses 1320 OpenMP). */ 1321 depth = 1; 1322 resizing = 0; 1323 maxLevels = 7; 1324 numPerLevel = 1325 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32)); 1326 skipPerLevel = &(numPerLevel[maxLevels]); 1327 for (kmp_uint32 i = 0; i < maxLevels; 1328 ++i) { // init numPerLevel[*] to 1 item per level 1329 numPerLevel[i] = 1; 1330 skipPerLevel[i] = 1; 1331 } 1332 1333 // Sort table by physical ID 1334 if (__kmp_topology && __kmp_topology->get_depth() > 0) { 1335 deriveLevels(); 1336 } else { 1337 numPerLevel[0] = maxLeaves; 1338 numPerLevel[1] = num_addrs / maxLeaves; 1339 if (num_addrs % maxLeaves) 1340 numPerLevel[1]++; 1341 } 1342 1343 base_num_threads = num_addrs; 1344 for (int i = maxLevels - 1; i >= 0; 1345 --i) // count non-empty levels to get depth 1346 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1' 1347 depth++; 1348 1349 kmp_uint32 branch = minBranch; 1350 if (numPerLevel[0] == 1) 1351 branch = num_addrs / maxLeaves; 1352 if (branch < minBranch) 1353 branch = minBranch; 1354 for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width 1355 while (numPerLevel[d] > branch || 1356 (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0! 1357 if (numPerLevel[d] & 1) 1358 numPerLevel[d]++; 1359 numPerLevel[d] = numPerLevel[d] >> 1; 1360 if (numPerLevel[d + 1] == 1) 1361 depth++; 1362 numPerLevel[d + 1] = numPerLevel[d + 1] << 1; 1363 } 1364 if (numPerLevel[0] == 1) { 1365 branch = branch >> 1; 1366 if (branch < 4) 1367 branch = minBranch; 1368 } 1369 } 1370 1371 for (kmp_uint32 i = 1; i < depth; ++i) 1372 skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1]; 1373 // Fill in hierarchy in the case of oversubscription 1374 for (kmp_uint32 i = depth; i < maxLevels; ++i) 1375 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1376 1377 uninitialized = initialized; // One writer 1378 } 1379 1380 // Resize the hierarchy if nproc changes to something larger than before 1381 void resize(kmp_uint32 nproc) { 1382 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1); 1383 while (bool_result == 0) { // someone else is trying to resize 1384 KMP_CPU_PAUSE(); 1385 if (nproc <= base_num_threads) // happy with other thread's resize 1386 return; 1387 else // try to resize 1388 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1); 1389 } 1390 KMP_DEBUG_ASSERT(bool_result != 0); 1391 if (nproc <= base_num_threads) 1392 return; // happy with other thread's resize 1393 1394 // Calculate new maxLevels 1395 kmp_uint32 old_sz = skipPerLevel[depth - 1]; 1396 kmp_uint32 incs = 0, old_maxLevels = maxLevels; 1397 // First see if old maxLevels is enough to contain new size 1398 for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) { 1399 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1400 numPerLevel[i - 1] *= 2; 1401 old_sz *= 2; 1402 depth++; 1403 } 1404 if (nproc > old_sz) { // Not enough space, need to expand hierarchy 1405 while (nproc > old_sz) { 1406 old_sz *= 2; 1407 incs++; 1408 depth++; 1409 } 1410 maxLevels += incs; 1411 1412 // Resize arrays 1413 kmp_uint32 *old_numPerLevel = numPerLevel; 1414 kmp_uint32 *old_skipPerLevel = skipPerLevel; 1415 numPerLevel = skipPerLevel = NULL; 1416 numPerLevel = 1417 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32)); 1418 skipPerLevel = &(numPerLevel[maxLevels]); 1419 1420 // Copy old elements from old arrays 1421 for (kmp_uint32 i = 0; i < old_maxLevels; ++i) { 1422 // init numPerLevel[*] to 1 item per level 1423 numPerLevel[i] = old_numPerLevel[i]; 1424 skipPerLevel[i] = old_skipPerLevel[i]; 1425 } 1426 1427 // Init new elements in arrays to 1 1428 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) { 1429 // init numPerLevel[*] to 1 item per level 1430 numPerLevel[i] = 1; 1431 skipPerLevel[i] = 1; 1432 } 1433 1434 // Free old arrays 1435 __kmp_free(old_numPerLevel); 1436 } 1437 1438 // Fill in oversubscription levels of hierarchy 1439 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) 1440 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1441 1442 base_num_threads = nproc; 1443 resizing = 0; // One writer 1444 } 1445 }; 1446 #endif // KMP_AFFINITY_H 1447