1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_i18n.h" 16 #include "kmp_io.h" 17 #include "kmp_str.h" 18 #include "kmp_wrapper_getpid.h" 19 #if KMP_USE_HIER_SCHED 20 #include "kmp_dispatch_hier.h" 21 #endif 22 #if KMP_USE_HWLOC 23 // Copied from hwloc 24 #define HWLOC_GROUP_KIND_INTEL_MODULE 102 25 #define HWLOC_GROUP_KIND_INTEL_TILE 103 26 #define HWLOC_GROUP_KIND_INTEL_DIE 104 27 #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220 28 #endif 29 #include <ctype.h> 30 31 // The machine topology 32 kmp_topology_t *__kmp_topology = nullptr; 33 // KMP_HW_SUBSET environment variable 34 kmp_hw_subset_t *__kmp_hw_subset = nullptr; 35 36 // Store the real or imagined machine hierarchy here 37 static hierarchy_info machine_hierarchy; 38 39 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); } 40 41 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 42 kmp_uint32 depth; 43 // The test below is true if affinity is available, but set to "none". Need to 44 // init on first use of hierarchical barrier. 45 if (TCR_1(machine_hierarchy.uninitialized)) 46 machine_hierarchy.init(nproc); 47 48 // Adjust the hierarchy in case num threads exceeds original 49 if (nproc > machine_hierarchy.base_num_threads) 50 machine_hierarchy.resize(nproc); 51 52 depth = machine_hierarchy.depth; 53 KMP_DEBUG_ASSERT(depth > 0); 54 55 thr_bar->depth = depth; 56 __kmp_type_convert(machine_hierarchy.numPerLevel[0] - 1, 57 &(thr_bar->base_leaf_kids)); 58 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 59 } 60 61 static int nCoresPerPkg, nPackages; 62 static int __kmp_nThreadsPerCore; 63 #ifndef KMP_DFLT_NTH_CORES 64 static int __kmp_ncores; 65 #endif 66 67 const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) { 68 switch (type) { 69 case KMP_HW_SOCKET: 70 return ((plural) ? KMP_I18N_STR(Sockets) : KMP_I18N_STR(Socket)); 71 case KMP_HW_DIE: 72 return ((plural) ? KMP_I18N_STR(Dice) : KMP_I18N_STR(Die)); 73 case KMP_HW_MODULE: 74 return ((plural) ? KMP_I18N_STR(Modules) : KMP_I18N_STR(Module)); 75 case KMP_HW_TILE: 76 return ((plural) ? KMP_I18N_STR(Tiles) : KMP_I18N_STR(Tile)); 77 case KMP_HW_NUMA: 78 return ((plural) ? KMP_I18N_STR(NumaDomains) : KMP_I18N_STR(NumaDomain)); 79 case KMP_HW_L3: 80 return ((plural) ? KMP_I18N_STR(L3Caches) : KMP_I18N_STR(L3Cache)); 81 case KMP_HW_L2: 82 return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache)); 83 case KMP_HW_L1: 84 return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache)); 85 case KMP_HW_LLC: 86 return ((plural) ? KMP_I18N_STR(LLCaches) : KMP_I18N_STR(LLCache)); 87 case KMP_HW_CORE: 88 return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core)); 89 case KMP_HW_THREAD: 90 return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread)); 91 case KMP_HW_PROC_GROUP: 92 return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup)); 93 } 94 return KMP_I18N_STR(Unknown); 95 } 96 97 const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) { 98 switch (type) { 99 case KMP_HW_SOCKET: 100 return ((plural) ? "sockets" : "socket"); 101 case KMP_HW_DIE: 102 return ((plural) ? "dice" : "die"); 103 case KMP_HW_MODULE: 104 return ((plural) ? "modules" : "module"); 105 case KMP_HW_TILE: 106 return ((plural) ? "tiles" : "tile"); 107 case KMP_HW_NUMA: 108 return ((plural) ? "numa_domains" : "numa_domain"); 109 case KMP_HW_L3: 110 return ((plural) ? "l3_caches" : "l3_cache"); 111 case KMP_HW_L2: 112 return ((plural) ? "l2_caches" : "l2_cache"); 113 case KMP_HW_L1: 114 return ((plural) ? "l1_caches" : "l1_cache"); 115 case KMP_HW_LLC: 116 return ((plural) ? "ll_caches" : "ll_cache"); 117 case KMP_HW_CORE: 118 return ((plural) ? "cores" : "core"); 119 case KMP_HW_THREAD: 120 return ((plural) ? "threads" : "thread"); 121 case KMP_HW_PROC_GROUP: 122 return ((plural) ? "proc_groups" : "proc_group"); 123 } 124 return ((plural) ? "unknowns" : "unknown"); 125 } 126 127 const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) { 128 switch (type) { 129 case KMP_HW_CORE_TYPE_UNKNOWN: 130 return "unknown"; 131 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 132 case KMP_HW_CORE_TYPE_ATOM: 133 return "Intel Atom(R) processor"; 134 case KMP_HW_CORE_TYPE_CORE: 135 return "Intel(R) Core(TM) processor"; 136 #endif 137 } 138 return "unknown"; 139 } 140 141 //////////////////////////////////////////////////////////////////////////////// 142 // kmp_hw_thread_t methods 143 int kmp_hw_thread_t::compare_ids(const void *a, const void *b) { 144 const kmp_hw_thread_t *ahwthread = (const kmp_hw_thread_t *)a; 145 const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b; 146 int depth = __kmp_topology->get_depth(); 147 for (int level = 0; level < depth; ++level) { 148 if (ahwthread->ids[level] < bhwthread->ids[level]) 149 return -1; 150 else if (ahwthread->ids[level] > bhwthread->ids[level]) 151 return 1; 152 } 153 if (ahwthread->os_id < bhwthread->os_id) 154 return -1; 155 else if (ahwthread->os_id > bhwthread->os_id) 156 return 1; 157 return 0; 158 } 159 160 #if KMP_AFFINITY_SUPPORTED 161 int kmp_hw_thread_t::compare_compact(const void *a, const void *b) { 162 int i; 163 const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a; 164 const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b; 165 int depth = __kmp_topology->get_depth(); 166 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 167 KMP_DEBUG_ASSERT(__kmp_affinity_compact <= depth); 168 for (i = 0; i < __kmp_affinity_compact; i++) { 169 int j = depth - i - 1; 170 if (aa->sub_ids[j] < bb->sub_ids[j]) 171 return -1; 172 if (aa->sub_ids[j] > bb->sub_ids[j]) 173 return 1; 174 } 175 for (; i < depth; i++) { 176 int j = i - __kmp_affinity_compact; 177 if (aa->sub_ids[j] < bb->sub_ids[j]) 178 return -1; 179 if (aa->sub_ids[j] > bb->sub_ids[j]) 180 return 1; 181 } 182 return 0; 183 } 184 #endif 185 186 void kmp_hw_thread_t::print() const { 187 int depth = __kmp_topology->get_depth(); 188 printf("%4d ", os_id); 189 for (int i = 0; i < depth; ++i) { 190 printf("%4d ", ids[i]); 191 } 192 if (core_type != KMP_HW_CORE_TYPE_UNKNOWN) { 193 printf(" (%s)", __kmp_hw_get_core_type_string(core_type)); 194 } 195 printf("\n"); 196 } 197 198 //////////////////////////////////////////////////////////////////////////////// 199 // kmp_topology_t methods 200 201 // Add a layer to the topology based on the ids. Assume the topology 202 // is perfectly nested (i.e., so no object has more than one parent) 203 void kmp_topology_t::_insert_layer(kmp_hw_t type, const int *ids) { 204 // Figure out where the layer should go by comparing the ids of the current 205 // layers with the new ids 206 int target_layer; 207 int previous_id = kmp_hw_thread_t::UNKNOWN_ID; 208 int previous_new_id = kmp_hw_thread_t::UNKNOWN_ID; 209 210 // Start from the highest layer and work down to find target layer 211 // If new layer is equal to another layer then put the new layer above 212 for (target_layer = 0; target_layer < depth; ++target_layer) { 213 bool layers_equal = true; 214 bool strictly_above_target_layer = false; 215 for (int i = 0; i < num_hw_threads; ++i) { 216 int id = hw_threads[i].ids[target_layer]; 217 int new_id = ids[i]; 218 if (id != previous_id && new_id == previous_new_id) { 219 // Found the layer we are strictly above 220 strictly_above_target_layer = true; 221 layers_equal = false; 222 break; 223 } else if (id == previous_id && new_id != previous_new_id) { 224 // Found a layer we are below. Move to next layer and check. 225 layers_equal = false; 226 break; 227 } 228 previous_id = id; 229 previous_new_id = new_id; 230 } 231 if (strictly_above_target_layer || layers_equal) 232 break; 233 } 234 235 // Found the layer we are above. Now move everything to accommodate the new 236 // layer. And put the new ids and type into the topology. 237 for (int i = depth - 1, j = depth; i >= target_layer; --i, --j) 238 types[j] = types[i]; 239 types[target_layer] = type; 240 for (int k = 0; k < num_hw_threads; ++k) { 241 for (int i = depth - 1, j = depth; i >= target_layer; --i, --j) 242 hw_threads[k].ids[j] = hw_threads[k].ids[i]; 243 hw_threads[k].ids[target_layer] = ids[k]; 244 } 245 equivalent[type] = type; 246 depth++; 247 } 248 249 #if KMP_GROUP_AFFINITY 250 // Insert the Windows Processor Group structure into the topology 251 void kmp_topology_t::_insert_windows_proc_groups() { 252 // Do not insert the processor group structure for a single group 253 if (__kmp_num_proc_groups == 1) 254 return; 255 kmp_affin_mask_t *mask; 256 int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads); 257 KMP_CPU_ALLOC(mask); 258 for (int i = 0; i < num_hw_threads; ++i) { 259 KMP_CPU_ZERO(mask); 260 KMP_CPU_SET(hw_threads[i].os_id, mask); 261 ids[i] = __kmp_get_proc_group(mask); 262 } 263 KMP_CPU_FREE(mask); 264 _insert_layer(KMP_HW_PROC_GROUP, ids); 265 __kmp_free(ids); 266 } 267 #endif 268 269 // Remove layers that don't add information to the topology. 270 // This is done by having the layer take on the id = UNKNOWN_ID (-1) 271 void kmp_topology_t::_remove_radix1_layers() { 272 int preference[KMP_HW_LAST]; 273 int top_index1, top_index2; 274 // Set up preference associative array 275 preference[KMP_HW_SOCKET] = 110; 276 preference[KMP_HW_PROC_GROUP] = 100; 277 preference[KMP_HW_CORE] = 95; 278 preference[KMP_HW_THREAD] = 90; 279 preference[KMP_HW_NUMA] = 85; 280 preference[KMP_HW_DIE] = 80; 281 preference[KMP_HW_TILE] = 75; 282 preference[KMP_HW_MODULE] = 73; 283 preference[KMP_HW_L3] = 70; 284 preference[KMP_HW_L2] = 65; 285 preference[KMP_HW_L1] = 60; 286 preference[KMP_HW_LLC] = 5; 287 top_index1 = 0; 288 top_index2 = 1; 289 while (top_index1 < depth - 1 && top_index2 < depth) { 290 kmp_hw_t type1 = types[top_index1]; 291 kmp_hw_t type2 = types[top_index2]; 292 KMP_ASSERT_VALID_HW_TYPE(type1); 293 KMP_ASSERT_VALID_HW_TYPE(type2); 294 // Do not allow the three main topology levels (sockets, cores, threads) to 295 // be compacted down 296 if ((type1 == KMP_HW_THREAD || type1 == KMP_HW_CORE || 297 type1 == KMP_HW_SOCKET) && 298 (type2 == KMP_HW_THREAD || type2 == KMP_HW_CORE || 299 type2 == KMP_HW_SOCKET)) { 300 top_index1 = top_index2++; 301 continue; 302 } 303 bool radix1 = true; 304 bool all_same = true; 305 int id1 = hw_threads[0].ids[top_index1]; 306 int id2 = hw_threads[0].ids[top_index2]; 307 int pref1 = preference[type1]; 308 int pref2 = preference[type2]; 309 for (int hwidx = 1; hwidx < num_hw_threads; ++hwidx) { 310 if (hw_threads[hwidx].ids[top_index1] == id1 && 311 hw_threads[hwidx].ids[top_index2] != id2) { 312 radix1 = false; 313 break; 314 } 315 if (hw_threads[hwidx].ids[top_index2] != id2) 316 all_same = false; 317 id1 = hw_threads[hwidx].ids[top_index1]; 318 id2 = hw_threads[hwidx].ids[top_index2]; 319 } 320 if (radix1) { 321 // Select the layer to remove based on preference 322 kmp_hw_t remove_type, keep_type; 323 int remove_layer, remove_layer_ids; 324 if (pref1 > pref2) { 325 remove_type = type2; 326 remove_layer = remove_layer_ids = top_index2; 327 keep_type = type1; 328 } else { 329 remove_type = type1; 330 remove_layer = remove_layer_ids = top_index1; 331 keep_type = type2; 332 } 333 // If all the indexes for the second (deeper) layer are the same. 334 // e.g., all are zero, then make sure to keep the first layer's ids 335 if (all_same) 336 remove_layer_ids = top_index2; 337 // Remove radix one type by setting the equivalence, removing the id from 338 // the hw threads and removing the layer from types and depth 339 set_equivalent_type(remove_type, keep_type); 340 for (int idx = 0; idx < num_hw_threads; ++idx) { 341 kmp_hw_thread_t &hw_thread = hw_threads[idx]; 342 for (int d = remove_layer_ids; d < depth - 1; ++d) 343 hw_thread.ids[d] = hw_thread.ids[d + 1]; 344 } 345 for (int idx = remove_layer; idx < depth - 1; ++idx) 346 types[idx] = types[idx + 1]; 347 depth--; 348 } else { 349 top_index1 = top_index2++; 350 } 351 } 352 KMP_ASSERT(depth > 0); 353 } 354 355 void kmp_topology_t::_set_last_level_cache() { 356 if (get_equivalent_type(KMP_HW_L3) != KMP_HW_UNKNOWN) 357 set_equivalent_type(KMP_HW_LLC, KMP_HW_L3); 358 else if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN) 359 set_equivalent_type(KMP_HW_LLC, KMP_HW_L2); 360 #if KMP_MIC_SUPPORTED 361 else if (__kmp_mic_type == mic3) { 362 if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN) 363 set_equivalent_type(KMP_HW_LLC, KMP_HW_L2); 364 else if (get_equivalent_type(KMP_HW_TILE) != KMP_HW_UNKNOWN) 365 set_equivalent_type(KMP_HW_LLC, KMP_HW_TILE); 366 // L2/Tile wasn't detected so just say L1 367 else 368 set_equivalent_type(KMP_HW_LLC, KMP_HW_L1); 369 } 370 #endif 371 else if (get_equivalent_type(KMP_HW_L1) != KMP_HW_UNKNOWN) 372 set_equivalent_type(KMP_HW_LLC, KMP_HW_L1); 373 // Fallback is to set last level cache to socket or core 374 if (get_equivalent_type(KMP_HW_LLC) == KMP_HW_UNKNOWN) { 375 if (get_equivalent_type(KMP_HW_SOCKET) != KMP_HW_UNKNOWN) 376 set_equivalent_type(KMP_HW_LLC, KMP_HW_SOCKET); 377 else if (get_equivalent_type(KMP_HW_CORE) != KMP_HW_UNKNOWN) 378 set_equivalent_type(KMP_HW_LLC, KMP_HW_CORE); 379 } 380 KMP_ASSERT(get_equivalent_type(KMP_HW_LLC) != KMP_HW_UNKNOWN); 381 } 382 383 // Gather the count of each topology layer and the ratio 384 void kmp_topology_t::_gather_enumeration_information() { 385 int previous_id[KMP_HW_LAST]; 386 int max[KMP_HW_LAST]; 387 388 for (int i = 0; i < depth; ++i) { 389 previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID; 390 max[i] = 0; 391 count[i] = 0; 392 ratio[i] = 0; 393 } 394 if (__kmp_is_hybrid_cpu()) { 395 for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) { 396 core_types_count[i] = 0; 397 core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN; 398 } 399 } 400 int core_level = get_level(KMP_HW_CORE); 401 for (int i = 0; i < num_hw_threads; ++i) { 402 kmp_hw_thread_t &hw_thread = hw_threads[i]; 403 for (int layer = 0; layer < depth; ++layer) { 404 int id = hw_thread.ids[layer]; 405 if (id != previous_id[layer]) { 406 // Add an additional increment to each count 407 for (int l = layer; l < depth; ++l) 408 count[l]++; 409 // Keep track of topology layer ratio statistics 410 max[layer]++; 411 for (int l = layer + 1; l < depth; ++l) { 412 if (max[l] > ratio[l]) 413 ratio[l] = max[l]; 414 max[l] = 1; 415 } 416 // Figure out the number of each core type for hybrid CPUs 417 if (__kmp_is_hybrid_cpu() && core_level >= 0 && layer <= core_level) 418 _increment_core_type(hw_thread.core_type); 419 break; 420 } 421 } 422 for (int layer = 0; layer < depth; ++layer) { 423 previous_id[layer] = hw_thread.ids[layer]; 424 } 425 } 426 for (int layer = 0; layer < depth; ++layer) { 427 if (max[layer] > ratio[layer]) 428 ratio[layer] = max[layer]; 429 } 430 } 431 432 // Find out if the topology is uniform 433 void kmp_topology_t::_discover_uniformity() { 434 int num = 1; 435 for (int level = 0; level < depth; ++level) 436 num *= ratio[level]; 437 flags.uniform = (num == count[depth - 1]); 438 } 439 440 // Set all the sub_ids for each hardware thread 441 void kmp_topology_t::_set_sub_ids() { 442 int previous_id[KMP_HW_LAST]; 443 int sub_id[KMP_HW_LAST]; 444 445 for (int i = 0; i < depth; ++i) { 446 previous_id[i] = -1; 447 sub_id[i] = -1; 448 } 449 for (int i = 0; i < num_hw_threads; ++i) { 450 kmp_hw_thread_t &hw_thread = hw_threads[i]; 451 // Setup the sub_id 452 for (int j = 0; j < depth; ++j) { 453 if (hw_thread.ids[j] != previous_id[j]) { 454 sub_id[j]++; 455 for (int k = j + 1; k < depth; ++k) { 456 sub_id[k] = 0; 457 } 458 break; 459 } 460 } 461 // Set previous_id 462 for (int j = 0; j < depth; ++j) { 463 previous_id[j] = hw_thread.ids[j]; 464 } 465 // Set the sub_ids field 466 for (int j = 0; j < depth; ++j) { 467 hw_thread.sub_ids[j] = sub_id[j]; 468 } 469 } 470 } 471 472 void kmp_topology_t::_set_globals() { 473 // Set nCoresPerPkg, nPackages, __kmp_nThreadsPerCore, __kmp_ncores 474 int core_level, thread_level, package_level; 475 package_level = get_level(KMP_HW_SOCKET); 476 #if KMP_GROUP_AFFINITY 477 if (package_level == -1) 478 package_level = get_level(KMP_HW_PROC_GROUP); 479 #endif 480 core_level = get_level(KMP_HW_CORE); 481 thread_level = get_level(KMP_HW_THREAD); 482 483 KMP_ASSERT(core_level != -1); 484 KMP_ASSERT(thread_level != -1); 485 486 __kmp_nThreadsPerCore = calculate_ratio(thread_level, core_level); 487 if (package_level != -1) { 488 nCoresPerPkg = calculate_ratio(core_level, package_level); 489 nPackages = get_count(package_level); 490 } else { 491 // assume one socket 492 nCoresPerPkg = get_count(core_level); 493 nPackages = 1; 494 } 495 #ifndef KMP_DFLT_NTH_CORES 496 __kmp_ncores = get_count(core_level); 497 #endif 498 } 499 500 kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth, 501 const kmp_hw_t *types) { 502 kmp_topology_t *retval; 503 // Allocate all data in one large allocation 504 size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc + 505 sizeof(int) * (size_t)KMP_HW_LAST * 3; 506 char *bytes = (char *)__kmp_allocate(size); 507 retval = (kmp_topology_t *)bytes; 508 if (nproc > 0) { 509 retval->hw_threads = (kmp_hw_thread_t *)(bytes + sizeof(kmp_topology_t)); 510 } else { 511 retval->hw_threads = nullptr; 512 } 513 retval->num_hw_threads = nproc; 514 retval->depth = ndepth; 515 int *arr = 516 (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc); 517 retval->types = (kmp_hw_t *)arr; 518 retval->ratio = arr + (size_t)KMP_HW_LAST; 519 retval->count = arr + 2 * (size_t)KMP_HW_LAST; 520 KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; } 521 for (int i = 0; i < ndepth; ++i) { 522 retval->types[i] = types[i]; 523 retval->equivalent[types[i]] = types[i]; 524 } 525 return retval; 526 } 527 528 void kmp_topology_t::deallocate(kmp_topology_t *topology) { 529 if (topology) 530 __kmp_free(topology); 531 } 532 533 bool kmp_topology_t::check_ids() const { 534 // Assume ids have been sorted 535 if (num_hw_threads == 0) 536 return true; 537 for (int i = 1; i < num_hw_threads; ++i) { 538 kmp_hw_thread_t ¤t_thread = hw_threads[i]; 539 kmp_hw_thread_t &previous_thread = hw_threads[i - 1]; 540 bool unique = false; 541 for (int j = 0; j < depth; ++j) { 542 if (previous_thread.ids[j] != current_thread.ids[j]) { 543 unique = true; 544 break; 545 } 546 } 547 if (unique) 548 continue; 549 return false; 550 } 551 return true; 552 } 553 554 void kmp_topology_t::dump() const { 555 printf("***********************\n"); 556 printf("*** __kmp_topology: ***\n"); 557 printf("***********************\n"); 558 printf("* depth: %d\n", depth); 559 560 printf("* types: "); 561 for (int i = 0; i < depth; ++i) 562 printf("%15s ", __kmp_hw_get_keyword(types[i])); 563 printf("\n"); 564 565 printf("* ratio: "); 566 for (int i = 0; i < depth; ++i) { 567 printf("%15d ", ratio[i]); 568 } 569 printf("\n"); 570 571 printf("* count: "); 572 for (int i = 0; i < depth; ++i) { 573 printf("%15d ", count[i]); 574 } 575 printf("\n"); 576 577 printf("* core_types:\n"); 578 for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) { 579 if (core_types[i] != KMP_HW_CORE_TYPE_UNKNOWN) { 580 printf(" %d %s core%c\n", core_types_count[i], 581 __kmp_hw_get_core_type_string(core_types[i]), 582 ((core_types_count[i] > 1) ? 's' : ' ')); 583 } else { 584 if (i == 0) 585 printf("No hybrid information available\n"); 586 break; 587 } 588 } 589 590 printf("* equivalent map:\n"); 591 KMP_FOREACH_HW_TYPE(i) { 592 const char *key = __kmp_hw_get_keyword(i); 593 const char *value = __kmp_hw_get_keyword(equivalent[i]); 594 printf("%-15s -> %-15s\n", key, value); 595 } 596 597 printf("* uniform: %s\n", (is_uniform() ? "Yes" : "No")); 598 599 printf("* num_hw_threads: %d\n", num_hw_threads); 600 printf("* hw_threads:\n"); 601 for (int i = 0; i < num_hw_threads; ++i) { 602 hw_threads[i].print(); 603 } 604 printf("***********************\n"); 605 } 606 607 void kmp_topology_t::print(const char *env_var) const { 608 kmp_str_buf_t buf; 609 int print_types_depth; 610 __kmp_str_buf_init(&buf); 611 kmp_hw_t print_types[KMP_HW_LAST + 2]; 612 613 // Num Available Threads 614 KMP_INFORM(AvailableOSProc, env_var, num_hw_threads); 615 616 // Uniform or not 617 if (is_uniform()) { 618 KMP_INFORM(Uniform, env_var); 619 } else { 620 KMP_INFORM(NonUniform, env_var); 621 } 622 623 // Equivalent types 624 KMP_FOREACH_HW_TYPE(type) { 625 kmp_hw_t eq_type = equivalent[type]; 626 if (eq_type != KMP_HW_UNKNOWN && eq_type != type) { 627 KMP_INFORM(AffEqualTopologyTypes, env_var, 628 __kmp_hw_get_catalog_string(type), 629 __kmp_hw_get_catalog_string(eq_type)); 630 } 631 } 632 633 // Quick topology 634 KMP_ASSERT(depth > 0 && depth <= (int)KMP_HW_LAST); 635 // Create a print types array that always guarantees printing 636 // the core and thread level 637 print_types_depth = 0; 638 for (int level = 0; level < depth; ++level) 639 print_types[print_types_depth++] = types[level]; 640 if (equivalent[KMP_HW_CORE] != KMP_HW_CORE) { 641 // Force in the core level for quick topology 642 if (print_types[print_types_depth - 1] == KMP_HW_THREAD) { 643 // Force core before thread e.g., 1 socket X 2 threads/socket 644 // becomes 1 socket X 1 core/socket X 2 threads/socket 645 print_types[print_types_depth - 1] = KMP_HW_CORE; 646 print_types[print_types_depth++] = KMP_HW_THREAD; 647 } else { 648 print_types[print_types_depth++] = KMP_HW_CORE; 649 } 650 } 651 // Always put threads at very end of quick topology 652 if (equivalent[KMP_HW_THREAD] != KMP_HW_THREAD) 653 print_types[print_types_depth++] = KMP_HW_THREAD; 654 655 __kmp_str_buf_clear(&buf); 656 kmp_hw_t numerator_type; 657 kmp_hw_t denominator_type = KMP_HW_UNKNOWN; 658 int core_level = get_level(KMP_HW_CORE); 659 int ncores = get_count(core_level); 660 661 for (int plevel = 0, level = 0; plevel < print_types_depth; ++plevel) { 662 int c; 663 bool plural; 664 numerator_type = print_types[plevel]; 665 KMP_ASSERT_VALID_HW_TYPE(numerator_type); 666 if (equivalent[numerator_type] != numerator_type) 667 c = 1; 668 else 669 c = get_ratio(level++); 670 plural = (c > 1); 671 if (plevel == 0) { 672 __kmp_str_buf_print(&buf, "%d %s", c, 673 __kmp_hw_get_catalog_string(numerator_type, plural)); 674 } else { 675 __kmp_str_buf_print(&buf, " x %d %s/%s", c, 676 __kmp_hw_get_catalog_string(numerator_type, plural), 677 __kmp_hw_get_catalog_string(denominator_type)); 678 } 679 denominator_type = numerator_type; 680 } 681 KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores); 682 683 if (__kmp_is_hybrid_cpu()) { 684 for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) { 685 if (core_types[i] == KMP_HW_CORE_TYPE_UNKNOWN) 686 break; 687 KMP_INFORM(TopologyHybrid, env_var, core_types_count[i], 688 __kmp_hw_get_core_type_string(core_types[i])); 689 } 690 } 691 692 if (num_hw_threads <= 0) { 693 __kmp_str_buf_free(&buf); 694 return; 695 } 696 697 // Full OS proc to hardware thread map 698 KMP_INFORM(OSProcToPhysicalThreadMap, env_var); 699 for (int i = 0; i < num_hw_threads; i++) { 700 __kmp_str_buf_clear(&buf); 701 for (int level = 0; level < depth; ++level) { 702 kmp_hw_t type = types[level]; 703 __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type)); 704 __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]); 705 } 706 if (__kmp_is_hybrid_cpu()) 707 __kmp_str_buf_print( 708 &buf, "(%s)", __kmp_hw_get_core_type_string(hw_threads[i].core_type)); 709 KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str); 710 } 711 712 __kmp_str_buf_free(&buf); 713 } 714 715 void kmp_topology_t::canonicalize() { 716 #if KMP_GROUP_AFFINITY 717 _insert_windows_proc_groups(); 718 #endif 719 _remove_radix1_layers(); 720 _gather_enumeration_information(); 721 _discover_uniformity(); 722 _set_sub_ids(); 723 _set_globals(); 724 _set_last_level_cache(); 725 726 #if KMP_MIC_SUPPORTED 727 // Manually Add L2 = Tile equivalence 728 if (__kmp_mic_type == mic3) { 729 if (get_level(KMP_HW_L2) != -1) 730 set_equivalent_type(KMP_HW_TILE, KMP_HW_L2); 731 else if (get_level(KMP_HW_TILE) != -1) 732 set_equivalent_type(KMP_HW_L2, KMP_HW_TILE); 733 } 734 #endif 735 736 // Perform post canonicalization checking 737 KMP_ASSERT(depth > 0); 738 for (int level = 0; level < depth; ++level) { 739 // All counts, ratios, and types must be valid 740 KMP_ASSERT(count[level] > 0 && ratio[level] > 0); 741 KMP_ASSERT_VALID_HW_TYPE(types[level]); 742 // Detected types must point to themselves 743 KMP_ASSERT(equivalent[types[level]] == types[level]); 744 } 745 746 #if KMP_AFFINITY_SUPPORTED 747 // Set the number of affinity granularity levels 748 if (__kmp_affinity_gran_levels < 0) { 749 kmp_hw_t gran_type = get_equivalent_type(__kmp_affinity_gran); 750 // Check if user's granularity request is valid 751 if (gran_type == KMP_HW_UNKNOWN) { 752 // First try core, then thread, then package 753 kmp_hw_t gran_types[3] = {KMP_HW_CORE, KMP_HW_THREAD, KMP_HW_SOCKET}; 754 for (auto g : gran_types) { 755 if (__kmp_topology->get_equivalent_type(g) != KMP_HW_UNKNOWN) { 756 gran_type = g; 757 break; 758 } 759 } 760 KMP_ASSERT(gran_type != KMP_HW_UNKNOWN); 761 // Warn user what granularity setting will be used instead 762 KMP_WARNING(AffGranularityBad, "KMP_AFFINITY", 763 __kmp_hw_get_catalog_string(__kmp_affinity_gran), 764 __kmp_hw_get_catalog_string(gran_type)); 765 __kmp_affinity_gran = gran_type; 766 } 767 #if KMP_GROUP_AFFINITY 768 // If more than one processor group exists, and the level of 769 // granularity specified by the user is too coarse, then the 770 // granularity must be adjusted "down" to processor group affinity 771 // because threads can only exist within one processor group. 772 // For example, if a user sets granularity=socket and there are two 773 // processor groups that cover a socket, then the runtime must 774 // restrict the granularity down to the processor group level. 775 if (__kmp_num_proc_groups > 1) { 776 int gran_depth = __kmp_topology->get_level(gran_type); 777 int proc_group_depth = __kmp_topology->get_level(KMP_HW_PROC_GROUP); 778 if (gran_depth >= 0 && proc_group_depth >= 0 && 779 gran_depth < proc_group_depth) { 780 KMP_WARNING(AffGranTooCoarseProcGroup, "KMP_AFFINITY", 781 __kmp_hw_get_catalog_string(__kmp_affinity_gran)); 782 __kmp_affinity_gran = gran_type = KMP_HW_PROC_GROUP; 783 } 784 } 785 #endif 786 __kmp_affinity_gran_levels = 0; 787 for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i) 788 __kmp_affinity_gran_levels++; 789 } 790 #endif // KMP_AFFINITY_SUPPORTED 791 } 792 793 // Canonicalize an explicit packages X cores/pkg X threads/core topology 794 void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg, 795 int nthreads_per_core, int ncores) { 796 int ndepth = 3; 797 depth = ndepth; 798 KMP_FOREACH_HW_TYPE(i) { equivalent[i] = KMP_HW_UNKNOWN; } 799 for (int level = 0; level < depth; ++level) { 800 count[level] = 0; 801 ratio[level] = 0; 802 } 803 count[0] = npackages; 804 count[1] = ncores; 805 count[2] = __kmp_xproc; 806 ratio[0] = npackages; 807 ratio[1] = ncores_per_pkg; 808 ratio[2] = nthreads_per_core; 809 equivalent[KMP_HW_SOCKET] = KMP_HW_SOCKET; 810 equivalent[KMP_HW_CORE] = KMP_HW_CORE; 811 equivalent[KMP_HW_THREAD] = KMP_HW_THREAD; 812 types[0] = KMP_HW_SOCKET; 813 types[1] = KMP_HW_CORE; 814 types[2] = KMP_HW_THREAD; 815 //__kmp_avail_proc = __kmp_xproc; 816 _discover_uniformity(); 817 } 818 819 // Apply the KMP_HW_SUBSET envirable to the topology 820 // Returns true if KMP_HW_SUBSET filtered any processors 821 // otherwise, returns false 822 bool kmp_topology_t::filter_hw_subset() { 823 // If KMP_HW_SUBSET wasn't requested, then do nothing. 824 if (!__kmp_hw_subset) 825 return false; 826 827 // First, sort the KMP_HW_SUBSET items by the machine topology 828 __kmp_hw_subset->sort(); 829 830 // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology 831 int hw_subset_depth = __kmp_hw_subset->get_depth(); 832 kmp_hw_t specified[KMP_HW_LAST]; 833 KMP_ASSERT(hw_subset_depth > 0); 834 KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; } 835 for (int i = 0; i < hw_subset_depth; ++i) { 836 int max_count; 837 int num = __kmp_hw_subset->at(i).num; 838 int offset = __kmp_hw_subset->at(i).offset; 839 kmp_hw_t type = __kmp_hw_subset->at(i).type; 840 kmp_hw_t equivalent_type = equivalent[type]; 841 int level = get_level(type); 842 843 // Check to see if current layer is in detected machine topology 844 if (equivalent_type != KMP_HW_UNKNOWN) { 845 __kmp_hw_subset->at(i).type = equivalent_type; 846 } else { 847 KMP_WARNING(AffHWSubsetNotExistGeneric, 848 __kmp_hw_get_catalog_string(type)); 849 return false; 850 } 851 852 // Check to see if current layer has already been specified 853 // either directly or through an equivalent type 854 if (specified[equivalent_type] != KMP_HW_UNKNOWN) { 855 KMP_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type), 856 __kmp_hw_get_catalog_string(specified[equivalent_type])); 857 return false; 858 } 859 specified[equivalent_type] = type; 860 861 // Check to see if each layer's num & offset parameters are valid 862 max_count = get_ratio(level); 863 if (max_count < 0 || num + offset > max_count) { 864 bool plural = (num > 1); 865 KMP_WARNING(AffHWSubsetManyGeneric, 866 __kmp_hw_get_catalog_string(type, plural)); 867 return false; 868 } 869 } 870 871 // Apply the filtered hardware subset 872 int new_index = 0; 873 for (int i = 0; i < num_hw_threads; ++i) { 874 kmp_hw_thread_t &hw_thread = hw_threads[i]; 875 // Check to see if this hardware thread should be filtered 876 bool should_be_filtered = false; 877 for (int level = 0, hw_subset_index = 0; 878 level < depth && hw_subset_index < hw_subset_depth; ++level) { 879 kmp_hw_t topology_type = types[level]; 880 auto hw_subset_item = __kmp_hw_subset->at(hw_subset_index); 881 kmp_hw_t hw_subset_type = hw_subset_item.type; 882 if (topology_type != hw_subset_type) 883 continue; 884 int num = hw_subset_item.num; 885 int offset = hw_subset_item.offset; 886 hw_subset_index++; 887 if (hw_thread.sub_ids[level] < offset || 888 hw_thread.sub_ids[level] >= offset + num) { 889 should_be_filtered = true; 890 break; 891 } 892 } 893 if (!should_be_filtered) { 894 if (i != new_index) 895 hw_threads[new_index] = hw_thread; 896 new_index++; 897 } else { 898 #if KMP_AFFINITY_SUPPORTED 899 KMP_CPU_CLR(hw_thread.os_id, __kmp_affin_fullMask); 900 #endif 901 __kmp_avail_proc--; 902 } 903 } 904 KMP_DEBUG_ASSERT(new_index <= num_hw_threads); 905 num_hw_threads = new_index; 906 907 // Post hardware subset canonicalization 908 _gather_enumeration_information(); 909 _discover_uniformity(); 910 _set_globals(); 911 _set_last_level_cache(); 912 return true; 913 } 914 915 bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const { 916 if (hw_level >= depth) 917 return true; 918 bool retval = true; 919 const kmp_hw_thread_t &t1 = hw_threads[hwt1]; 920 const kmp_hw_thread_t &t2 = hw_threads[hwt2]; 921 for (int i = 0; i < (depth - hw_level); ++i) { 922 if (t1.ids[i] != t2.ids[i]) 923 return false; 924 } 925 return retval; 926 } 927 928 //////////////////////////////////////////////////////////////////////////////// 929 930 #if KMP_AFFINITY_SUPPORTED 931 class kmp_affinity_raii_t { 932 kmp_affin_mask_t *mask; 933 bool restored; 934 935 public: 936 kmp_affinity_raii_t() : restored(false) { 937 KMP_CPU_ALLOC(mask); 938 KMP_ASSERT(mask != NULL); 939 __kmp_get_system_affinity(mask, TRUE); 940 } 941 void restore() { 942 __kmp_set_system_affinity(mask, TRUE); 943 KMP_CPU_FREE(mask); 944 restored = true; 945 } 946 ~kmp_affinity_raii_t() { 947 if (!restored) { 948 __kmp_set_system_affinity(mask, TRUE); 949 KMP_CPU_FREE(mask); 950 } 951 } 952 }; 953 954 bool KMPAffinity::picked_api = false; 955 956 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } 957 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); } 958 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); } 959 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); } 960 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); } 961 void KMPAffinity::operator delete(void *p) { __kmp_free(p); } 962 963 void KMPAffinity::pick_api() { 964 KMPAffinity *affinity_dispatch; 965 if (picked_api) 966 return; 967 #if KMP_USE_HWLOC 968 // Only use Hwloc if affinity isn't explicitly disabled and 969 // user requests Hwloc topology method 970 if (__kmp_affinity_top_method == affinity_top_method_hwloc && 971 __kmp_affinity_type != affinity_disabled) { 972 affinity_dispatch = new KMPHwlocAffinity(); 973 } else 974 #endif 975 { 976 affinity_dispatch = new KMPNativeAffinity(); 977 } 978 __kmp_affinity_dispatch = affinity_dispatch; 979 picked_api = true; 980 } 981 982 void KMPAffinity::destroy_api() { 983 if (__kmp_affinity_dispatch != NULL) { 984 delete __kmp_affinity_dispatch; 985 __kmp_affinity_dispatch = NULL; 986 picked_api = false; 987 } 988 } 989 990 #define KMP_ADVANCE_SCAN(scan) \ 991 while (*scan != '\0') { \ 992 scan++; \ 993 } 994 995 // Print the affinity mask to the character array in a pretty format. 996 // The format is a comma separated list of non-negative integers or integer 997 // ranges: e.g., 1,2,3-5,7,9-15 998 // The format can also be the string "{<empty>}" if no bits are set in mask 999 char *__kmp_affinity_print_mask(char *buf, int buf_len, 1000 kmp_affin_mask_t *mask) { 1001 int start = 0, finish = 0, previous = 0; 1002 bool first_range; 1003 KMP_ASSERT(buf); 1004 KMP_ASSERT(buf_len >= 40); 1005 KMP_ASSERT(mask); 1006 char *scan = buf; 1007 char *end = buf + buf_len - 1; 1008 1009 // Check for empty set. 1010 if (mask->begin() == mask->end()) { 1011 KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}"); 1012 KMP_ADVANCE_SCAN(scan); 1013 KMP_ASSERT(scan <= end); 1014 return buf; 1015 } 1016 1017 first_range = true; 1018 start = mask->begin(); 1019 while (1) { 1020 // Find next range 1021 // [start, previous] is inclusive range of contiguous bits in mask 1022 for (finish = mask->next(start), previous = start; 1023 finish == previous + 1 && finish != mask->end(); 1024 finish = mask->next(finish)) { 1025 previous = finish; 1026 } 1027 1028 // The first range does not need a comma printed before it, but the rest 1029 // of the ranges do need a comma beforehand 1030 if (!first_range) { 1031 KMP_SNPRINTF(scan, end - scan + 1, "%s", ","); 1032 KMP_ADVANCE_SCAN(scan); 1033 } else { 1034 first_range = false; 1035 } 1036 // Range with three or more contiguous bits in the affinity mask 1037 if (previous - start > 1) { 1038 KMP_SNPRINTF(scan, end - scan + 1, "%u-%u", start, previous); 1039 } else { 1040 // Range with one or two contiguous bits in the affinity mask 1041 KMP_SNPRINTF(scan, end - scan + 1, "%u", start); 1042 KMP_ADVANCE_SCAN(scan); 1043 if (previous - start > 0) { 1044 KMP_SNPRINTF(scan, end - scan + 1, ",%u", previous); 1045 } 1046 } 1047 KMP_ADVANCE_SCAN(scan); 1048 // Start over with new start point 1049 start = finish; 1050 if (start == mask->end()) 1051 break; 1052 // Check for overflow 1053 if (end - scan < 2) 1054 break; 1055 } 1056 1057 // Check for overflow 1058 KMP_ASSERT(scan <= end); 1059 return buf; 1060 } 1061 #undef KMP_ADVANCE_SCAN 1062 1063 // Print the affinity mask to the string buffer object in a pretty format 1064 // The format is a comma separated list of non-negative integers or integer 1065 // ranges: e.g., 1,2,3-5,7,9-15 1066 // The format can also be the string "{<empty>}" if no bits are set in mask 1067 kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf, 1068 kmp_affin_mask_t *mask) { 1069 int start = 0, finish = 0, previous = 0; 1070 bool first_range; 1071 KMP_ASSERT(buf); 1072 KMP_ASSERT(mask); 1073 1074 __kmp_str_buf_clear(buf); 1075 1076 // Check for empty set. 1077 if (mask->begin() == mask->end()) { 1078 __kmp_str_buf_print(buf, "%s", "{<empty>}"); 1079 return buf; 1080 } 1081 1082 first_range = true; 1083 start = mask->begin(); 1084 while (1) { 1085 // Find next range 1086 // [start, previous] is inclusive range of contiguous bits in mask 1087 for (finish = mask->next(start), previous = start; 1088 finish == previous + 1 && finish != mask->end(); 1089 finish = mask->next(finish)) { 1090 previous = finish; 1091 } 1092 1093 // The first range does not need a comma printed before it, but the rest 1094 // of the ranges do need a comma beforehand 1095 if (!first_range) { 1096 __kmp_str_buf_print(buf, "%s", ","); 1097 } else { 1098 first_range = false; 1099 } 1100 // Range with three or more contiguous bits in the affinity mask 1101 if (previous - start > 1) { 1102 __kmp_str_buf_print(buf, "%u-%u", start, previous); 1103 } else { 1104 // Range with one or two contiguous bits in the affinity mask 1105 __kmp_str_buf_print(buf, "%u", start); 1106 if (previous - start > 0) { 1107 __kmp_str_buf_print(buf, ",%u", previous); 1108 } 1109 } 1110 // Start over with new start point 1111 start = finish; 1112 if (start == mask->end()) 1113 break; 1114 } 1115 return buf; 1116 } 1117 1118 // Return (possibly empty) affinity mask representing the offline CPUs 1119 // Caller must free the mask 1120 kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() { 1121 kmp_affin_mask_t *offline; 1122 KMP_CPU_ALLOC(offline); 1123 KMP_CPU_ZERO(offline); 1124 #if KMP_OS_LINUX 1125 int n, begin_cpu, end_cpu; 1126 kmp_safe_raii_file_t offline_file; 1127 auto skip_ws = [](FILE *f) { 1128 int c; 1129 do { 1130 c = fgetc(f); 1131 } while (isspace(c)); 1132 if (c != EOF) 1133 ungetc(c, f); 1134 }; 1135 // File contains CSV of integer ranges representing the offline CPUs 1136 // e.g., 1,2,4-7,9,11-15 1137 int status = offline_file.try_open("/sys/devices/system/cpu/offline", "r"); 1138 if (status != 0) 1139 return offline; 1140 while (!feof(offline_file)) { 1141 skip_ws(offline_file); 1142 n = fscanf(offline_file, "%d", &begin_cpu); 1143 if (n != 1) 1144 break; 1145 skip_ws(offline_file); 1146 int c = fgetc(offline_file); 1147 if (c == EOF || c == ',') { 1148 // Just single CPU 1149 end_cpu = begin_cpu; 1150 } else if (c == '-') { 1151 // Range of CPUs 1152 skip_ws(offline_file); 1153 n = fscanf(offline_file, "%d", &end_cpu); 1154 if (n != 1) 1155 break; 1156 skip_ws(offline_file); 1157 c = fgetc(offline_file); // skip ',' 1158 } else { 1159 // Syntax problem 1160 break; 1161 } 1162 // Ensure a valid range of CPUs 1163 if (begin_cpu < 0 || begin_cpu >= __kmp_xproc || end_cpu < 0 || 1164 end_cpu >= __kmp_xproc || begin_cpu > end_cpu) { 1165 continue; 1166 } 1167 // Insert [begin_cpu, end_cpu] into offline mask 1168 for (int cpu = begin_cpu; cpu <= end_cpu; ++cpu) { 1169 KMP_CPU_SET(cpu, offline); 1170 } 1171 } 1172 #endif 1173 return offline; 1174 } 1175 1176 // Return the number of available procs 1177 int __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { 1178 int avail_proc = 0; 1179 KMP_CPU_ZERO(mask); 1180 1181 #if KMP_GROUP_AFFINITY 1182 1183 if (__kmp_num_proc_groups > 1) { 1184 int group; 1185 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 1186 for (group = 0; group < __kmp_num_proc_groups; group++) { 1187 int i; 1188 int num = __kmp_GetActiveProcessorCount(group); 1189 for (i = 0; i < num; i++) { 1190 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 1191 avail_proc++; 1192 } 1193 } 1194 } else 1195 1196 #endif /* KMP_GROUP_AFFINITY */ 1197 1198 { 1199 int proc; 1200 kmp_affin_mask_t *offline_cpus = __kmp_affinity_get_offline_cpus(); 1201 for (proc = 0; proc < __kmp_xproc; proc++) { 1202 // Skip offline CPUs 1203 if (KMP_CPU_ISSET(proc, offline_cpus)) 1204 continue; 1205 KMP_CPU_SET(proc, mask); 1206 avail_proc++; 1207 } 1208 KMP_CPU_FREE(offline_cpus); 1209 } 1210 1211 return avail_proc; 1212 } 1213 1214 // All of the __kmp_affinity_create_*_map() routines should allocate the 1215 // internal topology object and set the layer ids for it. Each routine 1216 // returns a boolean on whether it was successful at doing so. 1217 kmp_affin_mask_t *__kmp_affin_fullMask = NULL; 1218 1219 #if KMP_USE_HWLOC 1220 static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) { 1221 #if HWLOC_API_VERSION >= 0x00020000 1222 return hwloc_obj_type_is_cache(obj->type); 1223 #else 1224 return obj->type == HWLOC_OBJ_CACHE; 1225 #endif 1226 } 1227 1228 // Returns KMP_HW_* type derived from HWLOC_* type 1229 static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) { 1230 1231 if (__kmp_hwloc_is_cache_type(obj)) { 1232 if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION) 1233 return KMP_HW_UNKNOWN; 1234 switch (obj->attr->cache.depth) { 1235 case 1: 1236 return KMP_HW_L1; 1237 case 2: 1238 #if KMP_MIC_SUPPORTED 1239 if (__kmp_mic_type == mic3) { 1240 return KMP_HW_TILE; 1241 } 1242 #endif 1243 return KMP_HW_L2; 1244 case 3: 1245 return KMP_HW_L3; 1246 } 1247 return KMP_HW_UNKNOWN; 1248 } 1249 1250 switch (obj->type) { 1251 case HWLOC_OBJ_PACKAGE: 1252 return KMP_HW_SOCKET; 1253 case HWLOC_OBJ_NUMANODE: 1254 return KMP_HW_NUMA; 1255 case HWLOC_OBJ_CORE: 1256 return KMP_HW_CORE; 1257 case HWLOC_OBJ_PU: 1258 return KMP_HW_THREAD; 1259 case HWLOC_OBJ_GROUP: 1260 if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE) 1261 return KMP_HW_DIE; 1262 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE) 1263 return KMP_HW_TILE; 1264 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE) 1265 return KMP_HW_MODULE; 1266 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP) 1267 return KMP_HW_PROC_GROUP; 1268 return KMP_HW_UNKNOWN; 1269 #if HWLOC_API_VERSION >= 0x00020100 1270 case HWLOC_OBJ_DIE: 1271 return KMP_HW_DIE; 1272 #endif 1273 } 1274 return KMP_HW_UNKNOWN; 1275 } 1276 1277 // Returns the number of objects of type 'type' below 'obj' within the topology 1278 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is 1279 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET 1280 // object. 1281 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, 1282 hwloc_obj_type_t type) { 1283 int retval = 0; 1284 hwloc_obj_t first; 1285 for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, 1286 obj->logical_index, type, 0); 1287 first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, 1288 obj->type, first) == obj; 1289 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, 1290 first)) { 1291 ++retval; 1292 } 1293 return retval; 1294 } 1295 1296 // This gets the sub_id for a lower object under a higher object in the 1297 // topology tree 1298 static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher, 1299 hwloc_obj_t lower) { 1300 hwloc_obj_t obj; 1301 hwloc_obj_type_t ltype = lower->type; 1302 int lindex = lower->logical_index - 1; 1303 int sub_id = 0; 1304 // Get the previous lower object 1305 obj = hwloc_get_obj_by_type(t, ltype, lindex); 1306 while (obj && lindex >= 0 && 1307 hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) { 1308 if (obj->userdata) { 1309 sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata)); 1310 break; 1311 } 1312 sub_id++; 1313 lindex--; 1314 obj = hwloc_get_obj_by_type(t, ltype, lindex); 1315 } 1316 // store sub_id + 1 so that 0 is differed from NULL 1317 lower->userdata = RCAST(void *, sub_id + 1); 1318 return sub_id; 1319 } 1320 1321 static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) { 1322 kmp_hw_t type; 1323 int hw_thread_index, sub_id; 1324 int depth; 1325 hwloc_obj_t pu, obj, root, prev; 1326 kmp_hw_t types[KMP_HW_LAST]; 1327 hwloc_obj_type_t hwloc_types[KMP_HW_LAST]; 1328 1329 hwloc_topology_t tp = __kmp_hwloc_topology; 1330 *msg_id = kmp_i18n_null; 1331 if (__kmp_affinity_verbose) { 1332 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 1333 } 1334 1335 if (!KMP_AFFINITY_CAPABLE()) { 1336 // Hack to try and infer the machine topology using only the data 1337 // available from hwloc on the current thread, and __kmp_xproc. 1338 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1339 // hwloc only guarantees existance of PU object, so check PACKAGE and CORE 1340 hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); 1341 if (o != NULL) 1342 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE); 1343 else 1344 nCoresPerPkg = 1; // no PACKAGE found 1345 o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0); 1346 if (o != NULL) 1347 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU); 1348 else 1349 __kmp_nThreadsPerCore = 1; // no CORE found 1350 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1351 if (nCoresPerPkg == 0) 1352 nCoresPerPkg = 1; // to prevent possible division by 0 1353 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1354 return true; 1355 } 1356 1357 // Handle multiple types of cores if they exist on the system 1358 int nr_cpu_kinds = hwloc_cpukinds_get_nr(tp, 0); 1359 1360 typedef struct kmp_hwloc_cpukinds_info_t { 1361 int efficiency; 1362 kmp_hw_core_type_t core_type; 1363 hwloc_bitmap_t mask; 1364 } kmp_hwloc_cpukinds_info_t; 1365 kmp_hwloc_cpukinds_info_t *cpukinds = nullptr; 1366 1367 if (nr_cpu_kinds > 0) { 1368 unsigned nr_infos; 1369 struct hwloc_info_s *infos; 1370 cpukinds = (kmp_hwloc_cpukinds_info_t *)__kmp_allocate( 1371 sizeof(kmp_hwloc_cpukinds_info_t) * nr_cpu_kinds); 1372 for (unsigned idx = 0; idx < (unsigned)nr_cpu_kinds; ++idx) { 1373 cpukinds[idx].efficiency = -1; 1374 cpukinds[idx].core_type = KMP_HW_CORE_TYPE_UNKNOWN; 1375 cpukinds[idx].mask = hwloc_bitmap_alloc(); 1376 if (hwloc_cpukinds_get_info(tp, idx, cpukinds[idx].mask, 1377 &cpukinds[idx].efficiency, &nr_infos, &infos, 1378 0) == 0) { 1379 for (unsigned i = 0; i < nr_infos; ++i) { 1380 if (__kmp_str_match("CoreType", 8, infos[i].name)) { 1381 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1382 if (__kmp_str_match("IntelAtom", 9, infos[i].value)) { 1383 cpukinds[idx].core_type = KMP_HW_CORE_TYPE_ATOM; 1384 break; 1385 } else if (__kmp_str_match("IntelCore", 9, infos[i].value)) { 1386 cpukinds[idx].core_type = KMP_HW_CORE_TYPE_CORE; 1387 break; 1388 } 1389 #endif 1390 } 1391 } 1392 } 1393 } 1394 } 1395 1396 root = hwloc_get_root_obj(tp); 1397 1398 // Figure out the depth and types in the topology 1399 depth = 0; 1400 pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin()); 1401 KMP_ASSERT(pu); 1402 obj = pu; 1403 types[depth] = KMP_HW_THREAD; 1404 hwloc_types[depth] = obj->type; 1405 depth++; 1406 while (obj != root && obj != NULL) { 1407 obj = obj->parent; 1408 #if HWLOC_API_VERSION >= 0x00020000 1409 if (obj->memory_arity) { 1410 hwloc_obj_t memory; 1411 for (memory = obj->memory_first_child; memory; 1412 memory = hwloc_get_next_child(tp, obj, memory)) { 1413 if (memory->type == HWLOC_OBJ_NUMANODE) 1414 break; 1415 } 1416 if (memory && memory->type == HWLOC_OBJ_NUMANODE) { 1417 types[depth] = KMP_HW_NUMA; 1418 hwloc_types[depth] = memory->type; 1419 depth++; 1420 } 1421 } 1422 #endif 1423 type = __kmp_hwloc_type_2_topology_type(obj); 1424 if (type != KMP_HW_UNKNOWN) { 1425 types[depth] = type; 1426 hwloc_types[depth] = obj->type; 1427 depth++; 1428 } 1429 } 1430 KMP_ASSERT(depth > 0); 1431 1432 // Get the order for the types correct 1433 for (int i = 0, j = depth - 1; i < j; ++i, --j) { 1434 hwloc_obj_type_t hwloc_temp = hwloc_types[i]; 1435 kmp_hw_t temp = types[i]; 1436 types[i] = types[j]; 1437 types[j] = temp; 1438 hwloc_types[i] = hwloc_types[j]; 1439 hwloc_types[j] = hwloc_temp; 1440 } 1441 1442 // Allocate the data structure to be returned. 1443 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1444 1445 hw_thread_index = 0; 1446 pu = NULL; 1447 while (pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu)) { 1448 int index = depth - 1; 1449 bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask); 1450 kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index); 1451 if (included) { 1452 hw_thread.clear(); 1453 hw_thread.ids[index] = pu->logical_index; 1454 hw_thread.os_id = pu->os_index; 1455 // If multiple core types, then set that attribute for the hardware thread 1456 if (cpukinds) { 1457 int cpukind_index = -1; 1458 for (int i = 0; i < nr_cpu_kinds; ++i) { 1459 if (hwloc_bitmap_isset(cpukinds[i].mask, hw_thread.os_id)) { 1460 cpukind_index = i; 1461 break; 1462 } 1463 } 1464 if (cpukind_index >= 0) 1465 hw_thread.core_type = cpukinds[cpukind_index].core_type; 1466 } 1467 index--; 1468 } 1469 obj = pu; 1470 prev = obj; 1471 while (obj != root && obj != NULL) { 1472 obj = obj->parent; 1473 #if HWLOC_API_VERSION >= 0x00020000 1474 // NUMA Nodes are handled differently since they are not within the 1475 // parent/child structure anymore. They are separate children 1476 // of obj (memory_first_child points to first memory child) 1477 if (obj->memory_arity) { 1478 hwloc_obj_t memory; 1479 for (memory = obj->memory_first_child; memory; 1480 memory = hwloc_get_next_child(tp, obj, memory)) { 1481 if (memory->type == HWLOC_OBJ_NUMANODE) 1482 break; 1483 } 1484 if (memory && memory->type == HWLOC_OBJ_NUMANODE) { 1485 sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev); 1486 if (included) { 1487 hw_thread.ids[index] = memory->logical_index; 1488 hw_thread.ids[index + 1] = sub_id; 1489 index--; 1490 } 1491 prev = memory; 1492 } 1493 prev = obj; 1494 } 1495 #endif 1496 type = __kmp_hwloc_type_2_topology_type(obj); 1497 if (type != KMP_HW_UNKNOWN) { 1498 sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev); 1499 if (included) { 1500 hw_thread.ids[index] = obj->logical_index; 1501 hw_thread.ids[index + 1] = sub_id; 1502 index--; 1503 } 1504 prev = obj; 1505 } 1506 } 1507 if (included) 1508 hw_thread_index++; 1509 } 1510 1511 // Free the core types information 1512 if (cpukinds) { 1513 for (int idx = 0; idx < nr_cpu_kinds; ++idx) 1514 hwloc_bitmap_free(cpukinds[idx].mask); 1515 __kmp_free(cpukinds); 1516 } 1517 __kmp_topology->sort_ids(); 1518 return true; 1519 } 1520 #endif // KMP_USE_HWLOC 1521 1522 // If we don't know how to retrieve the machine's processor topology, or 1523 // encounter an error in doing so, this routine is called to form a "flat" 1524 // mapping of os thread id's <-> processor id's. 1525 static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) { 1526 *msg_id = kmp_i18n_null; 1527 int depth = 3; 1528 kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD}; 1529 1530 if (__kmp_affinity_verbose) { 1531 KMP_INFORM(UsingFlatOS, "KMP_AFFINITY"); 1532 } 1533 1534 // Even if __kmp_affinity_type == affinity_none, this routine might still 1535 // called to set __kmp_ncores, as well as 1536 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1537 if (!KMP_AFFINITY_CAPABLE()) { 1538 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1539 __kmp_ncores = nPackages = __kmp_xproc; 1540 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1541 return true; 1542 } 1543 1544 // When affinity is off, this routine will still be called to set 1545 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1546 // Make sure all these vars are set correctly, and return now if affinity is 1547 // not enabled. 1548 __kmp_ncores = nPackages = __kmp_avail_proc; 1549 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1550 1551 // Construct the data structure to be returned. 1552 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1553 int avail_ct = 0; 1554 int i; 1555 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1556 // Skip this proc if it is not included in the machine model. 1557 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1558 continue; 1559 } 1560 kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct); 1561 hw_thread.clear(); 1562 hw_thread.os_id = i; 1563 hw_thread.ids[0] = i; 1564 hw_thread.ids[1] = 0; 1565 hw_thread.ids[2] = 0; 1566 avail_ct++; 1567 } 1568 if (__kmp_affinity_verbose) { 1569 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 1570 } 1571 return true; 1572 } 1573 1574 #if KMP_GROUP_AFFINITY 1575 // If multiple Windows* OS processor groups exist, we can create a 2-level 1576 // topology map with the groups at level 0 and the individual procs at level 1. 1577 // This facilitates letting the threads float among all procs in a group, 1578 // if granularity=group (the default when there are multiple groups). 1579 static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) { 1580 *msg_id = kmp_i18n_null; 1581 int depth = 3; 1582 kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD}; 1583 const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR); 1584 1585 if (__kmp_affinity_verbose) { 1586 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 1587 } 1588 1589 // If we aren't affinity capable, then use flat topology 1590 if (!KMP_AFFINITY_CAPABLE()) { 1591 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1592 nPackages = __kmp_num_proc_groups; 1593 __kmp_nThreadsPerCore = 1; 1594 __kmp_ncores = __kmp_xproc; 1595 nCoresPerPkg = nPackages / __kmp_ncores; 1596 return true; 1597 } 1598 1599 // Construct the data structure to be returned. 1600 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1601 int avail_ct = 0; 1602 int i; 1603 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1604 // Skip this proc if it is not included in the machine model. 1605 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1606 continue; 1607 } 1608 kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++); 1609 hw_thread.clear(); 1610 hw_thread.os_id = i; 1611 hw_thread.ids[0] = i / BITS_PER_GROUP; 1612 hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP; 1613 } 1614 return true; 1615 } 1616 #endif /* KMP_GROUP_AFFINITY */ 1617 1618 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1619 1620 template <kmp_uint32 LSB, kmp_uint32 MSB> 1621 static inline unsigned __kmp_extract_bits(kmp_uint32 v) { 1622 const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB; 1623 const kmp_uint32 SHIFT_RIGHT = LSB; 1624 kmp_uint32 retval = v; 1625 retval <<= SHIFT_LEFT; 1626 retval >>= (SHIFT_LEFT + SHIFT_RIGHT); 1627 return retval; 1628 } 1629 1630 static int __kmp_cpuid_mask_width(int count) { 1631 int r = 0; 1632 1633 while ((1 << r) < count) 1634 ++r; 1635 return r; 1636 } 1637 1638 class apicThreadInfo { 1639 public: 1640 unsigned osId; // param to __kmp_affinity_bind_thread 1641 unsigned apicId; // from cpuid after binding 1642 unsigned maxCoresPerPkg; // "" 1643 unsigned maxThreadsPerPkg; // "" 1644 unsigned pkgId; // inferred from above values 1645 unsigned coreId; // "" 1646 unsigned threadId; // "" 1647 }; 1648 1649 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, 1650 const void *b) { 1651 const apicThreadInfo *aa = (const apicThreadInfo *)a; 1652 const apicThreadInfo *bb = (const apicThreadInfo *)b; 1653 if (aa->pkgId < bb->pkgId) 1654 return -1; 1655 if (aa->pkgId > bb->pkgId) 1656 return 1; 1657 if (aa->coreId < bb->coreId) 1658 return -1; 1659 if (aa->coreId > bb->coreId) 1660 return 1; 1661 if (aa->threadId < bb->threadId) 1662 return -1; 1663 if (aa->threadId > bb->threadId) 1664 return 1; 1665 return 0; 1666 } 1667 1668 class kmp_cache_info_t { 1669 public: 1670 struct info_t { 1671 unsigned level, mask; 1672 }; 1673 kmp_cache_info_t() : depth(0) { get_leaf4_levels(); } 1674 size_t get_depth() const { return depth; } 1675 info_t &operator[](size_t index) { return table[index]; } 1676 const info_t &operator[](size_t index) const { return table[index]; } 1677 1678 static kmp_hw_t get_topology_type(unsigned level) { 1679 KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL); 1680 switch (level) { 1681 case 1: 1682 return KMP_HW_L1; 1683 case 2: 1684 return KMP_HW_L2; 1685 case 3: 1686 return KMP_HW_L3; 1687 } 1688 return KMP_HW_UNKNOWN; 1689 } 1690 1691 private: 1692 static const int MAX_CACHE_LEVEL = 3; 1693 1694 size_t depth; 1695 info_t table[MAX_CACHE_LEVEL]; 1696 1697 void get_leaf4_levels() { 1698 unsigned level = 0; 1699 while (depth < MAX_CACHE_LEVEL) { 1700 unsigned cache_type, max_threads_sharing; 1701 unsigned cache_level, cache_mask_width; 1702 kmp_cpuid buf2; 1703 __kmp_x86_cpuid(4, level, &buf2); 1704 cache_type = __kmp_extract_bits<0, 4>(buf2.eax); 1705 if (!cache_type) 1706 break; 1707 // Skip instruction caches 1708 if (cache_type == 2) { 1709 level++; 1710 continue; 1711 } 1712 max_threads_sharing = __kmp_extract_bits<14, 25>(buf2.eax) + 1; 1713 cache_mask_width = __kmp_cpuid_mask_width(max_threads_sharing); 1714 cache_level = __kmp_extract_bits<5, 7>(buf2.eax); 1715 table[depth].level = cache_level; 1716 table[depth].mask = ((-1) << cache_mask_width); 1717 depth++; 1718 level++; 1719 } 1720 } 1721 }; 1722 1723 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 1724 // an algorithm which cycles through the available os threads, setting 1725 // the current thread's affinity mask to that thread, and then retrieves 1726 // the Apic Id for each thread context using the cpuid instruction. 1727 static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) { 1728 kmp_cpuid buf; 1729 *msg_id = kmp_i18n_null; 1730 1731 if (__kmp_affinity_verbose) { 1732 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 1733 } 1734 1735 // Check if cpuid leaf 4 is supported. 1736 __kmp_x86_cpuid(0, 0, &buf); 1737 if (buf.eax < 4) { 1738 *msg_id = kmp_i18n_str_NoLeaf4Support; 1739 return false; 1740 } 1741 1742 // The algorithm used starts by setting the affinity to each available thread 1743 // and retrieving info from the cpuid instruction, so if we are not capable of 1744 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 1745 // need to do something else - use the defaults that we calculated from 1746 // issuing cpuid without binding to each proc. 1747 if (!KMP_AFFINITY_CAPABLE()) { 1748 // Hack to try and infer the machine topology using only the data 1749 // available from cpuid on the current thread, and __kmp_xproc. 1750 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1751 1752 // Get an upper bound on the number of threads per package using cpuid(1). 1753 // On some OS/chps combinations where HT is supported by the chip but is 1754 // disabled, this value will be 2 on a single core chip. Usually, it will be 1755 // 2 if HT is enabled and 1 if HT is disabled. 1756 __kmp_x86_cpuid(1, 0, &buf); 1757 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1758 if (maxThreadsPerPkg == 0) { 1759 maxThreadsPerPkg = 1; 1760 } 1761 1762 // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded 1763 // value. 1764 // 1765 // The author of cpu_count.cpp treated this only an upper bound on the 1766 // number of cores, but I haven't seen any cases where it was greater than 1767 // the actual number of cores, so we will treat it as exact in this block of 1768 // code. 1769 // 1770 // First, we need to check if cpuid(4) is supported on this chip. To see if 1771 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or 1772 // greater. 1773 __kmp_x86_cpuid(0, 0, &buf); 1774 if (buf.eax >= 4) { 1775 __kmp_x86_cpuid(4, 0, &buf); 1776 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1777 } else { 1778 nCoresPerPkg = 1; 1779 } 1780 1781 // There is no way to reliably tell if HT is enabled without issuing the 1782 // cpuid instruction from every thread, can correlating the cpuid info, so 1783 // if the machine is not affinity capable, we assume that HT is off. We have 1784 // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine 1785 // does not support HT. 1786 // 1787 // - Older OSes are usually found on machines with older chips, which do not 1788 // support HT. 1789 // - The performance penalty for mistakenly identifying a machine as HT when 1790 // it isn't (which results in blocktime being incorrectly set to 0) is 1791 // greater than the penalty when for mistakenly identifying a machine as 1792 // being 1 thread/core when it is really HT enabled (which results in 1793 // blocktime being incorrectly set to a positive value). 1794 __kmp_ncores = __kmp_xproc; 1795 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1796 __kmp_nThreadsPerCore = 1; 1797 return true; 1798 } 1799 1800 // From here on, we can assume that it is safe to call 1801 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1802 // __kmp_affinity_type = affinity_none. 1803 1804 // Save the affinity mask for the current thread. 1805 kmp_affinity_raii_t previous_affinity; 1806 1807 // Run through each of the available contexts, binding the current thread 1808 // to it, and obtaining the pertinent information using the cpuid instr. 1809 // 1810 // The relevant information is: 1811 // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 1812 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 1813 // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value 1814 // of this field determines the width of the core# + thread# fields in the 1815 // Apic Id. It is also an upper bound on the number of threads per 1816 // package, but it has been verified that situations happen were it is not 1817 // exact. In particular, on certain OS/chip combinations where Intel(R) 1818 // Hyper-Threading Technology is supported by the chip but has been 1819 // disabled, the value of this field will be 2 (for a single core chip). 1820 // On other OS/chip combinations supporting Intel(R) Hyper-Threading 1821 // Technology, the value of this field will be 1 when Intel(R) 1822 // Hyper-Threading Technology is disabled and 2 when it is enabled. 1823 // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value 1824 // of this field (+1) determines the width of the core# field in the Apic 1825 // Id. The comments in "cpucount.cpp" say that this value is an upper 1826 // bound, but the IA-32 architecture manual says that it is exactly the 1827 // number of cores per package, and I haven't seen any case where it 1828 // wasn't. 1829 // 1830 // From this information, deduce the package Id, core Id, and thread Id, 1831 // and set the corresponding fields in the apicThreadInfo struct. 1832 unsigned i; 1833 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 1834 __kmp_avail_proc * sizeof(apicThreadInfo)); 1835 unsigned nApics = 0; 1836 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1837 // Skip this proc if it is not included in the machine model. 1838 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1839 continue; 1840 } 1841 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 1842 1843 __kmp_affinity_dispatch->bind_thread(i); 1844 threadInfo[nApics].osId = i; 1845 1846 // The apic id and max threads per pkg come from cpuid(1). 1847 __kmp_x86_cpuid(1, 0, &buf); 1848 if (((buf.edx >> 9) & 1) == 0) { 1849 __kmp_free(threadInfo); 1850 *msg_id = kmp_i18n_str_ApicNotPresent; 1851 return false; 1852 } 1853 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 1854 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1855 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 1856 threadInfo[nApics].maxThreadsPerPkg = 1; 1857 } 1858 1859 // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded 1860 // value. 1861 // 1862 // First, we need to check if cpuid(4) is supported on this chip. To see if 1863 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n 1864 // or greater. 1865 __kmp_x86_cpuid(0, 0, &buf); 1866 if (buf.eax >= 4) { 1867 __kmp_x86_cpuid(4, 0, &buf); 1868 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1869 } else { 1870 threadInfo[nApics].maxCoresPerPkg = 1; 1871 } 1872 1873 // Infer the pkgId / coreId / threadId using only the info obtained locally. 1874 int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg); 1875 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 1876 1877 int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg); 1878 int widthT = widthCT - widthC; 1879 if (widthT < 0) { 1880 // I've never seen this one happen, but I suppose it could, if the cpuid 1881 // instruction on a chip was really screwed up. Make sure to restore the 1882 // affinity mask before the tail call. 1883 __kmp_free(threadInfo); 1884 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1885 return false; 1886 } 1887 1888 int maskC = (1 << widthC) - 1; 1889 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC; 1890 1891 int maskT = (1 << widthT) - 1; 1892 threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT; 1893 1894 nApics++; 1895 } 1896 1897 // We've collected all the info we need. 1898 // Restore the old affinity mask for this thread. 1899 previous_affinity.restore(); 1900 1901 // Sort the threadInfo table by physical Id. 1902 qsort(threadInfo, nApics, sizeof(*threadInfo), 1903 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1904 1905 // The table is now sorted by pkgId / coreId / threadId, but we really don't 1906 // know the radix of any of the fields. pkgId's may be sparsely assigned among 1907 // the chips on a system. Although coreId's are usually assigned 1908 // [0 .. coresPerPkg-1] and threadId's are usually assigned 1909 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1910 // 1911 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 1912 // total # packages) are at this point - we want to determine that now. We 1913 // only have an upper bound on the first two figures. 1914 // 1915 // We also perform a consistency check at this point: the values returned by 1916 // the cpuid instruction for any thread bound to a given package had better 1917 // return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1918 nPackages = 1; 1919 nCoresPerPkg = 1; 1920 __kmp_nThreadsPerCore = 1; 1921 unsigned nCores = 1; 1922 1923 unsigned pkgCt = 1; // to determine radii 1924 unsigned lastPkgId = threadInfo[0].pkgId; 1925 unsigned coreCt = 1; 1926 unsigned lastCoreId = threadInfo[0].coreId; 1927 unsigned threadCt = 1; 1928 unsigned lastThreadId = threadInfo[0].threadId; 1929 1930 // intra-pkg consist checks 1931 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1932 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1933 1934 for (i = 1; i < nApics; i++) { 1935 if (threadInfo[i].pkgId != lastPkgId) { 1936 nCores++; 1937 pkgCt++; 1938 lastPkgId = threadInfo[i].pkgId; 1939 if ((int)coreCt > nCoresPerPkg) 1940 nCoresPerPkg = coreCt; 1941 coreCt = 1; 1942 lastCoreId = threadInfo[i].coreId; 1943 if ((int)threadCt > __kmp_nThreadsPerCore) 1944 __kmp_nThreadsPerCore = threadCt; 1945 threadCt = 1; 1946 lastThreadId = threadInfo[i].threadId; 1947 1948 // This is a different package, so go on to the next iteration without 1949 // doing any consistency checks. Reset the consistency check vars, though. 1950 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1951 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1952 continue; 1953 } 1954 1955 if (threadInfo[i].coreId != lastCoreId) { 1956 nCores++; 1957 coreCt++; 1958 lastCoreId = threadInfo[i].coreId; 1959 if ((int)threadCt > __kmp_nThreadsPerCore) 1960 __kmp_nThreadsPerCore = threadCt; 1961 threadCt = 1; 1962 lastThreadId = threadInfo[i].threadId; 1963 } else if (threadInfo[i].threadId != lastThreadId) { 1964 threadCt++; 1965 lastThreadId = threadInfo[i].threadId; 1966 } else { 1967 __kmp_free(threadInfo); 1968 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1969 return false; 1970 } 1971 1972 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1973 // fields agree between all the threads bounds to a given package. 1974 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) || 1975 (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1976 __kmp_free(threadInfo); 1977 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1978 return false; 1979 } 1980 } 1981 // When affinity is off, this routine will still be called to set 1982 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1983 // Make sure all these vars are set correctly 1984 nPackages = pkgCt; 1985 if ((int)coreCt > nCoresPerPkg) 1986 nCoresPerPkg = coreCt; 1987 if ((int)threadCt > __kmp_nThreadsPerCore) 1988 __kmp_nThreadsPerCore = threadCt; 1989 __kmp_ncores = nCores; 1990 KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc); 1991 1992 // Now that we've determined the number of packages, the number of cores per 1993 // package, and the number of threads per core, we can construct the data 1994 // structure that is to be returned. 1995 int idx = 0; 1996 int pkgLevel = 0; 1997 int coreLevel = 1; 1998 int threadLevel = 2; 1999 //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 2000 int depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 2001 kmp_hw_t types[3]; 2002 if (pkgLevel >= 0) 2003 types[idx++] = KMP_HW_SOCKET; 2004 if (coreLevel >= 0) 2005 types[idx++] = KMP_HW_CORE; 2006 if (threadLevel >= 0) 2007 types[idx++] = KMP_HW_THREAD; 2008 2009 KMP_ASSERT(depth > 0); 2010 __kmp_topology = kmp_topology_t::allocate(nApics, depth, types); 2011 2012 for (i = 0; i < nApics; ++i) { 2013 idx = 0; 2014 unsigned os = threadInfo[i].osId; 2015 kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 2016 hw_thread.clear(); 2017 2018 if (pkgLevel >= 0) { 2019 hw_thread.ids[idx++] = threadInfo[i].pkgId; 2020 } 2021 if (coreLevel >= 0) { 2022 hw_thread.ids[idx++] = threadInfo[i].coreId; 2023 } 2024 if (threadLevel >= 0) { 2025 hw_thread.ids[idx++] = threadInfo[i].threadId; 2026 } 2027 hw_thread.os_id = os; 2028 } 2029 2030 __kmp_free(threadInfo); 2031 __kmp_topology->sort_ids(); 2032 if (!__kmp_topology->check_ids()) { 2033 kmp_topology_t::deallocate(__kmp_topology); 2034 __kmp_topology = nullptr; 2035 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 2036 return false; 2037 } 2038 return true; 2039 } 2040 2041 // Hybrid cpu detection using CPUID.1A 2042 // Thread should be pinned to processor already 2043 static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type, 2044 unsigned *native_model_id) { 2045 kmp_cpuid buf; 2046 __kmp_x86_cpuid(0x1a, 0, &buf); 2047 *type = (kmp_hw_core_type_t)__kmp_extract_bits<24, 31>(buf.eax); 2048 *native_model_id = __kmp_extract_bits<0, 23>(buf.eax); 2049 } 2050 2051 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 2052 // architectures support a newer interface for specifying the x2APIC Ids, 2053 // based on CPUID.B or CPUID.1F 2054 /* 2055 * CPUID.B or 1F, Input ECX (sub leaf # aka level number) 2056 Bits Bits Bits Bits 2057 31-16 15-8 7-4 4-0 2058 ---+-----------+--------------+-------------+-----------------+ 2059 EAX| reserved | reserved | reserved | Bits to Shift | 2060 ---+-----------|--------------+-------------+-----------------| 2061 EBX| reserved | Num logical processors at level (16 bits) | 2062 ---+-----------|--------------+-------------------------------| 2063 ECX| reserved | Level Type | Level Number (8 bits) | 2064 ---+-----------+--------------+-------------------------------| 2065 EDX| X2APIC ID (32 bits) | 2066 ---+----------------------------------------------------------+ 2067 */ 2068 2069 enum { 2070 INTEL_LEVEL_TYPE_INVALID = 0, // Package level 2071 INTEL_LEVEL_TYPE_SMT = 1, 2072 INTEL_LEVEL_TYPE_CORE = 2, 2073 INTEL_LEVEL_TYPE_TILE = 3, 2074 INTEL_LEVEL_TYPE_MODULE = 4, 2075 INTEL_LEVEL_TYPE_DIE = 5, 2076 INTEL_LEVEL_TYPE_LAST = 6, 2077 }; 2078 2079 struct cpuid_level_info_t { 2080 unsigned level_type, mask, mask_width, nitems, cache_mask; 2081 }; 2082 2083 static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) { 2084 switch (intel_type) { 2085 case INTEL_LEVEL_TYPE_INVALID: 2086 return KMP_HW_SOCKET; 2087 case INTEL_LEVEL_TYPE_SMT: 2088 return KMP_HW_THREAD; 2089 case INTEL_LEVEL_TYPE_CORE: 2090 return KMP_HW_CORE; 2091 case INTEL_LEVEL_TYPE_TILE: 2092 return KMP_HW_TILE; 2093 case INTEL_LEVEL_TYPE_MODULE: 2094 return KMP_HW_MODULE; 2095 case INTEL_LEVEL_TYPE_DIE: 2096 return KMP_HW_DIE; 2097 } 2098 return KMP_HW_UNKNOWN; 2099 } 2100 2101 // This function takes the topology leaf, a levels array to store the levels 2102 // detected and a bitmap of the known levels. 2103 // Returns the number of levels in the topology 2104 static unsigned 2105 __kmp_x2apicid_get_levels(int leaf, 2106 cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST], 2107 kmp_uint64 known_levels) { 2108 unsigned level, levels_index; 2109 unsigned level_type, mask_width, nitems; 2110 kmp_cpuid buf; 2111 2112 // New algorithm has known topology layers act as highest unknown topology 2113 // layers when unknown topology layers exist. 2114 // e.g., Suppose layers were SMT <X> CORE <Y> <Z> PACKAGE, where <X> <Y> <Z> 2115 // are unknown topology layers, Then SMT will take the characteristics of 2116 // (SMT x <X>) and CORE will take the characteristics of (CORE x <Y> x <Z>). 2117 // This eliminates unknown portions of the topology while still keeping the 2118 // correct structure. 2119 level = levels_index = 0; 2120 do { 2121 __kmp_x86_cpuid(leaf, level, &buf); 2122 level_type = __kmp_extract_bits<8, 15>(buf.ecx); 2123 mask_width = __kmp_extract_bits<0, 4>(buf.eax); 2124 nitems = __kmp_extract_bits<0, 15>(buf.ebx); 2125 if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0) 2126 return 0; 2127 2128 if (known_levels & (1ull << level_type)) { 2129 // Add a new level to the topology 2130 KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST); 2131 levels[levels_index].level_type = level_type; 2132 levels[levels_index].mask_width = mask_width; 2133 levels[levels_index].nitems = nitems; 2134 levels_index++; 2135 } else { 2136 // If it is an unknown level, then logically move the previous layer up 2137 if (levels_index > 0) { 2138 levels[levels_index - 1].mask_width = mask_width; 2139 levels[levels_index - 1].nitems = nitems; 2140 } 2141 } 2142 level++; 2143 } while (level_type != INTEL_LEVEL_TYPE_INVALID); 2144 2145 // Set the masks to & with apicid 2146 for (unsigned i = 0; i < levels_index; ++i) { 2147 if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) { 2148 levels[i].mask = ~((-1) << levels[i].mask_width); 2149 levels[i].cache_mask = (-1) << levels[i].mask_width; 2150 for (unsigned j = 0; j < i; ++j) 2151 levels[i].mask ^= levels[j].mask; 2152 } else { 2153 KMP_DEBUG_ASSERT(levels_index > 0); 2154 levels[i].mask = (-1) << levels[i - 1].mask_width; 2155 levels[i].cache_mask = 0; 2156 } 2157 } 2158 return levels_index; 2159 } 2160 2161 static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) { 2162 2163 cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST]; 2164 kmp_hw_t types[INTEL_LEVEL_TYPE_LAST]; 2165 unsigned levels_index; 2166 kmp_cpuid buf; 2167 kmp_uint64 known_levels; 2168 int topology_leaf, highest_leaf, apic_id; 2169 int num_leaves; 2170 static int leaves[] = {0, 0}; 2171 2172 kmp_i18n_id_t leaf_message_id; 2173 2174 KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST); 2175 2176 *msg_id = kmp_i18n_null; 2177 if (__kmp_affinity_verbose) { 2178 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 2179 } 2180 2181 // Figure out the known topology levels 2182 known_levels = 0ull; 2183 for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) { 2184 if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) { 2185 known_levels |= (1ull << i); 2186 } 2187 } 2188 2189 // Get the highest cpuid leaf supported 2190 __kmp_x86_cpuid(0, 0, &buf); 2191 highest_leaf = buf.eax; 2192 2193 // If a specific topology method was requested, only allow that specific leaf 2194 // otherwise, try both leaves 31 and 11 in that order 2195 num_leaves = 0; 2196 if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 2197 num_leaves = 1; 2198 leaves[0] = 11; 2199 leaf_message_id = kmp_i18n_str_NoLeaf11Support; 2200 } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { 2201 num_leaves = 1; 2202 leaves[0] = 31; 2203 leaf_message_id = kmp_i18n_str_NoLeaf31Support; 2204 } else { 2205 num_leaves = 2; 2206 leaves[0] = 31; 2207 leaves[1] = 11; 2208 leaf_message_id = kmp_i18n_str_NoLeaf11Support; 2209 } 2210 2211 // Check to see if cpuid leaf 31 or 11 is supported. 2212 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2213 topology_leaf = -1; 2214 for (int i = 0; i < num_leaves; ++i) { 2215 int leaf = leaves[i]; 2216 if (highest_leaf < leaf) 2217 continue; 2218 __kmp_x86_cpuid(leaf, 0, &buf); 2219 if (buf.ebx == 0) 2220 continue; 2221 topology_leaf = leaf; 2222 levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels); 2223 if (levels_index == 0) 2224 continue; 2225 break; 2226 } 2227 if (topology_leaf == -1 || levels_index == 0) { 2228 *msg_id = leaf_message_id; 2229 return false; 2230 } 2231 KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST); 2232 2233 // The algorithm used starts by setting the affinity to each available thread 2234 // and retrieving info from the cpuid instruction, so if we are not capable of 2235 // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then 2236 // we need to do something else - use the defaults that we calculated from 2237 // issuing cpuid without binding to each proc. 2238 if (!KMP_AFFINITY_CAPABLE()) { 2239 // Hack to try and infer the machine topology using only the data 2240 // available from cpuid on the current thread, and __kmp_xproc. 2241 KMP_ASSERT(__kmp_affinity_type == affinity_none); 2242 for (unsigned i = 0; i < levels_index; ++i) { 2243 if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) { 2244 __kmp_nThreadsPerCore = levels[i].nitems; 2245 } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) { 2246 nCoresPerPkg = levels[i].nitems; 2247 } 2248 } 2249 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 2250 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 2251 return true; 2252 } 2253 2254 // Allocate the data structure to be returned. 2255 int depth = levels_index; 2256 for (int i = depth - 1, j = 0; i >= 0; --i, ++j) 2257 types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type); 2258 __kmp_topology = 2259 kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types); 2260 2261 // Insert equivalent cache types if they exist 2262 kmp_cache_info_t cache_info; 2263 for (size_t i = 0; i < cache_info.get_depth(); ++i) { 2264 const kmp_cache_info_t::info_t &info = cache_info[i]; 2265 unsigned cache_mask = info.mask; 2266 unsigned cache_level = info.level; 2267 for (unsigned j = 0; j < levels_index; ++j) { 2268 unsigned hw_cache_mask = levels[j].cache_mask; 2269 kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level); 2270 if (hw_cache_mask == cache_mask && j < levels_index - 1) { 2271 kmp_hw_t type = 2272 __kmp_intel_type_2_topology_type(levels[j + 1].level_type); 2273 __kmp_topology->set_equivalent_type(cache_type, type); 2274 } 2275 } 2276 } 2277 2278 // From here on, we can assume that it is safe to call 2279 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 2280 // __kmp_affinity_type = affinity_none. 2281 2282 // Save the affinity mask for the current thread. 2283 kmp_affinity_raii_t previous_affinity; 2284 2285 // Run through each of the available contexts, binding the current thread 2286 // to it, and obtaining the pertinent information using the cpuid instr. 2287 unsigned int proc; 2288 int hw_thread_index = 0; 2289 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { 2290 cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST]; 2291 unsigned my_levels_index; 2292 2293 // Skip this proc if it is not included in the machine model. 2294 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 2295 continue; 2296 } 2297 KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc); 2298 2299 __kmp_affinity_dispatch->bind_thread(proc); 2300 2301 // New algorithm 2302 __kmp_x86_cpuid(topology_leaf, 0, &buf); 2303 apic_id = buf.edx; 2304 kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index); 2305 my_levels_index = 2306 __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels); 2307 if (my_levels_index == 0 || my_levels_index != levels_index) { 2308 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 2309 return false; 2310 } 2311 hw_thread.clear(); 2312 hw_thread.os_id = proc; 2313 // Put in topology information 2314 for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) { 2315 hw_thread.ids[idx] = apic_id & my_levels[j].mask; 2316 if (j > 0) { 2317 hw_thread.ids[idx] >>= my_levels[j - 1].mask_width; 2318 } 2319 } 2320 // Hybrid information 2321 if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) { 2322 kmp_hw_core_type_t type; 2323 unsigned native_model_id; 2324 __kmp_get_hybrid_info(&type, &native_model_id); 2325 hw_thread.core_type = type; 2326 } 2327 hw_thread_index++; 2328 } 2329 KMP_ASSERT(hw_thread_index > 0); 2330 __kmp_topology->sort_ids(); 2331 if (!__kmp_topology->check_ids()) { 2332 kmp_topology_t::deallocate(__kmp_topology); 2333 __kmp_topology = nullptr; 2334 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 2335 return false; 2336 } 2337 return true; 2338 } 2339 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 2340 2341 #define osIdIndex 0 2342 #define threadIdIndex 1 2343 #define coreIdIndex 2 2344 #define pkgIdIndex 3 2345 #define nodeIdIndex 4 2346 2347 typedef unsigned *ProcCpuInfo; 2348 static unsigned maxIndex = pkgIdIndex; 2349 2350 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, 2351 const void *b) { 2352 unsigned i; 2353 const unsigned *aa = *(unsigned *const *)a; 2354 const unsigned *bb = *(unsigned *const *)b; 2355 for (i = maxIndex;; i--) { 2356 if (aa[i] < bb[i]) 2357 return -1; 2358 if (aa[i] > bb[i]) 2359 return 1; 2360 if (i == osIdIndex) 2361 break; 2362 } 2363 return 0; 2364 } 2365 2366 #if KMP_USE_HIER_SCHED 2367 // Set the array sizes for the hierarchy layers 2368 static void __kmp_dispatch_set_hierarchy_values() { 2369 // Set the maximum number of L1's to number of cores 2370 // Set the maximum number of L2's to to either number of cores / 2 for 2371 // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing 2372 // Or the number of cores for Intel(R) Xeon(R) processors 2373 // Set the maximum number of NUMA nodes and L3's to number of packages 2374 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] = 2375 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 2376 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores; 2377 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ 2378 KMP_MIC_SUPPORTED 2379 if (__kmp_mic_type >= mic3) 2380 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2; 2381 else 2382 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 2383 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores; 2384 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages; 2385 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages; 2386 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1; 2387 // Set the number of threads per unit 2388 // Number of hardware threads per L1/L2/L3/NUMA/LOOP 2389 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1; 2390 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] = 2391 __kmp_nThreadsPerCore; 2392 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ 2393 KMP_MIC_SUPPORTED 2394 if (__kmp_mic_type >= mic3) 2395 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 2396 2 * __kmp_nThreadsPerCore; 2397 else 2398 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 2399 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 2400 __kmp_nThreadsPerCore; 2401 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] = 2402 nCoresPerPkg * __kmp_nThreadsPerCore; 2403 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] = 2404 nCoresPerPkg * __kmp_nThreadsPerCore; 2405 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] = 2406 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 2407 } 2408 2409 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc) 2410 // i.e., this thread's L1 or this thread's L2, etc. 2411 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) { 2412 int index = type + 1; 2413 int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1]; 2414 KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST); 2415 if (type == kmp_hier_layer_e::LAYER_THREAD) 2416 return tid; 2417 else if (type == kmp_hier_layer_e::LAYER_LOOP) 2418 return 0; 2419 KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0); 2420 if (tid >= num_hw_threads) 2421 tid = tid % num_hw_threads; 2422 return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index]; 2423 } 2424 2425 // Return the number of t1's per t2 2426 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) { 2427 int i1 = t1 + 1; 2428 int i2 = t2 + 1; 2429 KMP_DEBUG_ASSERT(i1 <= i2); 2430 KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST); 2431 KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST); 2432 KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0); 2433 // (nthreads/t2) / (nthreads/t1) = t1 / t2 2434 return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1]; 2435 } 2436 #endif // KMP_USE_HIER_SCHED 2437 2438 static inline const char *__kmp_cpuinfo_get_filename() { 2439 const char *filename; 2440 if (__kmp_cpuinfo_file != nullptr) 2441 filename = __kmp_cpuinfo_file; 2442 else 2443 filename = "/proc/cpuinfo"; 2444 return filename; 2445 } 2446 2447 static inline const char *__kmp_cpuinfo_get_envvar() { 2448 const char *envvar = nullptr; 2449 if (__kmp_cpuinfo_file != nullptr) 2450 envvar = "KMP_CPUINFO_FILE"; 2451 return envvar; 2452 } 2453 2454 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 2455 // affinity map. 2456 static bool __kmp_affinity_create_cpuinfo_map(int *line, 2457 kmp_i18n_id_t *const msg_id) { 2458 const char *filename = __kmp_cpuinfo_get_filename(); 2459 const char *envvar = __kmp_cpuinfo_get_envvar(); 2460 *msg_id = kmp_i18n_null; 2461 2462 if (__kmp_affinity_verbose) { 2463 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 2464 } 2465 2466 kmp_safe_raii_file_t f(filename, "r", envvar); 2467 2468 // Scan of the file, and count the number of "processor" (osId) fields, 2469 // and find the highest value of <n> for a node_<n> field. 2470 char buf[256]; 2471 unsigned num_records = 0; 2472 while (!feof(f)) { 2473 buf[sizeof(buf) - 1] = 1; 2474 if (!fgets(buf, sizeof(buf), f)) { 2475 // Read errors presumably because of EOF 2476 break; 2477 } 2478 2479 char s1[] = "processor"; 2480 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2481 num_records++; 2482 continue; 2483 } 2484 2485 // FIXME - this will match "node_<n> <garbage>" 2486 unsigned level; 2487 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2488 // validate the input fisrt: 2489 if (level > (unsigned)__kmp_xproc) { // level is too big 2490 level = __kmp_xproc; 2491 } 2492 if (nodeIdIndex + level >= maxIndex) { 2493 maxIndex = nodeIdIndex + level; 2494 } 2495 continue; 2496 } 2497 } 2498 2499 // Check for empty file / no valid processor records, or too many. The number 2500 // of records can't exceed the number of valid bits in the affinity mask. 2501 if (num_records == 0) { 2502 *msg_id = kmp_i18n_str_NoProcRecords; 2503 return false; 2504 } 2505 if (num_records > (unsigned)__kmp_xproc) { 2506 *msg_id = kmp_i18n_str_TooManyProcRecords; 2507 return false; 2508 } 2509 2510 // Set the file pointer back to the beginning, so that we can scan the file 2511 // again, this time performing a full parse of the data. Allocate a vector of 2512 // ProcCpuInfo object, where we will place the data. Adding an extra element 2513 // at the end allows us to remove a lot of extra checks for termination 2514 // conditions. 2515 if (fseek(f, 0, SEEK_SET) != 0) { 2516 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 2517 return false; 2518 } 2519 2520 // Allocate the array of records to store the proc info in. The dummy 2521 // element at the end makes the logic in filling them out easier to code. 2522 unsigned **threadInfo = 2523 (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *)); 2524 unsigned i; 2525 for (i = 0; i <= num_records; i++) { 2526 threadInfo[i] = 2527 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2528 } 2529 2530 #define CLEANUP_THREAD_INFO \ 2531 for (i = 0; i <= num_records; i++) { \ 2532 __kmp_free(threadInfo[i]); \ 2533 } \ 2534 __kmp_free(threadInfo); 2535 2536 // A value of UINT_MAX means that we didn't find the field 2537 unsigned __index; 2538 2539 #define INIT_PROC_INFO(p) \ 2540 for (__index = 0; __index <= maxIndex; __index++) { \ 2541 (p)[__index] = UINT_MAX; \ 2542 } 2543 2544 for (i = 0; i <= num_records; i++) { 2545 INIT_PROC_INFO(threadInfo[i]); 2546 } 2547 2548 unsigned num_avail = 0; 2549 *line = 0; 2550 while (!feof(f)) { 2551 // Create an inner scoping level, so that all the goto targets at the end of 2552 // the loop appear in an outer scoping level. This avoids warnings about 2553 // jumping past an initialization to a target in the same block. 2554 { 2555 buf[sizeof(buf) - 1] = 1; 2556 bool long_line = false; 2557 if (!fgets(buf, sizeof(buf), f)) { 2558 // Read errors presumably because of EOF 2559 // If there is valid data in threadInfo[num_avail], then fake 2560 // a blank line in ensure that the last address gets parsed. 2561 bool valid = false; 2562 for (i = 0; i <= maxIndex; i++) { 2563 if (threadInfo[num_avail][i] != UINT_MAX) { 2564 valid = true; 2565 } 2566 } 2567 if (!valid) { 2568 break; 2569 } 2570 buf[0] = 0; 2571 } else if (!buf[sizeof(buf) - 1]) { 2572 // The line is longer than the buffer. Set a flag and don't 2573 // emit an error if we were going to ignore the line, anyway. 2574 long_line = true; 2575 2576 #define CHECK_LINE \ 2577 if (long_line) { \ 2578 CLEANUP_THREAD_INFO; \ 2579 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 2580 return false; \ 2581 } 2582 } 2583 (*line)++; 2584 2585 char s1[] = "processor"; 2586 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2587 CHECK_LINE; 2588 char *p = strchr(buf + sizeof(s1) - 1, ':'); 2589 unsigned val; 2590 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2591 goto no_val; 2592 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) 2593 #if KMP_ARCH_AARCH64 2594 // Handle the old AArch64 /proc/cpuinfo layout differently, 2595 // it contains all of the 'processor' entries listed in a 2596 // single 'Processor' section, therefore the normal looking 2597 // for duplicates in that section will always fail. 2598 num_avail++; 2599 #else 2600 goto dup_field; 2601 #endif 2602 threadInfo[num_avail][osIdIndex] = val; 2603 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64) 2604 char path[256]; 2605 KMP_SNPRINTF( 2606 path, sizeof(path), 2607 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 2608 threadInfo[num_avail][osIdIndex]); 2609 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 2610 2611 KMP_SNPRINTF(path, sizeof(path), 2612 "/sys/devices/system/cpu/cpu%u/topology/core_id", 2613 threadInfo[num_avail][osIdIndex]); 2614 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 2615 continue; 2616 #else 2617 } 2618 char s2[] = "physical id"; 2619 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 2620 CHECK_LINE; 2621 char *p = strchr(buf + sizeof(s2) - 1, ':'); 2622 unsigned val; 2623 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2624 goto no_val; 2625 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) 2626 goto dup_field; 2627 threadInfo[num_avail][pkgIdIndex] = val; 2628 continue; 2629 } 2630 char s3[] = "core id"; 2631 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2632 CHECK_LINE; 2633 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2634 unsigned val; 2635 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2636 goto no_val; 2637 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) 2638 goto dup_field; 2639 threadInfo[num_avail][coreIdIndex] = val; 2640 continue; 2641 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2642 } 2643 char s4[] = "thread id"; 2644 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2645 CHECK_LINE; 2646 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2647 unsigned val; 2648 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2649 goto no_val; 2650 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) 2651 goto dup_field; 2652 threadInfo[num_avail][threadIdIndex] = val; 2653 continue; 2654 } 2655 unsigned level; 2656 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2657 CHECK_LINE; 2658 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2659 unsigned val; 2660 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2661 goto no_val; 2662 // validate the input before using level: 2663 if (level > (unsigned)__kmp_xproc) { // level is too big 2664 level = __kmp_xproc; 2665 } 2666 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) 2667 goto dup_field; 2668 threadInfo[num_avail][nodeIdIndex + level] = val; 2669 continue; 2670 } 2671 2672 // We didn't recognize the leading token on the line. There are lots of 2673 // leading tokens that we don't recognize - if the line isn't empty, go on 2674 // to the next line. 2675 if ((*buf != 0) && (*buf != '\n')) { 2676 // If the line is longer than the buffer, read characters 2677 // until we find a newline. 2678 if (long_line) { 2679 int ch; 2680 while (((ch = fgetc(f)) != EOF) && (ch != '\n')) 2681 ; 2682 } 2683 continue; 2684 } 2685 2686 // A newline has signalled the end of the processor record. 2687 // Check that there aren't too many procs specified. 2688 if ((int)num_avail == __kmp_xproc) { 2689 CLEANUP_THREAD_INFO; 2690 *msg_id = kmp_i18n_str_TooManyEntries; 2691 return false; 2692 } 2693 2694 // Check for missing fields. The osId field must be there, and we 2695 // currently require that the physical id field is specified, also. 2696 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2697 CLEANUP_THREAD_INFO; 2698 *msg_id = kmp_i18n_str_MissingProcField; 2699 return false; 2700 } 2701 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2702 CLEANUP_THREAD_INFO; 2703 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2704 return false; 2705 } 2706 2707 // Skip this proc if it is not included in the machine model. 2708 if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], 2709 __kmp_affin_fullMask)) { 2710 INIT_PROC_INFO(threadInfo[num_avail]); 2711 continue; 2712 } 2713 2714 // We have a successful parse of this proc's info. 2715 // Increment the counter, and prepare for the next proc. 2716 num_avail++; 2717 KMP_ASSERT(num_avail <= num_records); 2718 INIT_PROC_INFO(threadInfo[num_avail]); 2719 } 2720 continue; 2721 2722 no_val: 2723 CLEANUP_THREAD_INFO; 2724 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2725 return false; 2726 2727 dup_field: 2728 CLEANUP_THREAD_INFO; 2729 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2730 return false; 2731 } 2732 *line = 0; 2733 2734 #if KMP_MIC && REDUCE_TEAM_SIZE 2735 unsigned teamSize = 0; 2736 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2737 2738 // check for num_records == __kmp_xproc ??? 2739 2740 // If it is configured to omit the package level when there is only a single 2741 // package, the logic at the end of this routine won't work if there is only a 2742 // single thread 2743 KMP_ASSERT(num_avail > 0); 2744 KMP_ASSERT(num_avail <= num_records); 2745 2746 // Sort the threadInfo table by physical Id. 2747 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2748 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2749 2750 // The table is now sorted by pkgId / coreId / threadId, but we really don't 2751 // know the radix of any of the fields. pkgId's may be sparsely assigned among 2752 // the chips on a system. Although coreId's are usually assigned 2753 // [0 .. coresPerPkg-1] and threadId's are usually assigned 2754 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2755 // 2756 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 2757 // total # packages) are at this point - we want to determine that now. We 2758 // only have an upper bound on the first two figures. 2759 unsigned *counts = 2760 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2761 unsigned *maxCt = 2762 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2763 unsigned *totals = 2764 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2765 unsigned *lastId = 2766 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2767 2768 bool assign_thread_ids = false; 2769 unsigned threadIdCt; 2770 unsigned index; 2771 2772 restart_radix_check: 2773 threadIdCt = 0; 2774 2775 // Initialize the counter arrays with data from threadInfo[0]. 2776 if (assign_thread_ids) { 2777 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2778 threadInfo[0][threadIdIndex] = threadIdCt++; 2779 } else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2780 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2781 } 2782 } 2783 for (index = 0; index <= maxIndex; index++) { 2784 counts[index] = 1; 2785 maxCt[index] = 1; 2786 totals[index] = 1; 2787 lastId[index] = threadInfo[0][index]; 2788 ; 2789 } 2790 2791 // Run through the rest of the OS procs. 2792 for (i = 1; i < num_avail; i++) { 2793 // Find the most significant index whose id differs from the id for the 2794 // previous OS proc. 2795 for (index = maxIndex; index >= threadIdIndex; index--) { 2796 if (assign_thread_ids && (index == threadIdIndex)) { 2797 // Auto-assign the thread id field if it wasn't specified. 2798 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2799 threadInfo[i][threadIdIndex] = threadIdCt++; 2800 } 2801 // Apparently the thread id field was specified for some entries and not 2802 // others. Start the thread id counter off at the next higher thread id. 2803 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2804 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2805 } 2806 } 2807 if (threadInfo[i][index] != lastId[index]) { 2808 // Run through all indices which are less significant, and reset the 2809 // counts to 1. At all levels up to and including index, we need to 2810 // increment the totals and record the last id. 2811 unsigned index2; 2812 for (index2 = threadIdIndex; index2 < index; index2++) { 2813 totals[index2]++; 2814 if (counts[index2] > maxCt[index2]) { 2815 maxCt[index2] = counts[index2]; 2816 } 2817 counts[index2] = 1; 2818 lastId[index2] = threadInfo[i][index2]; 2819 } 2820 counts[index]++; 2821 totals[index]++; 2822 lastId[index] = threadInfo[i][index]; 2823 2824 if (assign_thread_ids && (index > threadIdIndex)) { 2825 2826 #if KMP_MIC && REDUCE_TEAM_SIZE 2827 // The default team size is the total #threads in the machine 2828 // minus 1 thread for every core that has 3 or more threads. 2829 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2830 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2831 2832 // Restart the thread counter, as we are on a new core. 2833 threadIdCt = 0; 2834 2835 // Auto-assign the thread id field if it wasn't specified. 2836 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2837 threadInfo[i][threadIdIndex] = threadIdCt++; 2838 } 2839 2840 // Apparently the thread id field was specified for some entries and 2841 // not others. Start the thread id counter off at the next higher 2842 // thread id. 2843 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2844 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2845 } 2846 } 2847 break; 2848 } 2849 } 2850 if (index < threadIdIndex) { 2851 // If thread ids were specified, it is an error if they are not unique. 2852 // Also, check that we waven't already restarted the loop (to be safe - 2853 // shouldn't need to). 2854 if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) { 2855 __kmp_free(lastId); 2856 __kmp_free(totals); 2857 __kmp_free(maxCt); 2858 __kmp_free(counts); 2859 CLEANUP_THREAD_INFO; 2860 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2861 return false; 2862 } 2863 2864 // If the thread ids were not specified and we see entries entries that 2865 // are duplicates, start the loop over and assign the thread ids manually. 2866 assign_thread_ids = true; 2867 goto restart_radix_check; 2868 } 2869 } 2870 2871 #if KMP_MIC && REDUCE_TEAM_SIZE 2872 // The default team size is the total #threads in the machine 2873 // minus 1 thread for every core that has 3 or more threads. 2874 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2875 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2876 2877 for (index = threadIdIndex; index <= maxIndex; index++) { 2878 if (counts[index] > maxCt[index]) { 2879 maxCt[index] = counts[index]; 2880 } 2881 } 2882 2883 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2884 nCoresPerPkg = maxCt[coreIdIndex]; 2885 nPackages = totals[pkgIdIndex]; 2886 2887 // When affinity is off, this routine will still be called to set 2888 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 2889 // Make sure all these vars are set correctly, and return now if affinity is 2890 // not enabled. 2891 __kmp_ncores = totals[coreIdIndex]; 2892 if (!KMP_AFFINITY_CAPABLE()) { 2893 KMP_ASSERT(__kmp_affinity_type == affinity_none); 2894 return true; 2895 } 2896 2897 #if KMP_MIC && REDUCE_TEAM_SIZE 2898 // Set the default team size. 2899 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2900 __kmp_dflt_team_nth = teamSize; 2901 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting " 2902 "__kmp_dflt_team_nth = %d\n", 2903 __kmp_dflt_team_nth)); 2904 } 2905 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2906 2907 KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc); 2908 2909 // Count the number of levels which have more nodes at that level than at the 2910 // parent's level (with there being an implicit root node of the top level). 2911 // This is equivalent to saying that there is at least one node at this level 2912 // which has a sibling. These levels are in the map, and the package level is 2913 // always in the map. 2914 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2915 for (index = threadIdIndex; index < maxIndex; index++) { 2916 KMP_ASSERT(totals[index] >= totals[index + 1]); 2917 inMap[index] = (totals[index] > totals[index + 1]); 2918 } 2919 inMap[maxIndex] = (totals[maxIndex] > 1); 2920 inMap[pkgIdIndex] = true; 2921 inMap[coreIdIndex] = true; 2922 inMap[threadIdIndex] = true; 2923 2924 int depth = 0; 2925 int idx = 0; 2926 kmp_hw_t types[KMP_HW_LAST]; 2927 int pkgLevel = -1; 2928 int coreLevel = -1; 2929 int threadLevel = -1; 2930 for (index = threadIdIndex; index <= maxIndex; index++) { 2931 if (inMap[index]) { 2932 depth++; 2933 } 2934 } 2935 if (inMap[pkgIdIndex]) { 2936 pkgLevel = idx; 2937 types[idx++] = KMP_HW_SOCKET; 2938 } 2939 if (inMap[coreIdIndex]) { 2940 coreLevel = idx; 2941 types[idx++] = KMP_HW_CORE; 2942 } 2943 if (inMap[threadIdIndex]) { 2944 threadLevel = idx; 2945 types[idx++] = KMP_HW_THREAD; 2946 } 2947 KMP_ASSERT(depth > 0); 2948 2949 // Construct the data structure that is to be returned. 2950 __kmp_topology = kmp_topology_t::allocate(num_avail, depth, types); 2951 2952 for (i = 0; i < num_avail; ++i) { 2953 unsigned os = threadInfo[i][osIdIndex]; 2954 int src_index; 2955 int dst_index = 0; 2956 kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 2957 hw_thread.clear(); 2958 hw_thread.os_id = os; 2959 2960 idx = 0; 2961 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2962 if (!inMap[src_index]) { 2963 continue; 2964 } 2965 if (src_index == pkgIdIndex) { 2966 hw_thread.ids[pkgLevel] = threadInfo[i][src_index]; 2967 } else if (src_index == coreIdIndex) { 2968 hw_thread.ids[coreLevel] = threadInfo[i][src_index]; 2969 } else if (src_index == threadIdIndex) { 2970 hw_thread.ids[threadLevel] = threadInfo[i][src_index]; 2971 } 2972 dst_index++; 2973 } 2974 } 2975 2976 __kmp_free(inMap); 2977 __kmp_free(lastId); 2978 __kmp_free(totals); 2979 __kmp_free(maxCt); 2980 __kmp_free(counts); 2981 CLEANUP_THREAD_INFO; 2982 __kmp_topology->sort_ids(); 2983 if (!__kmp_topology->check_ids()) { 2984 kmp_topology_t::deallocate(__kmp_topology); 2985 __kmp_topology = nullptr; 2986 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2987 return false; 2988 } 2989 return true; 2990 } 2991 2992 // Create and return a table of affinity masks, indexed by OS thread ID. 2993 // This routine handles OR'ing together all the affinity masks of threads 2994 // that are sufficiently close, if granularity > fine. 2995 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex, 2996 unsigned *numUnique) { 2997 // First form a table of affinity masks in order of OS thread id. 2998 int maxOsId; 2999 int i; 3000 int numAddrs = __kmp_topology->get_num_hw_threads(); 3001 int depth = __kmp_topology->get_depth(); 3002 KMP_ASSERT(numAddrs); 3003 KMP_ASSERT(depth); 3004 3005 maxOsId = 0; 3006 for (i = numAddrs - 1;; --i) { 3007 int osId = __kmp_topology->at(i).os_id; 3008 if (osId > maxOsId) { 3009 maxOsId = osId; 3010 } 3011 if (i == 0) 3012 break; 3013 } 3014 kmp_affin_mask_t *osId2Mask; 3015 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1)); 3016 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 3017 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 3018 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 3019 } 3020 if (__kmp_affinity_gran_levels >= (int)depth) { 3021 if (__kmp_affinity_verbose || 3022 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 3023 KMP_WARNING(AffThreadsMayMigrate); 3024 } 3025 } 3026 3027 // Run through the table, forming the masks for all threads on each core. 3028 // Threads on the same core will have identical kmp_hw_thread_t objects, not 3029 // considering the last level, which must be the thread id. All threads on a 3030 // core will appear consecutively. 3031 int unique = 0; 3032 int j = 0; // index of 1st thread on core 3033 int leader = 0; 3034 kmp_affin_mask_t *sum; 3035 KMP_CPU_ALLOC_ON_STACK(sum); 3036 KMP_CPU_ZERO(sum); 3037 KMP_CPU_SET(__kmp_topology->at(0).os_id, sum); 3038 for (i = 1; i < numAddrs; i++) { 3039 // If this thread is sufficiently close to the leader (within the 3040 // granularity setting), then set the bit for this os thread in the 3041 // affinity mask for this group, and go on to the next thread. 3042 if (__kmp_topology->is_close(leader, i, __kmp_affinity_gran_levels)) { 3043 KMP_CPU_SET(__kmp_topology->at(i).os_id, sum); 3044 continue; 3045 } 3046 3047 // For every thread in this group, copy the mask to the thread's entry in 3048 // the osId2Mask table. Mark the first address as a leader. 3049 for (; j < i; j++) { 3050 int osId = __kmp_topology->at(j).os_id; 3051 KMP_DEBUG_ASSERT(osId <= maxOsId); 3052 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 3053 KMP_CPU_COPY(mask, sum); 3054 __kmp_topology->at(j).leader = (j == leader); 3055 } 3056 unique++; 3057 3058 // Start a new mask. 3059 leader = i; 3060 KMP_CPU_ZERO(sum); 3061 KMP_CPU_SET(__kmp_topology->at(i).os_id, sum); 3062 } 3063 3064 // For every thread in last group, copy the mask to the thread's 3065 // entry in the osId2Mask table. 3066 for (; j < i; j++) { 3067 int osId = __kmp_topology->at(j).os_id; 3068 KMP_DEBUG_ASSERT(osId <= maxOsId); 3069 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 3070 KMP_CPU_COPY(mask, sum); 3071 __kmp_topology->at(j).leader = (j == leader); 3072 } 3073 unique++; 3074 KMP_CPU_FREE_FROM_STACK(sum); 3075 3076 *maxIndex = maxOsId; 3077 *numUnique = unique; 3078 return osId2Mask; 3079 } 3080 3081 // Stuff for the affinity proclist parsers. It's easier to declare these vars 3082 // as file-static than to try and pass them through the calling sequence of 3083 // the recursive-descent OMP_PLACES parser. 3084 static kmp_affin_mask_t *newMasks; 3085 static int numNewMasks; 3086 static int nextNewMask; 3087 3088 #define ADD_MASK(_mask) \ 3089 { \ 3090 if (nextNewMask >= numNewMasks) { \ 3091 int i; \ 3092 numNewMasks *= 2; \ 3093 kmp_affin_mask_t *temp; \ 3094 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ 3095 for (i = 0; i < numNewMasks / 2; i++) { \ 3096 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \ 3097 kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \ 3098 KMP_CPU_COPY(dest, src); \ 3099 } \ 3100 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \ 3101 newMasks = temp; \ 3102 } \ 3103 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 3104 nextNewMask++; \ 3105 } 3106 3107 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \ 3108 { \ 3109 if (((_osId) > _maxOsId) || \ 3110 (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 3111 if (__kmp_affinity_verbose || \ 3112 (__kmp_affinity_warnings && \ 3113 (__kmp_affinity_type != affinity_none))) { \ 3114 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 3115 } \ 3116 } else { \ 3117 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 3118 } \ 3119 } 3120 3121 // Re-parse the proclist (for the explicit affinity type), and form the list 3122 // of affinity newMasks indexed by gtid. 3123 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 3124 unsigned int *out_numMasks, 3125 const char *proclist, 3126 kmp_affin_mask_t *osId2Mask, 3127 int maxOsId) { 3128 int i; 3129 const char *scan = proclist; 3130 const char *next = proclist; 3131 3132 // We use malloc() for the temporary mask vector, so that we can use 3133 // realloc() to extend it. 3134 numNewMasks = 2; 3135 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 3136 nextNewMask = 0; 3137 kmp_affin_mask_t *sumMask; 3138 KMP_CPU_ALLOC(sumMask); 3139 int setSize = 0; 3140 3141 for (;;) { 3142 int start, end, stride; 3143 3144 SKIP_WS(scan); 3145 next = scan; 3146 if (*next == '\0') { 3147 break; 3148 } 3149 3150 if (*next == '{') { 3151 int num; 3152 setSize = 0; 3153 next++; // skip '{' 3154 SKIP_WS(next); 3155 scan = next; 3156 3157 // Read the first integer in the set. 3158 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist"); 3159 SKIP_DIGITS(next); 3160 num = __kmp_str_to_int(scan, *next); 3161 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 3162 3163 // Copy the mask for that osId to the sum (union) mask. 3164 if ((num > maxOsId) || 3165 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3166 if (__kmp_affinity_verbose || 3167 (__kmp_affinity_warnings && 3168 (__kmp_affinity_type != affinity_none))) { 3169 KMP_WARNING(AffIgnoreInvalidProcID, num); 3170 } 3171 KMP_CPU_ZERO(sumMask); 3172 } else { 3173 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 3174 setSize = 1; 3175 } 3176 3177 for (;;) { 3178 // Check for end of set. 3179 SKIP_WS(next); 3180 if (*next == '}') { 3181 next++; // skip '}' 3182 break; 3183 } 3184 3185 // Skip optional comma. 3186 if (*next == ',') { 3187 next++; 3188 } 3189 SKIP_WS(next); 3190 3191 // Read the next integer in the set. 3192 scan = next; 3193 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3194 3195 SKIP_DIGITS(next); 3196 num = __kmp_str_to_int(scan, *next); 3197 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 3198 3199 // Add the mask for that osId to the sum mask. 3200 if ((num > maxOsId) || 3201 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3202 if (__kmp_affinity_verbose || 3203 (__kmp_affinity_warnings && 3204 (__kmp_affinity_type != affinity_none))) { 3205 KMP_WARNING(AffIgnoreInvalidProcID, num); 3206 } 3207 } else { 3208 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 3209 setSize++; 3210 } 3211 } 3212 if (setSize > 0) { 3213 ADD_MASK(sumMask); 3214 } 3215 3216 SKIP_WS(next); 3217 if (*next == ',') { 3218 next++; 3219 } 3220 scan = next; 3221 continue; 3222 } 3223 3224 // Read the first integer. 3225 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3226 SKIP_DIGITS(next); 3227 start = __kmp_str_to_int(scan, *next); 3228 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 3229 SKIP_WS(next); 3230 3231 // If this isn't a range, then add a mask to the list and go on. 3232 if (*next != '-') { 3233 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3234 3235 // Skip optional comma. 3236 if (*next == ',') { 3237 next++; 3238 } 3239 scan = next; 3240 continue; 3241 } 3242 3243 // This is a range. Skip over the '-' and read in the 2nd int. 3244 next++; // skip '-' 3245 SKIP_WS(next); 3246 scan = next; 3247 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3248 SKIP_DIGITS(next); 3249 end = __kmp_str_to_int(scan, *next); 3250 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 3251 3252 // Check for a stride parameter 3253 stride = 1; 3254 SKIP_WS(next); 3255 if (*next == ':') { 3256 // A stride is specified. Skip over the ':" and read the 3rd int. 3257 int sign = +1; 3258 next++; // skip ':' 3259 SKIP_WS(next); 3260 scan = next; 3261 if (*next == '-') { 3262 sign = -1; 3263 next++; 3264 SKIP_WS(next); 3265 scan = next; 3266 } 3267 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3268 SKIP_DIGITS(next); 3269 stride = __kmp_str_to_int(scan, *next); 3270 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 3271 stride *= sign; 3272 } 3273 3274 // Do some range checks. 3275 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 3276 if (stride > 0) { 3277 KMP_ASSERT2(start <= end, "bad explicit proc list"); 3278 } else { 3279 KMP_ASSERT2(start >= end, "bad explicit proc list"); 3280 } 3281 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 3282 3283 // Add the mask for each OS proc # to the list. 3284 if (stride > 0) { 3285 do { 3286 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3287 start += stride; 3288 } while (start <= end); 3289 } else { 3290 do { 3291 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3292 start += stride; 3293 } while (start >= end); 3294 } 3295 3296 // Skip optional comma. 3297 SKIP_WS(next); 3298 if (*next == ',') { 3299 next++; 3300 } 3301 scan = next; 3302 } 3303 3304 *out_numMasks = nextNewMask; 3305 if (nextNewMask == 0) { 3306 *out_masks = NULL; 3307 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3308 return; 3309 } 3310 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3311 for (i = 0; i < nextNewMask; i++) { 3312 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3313 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3314 KMP_CPU_COPY(dest, src); 3315 } 3316 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3317 KMP_CPU_FREE(sumMask); 3318 } 3319 3320 /*----------------------------------------------------------------------------- 3321 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 3322 places. Again, Here is the grammar: 3323 3324 place_list := place 3325 place_list := place , place_list 3326 place := num 3327 place := place : num 3328 place := place : num : signed 3329 place := { subplacelist } 3330 place := ! place // (lowest priority) 3331 subplace_list := subplace 3332 subplace_list := subplace , subplace_list 3333 subplace := num 3334 subplace := num : num 3335 subplace := num : num : signed 3336 signed := num 3337 signed := + signed 3338 signed := - signed 3339 -----------------------------------------------------------------------------*/ 3340 static void __kmp_process_subplace_list(const char **scan, 3341 kmp_affin_mask_t *osId2Mask, 3342 int maxOsId, kmp_affin_mask_t *tempMask, 3343 int *setSize) { 3344 const char *next; 3345 3346 for (;;) { 3347 int start, count, stride, i; 3348 3349 // Read in the starting proc id 3350 SKIP_WS(*scan); 3351 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3352 next = *scan; 3353 SKIP_DIGITS(next); 3354 start = __kmp_str_to_int(*scan, *next); 3355 KMP_ASSERT(start >= 0); 3356 *scan = next; 3357 3358 // valid follow sets are ',' ':' and '}' 3359 SKIP_WS(*scan); 3360 if (**scan == '}' || **scan == ',') { 3361 if ((start > maxOsId) || 3362 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3363 if (__kmp_affinity_verbose || 3364 (__kmp_affinity_warnings && 3365 (__kmp_affinity_type != affinity_none))) { 3366 KMP_WARNING(AffIgnoreInvalidProcID, start); 3367 } 3368 } else { 3369 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3370 (*setSize)++; 3371 } 3372 if (**scan == '}') { 3373 break; 3374 } 3375 (*scan)++; // skip ',' 3376 continue; 3377 } 3378 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3379 (*scan)++; // skip ':' 3380 3381 // Read count parameter 3382 SKIP_WS(*scan); 3383 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3384 next = *scan; 3385 SKIP_DIGITS(next); 3386 count = __kmp_str_to_int(*scan, *next); 3387 KMP_ASSERT(count >= 0); 3388 *scan = next; 3389 3390 // valid follow sets are ',' ':' and '}' 3391 SKIP_WS(*scan); 3392 if (**scan == '}' || **scan == ',') { 3393 for (i = 0; i < count; i++) { 3394 if ((start > maxOsId) || 3395 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3396 if (__kmp_affinity_verbose || 3397 (__kmp_affinity_warnings && 3398 (__kmp_affinity_type != affinity_none))) { 3399 KMP_WARNING(AffIgnoreInvalidProcID, start); 3400 } 3401 break; // don't proliferate warnings for large count 3402 } else { 3403 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3404 start++; 3405 (*setSize)++; 3406 } 3407 } 3408 if (**scan == '}') { 3409 break; 3410 } 3411 (*scan)++; // skip ',' 3412 continue; 3413 } 3414 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3415 (*scan)++; // skip ':' 3416 3417 // Read stride parameter 3418 int sign = +1; 3419 for (;;) { 3420 SKIP_WS(*scan); 3421 if (**scan == '+') { 3422 (*scan)++; // skip '+' 3423 continue; 3424 } 3425 if (**scan == '-') { 3426 sign *= -1; 3427 (*scan)++; // skip '-' 3428 continue; 3429 } 3430 break; 3431 } 3432 SKIP_WS(*scan); 3433 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3434 next = *scan; 3435 SKIP_DIGITS(next); 3436 stride = __kmp_str_to_int(*scan, *next); 3437 KMP_ASSERT(stride >= 0); 3438 *scan = next; 3439 stride *= sign; 3440 3441 // valid follow sets are ',' and '}' 3442 SKIP_WS(*scan); 3443 if (**scan == '}' || **scan == ',') { 3444 for (i = 0; i < count; i++) { 3445 if ((start > maxOsId) || 3446 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3447 if (__kmp_affinity_verbose || 3448 (__kmp_affinity_warnings && 3449 (__kmp_affinity_type != affinity_none))) { 3450 KMP_WARNING(AffIgnoreInvalidProcID, start); 3451 } 3452 break; // don't proliferate warnings for large count 3453 } else { 3454 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3455 start += stride; 3456 (*setSize)++; 3457 } 3458 } 3459 if (**scan == '}') { 3460 break; 3461 } 3462 (*scan)++; // skip ',' 3463 continue; 3464 } 3465 3466 KMP_ASSERT2(0, "bad explicit places list"); 3467 } 3468 } 3469 3470 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3471 int maxOsId, kmp_affin_mask_t *tempMask, 3472 int *setSize) { 3473 const char *next; 3474 3475 // valid follow sets are '{' '!' and num 3476 SKIP_WS(*scan); 3477 if (**scan == '{') { 3478 (*scan)++; // skip '{' 3479 __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize); 3480 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3481 (*scan)++; // skip '}' 3482 } else if (**scan == '!') { 3483 (*scan)++; // skip '!' 3484 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3485 KMP_CPU_COMPLEMENT(maxOsId, tempMask); 3486 } else if ((**scan >= '0') && (**scan <= '9')) { 3487 next = *scan; 3488 SKIP_DIGITS(next); 3489 int num = __kmp_str_to_int(*scan, *next); 3490 KMP_ASSERT(num >= 0); 3491 if ((num > maxOsId) || 3492 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3493 if (__kmp_affinity_verbose || 3494 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 3495 KMP_WARNING(AffIgnoreInvalidProcID, num); 3496 } 3497 } else { 3498 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3499 (*setSize)++; 3500 } 3501 *scan = next; // skip num 3502 } else { 3503 KMP_ASSERT2(0, "bad explicit places list"); 3504 } 3505 } 3506 3507 // static void 3508 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3509 unsigned int *out_numMasks, 3510 const char *placelist, 3511 kmp_affin_mask_t *osId2Mask, 3512 int maxOsId) { 3513 int i, j, count, stride, sign; 3514 const char *scan = placelist; 3515 const char *next = placelist; 3516 3517 numNewMasks = 2; 3518 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 3519 nextNewMask = 0; 3520 3521 // tempMask is modified based on the previous or initial 3522 // place to form the current place 3523 // previousMask contains the previous place 3524 kmp_affin_mask_t *tempMask; 3525 kmp_affin_mask_t *previousMask; 3526 KMP_CPU_ALLOC(tempMask); 3527 KMP_CPU_ZERO(tempMask); 3528 KMP_CPU_ALLOC(previousMask); 3529 KMP_CPU_ZERO(previousMask); 3530 int setSize = 0; 3531 3532 for (;;) { 3533 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3534 3535 // valid follow sets are ',' ':' and EOL 3536 SKIP_WS(scan); 3537 if (*scan == '\0' || *scan == ',') { 3538 if (setSize > 0) { 3539 ADD_MASK(tempMask); 3540 } 3541 KMP_CPU_ZERO(tempMask); 3542 setSize = 0; 3543 if (*scan == '\0') { 3544 break; 3545 } 3546 scan++; // skip ',' 3547 continue; 3548 } 3549 3550 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3551 scan++; // skip ':' 3552 3553 // Read count parameter 3554 SKIP_WS(scan); 3555 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3556 next = scan; 3557 SKIP_DIGITS(next); 3558 count = __kmp_str_to_int(scan, *next); 3559 KMP_ASSERT(count >= 0); 3560 scan = next; 3561 3562 // valid follow sets are ',' ':' and EOL 3563 SKIP_WS(scan); 3564 if (*scan == '\0' || *scan == ',') { 3565 stride = +1; 3566 } else { 3567 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3568 scan++; // skip ':' 3569 3570 // Read stride parameter 3571 sign = +1; 3572 for (;;) { 3573 SKIP_WS(scan); 3574 if (*scan == '+') { 3575 scan++; // skip '+' 3576 continue; 3577 } 3578 if (*scan == '-') { 3579 sign *= -1; 3580 scan++; // skip '-' 3581 continue; 3582 } 3583 break; 3584 } 3585 SKIP_WS(scan); 3586 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3587 next = scan; 3588 SKIP_DIGITS(next); 3589 stride = __kmp_str_to_int(scan, *next); 3590 KMP_DEBUG_ASSERT(stride >= 0); 3591 scan = next; 3592 stride *= sign; 3593 } 3594 3595 // Add places determined by initial_place : count : stride 3596 for (i = 0; i < count; i++) { 3597 if (setSize == 0) { 3598 break; 3599 } 3600 // Add the current place, then build the next place (tempMask) from that 3601 KMP_CPU_COPY(previousMask, tempMask); 3602 ADD_MASK(previousMask); 3603 KMP_CPU_ZERO(tempMask); 3604 setSize = 0; 3605 KMP_CPU_SET_ITERATE(j, previousMask) { 3606 if (!KMP_CPU_ISSET(j, previousMask)) { 3607 continue; 3608 } 3609 if ((j + stride > maxOsId) || (j + stride < 0) || 3610 (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) || 3611 (!KMP_CPU_ISSET(j + stride, 3612 KMP_CPU_INDEX(osId2Mask, j + stride)))) { 3613 if ((__kmp_affinity_verbose || 3614 (__kmp_affinity_warnings && 3615 (__kmp_affinity_type != affinity_none))) && 3616 i < count - 1) { 3617 KMP_WARNING(AffIgnoreInvalidProcID, j + stride); 3618 } 3619 continue; 3620 } 3621 KMP_CPU_SET(j + stride, tempMask); 3622 setSize++; 3623 } 3624 } 3625 KMP_CPU_ZERO(tempMask); 3626 setSize = 0; 3627 3628 // valid follow sets are ',' and EOL 3629 SKIP_WS(scan); 3630 if (*scan == '\0') { 3631 break; 3632 } 3633 if (*scan == ',') { 3634 scan++; // skip ',' 3635 continue; 3636 } 3637 3638 KMP_ASSERT2(0, "bad explicit places list"); 3639 } 3640 3641 *out_numMasks = nextNewMask; 3642 if (nextNewMask == 0) { 3643 *out_masks = NULL; 3644 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3645 return; 3646 } 3647 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3648 KMP_CPU_FREE(tempMask); 3649 KMP_CPU_FREE(previousMask); 3650 for (i = 0; i < nextNewMask; i++) { 3651 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3652 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3653 KMP_CPU_COPY(dest, src); 3654 } 3655 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3656 } 3657 3658 #undef ADD_MASK 3659 #undef ADD_MASK_OSID 3660 3661 // This function figures out the deepest level at which there is at least one 3662 // cluster/core with more than one processing unit bound to it. 3663 static int __kmp_affinity_find_core_level(int nprocs, int bottom_level) { 3664 int core_level = 0; 3665 3666 for (int i = 0; i < nprocs; i++) { 3667 const kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 3668 for (int j = bottom_level; j > 0; j--) { 3669 if (hw_thread.ids[j] > 0) { 3670 if (core_level < (j - 1)) { 3671 core_level = j - 1; 3672 } 3673 } 3674 } 3675 } 3676 return core_level; 3677 } 3678 3679 // This function counts number of clusters/cores at given level. 3680 static int __kmp_affinity_compute_ncores(int nprocs, int bottom_level, 3681 int core_level) { 3682 return __kmp_topology->get_count(core_level); 3683 } 3684 // This function finds to which cluster/core given processing unit is bound. 3685 static int __kmp_affinity_find_core(int proc, int bottom_level, 3686 int core_level) { 3687 int core = 0; 3688 KMP_DEBUG_ASSERT(proc >= 0 && proc < __kmp_topology->get_num_hw_threads()); 3689 for (int i = 0; i <= proc; ++i) { 3690 if (i + 1 <= proc) { 3691 for (int j = 0; j <= core_level; ++j) { 3692 if (__kmp_topology->at(i + 1).sub_ids[j] != 3693 __kmp_topology->at(i).sub_ids[j]) { 3694 core++; 3695 break; 3696 } 3697 } 3698 } 3699 } 3700 return core; 3701 } 3702 3703 // This function finds maximal number of processing units bound to a 3704 // cluster/core at given level. 3705 static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level, 3706 int core_level) { 3707 if (core_level >= bottom_level) 3708 return 1; 3709 int thread_level = __kmp_topology->get_level(KMP_HW_THREAD); 3710 return __kmp_topology->calculate_ratio(thread_level, core_level); 3711 } 3712 3713 static int *procarr = NULL; 3714 static int __kmp_aff_depth = 0; 3715 3716 // Create a one element mask array (set of places) which only contains the 3717 // initial process's affinity mask 3718 static void __kmp_create_affinity_none_places() { 3719 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3720 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3721 __kmp_affinity_num_masks = 1; 3722 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 3723 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0); 3724 KMP_CPU_COPY(dest, __kmp_affin_fullMask); 3725 } 3726 3727 static void __kmp_aux_affinity_initialize(void) { 3728 if (__kmp_affinity_masks != NULL) { 3729 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3730 return; 3731 } 3732 3733 // Create the "full" mask - this defines all of the processors that we 3734 // consider to be in the machine model. If respect is set, then it is the 3735 // initialization thread's affinity mask. Otherwise, it is all processors that 3736 // we know about on the machine. 3737 if (__kmp_affin_fullMask == NULL) { 3738 KMP_CPU_ALLOC(__kmp_affin_fullMask); 3739 } 3740 if (KMP_AFFINITY_CAPABLE()) { 3741 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE); 3742 if (__kmp_affinity_respect_mask) { 3743 // Count the number of available processors. 3744 unsigned i; 3745 __kmp_avail_proc = 0; 3746 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 3747 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 3748 continue; 3749 } 3750 __kmp_avail_proc++; 3751 } 3752 if (__kmp_avail_proc > __kmp_xproc) { 3753 if (__kmp_affinity_verbose || 3754 (__kmp_affinity_warnings && 3755 (__kmp_affinity_type != affinity_none))) { 3756 KMP_WARNING(ErrorInitializeAffinity); 3757 } 3758 __kmp_affinity_type = affinity_none; 3759 KMP_AFFINITY_DISABLE(); 3760 return; 3761 } 3762 3763 if (__kmp_affinity_verbose) { 3764 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3765 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3766 __kmp_affin_fullMask); 3767 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 3768 } 3769 } else { 3770 if (__kmp_affinity_verbose) { 3771 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3772 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3773 __kmp_affin_fullMask); 3774 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 3775 } 3776 __kmp_avail_proc = 3777 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); 3778 #if KMP_OS_WINDOWS 3779 // Set the process affinity mask since threads' affinity 3780 // masks must be subset of process mask in Windows* OS 3781 __kmp_affin_fullMask->set_process_affinity(true); 3782 #endif 3783 } 3784 } 3785 3786 kmp_i18n_id_t msg_id = kmp_i18n_null; 3787 3788 // For backward compatibility, setting KMP_CPUINFO_FILE => 3789 // KMP_TOPOLOGY_METHOD=cpuinfo 3790 if ((__kmp_cpuinfo_file != NULL) && 3791 (__kmp_affinity_top_method == affinity_top_method_all)) { 3792 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3793 } 3794 3795 bool success = false; 3796 if (__kmp_affinity_top_method == affinity_top_method_all) { 3797 // In the default code path, errors are not fatal - we just try using 3798 // another method. We only emit a warning message if affinity is on, or the 3799 // verbose flag is set, an the nowarnings flag was not set. 3800 #if KMP_USE_HWLOC 3801 if (!success && 3802 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3803 if (!__kmp_hwloc_error) { 3804 success = __kmp_affinity_create_hwloc_map(&msg_id); 3805 if (!success && __kmp_affinity_verbose) { 3806 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3807 } 3808 } else if (__kmp_affinity_verbose) { 3809 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3810 } 3811 } 3812 #endif 3813 3814 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 3815 if (!success) { 3816 success = __kmp_affinity_create_x2apicid_map(&msg_id); 3817 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3818 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3819 } 3820 } 3821 if (!success) { 3822 success = __kmp_affinity_create_apicid_map(&msg_id); 3823 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3824 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3825 } 3826 } 3827 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3828 3829 #if KMP_OS_LINUX 3830 if (!success) { 3831 int line = 0; 3832 success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id); 3833 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3834 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3835 } 3836 } 3837 #endif /* KMP_OS_LINUX */ 3838 3839 #if KMP_GROUP_AFFINITY 3840 if (!success && (__kmp_num_proc_groups > 1)) { 3841 success = __kmp_affinity_create_proc_group_map(&msg_id); 3842 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3843 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3844 } 3845 } 3846 #endif /* KMP_GROUP_AFFINITY */ 3847 3848 if (!success) { 3849 success = __kmp_affinity_create_flat_map(&msg_id); 3850 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3851 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3852 } 3853 KMP_ASSERT(success); 3854 } 3855 } 3856 3857 // If the user has specified that a paricular topology discovery method is to be 3858 // used, then we abort if that method fails. The exception is group affinity, 3859 // which might have been implicitly set. 3860 #if KMP_USE_HWLOC 3861 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 3862 KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC); 3863 success = __kmp_affinity_create_hwloc_map(&msg_id); 3864 if (!success) { 3865 KMP_ASSERT(msg_id != kmp_i18n_null); 3866 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3867 } 3868 } 3869 #endif // KMP_USE_HWLOC 3870 3871 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 3872 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid || 3873 __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { 3874 success = __kmp_affinity_create_x2apicid_map(&msg_id); 3875 if (!success) { 3876 KMP_ASSERT(msg_id != kmp_i18n_null); 3877 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3878 } 3879 } else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 3880 success = __kmp_affinity_create_apicid_map(&msg_id); 3881 if (!success) { 3882 KMP_ASSERT(msg_id != kmp_i18n_null); 3883 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3884 } 3885 } 3886 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3887 3888 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 3889 int line = 0; 3890 success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id); 3891 if (!success) { 3892 KMP_ASSERT(msg_id != kmp_i18n_null); 3893 const char *filename = __kmp_cpuinfo_get_filename(); 3894 if (line > 0) { 3895 KMP_FATAL(FileLineMsgExiting, filename, line, 3896 __kmp_i18n_catgets(msg_id)); 3897 } else { 3898 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 3899 } 3900 } 3901 } 3902 3903 #if KMP_GROUP_AFFINITY 3904 else if (__kmp_affinity_top_method == affinity_top_method_group) { 3905 success = __kmp_affinity_create_proc_group_map(&msg_id); 3906 KMP_ASSERT(success); 3907 if (!success) { 3908 KMP_ASSERT(msg_id != kmp_i18n_null); 3909 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3910 } 3911 } 3912 #endif /* KMP_GROUP_AFFINITY */ 3913 3914 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 3915 success = __kmp_affinity_create_flat_map(&msg_id); 3916 // should not fail 3917 KMP_ASSERT(success); 3918 } 3919 3920 // Early exit if topology could not be created 3921 if (!__kmp_topology) { 3922 if (KMP_AFFINITY_CAPABLE() && 3923 (__kmp_affinity_verbose || 3924 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) { 3925 KMP_WARNING(ErrorInitializeAffinity); 3926 } 3927 if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 && 3928 __kmp_ncores > 0) { 3929 __kmp_topology = kmp_topology_t::allocate(0, 0, NULL); 3930 __kmp_topology->canonicalize(nPackages, nCoresPerPkg, 3931 __kmp_nThreadsPerCore, __kmp_ncores); 3932 if (__kmp_affinity_verbose) { 3933 __kmp_topology->print("KMP_AFFINITY"); 3934 } 3935 } 3936 __kmp_affinity_type = affinity_none; 3937 __kmp_create_affinity_none_places(); 3938 #if KMP_USE_HIER_SCHED 3939 __kmp_dispatch_set_hierarchy_values(); 3940 #endif 3941 KMP_AFFINITY_DISABLE(); 3942 return; 3943 } 3944 3945 // Canonicalize, print (if requested), apply KMP_HW_SUBSET, and 3946 // initialize other data structures which depend on the topology 3947 __kmp_topology->canonicalize(); 3948 if (__kmp_affinity_verbose) 3949 __kmp_topology->print("KMP_AFFINITY"); 3950 bool filtered = __kmp_topology->filter_hw_subset(); 3951 if (filtered && __kmp_affinity_verbose) 3952 __kmp_topology->print("KMP_HW_SUBSET"); 3953 machine_hierarchy.init(__kmp_topology->get_num_hw_threads()); 3954 KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads()); 3955 // If KMP_AFFINITY=none, then only create the single "none" place 3956 // which is the process's initial affinity mask or the number of 3957 // hardware threads depending on respect,norespect 3958 if (__kmp_affinity_type == affinity_none) { 3959 __kmp_create_affinity_none_places(); 3960 #if KMP_USE_HIER_SCHED 3961 __kmp_dispatch_set_hierarchy_values(); 3962 #endif 3963 return; 3964 } 3965 int depth = __kmp_topology->get_depth(); 3966 3967 // Create the table of masks, indexed by thread Id. 3968 unsigned maxIndex; 3969 unsigned numUnique; 3970 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique); 3971 if (__kmp_affinity_gran_levels == 0) { 3972 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 3973 } 3974 3975 switch (__kmp_affinity_type) { 3976 3977 case affinity_explicit: 3978 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 3979 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) { 3980 __kmp_affinity_process_proclist( 3981 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 3982 __kmp_affinity_proclist, osId2Mask, maxIndex); 3983 } else { 3984 __kmp_affinity_process_placelist( 3985 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 3986 __kmp_affinity_proclist, osId2Mask, maxIndex); 3987 } 3988 if (__kmp_affinity_num_masks == 0) { 3989 if (__kmp_affinity_verbose || 3990 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 3991 KMP_WARNING(AffNoValidProcID); 3992 } 3993 __kmp_affinity_type = affinity_none; 3994 __kmp_create_affinity_none_places(); 3995 return; 3996 } 3997 break; 3998 3999 // The other affinity types rely on sorting the hardware threads according to 4000 // some permutation of the machine topology tree. Set __kmp_affinity_compact 4001 // and __kmp_affinity_offset appropriately, then jump to a common code 4002 // fragment to do the sort and create the array of affinity masks. 4003 case affinity_logical: 4004 __kmp_affinity_compact = 0; 4005 if (__kmp_affinity_offset) { 4006 __kmp_affinity_offset = 4007 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 4008 } 4009 goto sortTopology; 4010 4011 case affinity_physical: 4012 if (__kmp_nThreadsPerCore > 1) { 4013 __kmp_affinity_compact = 1; 4014 if (__kmp_affinity_compact >= depth) { 4015 __kmp_affinity_compact = 0; 4016 } 4017 } else { 4018 __kmp_affinity_compact = 0; 4019 } 4020 if (__kmp_affinity_offset) { 4021 __kmp_affinity_offset = 4022 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 4023 } 4024 goto sortTopology; 4025 4026 case affinity_scatter: 4027 if (__kmp_affinity_compact >= depth) { 4028 __kmp_affinity_compact = 0; 4029 } else { 4030 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 4031 } 4032 goto sortTopology; 4033 4034 case affinity_compact: 4035 if (__kmp_affinity_compact >= depth) { 4036 __kmp_affinity_compact = depth - 1; 4037 } 4038 goto sortTopology; 4039 4040 case affinity_balanced: 4041 if (depth <= 1) { 4042 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4043 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4044 } 4045 __kmp_affinity_type = affinity_none; 4046 __kmp_create_affinity_none_places(); 4047 return; 4048 } else if (!__kmp_topology->is_uniform()) { 4049 // Save the depth for further usage 4050 __kmp_aff_depth = depth; 4051 4052 int core_level = 4053 __kmp_affinity_find_core_level(__kmp_avail_proc, depth - 1); 4054 int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, depth - 1, 4055 core_level); 4056 int maxprocpercore = __kmp_affinity_max_proc_per_core( 4057 __kmp_avail_proc, depth - 1, core_level); 4058 4059 int nproc = ncores * maxprocpercore; 4060 if ((nproc < 2) || (nproc < __kmp_avail_proc)) { 4061 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4062 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4063 } 4064 __kmp_affinity_type = affinity_none; 4065 return; 4066 } 4067 4068 procarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4069 for (int i = 0; i < nproc; i++) { 4070 procarr[i] = -1; 4071 } 4072 4073 int lastcore = -1; 4074 int inlastcore = 0; 4075 for (int i = 0; i < __kmp_avail_proc; i++) { 4076 int proc = __kmp_topology->at(i).os_id; 4077 int core = __kmp_affinity_find_core(i, depth - 1, core_level); 4078 4079 if (core == lastcore) { 4080 inlastcore++; 4081 } else { 4082 inlastcore = 0; 4083 } 4084 lastcore = core; 4085 4086 procarr[core * maxprocpercore + inlastcore] = proc; 4087 } 4088 } 4089 if (__kmp_affinity_compact >= depth) { 4090 __kmp_affinity_compact = depth - 1; 4091 } 4092 4093 sortTopology: 4094 // Allocate the gtid->affinity mask table. 4095 if (__kmp_affinity_dups) { 4096 __kmp_affinity_num_masks = __kmp_avail_proc; 4097 } else { 4098 __kmp_affinity_num_masks = numUnique; 4099 } 4100 4101 if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) && 4102 (__kmp_affinity_num_places > 0) && 4103 ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) { 4104 __kmp_affinity_num_masks = __kmp_affinity_num_places; 4105 } 4106 4107 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4108 4109 // Sort the topology table according to the current setting of 4110 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 4111 __kmp_topology->sort_compact(); 4112 { 4113 int i; 4114 unsigned j; 4115 int num_hw_threads = __kmp_topology->get_num_hw_threads(); 4116 for (i = 0, j = 0; i < num_hw_threads; i++) { 4117 if ((!__kmp_affinity_dups) && (!__kmp_topology->at(i).leader)) { 4118 continue; 4119 } 4120 int osId = __kmp_topology->at(i).os_id; 4121 4122 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 4123 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j); 4124 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 4125 KMP_CPU_COPY(dest, src); 4126 if (++j >= __kmp_affinity_num_masks) { 4127 break; 4128 } 4129 } 4130 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 4131 } 4132 // Sort the topology back using ids 4133 __kmp_topology->sort_ids(); 4134 break; 4135 4136 default: 4137 KMP_ASSERT2(0, "Unexpected affinity setting"); 4138 } 4139 4140 KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1); 4141 } 4142 4143 void __kmp_affinity_initialize(void) { 4144 // Much of the code above was written assuming that if a machine was not 4145 // affinity capable, then __kmp_affinity_type == affinity_none. We now 4146 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 4147 // There are too many checks for __kmp_affinity_type == affinity_none 4148 // in this code. Instead of trying to change them all, check if 4149 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 4150 // affinity_none, call the real initialization routine, then restore 4151 // __kmp_affinity_type to affinity_disabled. 4152 int disabled = (__kmp_affinity_type == affinity_disabled); 4153 if (!KMP_AFFINITY_CAPABLE()) { 4154 KMP_ASSERT(disabled); 4155 } 4156 if (disabled) { 4157 __kmp_affinity_type = affinity_none; 4158 } 4159 __kmp_aux_affinity_initialize(); 4160 if (disabled) { 4161 __kmp_affinity_type = affinity_disabled; 4162 } 4163 } 4164 4165 void __kmp_affinity_uninitialize(void) { 4166 if (__kmp_affinity_masks != NULL) { 4167 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4168 __kmp_affinity_masks = NULL; 4169 } 4170 if (__kmp_affin_fullMask != NULL) { 4171 KMP_CPU_FREE(__kmp_affin_fullMask); 4172 __kmp_affin_fullMask = NULL; 4173 } 4174 __kmp_affinity_num_masks = 0; 4175 __kmp_affinity_type = affinity_default; 4176 __kmp_affinity_num_places = 0; 4177 if (__kmp_affinity_proclist != NULL) { 4178 __kmp_free(__kmp_affinity_proclist); 4179 __kmp_affinity_proclist = NULL; 4180 } 4181 if (procarr != NULL) { 4182 __kmp_free(procarr); 4183 procarr = NULL; 4184 } 4185 #if KMP_USE_HWLOC 4186 if (__kmp_hwloc_topology != NULL) { 4187 hwloc_topology_destroy(__kmp_hwloc_topology); 4188 __kmp_hwloc_topology = NULL; 4189 } 4190 #endif 4191 if (__kmp_hw_subset) { 4192 kmp_hw_subset_t::deallocate(__kmp_hw_subset); 4193 __kmp_hw_subset = nullptr; 4194 } 4195 if (__kmp_topology) { 4196 kmp_topology_t::deallocate(__kmp_topology); 4197 __kmp_topology = nullptr; 4198 } 4199 KMPAffinity::destroy_api(); 4200 } 4201 4202 void __kmp_affinity_set_init_mask(int gtid, int isa_root) { 4203 if (!KMP_AFFINITY_CAPABLE()) { 4204 return; 4205 } 4206 4207 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4208 if (th->th.th_affin_mask == NULL) { 4209 KMP_CPU_ALLOC(th->th.th_affin_mask); 4210 } else { 4211 KMP_CPU_ZERO(th->th.th_affin_mask); 4212 } 4213 4214 // Copy the thread mask to the kmp_info_t structure. If 4215 // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that 4216 // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set, 4217 // then the full mask is the same as the mask of the initialization thread. 4218 kmp_affin_mask_t *mask; 4219 int i; 4220 4221 if (KMP_AFFINITY_NON_PROC_BIND) { 4222 if ((__kmp_affinity_type == affinity_none) || 4223 (__kmp_affinity_type == affinity_balanced) || 4224 KMP_HIDDEN_HELPER_THREAD(gtid)) { 4225 #if KMP_GROUP_AFFINITY 4226 if (__kmp_num_proc_groups > 1) { 4227 return; 4228 } 4229 #endif 4230 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4231 i = 0; 4232 mask = __kmp_affin_fullMask; 4233 } else { 4234 int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid); 4235 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4236 i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4237 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4238 } 4239 } else { 4240 if ((!isa_root) || KMP_HIDDEN_HELPER_THREAD(gtid) || 4241 (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4242 #if KMP_GROUP_AFFINITY 4243 if (__kmp_num_proc_groups > 1) { 4244 return; 4245 } 4246 #endif 4247 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4248 i = KMP_PLACE_ALL; 4249 mask = __kmp_affin_fullMask; 4250 } else { 4251 // int i = some hash function or just a counter that doesn't 4252 // always start at 0. Use adjusted gtid for now. 4253 int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid); 4254 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4255 i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4256 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4257 } 4258 } 4259 4260 th->th.th_current_place = i; 4261 if (isa_root || KMP_HIDDEN_HELPER_THREAD(gtid)) { 4262 th->th.th_new_place = i; 4263 th->th.th_first_place = 0; 4264 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4265 } else if (KMP_AFFINITY_NON_PROC_BIND) { 4266 // When using a Non-OMP_PROC_BIND affinity method, 4267 // set all threads' place-partition-var to the entire place list 4268 th->th.th_first_place = 0; 4269 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4270 } 4271 4272 if (i == KMP_PLACE_ALL) { 4273 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4274 gtid)); 4275 } else { 4276 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4277 gtid, i)); 4278 } 4279 4280 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4281 4282 if (__kmp_affinity_verbose && !KMP_HIDDEN_HELPER_THREAD(gtid) 4283 /* to avoid duplicate printing (will be correctly printed on barrier) */ 4284 && (__kmp_affinity_type == affinity_none || 4285 (i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) { 4286 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4287 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4288 th->th.th_affin_mask); 4289 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4290 __kmp_gettid(), gtid, buf); 4291 } 4292 4293 #if KMP_DEBUG 4294 // Hidden helper thread affinity only printed for debug builds 4295 if (__kmp_affinity_verbose && KMP_HIDDEN_HELPER_THREAD(gtid)) { 4296 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4297 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4298 th->th.th_affin_mask); 4299 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY (hidden helper thread)", 4300 (kmp_int32)getpid(), __kmp_gettid(), gtid, buf); 4301 } 4302 #endif 4303 4304 #if KMP_OS_WINDOWS 4305 // On Windows* OS, the process affinity mask might have changed. If the user 4306 // didn't request affinity and this call fails, just continue silently. 4307 // See CQ171393. 4308 if (__kmp_affinity_type == affinity_none) { 4309 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4310 } else 4311 #endif 4312 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4313 } 4314 4315 void __kmp_affinity_set_place(int gtid) { 4316 if (!KMP_AFFINITY_CAPABLE()) { 4317 return; 4318 } 4319 4320 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4321 4322 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current " 4323 "place = %d)\n", 4324 gtid, th->th.th_new_place, th->th.th_current_place)); 4325 4326 // Check that the new place is within this thread's partition. 4327 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4328 KMP_ASSERT(th->th.th_new_place >= 0); 4329 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4330 if (th->th.th_first_place <= th->th.th_last_place) { 4331 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) && 4332 (th->th.th_new_place <= th->th.th_last_place)); 4333 } else { 4334 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) || 4335 (th->th.th_new_place >= th->th.th_last_place)); 4336 } 4337 4338 // Copy the thread mask to the kmp_info_t structure, 4339 // and set this thread's affinity. 4340 kmp_affin_mask_t *mask = 4341 KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place); 4342 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4343 th->th.th_current_place = th->th.th_new_place; 4344 4345 if (__kmp_affinity_verbose) { 4346 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4347 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4348 th->th.th_affin_mask); 4349 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4350 __kmp_gettid(), gtid, buf); 4351 } 4352 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4353 } 4354 4355 int __kmp_aux_set_affinity(void **mask) { 4356 int gtid; 4357 kmp_info_t *th; 4358 int retval; 4359 4360 if (!KMP_AFFINITY_CAPABLE()) { 4361 return -1; 4362 } 4363 4364 gtid = __kmp_entry_gtid(); 4365 KA_TRACE( 4366 1000, (""); { 4367 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4368 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4369 (kmp_affin_mask_t *)(*mask)); 4370 __kmp_debug_printf( 4371 "kmp_set_affinity: setting affinity mask for thread %d = %s\n", 4372 gtid, buf); 4373 }); 4374 4375 if (__kmp_env_consistency_check) { 4376 if ((mask == NULL) || (*mask == NULL)) { 4377 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4378 } else { 4379 unsigned proc; 4380 int num_procs = 0; 4381 4382 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) { 4383 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4384 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4385 } 4386 if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4387 continue; 4388 } 4389 num_procs++; 4390 } 4391 if (num_procs == 0) { 4392 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4393 } 4394 4395 #if KMP_GROUP_AFFINITY 4396 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4397 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4398 } 4399 #endif /* KMP_GROUP_AFFINITY */ 4400 } 4401 } 4402 4403 th = __kmp_threads[gtid]; 4404 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4405 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4406 if (retval == 0) { 4407 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4408 } 4409 4410 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4411 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4412 th->th.th_first_place = 0; 4413 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4414 4415 // Turn off 4.0 affinity for the current tread at this parallel level. 4416 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4417 4418 return retval; 4419 } 4420 4421 int __kmp_aux_get_affinity(void **mask) { 4422 int gtid; 4423 int retval; 4424 #if KMP_OS_WINDOWS || KMP_DEBUG 4425 kmp_info_t *th; 4426 #endif 4427 if (!KMP_AFFINITY_CAPABLE()) { 4428 return -1; 4429 } 4430 4431 gtid = __kmp_entry_gtid(); 4432 #if KMP_OS_WINDOWS || KMP_DEBUG 4433 th = __kmp_threads[gtid]; 4434 #else 4435 (void)gtid; // unused variable 4436 #endif 4437 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4438 4439 KA_TRACE( 4440 1000, (""); { 4441 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4442 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4443 th->th.th_affin_mask); 4444 __kmp_printf( 4445 "kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, 4446 buf); 4447 }); 4448 4449 if (__kmp_env_consistency_check) { 4450 if ((mask == NULL) || (*mask == NULL)) { 4451 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4452 } 4453 } 4454 4455 #if !KMP_OS_WINDOWS 4456 4457 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4458 KA_TRACE( 4459 1000, (""); { 4460 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4461 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4462 (kmp_affin_mask_t *)(*mask)); 4463 __kmp_printf( 4464 "kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, 4465 buf); 4466 }); 4467 return retval; 4468 4469 #else 4470 (void)retval; 4471 4472 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4473 return 0; 4474 4475 #endif /* KMP_OS_WINDOWS */ 4476 } 4477 4478 int __kmp_aux_get_affinity_max_proc() { 4479 if (!KMP_AFFINITY_CAPABLE()) { 4480 return 0; 4481 } 4482 #if KMP_GROUP_AFFINITY 4483 if (__kmp_num_proc_groups > 1) { 4484 return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT); 4485 } 4486 #endif 4487 return __kmp_xproc; 4488 } 4489 4490 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) { 4491 if (!KMP_AFFINITY_CAPABLE()) { 4492 return -1; 4493 } 4494 4495 KA_TRACE( 4496 1000, (""); { 4497 int gtid = __kmp_entry_gtid(); 4498 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4499 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4500 (kmp_affin_mask_t *)(*mask)); 4501 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in " 4502 "affinity mask for thread %d = %s\n", 4503 proc, gtid, buf); 4504 }); 4505 4506 if (__kmp_env_consistency_check) { 4507 if ((mask == NULL) || (*mask == NULL)) { 4508 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4509 } 4510 } 4511 4512 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4513 return -1; 4514 } 4515 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4516 return -2; 4517 } 4518 4519 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4520 return 0; 4521 } 4522 4523 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) { 4524 if (!KMP_AFFINITY_CAPABLE()) { 4525 return -1; 4526 } 4527 4528 KA_TRACE( 4529 1000, (""); { 4530 int gtid = __kmp_entry_gtid(); 4531 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4532 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4533 (kmp_affin_mask_t *)(*mask)); 4534 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in " 4535 "affinity mask for thread %d = %s\n", 4536 proc, gtid, buf); 4537 }); 4538 4539 if (__kmp_env_consistency_check) { 4540 if ((mask == NULL) || (*mask == NULL)) { 4541 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4542 } 4543 } 4544 4545 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4546 return -1; 4547 } 4548 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4549 return -2; 4550 } 4551 4552 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4553 return 0; 4554 } 4555 4556 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) { 4557 if (!KMP_AFFINITY_CAPABLE()) { 4558 return -1; 4559 } 4560 4561 KA_TRACE( 4562 1000, (""); { 4563 int gtid = __kmp_entry_gtid(); 4564 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4565 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4566 (kmp_affin_mask_t *)(*mask)); 4567 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in " 4568 "affinity mask for thread %d = %s\n", 4569 proc, gtid, buf); 4570 }); 4571 4572 if (__kmp_env_consistency_check) { 4573 if ((mask == NULL) || (*mask == NULL)) { 4574 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4575 } 4576 } 4577 4578 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4579 return -1; 4580 } 4581 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4582 return 0; 4583 } 4584 4585 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4586 } 4587 4588 // Dynamic affinity settings - Affinity balanced 4589 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) { 4590 KMP_DEBUG_ASSERT(th); 4591 bool fine_gran = true; 4592 int tid = th->th.th_info.ds.ds_tid; 4593 4594 // Do not perform balanced affinity for the hidden helper threads 4595 if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th))) 4596 return; 4597 4598 switch (__kmp_affinity_gran) { 4599 case KMP_HW_THREAD: 4600 break; 4601 case KMP_HW_CORE: 4602 if (__kmp_nThreadsPerCore > 1) { 4603 fine_gran = false; 4604 } 4605 break; 4606 case KMP_HW_SOCKET: 4607 if (nCoresPerPkg > 1) { 4608 fine_gran = false; 4609 } 4610 break; 4611 default: 4612 fine_gran = false; 4613 } 4614 4615 if (__kmp_topology->is_uniform()) { 4616 int coreID; 4617 int threadID; 4618 // Number of hyper threads per core in HT machine 4619 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4620 // Number of cores 4621 int ncores = __kmp_ncores; 4622 if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) { 4623 __kmp_nth_per_core = __kmp_avail_proc / nPackages; 4624 ncores = nPackages; 4625 } 4626 // How many threads will be bound to each core 4627 int chunk = nthreads / ncores; 4628 // How many cores will have an additional thread bound to it - "big cores" 4629 int big_cores = nthreads % ncores; 4630 // Number of threads on the big cores 4631 int big_nth = (chunk + 1) * big_cores; 4632 if (tid < big_nth) { 4633 coreID = tid / (chunk + 1); 4634 threadID = (tid % (chunk + 1)) % __kmp_nth_per_core; 4635 } else { // tid >= big_nth 4636 coreID = (tid - big_cores) / chunk; 4637 threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core; 4638 } 4639 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4640 "Illegal set affinity operation when not capable"); 4641 4642 kmp_affin_mask_t *mask = th->th.th_affin_mask; 4643 KMP_CPU_ZERO(mask); 4644 4645 if (fine_gran) { 4646 int osID = 4647 __kmp_topology->at(coreID * __kmp_nth_per_core + threadID).os_id; 4648 KMP_CPU_SET(osID, mask); 4649 } else { 4650 for (int i = 0; i < __kmp_nth_per_core; i++) { 4651 int osID; 4652 osID = __kmp_topology->at(coreID * __kmp_nth_per_core + i).os_id; 4653 KMP_CPU_SET(osID, mask); 4654 } 4655 } 4656 if (__kmp_affinity_verbose) { 4657 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4658 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4659 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4660 __kmp_gettid(), tid, buf); 4661 } 4662 __kmp_set_system_affinity(mask, TRUE); 4663 } else { // Non-uniform topology 4664 4665 kmp_affin_mask_t *mask = th->th.th_affin_mask; 4666 KMP_CPU_ZERO(mask); 4667 4668 int core_level = 4669 __kmp_affinity_find_core_level(__kmp_avail_proc, __kmp_aff_depth - 1); 4670 int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, 4671 __kmp_aff_depth - 1, core_level); 4672 int nth_per_core = __kmp_affinity_max_proc_per_core( 4673 __kmp_avail_proc, __kmp_aff_depth - 1, core_level); 4674 4675 // For performance gain consider the special case nthreads == 4676 // __kmp_avail_proc 4677 if (nthreads == __kmp_avail_proc) { 4678 if (fine_gran) { 4679 int osID = __kmp_topology->at(tid).os_id; 4680 KMP_CPU_SET(osID, mask); 4681 } else { 4682 int core = 4683 __kmp_affinity_find_core(tid, __kmp_aff_depth - 1, core_level); 4684 for (int i = 0; i < __kmp_avail_proc; i++) { 4685 int osID = __kmp_topology->at(i).os_id; 4686 if (__kmp_affinity_find_core(i, __kmp_aff_depth - 1, core_level) == 4687 core) { 4688 KMP_CPU_SET(osID, mask); 4689 } 4690 } 4691 } 4692 } else if (nthreads <= ncores) { 4693 4694 int core = 0; 4695 for (int i = 0; i < ncores; i++) { 4696 // Check if this core from procarr[] is in the mask 4697 int in_mask = 0; 4698 for (int j = 0; j < nth_per_core; j++) { 4699 if (procarr[i * nth_per_core + j] != -1) { 4700 in_mask = 1; 4701 break; 4702 } 4703 } 4704 if (in_mask) { 4705 if (tid == core) { 4706 for (int j = 0; j < nth_per_core; j++) { 4707 int osID = procarr[i * nth_per_core + j]; 4708 if (osID != -1) { 4709 KMP_CPU_SET(osID, mask); 4710 // For fine granularity it is enough to set the first available 4711 // osID for this core 4712 if (fine_gran) { 4713 break; 4714 } 4715 } 4716 } 4717 break; 4718 } else { 4719 core++; 4720 } 4721 } 4722 } 4723 } else { // nthreads > ncores 4724 // Array to save the number of processors at each core 4725 int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores); 4726 // Array to save the number of cores with "x" available processors; 4727 int *ncores_with_x_procs = 4728 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 4729 // Array to save the number of cores with # procs from x to nth_per_core 4730 int *ncores_with_x_to_max_procs = 4731 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 4732 4733 for (int i = 0; i <= nth_per_core; i++) { 4734 ncores_with_x_procs[i] = 0; 4735 ncores_with_x_to_max_procs[i] = 0; 4736 } 4737 4738 for (int i = 0; i < ncores; i++) { 4739 int cnt = 0; 4740 for (int j = 0; j < nth_per_core; j++) { 4741 if (procarr[i * nth_per_core + j] != -1) { 4742 cnt++; 4743 } 4744 } 4745 nproc_at_core[i] = cnt; 4746 ncores_with_x_procs[cnt]++; 4747 } 4748 4749 for (int i = 0; i <= nth_per_core; i++) { 4750 for (int j = i; j <= nth_per_core; j++) { 4751 ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j]; 4752 } 4753 } 4754 4755 // Max number of processors 4756 int nproc = nth_per_core * ncores; 4757 // An array to keep number of threads per each context 4758 int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4759 for (int i = 0; i < nproc; i++) { 4760 newarr[i] = 0; 4761 } 4762 4763 int nth = nthreads; 4764 int flag = 0; 4765 while (nth > 0) { 4766 for (int j = 1; j <= nth_per_core; j++) { 4767 int cnt = ncores_with_x_to_max_procs[j]; 4768 for (int i = 0; i < ncores; i++) { 4769 // Skip the core with 0 processors 4770 if (nproc_at_core[i] == 0) { 4771 continue; 4772 } 4773 for (int k = 0; k < nth_per_core; k++) { 4774 if (procarr[i * nth_per_core + k] != -1) { 4775 if (newarr[i * nth_per_core + k] == 0) { 4776 newarr[i * nth_per_core + k] = 1; 4777 cnt--; 4778 nth--; 4779 break; 4780 } else { 4781 if (flag != 0) { 4782 newarr[i * nth_per_core + k]++; 4783 cnt--; 4784 nth--; 4785 break; 4786 } 4787 } 4788 } 4789 } 4790 if (cnt == 0 || nth == 0) { 4791 break; 4792 } 4793 } 4794 if (nth == 0) { 4795 break; 4796 } 4797 } 4798 flag = 1; 4799 } 4800 int sum = 0; 4801 for (int i = 0; i < nproc; i++) { 4802 sum += newarr[i]; 4803 if (sum > tid) { 4804 if (fine_gran) { 4805 int osID = procarr[i]; 4806 KMP_CPU_SET(osID, mask); 4807 } else { 4808 int coreID = i / nth_per_core; 4809 for (int ii = 0; ii < nth_per_core; ii++) { 4810 int osID = procarr[coreID * nth_per_core + ii]; 4811 if (osID != -1) { 4812 KMP_CPU_SET(osID, mask); 4813 } 4814 } 4815 } 4816 break; 4817 } 4818 } 4819 __kmp_free(newarr); 4820 } 4821 4822 if (__kmp_affinity_verbose) { 4823 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4824 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4825 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4826 __kmp_gettid(), tid, buf); 4827 } 4828 __kmp_set_system_affinity(mask, TRUE); 4829 } 4830 } 4831 4832 #if KMP_OS_LINUX || KMP_OS_FREEBSD 4833 // We don't need this entry for Windows because 4834 // there is GetProcessAffinityMask() api 4835 // 4836 // The intended usage is indicated by these steps: 4837 // 1) The user gets the current affinity mask 4838 // 2) Then sets the affinity by calling this function 4839 // 3) Error check the return value 4840 // 4) Use non-OpenMP parallelization 4841 // 5) Reset the affinity to what was stored in step 1) 4842 #ifdef __cplusplus 4843 extern "C" 4844 #endif 4845 int 4846 kmp_set_thread_affinity_mask_initial() 4847 // the function returns 0 on success, 4848 // -1 if we cannot bind thread 4849 // >0 (errno) if an error happened during binding 4850 { 4851 int gtid = __kmp_get_gtid(); 4852 if (gtid < 0) { 4853 // Do not touch non-omp threads 4854 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 4855 "non-omp thread, returning\n")); 4856 return -1; 4857 } 4858 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) { 4859 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 4860 "affinity not initialized, returning\n")); 4861 return -1; 4862 } 4863 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 4864 "set full mask for thread %d\n", 4865 gtid)); 4866 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL); 4867 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE); 4868 } 4869 #endif 4870 4871 #endif // KMP_AFFINITY_SUPPORTED 4872