1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_i18n.h" 16 #include "kmp_io.h" 17 #include "kmp_str.h" 18 #include "kmp_wrapper_getpid.h" 19 #if KMP_USE_HIER_SCHED 20 #include "kmp_dispatch_hier.h" 21 #endif 22 #if KMP_USE_HWLOC 23 // Copied from hwloc 24 #define HWLOC_GROUP_KIND_INTEL_MODULE 102 25 #define HWLOC_GROUP_KIND_INTEL_TILE 103 26 #define HWLOC_GROUP_KIND_INTEL_DIE 104 27 #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220 28 #endif 29 #include <ctype.h> 30 31 // The machine topology 32 kmp_topology_t *__kmp_topology = nullptr; 33 // KMP_HW_SUBSET environment variable 34 kmp_hw_subset_t *__kmp_hw_subset = nullptr; 35 36 // Store the real or imagined machine hierarchy here 37 static hierarchy_info machine_hierarchy; 38 39 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); } 40 41 #if KMP_AFFINITY_SUPPORTED 42 // Helper class to see if place lists further restrict the fullMask 43 class kmp_full_mask_modifier_t { 44 kmp_affin_mask_t *mask; 45 46 public: 47 kmp_full_mask_modifier_t() { 48 KMP_CPU_ALLOC(mask); 49 KMP_CPU_ZERO(mask); 50 } 51 ~kmp_full_mask_modifier_t() { 52 KMP_CPU_FREE(mask); 53 mask = nullptr; 54 } 55 void include(const kmp_affin_mask_t *other) { KMP_CPU_UNION(mask, other); } 56 // If the new full mask is different from the current full mask, 57 // then switch them. Returns true if full mask was affected, false otherwise. 58 bool restrict_to_mask() { 59 // See if the new mask further restricts or changes the full mask 60 if (KMP_CPU_EQUAL(__kmp_affin_fullMask, mask) || KMP_CPU_ISEMPTY(mask)) 61 return false; 62 return __kmp_topology->restrict_to_mask(mask); 63 } 64 }; 65 66 static inline const char * 67 __kmp_get_affinity_env_var(const kmp_affinity_t &affinity, 68 bool for_binding = false) { 69 if (affinity.flags.omp_places) { 70 if (for_binding) 71 return "OMP_PROC_BIND"; 72 return "OMP_PLACES"; 73 } 74 return affinity.env_var; 75 } 76 #endif // KMP_AFFINITY_SUPPORTED 77 78 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 79 kmp_uint32 depth; 80 // The test below is true if affinity is available, but set to "none". Need to 81 // init on first use of hierarchical barrier. 82 if (TCR_1(machine_hierarchy.uninitialized)) 83 machine_hierarchy.init(nproc); 84 85 // Adjust the hierarchy in case num threads exceeds original 86 if (nproc > machine_hierarchy.base_num_threads) 87 machine_hierarchy.resize(nproc); 88 89 depth = machine_hierarchy.depth; 90 KMP_DEBUG_ASSERT(depth > 0); 91 92 thr_bar->depth = depth; 93 __kmp_type_convert(machine_hierarchy.numPerLevel[0] - 1, 94 &(thr_bar->base_leaf_kids)); 95 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 96 } 97 98 static int nCoresPerPkg, nPackages; 99 static int __kmp_nThreadsPerCore; 100 #ifndef KMP_DFLT_NTH_CORES 101 static int __kmp_ncores; 102 #endif 103 104 const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) { 105 switch (type) { 106 case KMP_HW_SOCKET: 107 return ((plural) ? KMP_I18N_STR(Sockets) : KMP_I18N_STR(Socket)); 108 case KMP_HW_DIE: 109 return ((plural) ? KMP_I18N_STR(Dice) : KMP_I18N_STR(Die)); 110 case KMP_HW_MODULE: 111 return ((plural) ? KMP_I18N_STR(Modules) : KMP_I18N_STR(Module)); 112 case KMP_HW_TILE: 113 return ((plural) ? KMP_I18N_STR(Tiles) : KMP_I18N_STR(Tile)); 114 case KMP_HW_NUMA: 115 return ((plural) ? KMP_I18N_STR(NumaDomains) : KMP_I18N_STR(NumaDomain)); 116 case KMP_HW_L3: 117 return ((plural) ? KMP_I18N_STR(L3Caches) : KMP_I18N_STR(L3Cache)); 118 case KMP_HW_L2: 119 return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache)); 120 case KMP_HW_L1: 121 return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache)); 122 case KMP_HW_LLC: 123 return ((plural) ? KMP_I18N_STR(LLCaches) : KMP_I18N_STR(LLCache)); 124 case KMP_HW_CORE: 125 return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core)); 126 case KMP_HW_THREAD: 127 return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread)); 128 case KMP_HW_PROC_GROUP: 129 return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup)); 130 } 131 return KMP_I18N_STR(Unknown); 132 } 133 134 const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) { 135 switch (type) { 136 case KMP_HW_SOCKET: 137 return ((plural) ? "sockets" : "socket"); 138 case KMP_HW_DIE: 139 return ((plural) ? "dice" : "die"); 140 case KMP_HW_MODULE: 141 return ((plural) ? "modules" : "module"); 142 case KMP_HW_TILE: 143 return ((plural) ? "tiles" : "tile"); 144 case KMP_HW_NUMA: 145 return ((plural) ? "numa_domains" : "numa_domain"); 146 case KMP_HW_L3: 147 return ((plural) ? "l3_caches" : "l3_cache"); 148 case KMP_HW_L2: 149 return ((plural) ? "l2_caches" : "l2_cache"); 150 case KMP_HW_L1: 151 return ((plural) ? "l1_caches" : "l1_cache"); 152 case KMP_HW_LLC: 153 return ((plural) ? "ll_caches" : "ll_cache"); 154 case KMP_HW_CORE: 155 return ((plural) ? "cores" : "core"); 156 case KMP_HW_THREAD: 157 return ((plural) ? "threads" : "thread"); 158 case KMP_HW_PROC_GROUP: 159 return ((plural) ? "proc_groups" : "proc_group"); 160 } 161 return ((plural) ? "unknowns" : "unknown"); 162 } 163 164 const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) { 165 switch (type) { 166 case KMP_HW_CORE_TYPE_UNKNOWN: 167 return "unknown"; 168 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 169 case KMP_HW_CORE_TYPE_ATOM: 170 return "Intel Atom(R) processor"; 171 case KMP_HW_CORE_TYPE_CORE: 172 return "Intel(R) Core(TM) processor"; 173 #endif 174 } 175 return "unknown"; 176 } 177 178 #if KMP_AFFINITY_SUPPORTED 179 // If affinity is supported, check the affinity 180 // verbose and warning flags before printing warning 181 #define KMP_AFF_WARNING(s, ...) \ 182 if (s.flags.verbose || (s.flags.warnings && (s.type != affinity_none))) { \ 183 KMP_WARNING(__VA_ARGS__); \ 184 } 185 #else 186 #define KMP_AFF_WARNING(s, ...) KMP_WARNING(__VA_ARGS__) 187 #endif 188 189 //////////////////////////////////////////////////////////////////////////////// 190 // kmp_hw_thread_t methods 191 int kmp_hw_thread_t::compare_ids(const void *a, const void *b) { 192 const kmp_hw_thread_t *ahwthread = (const kmp_hw_thread_t *)a; 193 const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b; 194 int depth = __kmp_topology->get_depth(); 195 for (int level = 0; level < depth; ++level) { 196 if (ahwthread->ids[level] < bhwthread->ids[level]) 197 return -1; 198 else if (ahwthread->ids[level] > bhwthread->ids[level]) 199 return 1; 200 } 201 if (ahwthread->os_id < bhwthread->os_id) 202 return -1; 203 else if (ahwthread->os_id > bhwthread->os_id) 204 return 1; 205 return 0; 206 } 207 208 #if KMP_AFFINITY_SUPPORTED 209 int kmp_hw_thread_t::compare_compact(const void *a, const void *b) { 210 int i; 211 const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a; 212 const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b; 213 int depth = __kmp_topology->get_depth(); 214 int compact = __kmp_topology->compact; 215 KMP_DEBUG_ASSERT(compact >= 0); 216 KMP_DEBUG_ASSERT(compact <= depth); 217 for (i = 0; i < compact; i++) { 218 int j = depth - i - 1; 219 if (aa->sub_ids[j] < bb->sub_ids[j]) 220 return -1; 221 if (aa->sub_ids[j] > bb->sub_ids[j]) 222 return 1; 223 } 224 for (; i < depth; i++) { 225 int j = i - compact; 226 if (aa->sub_ids[j] < bb->sub_ids[j]) 227 return -1; 228 if (aa->sub_ids[j] > bb->sub_ids[j]) 229 return 1; 230 } 231 return 0; 232 } 233 #endif 234 235 void kmp_hw_thread_t::print() const { 236 int depth = __kmp_topology->get_depth(); 237 printf("%4d ", os_id); 238 for (int i = 0; i < depth; ++i) { 239 printf("%4d ", ids[i]); 240 } 241 if (attrs) { 242 if (attrs.is_core_type_valid()) 243 printf(" (%s)", __kmp_hw_get_core_type_string(attrs.get_core_type())); 244 if (attrs.is_core_eff_valid()) 245 printf(" (eff=%d)", attrs.get_core_eff()); 246 } 247 if (leader) 248 printf(" (leader)"); 249 printf("\n"); 250 } 251 252 //////////////////////////////////////////////////////////////////////////////// 253 // kmp_topology_t methods 254 255 // Add a layer to the topology based on the ids. Assume the topology 256 // is perfectly nested (i.e., so no object has more than one parent) 257 void kmp_topology_t::_insert_layer(kmp_hw_t type, const int *ids) { 258 // Figure out where the layer should go by comparing the ids of the current 259 // layers with the new ids 260 int target_layer; 261 int previous_id = kmp_hw_thread_t::UNKNOWN_ID; 262 int previous_new_id = kmp_hw_thread_t::UNKNOWN_ID; 263 264 // Start from the highest layer and work down to find target layer 265 // If new layer is equal to another layer then put the new layer above 266 for (target_layer = 0; target_layer < depth; ++target_layer) { 267 bool layers_equal = true; 268 bool strictly_above_target_layer = false; 269 for (int i = 0; i < num_hw_threads; ++i) { 270 int id = hw_threads[i].ids[target_layer]; 271 int new_id = ids[i]; 272 if (id != previous_id && new_id == previous_new_id) { 273 // Found the layer we are strictly above 274 strictly_above_target_layer = true; 275 layers_equal = false; 276 break; 277 } else if (id == previous_id && new_id != previous_new_id) { 278 // Found a layer we are below. Move to next layer and check. 279 layers_equal = false; 280 break; 281 } 282 previous_id = id; 283 previous_new_id = new_id; 284 } 285 if (strictly_above_target_layer || layers_equal) 286 break; 287 } 288 289 // Found the layer we are above. Now move everything to accommodate the new 290 // layer. And put the new ids and type into the topology. 291 for (int i = depth - 1, j = depth; i >= target_layer; --i, --j) 292 types[j] = types[i]; 293 types[target_layer] = type; 294 for (int k = 0; k < num_hw_threads; ++k) { 295 for (int i = depth - 1, j = depth; i >= target_layer; --i, --j) 296 hw_threads[k].ids[j] = hw_threads[k].ids[i]; 297 hw_threads[k].ids[target_layer] = ids[k]; 298 } 299 equivalent[type] = type; 300 depth++; 301 } 302 303 #if KMP_GROUP_AFFINITY 304 // Insert the Windows Processor Group structure into the topology 305 void kmp_topology_t::_insert_windows_proc_groups() { 306 // Do not insert the processor group structure for a single group 307 if (__kmp_num_proc_groups == 1) 308 return; 309 kmp_affin_mask_t *mask; 310 int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads); 311 KMP_CPU_ALLOC(mask); 312 for (int i = 0; i < num_hw_threads; ++i) { 313 KMP_CPU_ZERO(mask); 314 KMP_CPU_SET(hw_threads[i].os_id, mask); 315 ids[i] = __kmp_get_proc_group(mask); 316 } 317 KMP_CPU_FREE(mask); 318 _insert_layer(KMP_HW_PROC_GROUP, ids); 319 __kmp_free(ids); 320 } 321 #endif 322 323 // Remove layers that don't add information to the topology. 324 // This is done by having the layer take on the id = UNKNOWN_ID (-1) 325 void kmp_topology_t::_remove_radix1_layers() { 326 int preference[KMP_HW_LAST]; 327 int top_index1, top_index2; 328 // Set up preference associative array 329 preference[KMP_HW_SOCKET] = 110; 330 preference[KMP_HW_PROC_GROUP] = 100; 331 preference[KMP_HW_CORE] = 95; 332 preference[KMP_HW_THREAD] = 90; 333 preference[KMP_HW_NUMA] = 85; 334 preference[KMP_HW_DIE] = 80; 335 preference[KMP_HW_TILE] = 75; 336 preference[KMP_HW_MODULE] = 73; 337 preference[KMP_HW_L3] = 70; 338 preference[KMP_HW_L2] = 65; 339 preference[KMP_HW_L1] = 60; 340 preference[KMP_HW_LLC] = 5; 341 top_index1 = 0; 342 top_index2 = 1; 343 while (top_index1 < depth - 1 && top_index2 < depth) { 344 kmp_hw_t type1 = types[top_index1]; 345 kmp_hw_t type2 = types[top_index2]; 346 KMP_ASSERT_VALID_HW_TYPE(type1); 347 KMP_ASSERT_VALID_HW_TYPE(type2); 348 // Do not allow the three main topology levels (sockets, cores, threads) to 349 // be compacted down 350 if ((type1 == KMP_HW_THREAD || type1 == KMP_HW_CORE || 351 type1 == KMP_HW_SOCKET) && 352 (type2 == KMP_HW_THREAD || type2 == KMP_HW_CORE || 353 type2 == KMP_HW_SOCKET)) { 354 top_index1 = top_index2++; 355 continue; 356 } 357 bool radix1 = true; 358 bool all_same = true; 359 int id1 = hw_threads[0].ids[top_index1]; 360 int id2 = hw_threads[0].ids[top_index2]; 361 int pref1 = preference[type1]; 362 int pref2 = preference[type2]; 363 for (int hwidx = 1; hwidx < num_hw_threads; ++hwidx) { 364 if (hw_threads[hwidx].ids[top_index1] == id1 && 365 hw_threads[hwidx].ids[top_index2] != id2) { 366 radix1 = false; 367 break; 368 } 369 if (hw_threads[hwidx].ids[top_index2] != id2) 370 all_same = false; 371 id1 = hw_threads[hwidx].ids[top_index1]; 372 id2 = hw_threads[hwidx].ids[top_index2]; 373 } 374 if (radix1) { 375 // Select the layer to remove based on preference 376 kmp_hw_t remove_type, keep_type; 377 int remove_layer, remove_layer_ids; 378 if (pref1 > pref2) { 379 remove_type = type2; 380 remove_layer = remove_layer_ids = top_index2; 381 keep_type = type1; 382 } else { 383 remove_type = type1; 384 remove_layer = remove_layer_ids = top_index1; 385 keep_type = type2; 386 } 387 // If all the indexes for the second (deeper) layer are the same. 388 // e.g., all are zero, then make sure to keep the first layer's ids 389 if (all_same) 390 remove_layer_ids = top_index2; 391 // Remove radix one type by setting the equivalence, removing the id from 392 // the hw threads and removing the layer from types and depth 393 set_equivalent_type(remove_type, keep_type); 394 for (int idx = 0; idx < num_hw_threads; ++idx) { 395 kmp_hw_thread_t &hw_thread = hw_threads[idx]; 396 for (int d = remove_layer_ids; d < depth - 1; ++d) 397 hw_thread.ids[d] = hw_thread.ids[d + 1]; 398 } 399 for (int idx = remove_layer; idx < depth - 1; ++idx) 400 types[idx] = types[idx + 1]; 401 depth--; 402 } else { 403 top_index1 = top_index2++; 404 } 405 } 406 KMP_ASSERT(depth > 0); 407 } 408 409 void kmp_topology_t::_set_last_level_cache() { 410 if (get_equivalent_type(KMP_HW_L3) != KMP_HW_UNKNOWN) 411 set_equivalent_type(KMP_HW_LLC, KMP_HW_L3); 412 else if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN) 413 set_equivalent_type(KMP_HW_LLC, KMP_HW_L2); 414 #if KMP_MIC_SUPPORTED 415 else if (__kmp_mic_type == mic3) { 416 if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN) 417 set_equivalent_type(KMP_HW_LLC, KMP_HW_L2); 418 else if (get_equivalent_type(KMP_HW_TILE) != KMP_HW_UNKNOWN) 419 set_equivalent_type(KMP_HW_LLC, KMP_HW_TILE); 420 // L2/Tile wasn't detected so just say L1 421 else 422 set_equivalent_type(KMP_HW_LLC, KMP_HW_L1); 423 } 424 #endif 425 else if (get_equivalent_type(KMP_HW_L1) != KMP_HW_UNKNOWN) 426 set_equivalent_type(KMP_HW_LLC, KMP_HW_L1); 427 // Fallback is to set last level cache to socket or core 428 if (get_equivalent_type(KMP_HW_LLC) == KMP_HW_UNKNOWN) { 429 if (get_equivalent_type(KMP_HW_SOCKET) != KMP_HW_UNKNOWN) 430 set_equivalent_type(KMP_HW_LLC, KMP_HW_SOCKET); 431 else if (get_equivalent_type(KMP_HW_CORE) != KMP_HW_UNKNOWN) 432 set_equivalent_type(KMP_HW_LLC, KMP_HW_CORE); 433 } 434 KMP_ASSERT(get_equivalent_type(KMP_HW_LLC) != KMP_HW_UNKNOWN); 435 } 436 437 // Gather the count of each topology layer and the ratio 438 void kmp_topology_t::_gather_enumeration_information() { 439 int previous_id[KMP_HW_LAST]; 440 int max[KMP_HW_LAST]; 441 442 for (int i = 0; i < depth; ++i) { 443 previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID; 444 max[i] = 0; 445 count[i] = 0; 446 ratio[i] = 0; 447 } 448 int core_level = get_level(KMP_HW_CORE); 449 for (int i = 0; i < num_hw_threads; ++i) { 450 kmp_hw_thread_t &hw_thread = hw_threads[i]; 451 for (int layer = 0; layer < depth; ++layer) { 452 int id = hw_thread.ids[layer]; 453 if (id != previous_id[layer]) { 454 // Add an additional increment to each count 455 for (int l = layer; l < depth; ++l) 456 count[l]++; 457 // Keep track of topology layer ratio statistics 458 max[layer]++; 459 for (int l = layer + 1; l < depth; ++l) { 460 if (max[l] > ratio[l]) 461 ratio[l] = max[l]; 462 max[l] = 1; 463 } 464 // Figure out the number of different core types 465 // and efficiencies for hybrid CPUs 466 if (__kmp_is_hybrid_cpu() && core_level >= 0 && layer <= core_level) { 467 if (hw_thread.attrs.is_core_eff_valid() && 468 hw_thread.attrs.core_eff >= num_core_efficiencies) { 469 // Because efficiencies can range from 0 to max efficiency - 1, 470 // the number of efficiencies is max efficiency + 1 471 num_core_efficiencies = hw_thread.attrs.core_eff + 1; 472 } 473 if (hw_thread.attrs.is_core_type_valid()) { 474 bool found = false; 475 for (int j = 0; j < num_core_types; ++j) { 476 if (hw_thread.attrs.get_core_type() == core_types[j]) { 477 found = true; 478 break; 479 } 480 } 481 if (!found) { 482 KMP_ASSERT(num_core_types < KMP_HW_MAX_NUM_CORE_TYPES); 483 core_types[num_core_types++] = hw_thread.attrs.get_core_type(); 484 } 485 } 486 } 487 break; 488 } 489 } 490 for (int layer = 0; layer < depth; ++layer) { 491 previous_id[layer] = hw_thread.ids[layer]; 492 } 493 } 494 for (int layer = 0; layer < depth; ++layer) { 495 if (max[layer] > ratio[layer]) 496 ratio[layer] = max[layer]; 497 } 498 } 499 500 int kmp_topology_t::_get_ncores_with_attr(const kmp_hw_attr_t &attr, 501 int above_level, 502 bool find_all) const { 503 int current, current_max; 504 int previous_id[KMP_HW_LAST]; 505 for (int i = 0; i < depth; ++i) 506 previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID; 507 int core_level = get_level(KMP_HW_CORE); 508 if (find_all) 509 above_level = -1; 510 KMP_ASSERT(above_level < core_level); 511 current_max = 0; 512 current = 0; 513 for (int i = 0; i < num_hw_threads; ++i) { 514 kmp_hw_thread_t &hw_thread = hw_threads[i]; 515 if (!find_all && hw_thread.ids[above_level] != previous_id[above_level]) { 516 if (current > current_max) 517 current_max = current; 518 current = hw_thread.attrs.contains(attr); 519 } else { 520 for (int level = above_level + 1; level <= core_level; ++level) { 521 if (hw_thread.ids[level] != previous_id[level]) { 522 if (hw_thread.attrs.contains(attr)) 523 current++; 524 break; 525 } 526 } 527 } 528 for (int level = 0; level < depth; ++level) 529 previous_id[level] = hw_thread.ids[level]; 530 } 531 if (current > current_max) 532 current_max = current; 533 return current_max; 534 } 535 536 // Find out if the topology is uniform 537 void kmp_topology_t::_discover_uniformity() { 538 int num = 1; 539 for (int level = 0; level < depth; ++level) 540 num *= ratio[level]; 541 flags.uniform = (num == count[depth - 1]); 542 } 543 544 // Set all the sub_ids for each hardware thread 545 void kmp_topology_t::_set_sub_ids() { 546 int previous_id[KMP_HW_LAST]; 547 int sub_id[KMP_HW_LAST]; 548 549 for (int i = 0; i < depth; ++i) { 550 previous_id[i] = -1; 551 sub_id[i] = -1; 552 } 553 for (int i = 0; i < num_hw_threads; ++i) { 554 kmp_hw_thread_t &hw_thread = hw_threads[i]; 555 // Setup the sub_id 556 for (int j = 0; j < depth; ++j) { 557 if (hw_thread.ids[j] != previous_id[j]) { 558 sub_id[j]++; 559 for (int k = j + 1; k < depth; ++k) { 560 sub_id[k] = 0; 561 } 562 break; 563 } 564 } 565 // Set previous_id 566 for (int j = 0; j < depth; ++j) { 567 previous_id[j] = hw_thread.ids[j]; 568 } 569 // Set the sub_ids field 570 for (int j = 0; j < depth; ++j) { 571 hw_thread.sub_ids[j] = sub_id[j]; 572 } 573 } 574 } 575 576 void kmp_topology_t::_set_globals() { 577 // Set nCoresPerPkg, nPackages, __kmp_nThreadsPerCore, __kmp_ncores 578 int core_level, thread_level, package_level; 579 package_level = get_level(KMP_HW_SOCKET); 580 #if KMP_GROUP_AFFINITY 581 if (package_level == -1) 582 package_level = get_level(KMP_HW_PROC_GROUP); 583 #endif 584 core_level = get_level(KMP_HW_CORE); 585 thread_level = get_level(KMP_HW_THREAD); 586 587 KMP_ASSERT(core_level != -1); 588 KMP_ASSERT(thread_level != -1); 589 590 __kmp_nThreadsPerCore = calculate_ratio(thread_level, core_level); 591 if (package_level != -1) { 592 nCoresPerPkg = calculate_ratio(core_level, package_level); 593 nPackages = get_count(package_level); 594 } else { 595 // assume one socket 596 nCoresPerPkg = get_count(core_level); 597 nPackages = 1; 598 } 599 #ifndef KMP_DFLT_NTH_CORES 600 __kmp_ncores = get_count(core_level); 601 #endif 602 } 603 604 kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth, 605 const kmp_hw_t *types) { 606 kmp_topology_t *retval; 607 // Allocate all data in one large allocation 608 size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc + 609 sizeof(int) * (size_t)KMP_HW_LAST * 3; 610 char *bytes = (char *)__kmp_allocate(size); 611 retval = (kmp_topology_t *)bytes; 612 if (nproc > 0) { 613 retval->hw_threads = (kmp_hw_thread_t *)(bytes + sizeof(kmp_topology_t)); 614 } else { 615 retval->hw_threads = nullptr; 616 } 617 retval->num_hw_threads = nproc; 618 retval->depth = ndepth; 619 int *arr = 620 (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc); 621 retval->types = (kmp_hw_t *)arr; 622 retval->ratio = arr + (size_t)KMP_HW_LAST; 623 retval->count = arr + 2 * (size_t)KMP_HW_LAST; 624 retval->num_core_efficiencies = 0; 625 retval->num_core_types = 0; 626 retval->compact = 0; 627 for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) 628 retval->core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN; 629 KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; } 630 for (int i = 0; i < ndepth; ++i) { 631 retval->types[i] = types[i]; 632 retval->equivalent[types[i]] = types[i]; 633 } 634 return retval; 635 } 636 637 void kmp_topology_t::deallocate(kmp_topology_t *topology) { 638 if (topology) 639 __kmp_free(topology); 640 } 641 642 bool kmp_topology_t::check_ids() const { 643 // Assume ids have been sorted 644 if (num_hw_threads == 0) 645 return true; 646 for (int i = 1; i < num_hw_threads; ++i) { 647 kmp_hw_thread_t ¤t_thread = hw_threads[i]; 648 kmp_hw_thread_t &previous_thread = hw_threads[i - 1]; 649 bool unique = false; 650 for (int j = 0; j < depth; ++j) { 651 if (previous_thread.ids[j] != current_thread.ids[j]) { 652 unique = true; 653 break; 654 } 655 } 656 if (unique) 657 continue; 658 return false; 659 } 660 return true; 661 } 662 663 void kmp_topology_t::dump() const { 664 printf("***********************\n"); 665 printf("*** __kmp_topology: ***\n"); 666 printf("***********************\n"); 667 printf("* depth: %d\n", depth); 668 669 printf("* types: "); 670 for (int i = 0; i < depth; ++i) 671 printf("%15s ", __kmp_hw_get_keyword(types[i])); 672 printf("\n"); 673 674 printf("* ratio: "); 675 for (int i = 0; i < depth; ++i) { 676 printf("%15d ", ratio[i]); 677 } 678 printf("\n"); 679 680 printf("* count: "); 681 for (int i = 0; i < depth; ++i) { 682 printf("%15d ", count[i]); 683 } 684 printf("\n"); 685 686 printf("* num_core_eff: %d\n", num_core_efficiencies); 687 printf("* num_core_types: %d\n", num_core_types); 688 printf("* core_types: "); 689 for (int i = 0; i < num_core_types; ++i) 690 printf("%3d ", core_types[i]); 691 printf("\n"); 692 693 printf("* equivalent map:\n"); 694 KMP_FOREACH_HW_TYPE(i) { 695 const char *key = __kmp_hw_get_keyword(i); 696 const char *value = __kmp_hw_get_keyword(equivalent[i]); 697 printf("%-15s -> %-15s\n", key, value); 698 } 699 700 printf("* uniform: %s\n", (is_uniform() ? "Yes" : "No")); 701 702 printf("* num_hw_threads: %d\n", num_hw_threads); 703 printf("* hw_threads:\n"); 704 for (int i = 0; i < num_hw_threads; ++i) { 705 hw_threads[i].print(); 706 } 707 printf("***********************\n"); 708 } 709 710 void kmp_topology_t::print(const char *env_var) const { 711 kmp_str_buf_t buf; 712 int print_types_depth; 713 __kmp_str_buf_init(&buf); 714 kmp_hw_t print_types[KMP_HW_LAST + 2]; 715 716 // Num Available Threads 717 if (num_hw_threads) { 718 KMP_INFORM(AvailableOSProc, env_var, num_hw_threads); 719 } else { 720 KMP_INFORM(AvailableOSProc, env_var, __kmp_xproc); 721 } 722 723 // Uniform or not 724 if (is_uniform()) { 725 KMP_INFORM(Uniform, env_var); 726 } else { 727 KMP_INFORM(NonUniform, env_var); 728 } 729 730 // Equivalent types 731 KMP_FOREACH_HW_TYPE(type) { 732 kmp_hw_t eq_type = equivalent[type]; 733 if (eq_type != KMP_HW_UNKNOWN && eq_type != type) { 734 KMP_INFORM(AffEqualTopologyTypes, env_var, 735 __kmp_hw_get_catalog_string(type), 736 __kmp_hw_get_catalog_string(eq_type)); 737 } 738 } 739 740 // Quick topology 741 KMP_ASSERT(depth > 0 && depth <= (int)KMP_HW_LAST); 742 // Create a print types array that always guarantees printing 743 // the core and thread level 744 print_types_depth = 0; 745 for (int level = 0; level < depth; ++level) 746 print_types[print_types_depth++] = types[level]; 747 if (equivalent[KMP_HW_CORE] != KMP_HW_CORE) { 748 // Force in the core level for quick topology 749 if (print_types[print_types_depth - 1] == KMP_HW_THREAD) { 750 // Force core before thread e.g., 1 socket X 2 threads/socket 751 // becomes 1 socket X 1 core/socket X 2 threads/socket 752 print_types[print_types_depth - 1] = KMP_HW_CORE; 753 print_types[print_types_depth++] = KMP_HW_THREAD; 754 } else { 755 print_types[print_types_depth++] = KMP_HW_CORE; 756 } 757 } 758 // Always put threads at very end of quick topology 759 if (equivalent[KMP_HW_THREAD] != KMP_HW_THREAD) 760 print_types[print_types_depth++] = KMP_HW_THREAD; 761 762 __kmp_str_buf_clear(&buf); 763 kmp_hw_t numerator_type; 764 kmp_hw_t denominator_type = KMP_HW_UNKNOWN; 765 int core_level = get_level(KMP_HW_CORE); 766 int ncores = get_count(core_level); 767 768 for (int plevel = 0, level = 0; plevel < print_types_depth; ++plevel) { 769 int c; 770 bool plural; 771 numerator_type = print_types[plevel]; 772 KMP_ASSERT_VALID_HW_TYPE(numerator_type); 773 if (equivalent[numerator_type] != numerator_type) 774 c = 1; 775 else 776 c = get_ratio(level++); 777 plural = (c > 1); 778 if (plevel == 0) { 779 __kmp_str_buf_print(&buf, "%d %s", c, 780 __kmp_hw_get_catalog_string(numerator_type, plural)); 781 } else { 782 __kmp_str_buf_print(&buf, " x %d %s/%s", c, 783 __kmp_hw_get_catalog_string(numerator_type, plural), 784 __kmp_hw_get_catalog_string(denominator_type)); 785 } 786 denominator_type = numerator_type; 787 } 788 KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores); 789 790 // Hybrid topology information 791 if (__kmp_is_hybrid_cpu()) { 792 for (int i = 0; i < num_core_types; ++i) { 793 kmp_hw_core_type_t core_type = core_types[i]; 794 kmp_hw_attr_t attr; 795 attr.clear(); 796 attr.set_core_type(core_type); 797 int ncores = get_ncores_with_attr(attr); 798 if (ncores > 0) { 799 KMP_INFORM(TopologyHybrid, env_var, ncores, 800 __kmp_hw_get_core_type_string(core_type)); 801 KMP_ASSERT(num_core_efficiencies <= KMP_HW_MAX_NUM_CORE_EFFS) 802 for (int eff = 0; eff < num_core_efficiencies; ++eff) { 803 attr.set_core_eff(eff); 804 int ncores_with_eff = get_ncores_with_attr(attr); 805 if (ncores_with_eff > 0) { 806 KMP_INFORM(TopologyHybridCoreEff, env_var, ncores_with_eff, eff); 807 } 808 } 809 } 810 } 811 } 812 813 if (num_hw_threads <= 0) { 814 __kmp_str_buf_free(&buf); 815 return; 816 } 817 818 // Full OS proc to hardware thread map 819 KMP_INFORM(OSProcToPhysicalThreadMap, env_var); 820 for (int i = 0; i < num_hw_threads; i++) { 821 __kmp_str_buf_clear(&buf); 822 for (int level = 0; level < depth; ++level) { 823 kmp_hw_t type = types[level]; 824 __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type)); 825 __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]); 826 } 827 if (__kmp_is_hybrid_cpu()) 828 __kmp_str_buf_print( 829 &buf, "(%s)", 830 __kmp_hw_get_core_type_string(hw_threads[i].attrs.get_core_type())); 831 KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str); 832 } 833 834 __kmp_str_buf_free(&buf); 835 } 836 837 #if KMP_AFFINITY_SUPPORTED 838 void kmp_topology_t::set_granularity(kmp_affinity_t &affinity) const { 839 const char *env_var = __kmp_get_affinity_env_var(affinity); 840 // If requested hybrid CPU attributes for granularity (either OMP_PLACES or 841 // KMP_AFFINITY), but none exist, then reset granularity and have below method 842 // select a granularity and warn user. 843 if (!__kmp_is_hybrid_cpu()) { 844 if (affinity.core_attr_gran.valid) { 845 // OMP_PLACES with cores:<attribute> but non-hybrid arch, use cores 846 // instead 847 KMP_AFF_WARNING( 848 affinity, AffIgnoringNonHybrid, env_var, 849 __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true)); 850 affinity.gran = KMP_HW_CORE; 851 affinity.gran_levels = -1; 852 affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN; 853 affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0; 854 } else if (affinity.flags.core_types_gran || 855 affinity.flags.core_effs_gran) { 856 // OMP_PLACES=core_types|core_effs but non-hybrid, use cores instead 857 if (affinity.flags.omp_places) { 858 KMP_AFF_WARNING( 859 affinity, AffIgnoringNonHybrid, env_var, 860 __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true)); 861 } else { 862 // KMP_AFFINITY=granularity=core_type|core_eff,... 863 KMP_AFF_WARNING(affinity, AffGranularityBad, env_var, 864 "Intel(R) Hybrid Technology core attribute", 865 __kmp_hw_get_catalog_string(KMP_HW_CORE)); 866 } 867 affinity.gran = KMP_HW_CORE; 868 affinity.gran_levels = -1; 869 affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN; 870 affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0; 871 } 872 } 873 // Set the number of affinity granularity levels 874 if (affinity.gran_levels < 0) { 875 kmp_hw_t gran_type = get_equivalent_type(affinity.gran); 876 // Check if user's granularity request is valid 877 if (gran_type == KMP_HW_UNKNOWN) { 878 // First try core, then thread, then package 879 kmp_hw_t gran_types[3] = {KMP_HW_CORE, KMP_HW_THREAD, KMP_HW_SOCKET}; 880 for (auto g : gran_types) { 881 if (get_equivalent_type(g) != KMP_HW_UNKNOWN) { 882 gran_type = g; 883 break; 884 } 885 } 886 KMP_ASSERT(gran_type != KMP_HW_UNKNOWN); 887 // Warn user what granularity setting will be used instead 888 KMP_AFF_WARNING(affinity, AffGranularityBad, env_var, 889 __kmp_hw_get_catalog_string(affinity.gran), 890 __kmp_hw_get_catalog_string(gran_type)); 891 affinity.gran = gran_type; 892 } 893 #if KMP_GROUP_AFFINITY 894 // If more than one processor group exists, and the level of 895 // granularity specified by the user is too coarse, then the 896 // granularity must be adjusted "down" to processor group affinity 897 // because threads can only exist within one processor group. 898 // For example, if a user sets granularity=socket and there are two 899 // processor groups that cover a socket, then the runtime must 900 // restrict the granularity down to the processor group level. 901 if (__kmp_num_proc_groups > 1) { 902 int gran_depth = get_level(gran_type); 903 int proc_group_depth = get_level(KMP_HW_PROC_GROUP); 904 if (gran_depth >= 0 && proc_group_depth >= 0 && 905 gran_depth < proc_group_depth) { 906 KMP_AFF_WARNING(affinity, AffGranTooCoarseProcGroup, env_var, 907 __kmp_hw_get_catalog_string(affinity.gran)); 908 affinity.gran = gran_type = KMP_HW_PROC_GROUP; 909 } 910 } 911 #endif 912 affinity.gran_levels = 0; 913 for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i) 914 affinity.gran_levels++; 915 } 916 } 917 #endif 918 919 void kmp_topology_t::canonicalize() { 920 #if KMP_GROUP_AFFINITY 921 _insert_windows_proc_groups(); 922 #endif 923 _remove_radix1_layers(); 924 _gather_enumeration_information(); 925 _discover_uniformity(); 926 _set_sub_ids(); 927 _set_globals(); 928 _set_last_level_cache(); 929 930 #if KMP_MIC_SUPPORTED 931 // Manually Add L2 = Tile equivalence 932 if (__kmp_mic_type == mic3) { 933 if (get_level(KMP_HW_L2) != -1) 934 set_equivalent_type(KMP_HW_TILE, KMP_HW_L2); 935 else if (get_level(KMP_HW_TILE) != -1) 936 set_equivalent_type(KMP_HW_L2, KMP_HW_TILE); 937 } 938 #endif 939 940 // Perform post canonicalization checking 941 KMP_ASSERT(depth > 0); 942 for (int level = 0; level < depth; ++level) { 943 // All counts, ratios, and types must be valid 944 KMP_ASSERT(count[level] > 0 && ratio[level] > 0); 945 KMP_ASSERT_VALID_HW_TYPE(types[level]); 946 // Detected types must point to themselves 947 KMP_ASSERT(equivalent[types[level]] == types[level]); 948 } 949 } 950 951 // Canonicalize an explicit packages X cores/pkg X threads/core topology 952 void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg, 953 int nthreads_per_core, int ncores) { 954 int ndepth = 3; 955 depth = ndepth; 956 KMP_FOREACH_HW_TYPE(i) { equivalent[i] = KMP_HW_UNKNOWN; } 957 for (int level = 0; level < depth; ++level) { 958 count[level] = 0; 959 ratio[level] = 0; 960 } 961 count[0] = npackages; 962 count[1] = ncores; 963 count[2] = __kmp_xproc; 964 ratio[0] = npackages; 965 ratio[1] = ncores_per_pkg; 966 ratio[2] = nthreads_per_core; 967 equivalent[KMP_HW_SOCKET] = KMP_HW_SOCKET; 968 equivalent[KMP_HW_CORE] = KMP_HW_CORE; 969 equivalent[KMP_HW_THREAD] = KMP_HW_THREAD; 970 types[0] = KMP_HW_SOCKET; 971 types[1] = KMP_HW_CORE; 972 types[2] = KMP_HW_THREAD; 973 //__kmp_avail_proc = __kmp_xproc; 974 _discover_uniformity(); 975 } 976 977 // Represents running sub IDs for a single core attribute where 978 // attribute values have SIZE possibilities. 979 template <size_t SIZE, typename IndexFunc> struct kmp_sub_ids_t { 980 int last_level; // last level in topology to consider for sub_ids 981 int sub_id[SIZE]; // The sub ID for a given attribute value 982 int prev_sub_id[KMP_HW_LAST]; 983 IndexFunc indexer; 984 985 public: 986 kmp_sub_ids_t(int last_level) : last_level(last_level) { 987 KMP_ASSERT(last_level < KMP_HW_LAST); 988 for (size_t i = 0; i < SIZE; ++i) 989 sub_id[i] = -1; 990 for (size_t i = 0; i < KMP_HW_LAST; ++i) 991 prev_sub_id[i] = -1; 992 } 993 void update(const kmp_hw_thread_t &hw_thread) { 994 int idx = indexer(hw_thread); 995 KMP_ASSERT(idx < (int)SIZE); 996 for (int level = 0; level <= last_level; ++level) { 997 if (hw_thread.sub_ids[level] != prev_sub_id[level]) { 998 if (level < last_level) 999 sub_id[idx] = -1; 1000 sub_id[idx]++; 1001 break; 1002 } 1003 } 1004 for (int level = 0; level <= last_level; ++level) 1005 prev_sub_id[level] = hw_thread.sub_ids[level]; 1006 } 1007 int get_sub_id(const kmp_hw_thread_t &hw_thread) const { 1008 return sub_id[indexer(hw_thread)]; 1009 } 1010 }; 1011 1012 #if KMP_AFFINITY_SUPPORTED 1013 static kmp_str_buf_t * 1014 __kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf, 1015 bool plural) { 1016 __kmp_str_buf_init(buf); 1017 if (attr.is_core_type_valid()) 1018 __kmp_str_buf_print(buf, "%s %s", 1019 __kmp_hw_get_core_type_string(attr.get_core_type()), 1020 __kmp_hw_get_catalog_string(KMP_HW_CORE, plural)); 1021 else 1022 __kmp_str_buf_print(buf, "%s eff=%d", 1023 __kmp_hw_get_catalog_string(KMP_HW_CORE, plural), 1024 attr.get_core_eff()); 1025 return buf; 1026 } 1027 1028 bool kmp_topology_t::restrict_to_mask(const kmp_affin_mask_t *mask) { 1029 // Apply the filter 1030 bool affected; 1031 int new_index = 0; 1032 for (int i = 0; i < num_hw_threads; ++i) { 1033 int os_id = hw_threads[i].os_id; 1034 if (KMP_CPU_ISSET(os_id, mask)) { 1035 if (i != new_index) 1036 hw_threads[new_index] = hw_threads[i]; 1037 new_index++; 1038 } else { 1039 KMP_CPU_CLR(os_id, __kmp_affin_fullMask); 1040 __kmp_avail_proc--; 1041 } 1042 } 1043 1044 KMP_DEBUG_ASSERT(new_index <= num_hw_threads); 1045 affected = (num_hw_threads != new_index); 1046 num_hw_threads = new_index; 1047 1048 // Post hardware subset canonicalization 1049 if (affected) { 1050 _gather_enumeration_information(); 1051 _discover_uniformity(); 1052 _set_globals(); 1053 _set_last_level_cache(); 1054 #if KMP_OS_WINDOWS 1055 // Copy filtered full mask if topology has single processor group 1056 if (__kmp_num_proc_groups <= 1) 1057 #endif 1058 __kmp_affin_origMask->copy(__kmp_affin_fullMask); 1059 } 1060 return affected; 1061 } 1062 1063 // Apply the KMP_HW_SUBSET envirable to the topology 1064 // Returns true if KMP_HW_SUBSET filtered any processors 1065 // otherwise, returns false 1066 bool kmp_topology_t::filter_hw_subset() { 1067 // If KMP_HW_SUBSET wasn't requested, then do nothing. 1068 if (!__kmp_hw_subset) 1069 return false; 1070 1071 // First, sort the KMP_HW_SUBSET items by the machine topology 1072 __kmp_hw_subset->sort(); 1073 1074 // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology 1075 bool using_core_types = false; 1076 bool using_core_effs = false; 1077 int hw_subset_depth = __kmp_hw_subset->get_depth(); 1078 kmp_hw_t specified[KMP_HW_LAST]; 1079 int *topology_levels = (int *)KMP_ALLOCA(sizeof(int) * hw_subset_depth); 1080 KMP_ASSERT(hw_subset_depth > 0); 1081 KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; } 1082 int core_level = get_level(KMP_HW_CORE); 1083 for (int i = 0; i < hw_subset_depth; ++i) { 1084 int max_count; 1085 const kmp_hw_subset_t::item_t &item = __kmp_hw_subset->at(i); 1086 int num = item.num[0]; 1087 int offset = item.offset[0]; 1088 kmp_hw_t type = item.type; 1089 kmp_hw_t equivalent_type = equivalent[type]; 1090 int level = get_level(type); 1091 topology_levels[i] = level; 1092 1093 // Check to see if current layer is in detected machine topology 1094 if (equivalent_type != KMP_HW_UNKNOWN) { 1095 __kmp_hw_subset->at(i).type = equivalent_type; 1096 } else { 1097 KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetNotExistGeneric, 1098 __kmp_hw_get_catalog_string(type)); 1099 return false; 1100 } 1101 1102 // Check to see if current layer has already been 1103 // specified either directly or through an equivalent type 1104 if (specified[equivalent_type] != KMP_HW_UNKNOWN) { 1105 KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetEqvLayers, 1106 __kmp_hw_get_catalog_string(type), 1107 __kmp_hw_get_catalog_string(specified[equivalent_type])); 1108 return false; 1109 } 1110 specified[equivalent_type] = type; 1111 1112 // Check to see if each layer's num & offset parameters are valid 1113 max_count = get_ratio(level); 1114 if (max_count < 0 || 1115 (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) { 1116 bool plural = (num > 1); 1117 KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric, 1118 __kmp_hw_get_catalog_string(type, plural)); 1119 return false; 1120 } 1121 1122 // Check to see if core attributes are consistent 1123 if (core_level == level) { 1124 // Determine which core attributes are specified 1125 for (int j = 0; j < item.num_attrs; ++j) { 1126 if (item.attr[j].is_core_type_valid()) 1127 using_core_types = true; 1128 if (item.attr[j].is_core_eff_valid()) 1129 using_core_effs = true; 1130 } 1131 1132 // Check if using a single core attribute on non-hybrid arch. 1133 // Do not ignore all of KMP_HW_SUBSET, just ignore the attribute. 1134 // 1135 // Check if using multiple core attributes on non-hyrbid arch. 1136 // Ignore all of KMP_HW_SUBSET if this is the case. 1137 if ((using_core_effs || using_core_types) && !__kmp_is_hybrid_cpu()) { 1138 if (item.num_attrs == 1) { 1139 if (using_core_effs) { 1140 KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIgnoringAttr, 1141 "efficiency"); 1142 } else { 1143 KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIgnoringAttr, 1144 "core_type"); 1145 } 1146 using_core_effs = false; 1147 using_core_types = false; 1148 } else { 1149 KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAttrsNonHybrid); 1150 return false; 1151 } 1152 } 1153 1154 // Check if using both core types and core efficiencies together 1155 if (using_core_types && using_core_effs) { 1156 KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIncompat, "core_type", 1157 "efficiency"); 1158 return false; 1159 } 1160 1161 // Check that core efficiency values are valid 1162 if (using_core_effs) { 1163 for (int j = 0; j < item.num_attrs; ++j) { 1164 if (item.attr[j].is_core_eff_valid()) { 1165 int core_eff = item.attr[j].get_core_eff(); 1166 if (core_eff < 0 || core_eff >= num_core_efficiencies) { 1167 kmp_str_buf_t buf; 1168 __kmp_str_buf_init(&buf); 1169 __kmp_str_buf_print(&buf, "%d", item.attr[j].get_core_eff()); 1170 __kmp_msg(kmp_ms_warning, 1171 KMP_MSG(AffHWSubsetAttrInvalid, "efficiency", buf.str), 1172 KMP_HNT(ValidValuesRange, 0, num_core_efficiencies - 1), 1173 __kmp_msg_null); 1174 __kmp_str_buf_free(&buf); 1175 return false; 1176 } 1177 } 1178 } 1179 } 1180 1181 // Check that the number of requested cores with attributes is valid 1182 if (using_core_types || using_core_effs) { 1183 for (int j = 0; j < item.num_attrs; ++j) { 1184 int num = item.num[j]; 1185 int offset = item.offset[j]; 1186 int level_above = core_level - 1; 1187 if (level_above >= 0) { 1188 max_count = get_ncores_with_attr_per(item.attr[j], level_above); 1189 if (max_count <= 0 || 1190 (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) { 1191 kmp_str_buf_t buf; 1192 __kmp_hw_get_catalog_core_string(item.attr[j], &buf, num > 0); 1193 KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric, buf.str); 1194 __kmp_str_buf_free(&buf); 1195 return false; 1196 } 1197 } 1198 } 1199 } 1200 1201 if ((using_core_types || using_core_effs) && item.num_attrs > 1) { 1202 for (int j = 0; j < item.num_attrs; ++j) { 1203 // Ambiguous use of specific core attribute + generic core 1204 // e.g., 4c & 3c:intel_core or 4c & 3c:eff1 1205 if (!item.attr[j]) { 1206 kmp_hw_attr_t other_attr; 1207 for (int k = 0; k < item.num_attrs; ++k) { 1208 if (item.attr[k] != item.attr[j]) { 1209 other_attr = item.attr[k]; 1210 break; 1211 } 1212 } 1213 kmp_str_buf_t buf; 1214 __kmp_hw_get_catalog_core_string(other_attr, &buf, item.num[j] > 0); 1215 KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIncompat, 1216 __kmp_hw_get_catalog_string(KMP_HW_CORE), buf.str); 1217 __kmp_str_buf_free(&buf); 1218 return false; 1219 } 1220 // Allow specifying a specific core type or core eff exactly once 1221 for (int k = 0; k < j; ++k) { 1222 if (!item.attr[j] || !item.attr[k]) 1223 continue; 1224 if (item.attr[k] == item.attr[j]) { 1225 kmp_str_buf_t buf; 1226 __kmp_hw_get_catalog_core_string(item.attr[j], &buf, 1227 item.num[j] > 0); 1228 KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAttrRepeat, buf.str); 1229 __kmp_str_buf_free(&buf); 1230 return false; 1231 } 1232 } 1233 } 1234 } 1235 } 1236 } 1237 1238 struct core_type_indexer { 1239 int operator()(const kmp_hw_thread_t &t) const { 1240 switch (t.attrs.get_core_type()) { 1241 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1242 case KMP_HW_CORE_TYPE_ATOM: 1243 return 1; 1244 case KMP_HW_CORE_TYPE_CORE: 1245 return 2; 1246 #endif 1247 case KMP_HW_CORE_TYPE_UNKNOWN: 1248 return 0; 1249 } 1250 KMP_ASSERT(0); 1251 return 0; 1252 } 1253 }; 1254 struct core_eff_indexer { 1255 int operator()(const kmp_hw_thread_t &t) const { 1256 return t.attrs.get_core_eff(); 1257 } 1258 }; 1259 1260 kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_TYPES, core_type_indexer> core_type_sub_ids( 1261 core_level); 1262 kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_EFFS, core_eff_indexer> core_eff_sub_ids( 1263 core_level); 1264 1265 // Determine which hardware threads should be filtered. 1266 int num_filtered = 0; 1267 kmp_affin_mask_t *filtered_mask; 1268 KMP_CPU_ALLOC(filtered_mask); 1269 KMP_CPU_COPY(filtered_mask, __kmp_affin_fullMask); 1270 for (int i = 0; i < num_hw_threads; ++i) { 1271 kmp_hw_thread_t &hw_thread = hw_threads[i]; 1272 // Update type_sub_id 1273 if (using_core_types) 1274 core_type_sub_ids.update(hw_thread); 1275 if (using_core_effs) 1276 core_eff_sub_ids.update(hw_thread); 1277 1278 // Check to see if this hardware thread should be filtered 1279 bool should_be_filtered = false; 1280 for (int hw_subset_index = 0; hw_subset_index < hw_subset_depth; 1281 ++hw_subset_index) { 1282 const auto &hw_subset_item = __kmp_hw_subset->at(hw_subset_index); 1283 int level = topology_levels[hw_subset_index]; 1284 if (level == -1) 1285 continue; 1286 if ((using_core_effs || using_core_types) && level == core_level) { 1287 // Look for the core attribute in KMP_HW_SUBSET which corresponds 1288 // to this hardware thread's core attribute. Use this num,offset plus 1289 // the running sub_id for the particular core attribute of this hardware 1290 // thread to determine if the hardware thread should be filtered or not. 1291 int attr_idx; 1292 kmp_hw_core_type_t core_type = hw_thread.attrs.get_core_type(); 1293 int core_eff = hw_thread.attrs.get_core_eff(); 1294 for (attr_idx = 0; attr_idx < hw_subset_item.num_attrs; ++attr_idx) { 1295 if (using_core_types && 1296 hw_subset_item.attr[attr_idx].get_core_type() == core_type) 1297 break; 1298 if (using_core_effs && 1299 hw_subset_item.attr[attr_idx].get_core_eff() == core_eff) 1300 break; 1301 } 1302 // This core attribute isn't in the KMP_HW_SUBSET so always filter it. 1303 if (attr_idx == hw_subset_item.num_attrs) { 1304 should_be_filtered = true; 1305 break; 1306 } 1307 int sub_id; 1308 int num = hw_subset_item.num[attr_idx]; 1309 int offset = hw_subset_item.offset[attr_idx]; 1310 if (using_core_types) 1311 sub_id = core_type_sub_ids.get_sub_id(hw_thread); 1312 else 1313 sub_id = core_eff_sub_ids.get_sub_id(hw_thread); 1314 if (sub_id < offset || 1315 (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) { 1316 should_be_filtered = true; 1317 break; 1318 } 1319 } else { 1320 int num = hw_subset_item.num[0]; 1321 int offset = hw_subset_item.offset[0]; 1322 if (hw_thread.sub_ids[level] < offset || 1323 (num != kmp_hw_subset_t::USE_ALL && 1324 hw_thread.sub_ids[level] >= offset + num)) { 1325 should_be_filtered = true; 1326 break; 1327 } 1328 } 1329 } 1330 // Collect filtering information 1331 if (should_be_filtered) { 1332 KMP_CPU_CLR(hw_thread.os_id, filtered_mask); 1333 num_filtered++; 1334 } 1335 } 1336 1337 // One last check that we shouldn't allow filtering entire machine 1338 if (num_filtered == num_hw_threads) { 1339 KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAllFiltered); 1340 return false; 1341 } 1342 1343 // Apply the filter 1344 restrict_to_mask(filtered_mask); 1345 return true; 1346 } 1347 1348 bool kmp_topology_t::is_close(int hwt1, int hwt2, 1349 const kmp_affinity_t &stgs) const { 1350 int hw_level = stgs.gran_levels; 1351 if (hw_level >= depth) 1352 return true; 1353 bool retval = true; 1354 const kmp_hw_thread_t &t1 = hw_threads[hwt1]; 1355 const kmp_hw_thread_t &t2 = hw_threads[hwt2]; 1356 if (stgs.flags.core_types_gran) 1357 return t1.attrs.get_core_type() == t2.attrs.get_core_type(); 1358 if (stgs.flags.core_effs_gran) 1359 return t1.attrs.get_core_eff() == t2.attrs.get_core_eff(); 1360 for (int i = 0; i < (depth - hw_level); ++i) { 1361 if (t1.ids[i] != t2.ids[i]) 1362 return false; 1363 } 1364 return retval; 1365 } 1366 1367 //////////////////////////////////////////////////////////////////////////////// 1368 1369 bool KMPAffinity::picked_api = false; 1370 1371 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } 1372 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); } 1373 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); } 1374 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); } 1375 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); } 1376 void KMPAffinity::operator delete(void *p) { __kmp_free(p); } 1377 1378 void KMPAffinity::pick_api() { 1379 KMPAffinity *affinity_dispatch; 1380 if (picked_api) 1381 return; 1382 #if KMP_USE_HWLOC 1383 // Only use Hwloc if affinity isn't explicitly disabled and 1384 // user requests Hwloc topology method 1385 if (__kmp_affinity_top_method == affinity_top_method_hwloc && 1386 __kmp_affinity.type != affinity_disabled) { 1387 affinity_dispatch = new KMPHwlocAffinity(); 1388 } else 1389 #endif 1390 { 1391 affinity_dispatch = new KMPNativeAffinity(); 1392 } 1393 __kmp_affinity_dispatch = affinity_dispatch; 1394 picked_api = true; 1395 } 1396 1397 void KMPAffinity::destroy_api() { 1398 if (__kmp_affinity_dispatch != NULL) { 1399 delete __kmp_affinity_dispatch; 1400 __kmp_affinity_dispatch = NULL; 1401 picked_api = false; 1402 } 1403 } 1404 1405 #define KMP_ADVANCE_SCAN(scan) \ 1406 while (*scan != '\0') { \ 1407 scan++; \ 1408 } 1409 1410 // Print the affinity mask to the character array in a pretty format. 1411 // The format is a comma separated list of non-negative integers or integer 1412 // ranges: e.g., 1,2,3-5,7,9-15 1413 // The format can also be the string "{<empty>}" if no bits are set in mask 1414 char *__kmp_affinity_print_mask(char *buf, int buf_len, 1415 kmp_affin_mask_t *mask) { 1416 int start = 0, finish = 0, previous = 0; 1417 bool first_range; 1418 KMP_ASSERT(buf); 1419 KMP_ASSERT(buf_len >= 40); 1420 KMP_ASSERT(mask); 1421 char *scan = buf; 1422 char *end = buf + buf_len - 1; 1423 1424 // Check for empty set. 1425 if (mask->begin() == mask->end()) { 1426 KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}"); 1427 KMP_ADVANCE_SCAN(scan); 1428 KMP_ASSERT(scan <= end); 1429 return buf; 1430 } 1431 1432 first_range = true; 1433 start = mask->begin(); 1434 while (1) { 1435 // Find next range 1436 // [start, previous] is inclusive range of contiguous bits in mask 1437 for (finish = mask->next(start), previous = start; 1438 finish == previous + 1 && finish != mask->end(); 1439 finish = mask->next(finish)) { 1440 previous = finish; 1441 } 1442 1443 // The first range does not need a comma printed before it, but the rest 1444 // of the ranges do need a comma beforehand 1445 if (!first_range) { 1446 KMP_SNPRINTF(scan, end - scan + 1, "%s", ","); 1447 KMP_ADVANCE_SCAN(scan); 1448 } else { 1449 first_range = false; 1450 } 1451 // Range with three or more contiguous bits in the affinity mask 1452 if (previous - start > 1) { 1453 KMP_SNPRINTF(scan, end - scan + 1, "%u-%u", start, previous); 1454 } else { 1455 // Range with one or two contiguous bits in the affinity mask 1456 KMP_SNPRINTF(scan, end - scan + 1, "%u", start); 1457 KMP_ADVANCE_SCAN(scan); 1458 if (previous - start > 0) { 1459 KMP_SNPRINTF(scan, end - scan + 1, ",%u", previous); 1460 } 1461 } 1462 KMP_ADVANCE_SCAN(scan); 1463 // Start over with new start point 1464 start = finish; 1465 if (start == mask->end()) 1466 break; 1467 // Check for overflow 1468 if (end - scan < 2) 1469 break; 1470 } 1471 1472 // Check for overflow 1473 KMP_ASSERT(scan <= end); 1474 return buf; 1475 } 1476 #undef KMP_ADVANCE_SCAN 1477 1478 // Print the affinity mask to the string buffer object in a pretty format 1479 // The format is a comma separated list of non-negative integers or integer 1480 // ranges: e.g., 1,2,3-5,7,9-15 1481 // The format can also be the string "{<empty>}" if no bits are set in mask 1482 kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf, 1483 kmp_affin_mask_t *mask) { 1484 int start = 0, finish = 0, previous = 0; 1485 bool first_range; 1486 KMP_ASSERT(buf); 1487 KMP_ASSERT(mask); 1488 1489 __kmp_str_buf_clear(buf); 1490 1491 // Check for empty set. 1492 if (mask->begin() == mask->end()) { 1493 __kmp_str_buf_print(buf, "%s", "{<empty>}"); 1494 return buf; 1495 } 1496 1497 first_range = true; 1498 start = mask->begin(); 1499 while (1) { 1500 // Find next range 1501 // [start, previous] is inclusive range of contiguous bits in mask 1502 for (finish = mask->next(start), previous = start; 1503 finish == previous + 1 && finish != mask->end(); 1504 finish = mask->next(finish)) { 1505 previous = finish; 1506 } 1507 1508 // The first range does not need a comma printed before it, but the rest 1509 // of the ranges do need a comma beforehand 1510 if (!first_range) { 1511 __kmp_str_buf_print(buf, "%s", ","); 1512 } else { 1513 first_range = false; 1514 } 1515 // Range with three or more contiguous bits in the affinity mask 1516 if (previous - start > 1) { 1517 __kmp_str_buf_print(buf, "%u-%u", start, previous); 1518 } else { 1519 // Range with one or two contiguous bits in the affinity mask 1520 __kmp_str_buf_print(buf, "%u", start); 1521 if (previous - start > 0) { 1522 __kmp_str_buf_print(buf, ",%u", previous); 1523 } 1524 } 1525 // Start over with new start point 1526 start = finish; 1527 if (start == mask->end()) 1528 break; 1529 } 1530 return buf; 1531 } 1532 1533 // Return (possibly empty) affinity mask representing the offline CPUs 1534 // Caller must free the mask 1535 kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() { 1536 kmp_affin_mask_t *offline; 1537 KMP_CPU_ALLOC(offline); 1538 KMP_CPU_ZERO(offline); 1539 #if KMP_OS_LINUX 1540 int n, begin_cpu, end_cpu; 1541 kmp_safe_raii_file_t offline_file; 1542 auto skip_ws = [](FILE *f) { 1543 int c; 1544 do { 1545 c = fgetc(f); 1546 } while (isspace(c)); 1547 if (c != EOF) 1548 ungetc(c, f); 1549 }; 1550 // File contains CSV of integer ranges representing the offline CPUs 1551 // e.g., 1,2,4-7,9,11-15 1552 int status = offline_file.try_open("/sys/devices/system/cpu/offline", "r"); 1553 if (status != 0) 1554 return offline; 1555 while (!feof(offline_file)) { 1556 skip_ws(offline_file); 1557 n = fscanf(offline_file, "%d", &begin_cpu); 1558 if (n != 1) 1559 break; 1560 skip_ws(offline_file); 1561 int c = fgetc(offline_file); 1562 if (c == EOF || c == ',') { 1563 // Just single CPU 1564 end_cpu = begin_cpu; 1565 } else if (c == '-') { 1566 // Range of CPUs 1567 skip_ws(offline_file); 1568 n = fscanf(offline_file, "%d", &end_cpu); 1569 if (n != 1) 1570 break; 1571 skip_ws(offline_file); 1572 c = fgetc(offline_file); // skip ',' 1573 } else { 1574 // Syntax problem 1575 break; 1576 } 1577 // Ensure a valid range of CPUs 1578 if (begin_cpu < 0 || begin_cpu >= __kmp_xproc || end_cpu < 0 || 1579 end_cpu >= __kmp_xproc || begin_cpu > end_cpu) { 1580 continue; 1581 } 1582 // Insert [begin_cpu, end_cpu] into offline mask 1583 for (int cpu = begin_cpu; cpu <= end_cpu; ++cpu) { 1584 KMP_CPU_SET(cpu, offline); 1585 } 1586 } 1587 #endif 1588 return offline; 1589 } 1590 1591 // Return the number of available procs 1592 int __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { 1593 int avail_proc = 0; 1594 KMP_CPU_ZERO(mask); 1595 1596 #if KMP_GROUP_AFFINITY 1597 1598 if (__kmp_num_proc_groups > 1) { 1599 int group; 1600 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 1601 for (group = 0; group < __kmp_num_proc_groups; group++) { 1602 int i; 1603 int num = __kmp_GetActiveProcessorCount(group); 1604 for (i = 0; i < num; i++) { 1605 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 1606 avail_proc++; 1607 } 1608 } 1609 } else 1610 1611 #endif /* KMP_GROUP_AFFINITY */ 1612 1613 { 1614 int proc; 1615 kmp_affin_mask_t *offline_cpus = __kmp_affinity_get_offline_cpus(); 1616 for (proc = 0; proc < __kmp_xproc; proc++) { 1617 // Skip offline CPUs 1618 if (KMP_CPU_ISSET(proc, offline_cpus)) 1619 continue; 1620 KMP_CPU_SET(proc, mask); 1621 avail_proc++; 1622 } 1623 KMP_CPU_FREE(offline_cpus); 1624 } 1625 1626 return avail_proc; 1627 } 1628 1629 // All of the __kmp_affinity_create_*_map() routines should allocate the 1630 // internal topology object and set the layer ids for it. Each routine 1631 // returns a boolean on whether it was successful at doing so. 1632 kmp_affin_mask_t *__kmp_affin_fullMask = NULL; 1633 // Original mask is a subset of full mask in multiple processor groups topology 1634 kmp_affin_mask_t *__kmp_affin_origMask = NULL; 1635 1636 #if KMP_USE_HWLOC 1637 static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) { 1638 #if HWLOC_API_VERSION >= 0x00020000 1639 return hwloc_obj_type_is_cache(obj->type); 1640 #else 1641 return obj->type == HWLOC_OBJ_CACHE; 1642 #endif 1643 } 1644 1645 // Returns KMP_HW_* type derived from HWLOC_* type 1646 static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) { 1647 1648 if (__kmp_hwloc_is_cache_type(obj)) { 1649 if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION) 1650 return KMP_HW_UNKNOWN; 1651 switch (obj->attr->cache.depth) { 1652 case 1: 1653 return KMP_HW_L1; 1654 case 2: 1655 #if KMP_MIC_SUPPORTED 1656 if (__kmp_mic_type == mic3) { 1657 return KMP_HW_TILE; 1658 } 1659 #endif 1660 return KMP_HW_L2; 1661 case 3: 1662 return KMP_HW_L3; 1663 } 1664 return KMP_HW_UNKNOWN; 1665 } 1666 1667 switch (obj->type) { 1668 case HWLOC_OBJ_PACKAGE: 1669 return KMP_HW_SOCKET; 1670 case HWLOC_OBJ_NUMANODE: 1671 return KMP_HW_NUMA; 1672 case HWLOC_OBJ_CORE: 1673 return KMP_HW_CORE; 1674 case HWLOC_OBJ_PU: 1675 return KMP_HW_THREAD; 1676 case HWLOC_OBJ_GROUP: 1677 #if HWLOC_API_VERSION >= 0x00020000 1678 if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE) 1679 return KMP_HW_DIE; 1680 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE) 1681 return KMP_HW_TILE; 1682 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE) 1683 return KMP_HW_MODULE; 1684 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP) 1685 return KMP_HW_PROC_GROUP; 1686 #endif 1687 return KMP_HW_UNKNOWN; 1688 #if HWLOC_API_VERSION >= 0x00020100 1689 case HWLOC_OBJ_DIE: 1690 return KMP_HW_DIE; 1691 #endif 1692 } 1693 return KMP_HW_UNKNOWN; 1694 } 1695 1696 // Returns the number of objects of type 'type' below 'obj' within the topology 1697 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is 1698 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET 1699 // object. 1700 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, 1701 hwloc_obj_type_t type) { 1702 int retval = 0; 1703 hwloc_obj_t first; 1704 for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, 1705 obj->logical_index, type, 0); 1706 first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, 1707 obj->type, first) == obj; 1708 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, 1709 first)) { 1710 ++retval; 1711 } 1712 return retval; 1713 } 1714 1715 // This gets the sub_id for a lower object under a higher object in the 1716 // topology tree 1717 static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher, 1718 hwloc_obj_t lower) { 1719 hwloc_obj_t obj; 1720 hwloc_obj_type_t ltype = lower->type; 1721 int lindex = lower->logical_index - 1; 1722 int sub_id = 0; 1723 // Get the previous lower object 1724 obj = hwloc_get_obj_by_type(t, ltype, lindex); 1725 while (obj && lindex >= 0 && 1726 hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) { 1727 if (obj->userdata) { 1728 sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata)); 1729 break; 1730 } 1731 sub_id++; 1732 lindex--; 1733 obj = hwloc_get_obj_by_type(t, ltype, lindex); 1734 } 1735 // store sub_id + 1 so that 0 is differed from NULL 1736 lower->userdata = RCAST(void *, sub_id + 1); 1737 return sub_id; 1738 } 1739 1740 static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) { 1741 kmp_hw_t type; 1742 int hw_thread_index, sub_id; 1743 int depth; 1744 hwloc_obj_t pu, obj, root, prev; 1745 kmp_hw_t types[KMP_HW_LAST]; 1746 hwloc_obj_type_t hwloc_types[KMP_HW_LAST]; 1747 1748 hwloc_topology_t tp = __kmp_hwloc_topology; 1749 *msg_id = kmp_i18n_null; 1750 if (__kmp_affinity.flags.verbose) { 1751 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 1752 } 1753 1754 if (!KMP_AFFINITY_CAPABLE()) { 1755 // Hack to try and infer the machine topology using only the data 1756 // available from hwloc on the current thread, and __kmp_xproc. 1757 KMP_ASSERT(__kmp_affinity.type == affinity_none); 1758 // hwloc only guarantees existance of PU object, so check PACKAGE and CORE 1759 hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); 1760 if (o != NULL) 1761 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE); 1762 else 1763 nCoresPerPkg = 1; // no PACKAGE found 1764 o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0); 1765 if (o != NULL) 1766 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU); 1767 else 1768 __kmp_nThreadsPerCore = 1; // no CORE found 1769 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1770 if (nCoresPerPkg == 0) 1771 nCoresPerPkg = 1; // to prevent possible division by 0 1772 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1773 return true; 1774 } 1775 1776 #if HWLOC_API_VERSION >= 0x00020400 1777 // Handle multiple types of cores if they exist on the system 1778 int nr_cpu_kinds = hwloc_cpukinds_get_nr(tp, 0); 1779 1780 typedef struct kmp_hwloc_cpukinds_info_t { 1781 int efficiency; 1782 kmp_hw_core_type_t core_type; 1783 hwloc_bitmap_t mask; 1784 } kmp_hwloc_cpukinds_info_t; 1785 kmp_hwloc_cpukinds_info_t *cpukinds = nullptr; 1786 1787 if (nr_cpu_kinds > 0) { 1788 unsigned nr_infos; 1789 struct hwloc_info_s *infos; 1790 cpukinds = (kmp_hwloc_cpukinds_info_t *)__kmp_allocate( 1791 sizeof(kmp_hwloc_cpukinds_info_t) * nr_cpu_kinds); 1792 for (unsigned idx = 0; idx < (unsigned)nr_cpu_kinds; ++idx) { 1793 cpukinds[idx].efficiency = -1; 1794 cpukinds[idx].core_type = KMP_HW_CORE_TYPE_UNKNOWN; 1795 cpukinds[idx].mask = hwloc_bitmap_alloc(); 1796 if (hwloc_cpukinds_get_info(tp, idx, cpukinds[idx].mask, 1797 &cpukinds[idx].efficiency, &nr_infos, &infos, 1798 0) == 0) { 1799 for (unsigned i = 0; i < nr_infos; ++i) { 1800 if (__kmp_str_match("CoreType", 8, infos[i].name)) { 1801 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1802 if (__kmp_str_match("IntelAtom", 9, infos[i].value)) { 1803 cpukinds[idx].core_type = KMP_HW_CORE_TYPE_ATOM; 1804 break; 1805 } else if (__kmp_str_match("IntelCore", 9, infos[i].value)) { 1806 cpukinds[idx].core_type = KMP_HW_CORE_TYPE_CORE; 1807 break; 1808 } 1809 #endif 1810 } 1811 } 1812 } 1813 } 1814 } 1815 #endif 1816 1817 root = hwloc_get_root_obj(tp); 1818 1819 // Figure out the depth and types in the topology 1820 depth = 0; 1821 pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin()); 1822 KMP_ASSERT(pu); 1823 obj = pu; 1824 types[depth] = KMP_HW_THREAD; 1825 hwloc_types[depth] = obj->type; 1826 depth++; 1827 while (obj != root && obj != NULL) { 1828 obj = obj->parent; 1829 #if HWLOC_API_VERSION >= 0x00020000 1830 if (obj->memory_arity) { 1831 hwloc_obj_t memory; 1832 for (memory = obj->memory_first_child; memory; 1833 memory = hwloc_get_next_child(tp, obj, memory)) { 1834 if (memory->type == HWLOC_OBJ_NUMANODE) 1835 break; 1836 } 1837 if (memory && memory->type == HWLOC_OBJ_NUMANODE) { 1838 types[depth] = KMP_HW_NUMA; 1839 hwloc_types[depth] = memory->type; 1840 depth++; 1841 } 1842 } 1843 #endif 1844 type = __kmp_hwloc_type_2_topology_type(obj); 1845 if (type != KMP_HW_UNKNOWN) { 1846 types[depth] = type; 1847 hwloc_types[depth] = obj->type; 1848 depth++; 1849 } 1850 } 1851 KMP_ASSERT(depth > 0); 1852 1853 // Get the order for the types correct 1854 for (int i = 0, j = depth - 1; i < j; ++i, --j) { 1855 hwloc_obj_type_t hwloc_temp = hwloc_types[i]; 1856 kmp_hw_t temp = types[i]; 1857 types[i] = types[j]; 1858 types[j] = temp; 1859 hwloc_types[i] = hwloc_types[j]; 1860 hwloc_types[j] = hwloc_temp; 1861 } 1862 1863 // Allocate the data structure to be returned. 1864 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1865 1866 hw_thread_index = 0; 1867 pu = NULL; 1868 while ((pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu))) { 1869 int index = depth - 1; 1870 bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask); 1871 kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index); 1872 if (included) { 1873 hw_thread.clear(); 1874 hw_thread.ids[index] = pu->logical_index; 1875 hw_thread.os_id = pu->os_index; 1876 // If multiple core types, then set that attribute for the hardware thread 1877 #if HWLOC_API_VERSION >= 0x00020400 1878 if (cpukinds) { 1879 int cpukind_index = -1; 1880 for (int i = 0; i < nr_cpu_kinds; ++i) { 1881 if (hwloc_bitmap_isset(cpukinds[i].mask, hw_thread.os_id)) { 1882 cpukind_index = i; 1883 break; 1884 } 1885 } 1886 if (cpukind_index >= 0) { 1887 hw_thread.attrs.set_core_type(cpukinds[cpukind_index].core_type); 1888 hw_thread.attrs.set_core_eff(cpukinds[cpukind_index].efficiency); 1889 } 1890 } 1891 #endif 1892 index--; 1893 } 1894 obj = pu; 1895 prev = obj; 1896 while (obj != root && obj != NULL) { 1897 obj = obj->parent; 1898 #if HWLOC_API_VERSION >= 0x00020000 1899 // NUMA Nodes are handled differently since they are not within the 1900 // parent/child structure anymore. They are separate children 1901 // of obj (memory_first_child points to first memory child) 1902 if (obj->memory_arity) { 1903 hwloc_obj_t memory; 1904 for (memory = obj->memory_first_child; memory; 1905 memory = hwloc_get_next_child(tp, obj, memory)) { 1906 if (memory->type == HWLOC_OBJ_NUMANODE) 1907 break; 1908 } 1909 if (memory && memory->type == HWLOC_OBJ_NUMANODE) { 1910 sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev); 1911 if (included) { 1912 hw_thread.ids[index] = memory->logical_index; 1913 hw_thread.ids[index + 1] = sub_id; 1914 index--; 1915 } 1916 prev = memory; 1917 } 1918 prev = obj; 1919 } 1920 #endif 1921 type = __kmp_hwloc_type_2_topology_type(obj); 1922 if (type != KMP_HW_UNKNOWN) { 1923 sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev); 1924 if (included) { 1925 hw_thread.ids[index] = obj->logical_index; 1926 hw_thread.ids[index + 1] = sub_id; 1927 index--; 1928 } 1929 prev = obj; 1930 } 1931 } 1932 if (included) 1933 hw_thread_index++; 1934 } 1935 1936 #if HWLOC_API_VERSION >= 0x00020400 1937 // Free the core types information 1938 if (cpukinds) { 1939 for (int idx = 0; idx < nr_cpu_kinds; ++idx) 1940 hwloc_bitmap_free(cpukinds[idx].mask); 1941 __kmp_free(cpukinds); 1942 } 1943 #endif 1944 __kmp_topology->sort_ids(); 1945 return true; 1946 } 1947 #endif // KMP_USE_HWLOC 1948 1949 // If we don't know how to retrieve the machine's processor topology, or 1950 // encounter an error in doing so, this routine is called to form a "flat" 1951 // mapping of os thread id's <-> processor id's. 1952 static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) { 1953 *msg_id = kmp_i18n_null; 1954 int depth = 3; 1955 kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD}; 1956 1957 if (__kmp_affinity.flags.verbose) { 1958 KMP_INFORM(UsingFlatOS, "KMP_AFFINITY"); 1959 } 1960 1961 // Even if __kmp_affinity.type == affinity_none, this routine might still 1962 // be called to set __kmp_ncores, as well as 1963 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1964 if (!KMP_AFFINITY_CAPABLE()) { 1965 KMP_ASSERT(__kmp_affinity.type == affinity_none); 1966 __kmp_ncores = nPackages = __kmp_xproc; 1967 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1968 return true; 1969 } 1970 1971 // When affinity is off, this routine will still be called to set 1972 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1973 // Make sure all these vars are set correctly, and return now if affinity is 1974 // not enabled. 1975 __kmp_ncores = nPackages = __kmp_avail_proc; 1976 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1977 1978 // Construct the data structure to be returned. 1979 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1980 int avail_ct = 0; 1981 int i; 1982 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1983 // Skip this proc if it is not included in the machine model. 1984 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1985 continue; 1986 } 1987 kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct); 1988 hw_thread.clear(); 1989 hw_thread.os_id = i; 1990 hw_thread.ids[0] = i; 1991 hw_thread.ids[1] = 0; 1992 hw_thread.ids[2] = 0; 1993 avail_ct++; 1994 } 1995 if (__kmp_affinity.flags.verbose) { 1996 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 1997 } 1998 return true; 1999 } 2000 2001 #if KMP_GROUP_AFFINITY 2002 // If multiple Windows* OS processor groups exist, we can create a 2-level 2003 // topology map with the groups at level 0 and the individual procs at level 1. 2004 // This facilitates letting the threads float among all procs in a group, 2005 // if granularity=group (the default when there are multiple groups). 2006 static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) { 2007 *msg_id = kmp_i18n_null; 2008 int depth = 3; 2009 kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD}; 2010 const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR); 2011 2012 if (__kmp_affinity.flags.verbose) { 2013 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 2014 } 2015 2016 // If we aren't affinity capable, then use flat topology 2017 if (!KMP_AFFINITY_CAPABLE()) { 2018 KMP_ASSERT(__kmp_affinity.type == affinity_none); 2019 nPackages = __kmp_num_proc_groups; 2020 __kmp_nThreadsPerCore = 1; 2021 __kmp_ncores = __kmp_xproc; 2022 nCoresPerPkg = nPackages / __kmp_ncores; 2023 return true; 2024 } 2025 2026 // Construct the data structure to be returned. 2027 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 2028 int avail_ct = 0; 2029 int i; 2030 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 2031 // Skip this proc if it is not included in the machine model. 2032 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 2033 continue; 2034 } 2035 kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++); 2036 hw_thread.clear(); 2037 hw_thread.os_id = i; 2038 hw_thread.ids[0] = i / BITS_PER_GROUP; 2039 hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP; 2040 } 2041 return true; 2042 } 2043 #endif /* KMP_GROUP_AFFINITY */ 2044 2045 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 2046 2047 template <kmp_uint32 LSB, kmp_uint32 MSB> 2048 static inline unsigned __kmp_extract_bits(kmp_uint32 v) { 2049 const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB; 2050 const kmp_uint32 SHIFT_RIGHT = LSB; 2051 kmp_uint32 retval = v; 2052 retval <<= SHIFT_LEFT; 2053 retval >>= (SHIFT_LEFT + SHIFT_RIGHT); 2054 return retval; 2055 } 2056 2057 static int __kmp_cpuid_mask_width(int count) { 2058 int r = 0; 2059 2060 while ((1 << r) < count) 2061 ++r; 2062 return r; 2063 } 2064 2065 class apicThreadInfo { 2066 public: 2067 unsigned osId; // param to __kmp_affinity_bind_thread 2068 unsigned apicId; // from cpuid after binding 2069 unsigned maxCoresPerPkg; // "" 2070 unsigned maxThreadsPerPkg; // "" 2071 unsigned pkgId; // inferred from above values 2072 unsigned coreId; // "" 2073 unsigned threadId; // "" 2074 }; 2075 2076 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, 2077 const void *b) { 2078 const apicThreadInfo *aa = (const apicThreadInfo *)a; 2079 const apicThreadInfo *bb = (const apicThreadInfo *)b; 2080 if (aa->pkgId < bb->pkgId) 2081 return -1; 2082 if (aa->pkgId > bb->pkgId) 2083 return 1; 2084 if (aa->coreId < bb->coreId) 2085 return -1; 2086 if (aa->coreId > bb->coreId) 2087 return 1; 2088 if (aa->threadId < bb->threadId) 2089 return -1; 2090 if (aa->threadId > bb->threadId) 2091 return 1; 2092 return 0; 2093 } 2094 2095 class kmp_cache_info_t { 2096 public: 2097 struct info_t { 2098 unsigned level, mask; 2099 }; 2100 kmp_cache_info_t() : depth(0) { get_leaf4_levels(); } 2101 size_t get_depth() const { return depth; } 2102 info_t &operator[](size_t index) { return table[index]; } 2103 const info_t &operator[](size_t index) const { return table[index]; } 2104 2105 static kmp_hw_t get_topology_type(unsigned level) { 2106 KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL); 2107 switch (level) { 2108 case 1: 2109 return KMP_HW_L1; 2110 case 2: 2111 return KMP_HW_L2; 2112 case 3: 2113 return KMP_HW_L3; 2114 } 2115 return KMP_HW_UNKNOWN; 2116 } 2117 2118 private: 2119 static const int MAX_CACHE_LEVEL = 3; 2120 2121 size_t depth; 2122 info_t table[MAX_CACHE_LEVEL]; 2123 2124 void get_leaf4_levels() { 2125 unsigned level = 0; 2126 while (depth < MAX_CACHE_LEVEL) { 2127 unsigned cache_type, max_threads_sharing; 2128 unsigned cache_level, cache_mask_width; 2129 kmp_cpuid buf2; 2130 __kmp_x86_cpuid(4, level, &buf2); 2131 cache_type = __kmp_extract_bits<0, 4>(buf2.eax); 2132 if (!cache_type) 2133 break; 2134 // Skip instruction caches 2135 if (cache_type == 2) { 2136 level++; 2137 continue; 2138 } 2139 max_threads_sharing = __kmp_extract_bits<14, 25>(buf2.eax) + 1; 2140 cache_mask_width = __kmp_cpuid_mask_width(max_threads_sharing); 2141 cache_level = __kmp_extract_bits<5, 7>(buf2.eax); 2142 table[depth].level = cache_level; 2143 table[depth].mask = ((-1) << cache_mask_width); 2144 depth++; 2145 level++; 2146 } 2147 } 2148 }; 2149 2150 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 2151 // an algorithm which cycles through the available os threads, setting 2152 // the current thread's affinity mask to that thread, and then retrieves 2153 // the Apic Id for each thread context using the cpuid instruction. 2154 static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) { 2155 kmp_cpuid buf; 2156 *msg_id = kmp_i18n_null; 2157 2158 if (__kmp_affinity.flags.verbose) { 2159 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 2160 } 2161 2162 // Check if cpuid leaf 4 is supported. 2163 __kmp_x86_cpuid(0, 0, &buf); 2164 if (buf.eax < 4) { 2165 *msg_id = kmp_i18n_str_NoLeaf4Support; 2166 return false; 2167 } 2168 2169 // The algorithm used starts by setting the affinity to each available thread 2170 // and retrieving info from the cpuid instruction, so if we are not capable of 2171 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 2172 // need to do something else - use the defaults that we calculated from 2173 // issuing cpuid without binding to each proc. 2174 if (!KMP_AFFINITY_CAPABLE()) { 2175 // Hack to try and infer the machine topology using only the data 2176 // available from cpuid on the current thread, and __kmp_xproc. 2177 KMP_ASSERT(__kmp_affinity.type == affinity_none); 2178 2179 // Get an upper bound on the number of threads per package using cpuid(1). 2180 // On some OS/chps combinations where HT is supported by the chip but is 2181 // disabled, this value will be 2 on a single core chip. Usually, it will be 2182 // 2 if HT is enabled and 1 if HT is disabled. 2183 __kmp_x86_cpuid(1, 0, &buf); 2184 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 2185 if (maxThreadsPerPkg == 0) { 2186 maxThreadsPerPkg = 1; 2187 } 2188 2189 // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded 2190 // value. 2191 // 2192 // The author of cpu_count.cpp treated this only an upper bound on the 2193 // number of cores, but I haven't seen any cases where it was greater than 2194 // the actual number of cores, so we will treat it as exact in this block of 2195 // code. 2196 // 2197 // First, we need to check if cpuid(4) is supported on this chip. To see if 2198 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or 2199 // greater. 2200 __kmp_x86_cpuid(0, 0, &buf); 2201 if (buf.eax >= 4) { 2202 __kmp_x86_cpuid(4, 0, &buf); 2203 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 2204 } else { 2205 nCoresPerPkg = 1; 2206 } 2207 2208 // There is no way to reliably tell if HT is enabled without issuing the 2209 // cpuid instruction from every thread, can correlating the cpuid info, so 2210 // if the machine is not affinity capable, we assume that HT is off. We have 2211 // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine 2212 // does not support HT. 2213 // 2214 // - Older OSes are usually found on machines with older chips, which do not 2215 // support HT. 2216 // - The performance penalty for mistakenly identifying a machine as HT when 2217 // it isn't (which results in blocktime being incorrectly set to 0) is 2218 // greater than the penalty when for mistakenly identifying a machine as 2219 // being 1 thread/core when it is really HT enabled (which results in 2220 // blocktime being incorrectly set to a positive value). 2221 __kmp_ncores = __kmp_xproc; 2222 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 2223 __kmp_nThreadsPerCore = 1; 2224 return true; 2225 } 2226 2227 // From here on, we can assume that it is safe to call 2228 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 2229 // __kmp_affinity.type = affinity_none. 2230 2231 // Save the affinity mask for the current thread. 2232 kmp_affinity_raii_t previous_affinity; 2233 2234 // Run through each of the available contexts, binding the current thread 2235 // to it, and obtaining the pertinent information using the cpuid instr. 2236 // 2237 // The relevant information is: 2238 // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 2239 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 2240 // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value 2241 // of this field determines the width of the core# + thread# fields in the 2242 // Apic Id. It is also an upper bound on the number of threads per 2243 // package, but it has been verified that situations happen were it is not 2244 // exact. In particular, on certain OS/chip combinations where Intel(R) 2245 // Hyper-Threading Technology is supported by the chip but has been 2246 // disabled, the value of this field will be 2 (for a single core chip). 2247 // On other OS/chip combinations supporting Intel(R) Hyper-Threading 2248 // Technology, the value of this field will be 1 when Intel(R) 2249 // Hyper-Threading Technology is disabled and 2 when it is enabled. 2250 // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value 2251 // of this field (+1) determines the width of the core# field in the Apic 2252 // Id. The comments in "cpucount.cpp" say that this value is an upper 2253 // bound, but the IA-32 architecture manual says that it is exactly the 2254 // number of cores per package, and I haven't seen any case where it 2255 // wasn't. 2256 // 2257 // From this information, deduce the package Id, core Id, and thread Id, 2258 // and set the corresponding fields in the apicThreadInfo struct. 2259 unsigned i; 2260 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 2261 __kmp_avail_proc * sizeof(apicThreadInfo)); 2262 unsigned nApics = 0; 2263 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 2264 // Skip this proc if it is not included in the machine model. 2265 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 2266 continue; 2267 } 2268 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 2269 2270 __kmp_affinity_dispatch->bind_thread(i); 2271 threadInfo[nApics].osId = i; 2272 2273 // The apic id and max threads per pkg come from cpuid(1). 2274 __kmp_x86_cpuid(1, 0, &buf); 2275 if (((buf.edx >> 9) & 1) == 0) { 2276 __kmp_free(threadInfo); 2277 *msg_id = kmp_i18n_str_ApicNotPresent; 2278 return false; 2279 } 2280 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 2281 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 2282 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 2283 threadInfo[nApics].maxThreadsPerPkg = 1; 2284 } 2285 2286 // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded 2287 // value. 2288 // 2289 // First, we need to check if cpuid(4) is supported on this chip. To see if 2290 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n 2291 // or greater. 2292 __kmp_x86_cpuid(0, 0, &buf); 2293 if (buf.eax >= 4) { 2294 __kmp_x86_cpuid(4, 0, &buf); 2295 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 2296 } else { 2297 threadInfo[nApics].maxCoresPerPkg = 1; 2298 } 2299 2300 // Infer the pkgId / coreId / threadId using only the info obtained locally. 2301 int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg); 2302 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 2303 2304 int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg); 2305 int widthT = widthCT - widthC; 2306 if (widthT < 0) { 2307 // I've never seen this one happen, but I suppose it could, if the cpuid 2308 // instruction on a chip was really screwed up. Make sure to restore the 2309 // affinity mask before the tail call. 2310 __kmp_free(threadInfo); 2311 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 2312 return false; 2313 } 2314 2315 int maskC = (1 << widthC) - 1; 2316 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC; 2317 2318 int maskT = (1 << widthT) - 1; 2319 threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT; 2320 2321 nApics++; 2322 } 2323 2324 // We've collected all the info we need. 2325 // Restore the old affinity mask for this thread. 2326 previous_affinity.restore(); 2327 2328 // Sort the threadInfo table by physical Id. 2329 qsort(threadInfo, nApics, sizeof(*threadInfo), 2330 __kmp_affinity_cmp_apicThreadInfo_phys_id); 2331 2332 // The table is now sorted by pkgId / coreId / threadId, but we really don't 2333 // know the radix of any of the fields. pkgId's may be sparsely assigned among 2334 // the chips on a system. Although coreId's are usually assigned 2335 // [0 .. coresPerPkg-1] and threadId's are usually assigned 2336 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2337 // 2338 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 2339 // total # packages) are at this point - we want to determine that now. We 2340 // only have an upper bound on the first two figures. 2341 // 2342 // We also perform a consistency check at this point: the values returned by 2343 // the cpuid instruction for any thread bound to a given package had better 2344 // return the same info for maxThreadsPerPkg and maxCoresPerPkg. 2345 nPackages = 1; 2346 nCoresPerPkg = 1; 2347 __kmp_nThreadsPerCore = 1; 2348 unsigned nCores = 1; 2349 2350 unsigned pkgCt = 1; // to determine radii 2351 unsigned lastPkgId = threadInfo[0].pkgId; 2352 unsigned coreCt = 1; 2353 unsigned lastCoreId = threadInfo[0].coreId; 2354 unsigned threadCt = 1; 2355 unsigned lastThreadId = threadInfo[0].threadId; 2356 2357 // intra-pkg consist checks 2358 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 2359 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 2360 2361 for (i = 1; i < nApics; i++) { 2362 if (threadInfo[i].pkgId != lastPkgId) { 2363 nCores++; 2364 pkgCt++; 2365 lastPkgId = threadInfo[i].pkgId; 2366 if ((int)coreCt > nCoresPerPkg) 2367 nCoresPerPkg = coreCt; 2368 coreCt = 1; 2369 lastCoreId = threadInfo[i].coreId; 2370 if ((int)threadCt > __kmp_nThreadsPerCore) 2371 __kmp_nThreadsPerCore = threadCt; 2372 threadCt = 1; 2373 lastThreadId = threadInfo[i].threadId; 2374 2375 // This is a different package, so go on to the next iteration without 2376 // doing any consistency checks. Reset the consistency check vars, though. 2377 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 2378 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 2379 continue; 2380 } 2381 2382 if (threadInfo[i].coreId != lastCoreId) { 2383 nCores++; 2384 coreCt++; 2385 lastCoreId = threadInfo[i].coreId; 2386 if ((int)threadCt > __kmp_nThreadsPerCore) 2387 __kmp_nThreadsPerCore = threadCt; 2388 threadCt = 1; 2389 lastThreadId = threadInfo[i].threadId; 2390 } else if (threadInfo[i].threadId != lastThreadId) { 2391 threadCt++; 2392 lastThreadId = threadInfo[i].threadId; 2393 } else { 2394 __kmp_free(threadInfo); 2395 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 2396 return false; 2397 } 2398 2399 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 2400 // fields agree between all the threads bounds to a given package. 2401 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) || 2402 (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 2403 __kmp_free(threadInfo); 2404 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 2405 return false; 2406 } 2407 } 2408 // When affinity is off, this routine will still be called to set 2409 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 2410 // Make sure all these vars are set correctly 2411 nPackages = pkgCt; 2412 if ((int)coreCt > nCoresPerPkg) 2413 nCoresPerPkg = coreCt; 2414 if ((int)threadCt > __kmp_nThreadsPerCore) 2415 __kmp_nThreadsPerCore = threadCt; 2416 __kmp_ncores = nCores; 2417 KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc); 2418 2419 // Now that we've determined the number of packages, the number of cores per 2420 // package, and the number of threads per core, we can construct the data 2421 // structure that is to be returned. 2422 int idx = 0; 2423 int pkgLevel = 0; 2424 int coreLevel = 1; 2425 int threadLevel = 2; 2426 //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 2427 int depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 2428 kmp_hw_t types[3]; 2429 if (pkgLevel >= 0) 2430 types[idx++] = KMP_HW_SOCKET; 2431 if (coreLevel >= 0) 2432 types[idx++] = KMP_HW_CORE; 2433 if (threadLevel >= 0) 2434 types[idx++] = KMP_HW_THREAD; 2435 2436 KMP_ASSERT(depth > 0); 2437 __kmp_topology = kmp_topology_t::allocate(nApics, depth, types); 2438 2439 for (i = 0; i < nApics; ++i) { 2440 idx = 0; 2441 unsigned os = threadInfo[i].osId; 2442 kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 2443 hw_thread.clear(); 2444 2445 if (pkgLevel >= 0) { 2446 hw_thread.ids[idx++] = threadInfo[i].pkgId; 2447 } 2448 if (coreLevel >= 0) { 2449 hw_thread.ids[idx++] = threadInfo[i].coreId; 2450 } 2451 if (threadLevel >= 0) { 2452 hw_thread.ids[idx++] = threadInfo[i].threadId; 2453 } 2454 hw_thread.os_id = os; 2455 } 2456 2457 __kmp_free(threadInfo); 2458 __kmp_topology->sort_ids(); 2459 if (!__kmp_topology->check_ids()) { 2460 kmp_topology_t::deallocate(__kmp_topology); 2461 __kmp_topology = nullptr; 2462 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 2463 return false; 2464 } 2465 return true; 2466 } 2467 2468 // Hybrid cpu detection using CPUID.1A 2469 // Thread should be pinned to processor already 2470 static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type, int *efficiency, 2471 unsigned *native_model_id) { 2472 kmp_cpuid buf; 2473 __kmp_x86_cpuid(0x1a, 0, &buf); 2474 *type = (kmp_hw_core_type_t)__kmp_extract_bits<24, 31>(buf.eax); 2475 switch (*type) { 2476 case KMP_HW_CORE_TYPE_ATOM: 2477 *efficiency = 0; 2478 break; 2479 case KMP_HW_CORE_TYPE_CORE: 2480 *efficiency = 1; 2481 break; 2482 default: 2483 *efficiency = 0; 2484 } 2485 *native_model_id = __kmp_extract_bits<0, 23>(buf.eax); 2486 } 2487 2488 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 2489 // architectures support a newer interface for specifying the x2APIC Ids, 2490 // based on CPUID.B or CPUID.1F 2491 /* 2492 * CPUID.B or 1F, Input ECX (sub leaf # aka level number) 2493 Bits Bits Bits Bits 2494 31-16 15-8 7-4 4-0 2495 ---+-----------+--------------+-------------+-----------------+ 2496 EAX| reserved | reserved | reserved | Bits to Shift | 2497 ---+-----------|--------------+-------------+-----------------| 2498 EBX| reserved | Num logical processors at level (16 bits) | 2499 ---+-----------|--------------+-------------------------------| 2500 ECX| reserved | Level Type | Level Number (8 bits) | 2501 ---+-----------+--------------+-------------------------------| 2502 EDX| X2APIC ID (32 bits) | 2503 ---+----------------------------------------------------------+ 2504 */ 2505 2506 enum { 2507 INTEL_LEVEL_TYPE_INVALID = 0, // Package level 2508 INTEL_LEVEL_TYPE_SMT = 1, 2509 INTEL_LEVEL_TYPE_CORE = 2, 2510 INTEL_LEVEL_TYPE_MODULE = 3, 2511 INTEL_LEVEL_TYPE_TILE = 4, 2512 INTEL_LEVEL_TYPE_DIE = 5, 2513 INTEL_LEVEL_TYPE_LAST = 6, 2514 }; 2515 2516 struct cpuid_level_info_t { 2517 unsigned level_type, mask, mask_width, nitems, cache_mask; 2518 }; 2519 2520 static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) { 2521 switch (intel_type) { 2522 case INTEL_LEVEL_TYPE_INVALID: 2523 return KMP_HW_SOCKET; 2524 case INTEL_LEVEL_TYPE_SMT: 2525 return KMP_HW_THREAD; 2526 case INTEL_LEVEL_TYPE_CORE: 2527 return KMP_HW_CORE; 2528 case INTEL_LEVEL_TYPE_TILE: 2529 return KMP_HW_TILE; 2530 case INTEL_LEVEL_TYPE_MODULE: 2531 return KMP_HW_MODULE; 2532 case INTEL_LEVEL_TYPE_DIE: 2533 return KMP_HW_DIE; 2534 } 2535 return KMP_HW_UNKNOWN; 2536 } 2537 2538 // This function takes the topology leaf, a levels array to store the levels 2539 // detected and a bitmap of the known levels. 2540 // Returns the number of levels in the topology 2541 static unsigned 2542 __kmp_x2apicid_get_levels(int leaf, 2543 cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST], 2544 kmp_uint64 known_levels) { 2545 unsigned level, levels_index; 2546 unsigned level_type, mask_width, nitems; 2547 kmp_cpuid buf; 2548 2549 // New algorithm has known topology layers act as highest unknown topology 2550 // layers when unknown topology layers exist. 2551 // e.g., Suppose layers were SMT <X> CORE <Y> <Z> PACKAGE, where <X> <Y> <Z> 2552 // are unknown topology layers, Then SMT will take the characteristics of 2553 // (SMT x <X>) and CORE will take the characteristics of (CORE x <Y> x <Z>). 2554 // This eliminates unknown portions of the topology while still keeping the 2555 // correct structure. 2556 level = levels_index = 0; 2557 do { 2558 __kmp_x86_cpuid(leaf, level, &buf); 2559 level_type = __kmp_extract_bits<8, 15>(buf.ecx); 2560 mask_width = __kmp_extract_bits<0, 4>(buf.eax); 2561 nitems = __kmp_extract_bits<0, 15>(buf.ebx); 2562 if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0) 2563 return 0; 2564 2565 if (known_levels & (1ull << level_type)) { 2566 // Add a new level to the topology 2567 KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST); 2568 levels[levels_index].level_type = level_type; 2569 levels[levels_index].mask_width = mask_width; 2570 levels[levels_index].nitems = nitems; 2571 levels_index++; 2572 } else { 2573 // If it is an unknown level, then logically move the previous layer up 2574 if (levels_index > 0) { 2575 levels[levels_index - 1].mask_width = mask_width; 2576 levels[levels_index - 1].nitems = nitems; 2577 } 2578 } 2579 level++; 2580 } while (level_type != INTEL_LEVEL_TYPE_INVALID); 2581 2582 // Ensure the INTEL_LEVEL_TYPE_INVALID (Socket) layer isn't first 2583 if (levels_index == 0 || levels[0].level_type == INTEL_LEVEL_TYPE_INVALID) 2584 return 0; 2585 2586 // Set the masks to & with apicid 2587 for (unsigned i = 0; i < levels_index; ++i) { 2588 if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) { 2589 levels[i].mask = ~((-1) << levels[i].mask_width); 2590 levels[i].cache_mask = (-1) << levels[i].mask_width; 2591 for (unsigned j = 0; j < i; ++j) 2592 levels[i].mask ^= levels[j].mask; 2593 } else { 2594 KMP_DEBUG_ASSERT(i > 0); 2595 levels[i].mask = (-1) << levels[i - 1].mask_width; 2596 levels[i].cache_mask = 0; 2597 } 2598 } 2599 return levels_index; 2600 } 2601 2602 static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) { 2603 2604 cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST]; 2605 kmp_hw_t types[INTEL_LEVEL_TYPE_LAST]; 2606 unsigned levels_index; 2607 kmp_cpuid buf; 2608 kmp_uint64 known_levels; 2609 int topology_leaf, highest_leaf, apic_id; 2610 int num_leaves; 2611 static int leaves[] = {0, 0}; 2612 2613 kmp_i18n_id_t leaf_message_id; 2614 2615 KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST); 2616 2617 *msg_id = kmp_i18n_null; 2618 if (__kmp_affinity.flags.verbose) { 2619 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 2620 } 2621 2622 // Figure out the known topology levels 2623 known_levels = 0ull; 2624 for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) { 2625 if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) { 2626 known_levels |= (1ull << i); 2627 } 2628 } 2629 2630 // Get the highest cpuid leaf supported 2631 __kmp_x86_cpuid(0, 0, &buf); 2632 highest_leaf = buf.eax; 2633 2634 // If a specific topology method was requested, only allow that specific leaf 2635 // otherwise, try both leaves 31 and 11 in that order 2636 num_leaves = 0; 2637 if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 2638 num_leaves = 1; 2639 leaves[0] = 11; 2640 leaf_message_id = kmp_i18n_str_NoLeaf11Support; 2641 } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { 2642 num_leaves = 1; 2643 leaves[0] = 31; 2644 leaf_message_id = kmp_i18n_str_NoLeaf31Support; 2645 } else { 2646 num_leaves = 2; 2647 leaves[0] = 31; 2648 leaves[1] = 11; 2649 leaf_message_id = kmp_i18n_str_NoLeaf11Support; 2650 } 2651 2652 // Check to see if cpuid leaf 31 or 11 is supported. 2653 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2654 topology_leaf = -1; 2655 for (int i = 0; i < num_leaves; ++i) { 2656 int leaf = leaves[i]; 2657 if (highest_leaf < leaf) 2658 continue; 2659 __kmp_x86_cpuid(leaf, 0, &buf); 2660 if (buf.ebx == 0) 2661 continue; 2662 topology_leaf = leaf; 2663 levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels); 2664 if (levels_index == 0) 2665 continue; 2666 break; 2667 } 2668 if (topology_leaf == -1 || levels_index == 0) { 2669 *msg_id = leaf_message_id; 2670 return false; 2671 } 2672 KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST); 2673 2674 // The algorithm used starts by setting the affinity to each available thread 2675 // and retrieving info from the cpuid instruction, so if we are not capable of 2676 // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then 2677 // we need to do something else - use the defaults that we calculated from 2678 // issuing cpuid without binding to each proc. 2679 if (!KMP_AFFINITY_CAPABLE()) { 2680 // Hack to try and infer the machine topology using only the data 2681 // available from cpuid on the current thread, and __kmp_xproc. 2682 KMP_ASSERT(__kmp_affinity.type == affinity_none); 2683 for (unsigned i = 0; i < levels_index; ++i) { 2684 if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) { 2685 __kmp_nThreadsPerCore = levels[i].nitems; 2686 } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) { 2687 nCoresPerPkg = levels[i].nitems; 2688 } 2689 } 2690 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 2691 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 2692 return true; 2693 } 2694 2695 // Allocate the data structure to be returned. 2696 int depth = levels_index; 2697 for (int i = depth - 1, j = 0; i >= 0; --i, ++j) 2698 types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type); 2699 __kmp_topology = 2700 kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types); 2701 2702 // Insert equivalent cache types if they exist 2703 kmp_cache_info_t cache_info; 2704 for (size_t i = 0; i < cache_info.get_depth(); ++i) { 2705 const kmp_cache_info_t::info_t &info = cache_info[i]; 2706 unsigned cache_mask = info.mask; 2707 unsigned cache_level = info.level; 2708 for (unsigned j = 0; j < levels_index; ++j) { 2709 unsigned hw_cache_mask = levels[j].cache_mask; 2710 kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level); 2711 if (hw_cache_mask == cache_mask && j < levels_index - 1) { 2712 kmp_hw_t type = 2713 __kmp_intel_type_2_topology_type(levels[j + 1].level_type); 2714 __kmp_topology->set_equivalent_type(cache_type, type); 2715 } 2716 } 2717 } 2718 2719 // From here on, we can assume that it is safe to call 2720 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 2721 // __kmp_affinity.type = affinity_none. 2722 2723 // Save the affinity mask for the current thread. 2724 kmp_affinity_raii_t previous_affinity; 2725 2726 // Run through each of the available contexts, binding the current thread 2727 // to it, and obtaining the pertinent information using the cpuid instr. 2728 unsigned int proc; 2729 int hw_thread_index = 0; 2730 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { 2731 cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST]; 2732 unsigned my_levels_index; 2733 2734 // Skip this proc if it is not included in the machine model. 2735 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 2736 continue; 2737 } 2738 KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc); 2739 2740 __kmp_affinity_dispatch->bind_thread(proc); 2741 2742 // New algorithm 2743 __kmp_x86_cpuid(topology_leaf, 0, &buf); 2744 apic_id = buf.edx; 2745 kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index); 2746 my_levels_index = 2747 __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels); 2748 if (my_levels_index == 0 || my_levels_index != levels_index) { 2749 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 2750 return false; 2751 } 2752 hw_thread.clear(); 2753 hw_thread.os_id = proc; 2754 // Put in topology information 2755 for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) { 2756 hw_thread.ids[idx] = apic_id & my_levels[j].mask; 2757 if (j > 0) { 2758 hw_thread.ids[idx] >>= my_levels[j - 1].mask_width; 2759 } 2760 } 2761 // Hybrid information 2762 if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) { 2763 kmp_hw_core_type_t type; 2764 unsigned native_model_id; 2765 int efficiency; 2766 __kmp_get_hybrid_info(&type, &efficiency, &native_model_id); 2767 hw_thread.attrs.set_core_type(type); 2768 hw_thread.attrs.set_core_eff(efficiency); 2769 } 2770 hw_thread_index++; 2771 } 2772 KMP_ASSERT(hw_thread_index > 0); 2773 __kmp_topology->sort_ids(); 2774 if (!__kmp_topology->check_ids()) { 2775 kmp_topology_t::deallocate(__kmp_topology); 2776 __kmp_topology = nullptr; 2777 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 2778 return false; 2779 } 2780 return true; 2781 } 2782 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 2783 2784 #define osIdIndex 0 2785 #define threadIdIndex 1 2786 #define coreIdIndex 2 2787 #define pkgIdIndex 3 2788 #define nodeIdIndex 4 2789 2790 typedef unsigned *ProcCpuInfo; 2791 static unsigned maxIndex = pkgIdIndex; 2792 2793 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, 2794 const void *b) { 2795 unsigned i; 2796 const unsigned *aa = *(unsigned *const *)a; 2797 const unsigned *bb = *(unsigned *const *)b; 2798 for (i = maxIndex;; i--) { 2799 if (aa[i] < bb[i]) 2800 return -1; 2801 if (aa[i] > bb[i]) 2802 return 1; 2803 if (i == osIdIndex) 2804 break; 2805 } 2806 return 0; 2807 } 2808 2809 #if KMP_USE_HIER_SCHED 2810 // Set the array sizes for the hierarchy layers 2811 static void __kmp_dispatch_set_hierarchy_values() { 2812 // Set the maximum number of L1's to number of cores 2813 // Set the maximum number of L2's to either number of cores / 2 for 2814 // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing 2815 // Or the number of cores for Intel(R) Xeon(R) processors 2816 // Set the maximum number of NUMA nodes and L3's to number of packages 2817 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] = 2818 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 2819 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores; 2820 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ 2821 KMP_MIC_SUPPORTED 2822 if (__kmp_mic_type >= mic3) 2823 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2; 2824 else 2825 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 2826 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores; 2827 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages; 2828 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages; 2829 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1; 2830 // Set the number of threads per unit 2831 // Number of hardware threads per L1/L2/L3/NUMA/LOOP 2832 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1; 2833 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] = 2834 __kmp_nThreadsPerCore; 2835 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ 2836 KMP_MIC_SUPPORTED 2837 if (__kmp_mic_type >= mic3) 2838 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 2839 2 * __kmp_nThreadsPerCore; 2840 else 2841 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 2842 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 2843 __kmp_nThreadsPerCore; 2844 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] = 2845 nCoresPerPkg * __kmp_nThreadsPerCore; 2846 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] = 2847 nCoresPerPkg * __kmp_nThreadsPerCore; 2848 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] = 2849 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 2850 } 2851 2852 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc) 2853 // i.e., this thread's L1 or this thread's L2, etc. 2854 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) { 2855 int index = type + 1; 2856 int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1]; 2857 KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST); 2858 if (type == kmp_hier_layer_e::LAYER_THREAD) 2859 return tid; 2860 else if (type == kmp_hier_layer_e::LAYER_LOOP) 2861 return 0; 2862 KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0); 2863 if (tid >= num_hw_threads) 2864 tid = tid % num_hw_threads; 2865 return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index]; 2866 } 2867 2868 // Return the number of t1's per t2 2869 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) { 2870 int i1 = t1 + 1; 2871 int i2 = t2 + 1; 2872 KMP_DEBUG_ASSERT(i1 <= i2); 2873 KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST); 2874 KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST); 2875 KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0); 2876 // (nthreads/t2) / (nthreads/t1) = t1 / t2 2877 return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1]; 2878 } 2879 #endif // KMP_USE_HIER_SCHED 2880 2881 static inline const char *__kmp_cpuinfo_get_filename() { 2882 const char *filename; 2883 if (__kmp_cpuinfo_file != nullptr) 2884 filename = __kmp_cpuinfo_file; 2885 else 2886 filename = "/proc/cpuinfo"; 2887 return filename; 2888 } 2889 2890 static inline const char *__kmp_cpuinfo_get_envvar() { 2891 const char *envvar = nullptr; 2892 if (__kmp_cpuinfo_file != nullptr) 2893 envvar = "KMP_CPUINFO_FILE"; 2894 return envvar; 2895 } 2896 2897 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 2898 // affinity map. 2899 static bool __kmp_affinity_create_cpuinfo_map(int *line, 2900 kmp_i18n_id_t *const msg_id) { 2901 const char *filename = __kmp_cpuinfo_get_filename(); 2902 const char *envvar = __kmp_cpuinfo_get_envvar(); 2903 *msg_id = kmp_i18n_null; 2904 2905 if (__kmp_affinity.flags.verbose) { 2906 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 2907 } 2908 2909 kmp_safe_raii_file_t f(filename, "r", envvar); 2910 2911 // Scan of the file, and count the number of "processor" (osId) fields, 2912 // and find the highest value of <n> for a node_<n> field. 2913 char buf[256]; 2914 unsigned num_records = 0; 2915 while (!feof(f)) { 2916 buf[sizeof(buf) - 1] = 1; 2917 if (!fgets(buf, sizeof(buf), f)) { 2918 // Read errors presumably because of EOF 2919 break; 2920 } 2921 2922 char s1[] = "processor"; 2923 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2924 num_records++; 2925 continue; 2926 } 2927 2928 // FIXME - this will match "node_<n> <garbage>" 2929 unsigned level; 2930 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2931 // validate the input fisrt: 2932 if (level > (unsigned)__kmp_xproc) { // level is too big 2933 level = __kmp_xproc; 2934 } 2935 if (nodeIdIndex + level >= maxIndex) { 2936 maxIndex = nodeIdIndex + level; 2937 } 2938 continue; 2939 } 2940 } 2941 2942 // Check for empty file / no valid processor records, or too many. The number 2943 // of records can't exceed the number of valid bits in the affinity mask. 2944 if (num_records == 0) { 2945 *msg_id = kmp_i18n_str_NoProcRecords; 2946 return false; 2947 } 2948 if (num_records > (unsigned)__kmp_xproc) { 2949 *msg_id = kmp_i18n_str_TooManyProcRecords; 2950 return false; 2951 } 2952 2953 // Set the file pointer back to the beginning, so that we can scan the file 2954 // again, this time performing a full parse of the data. Allocate a vector of 2955 // ProcCpuInfo object, where we will place the data. Adding an extra element 2956 // at the end allows us to remove a lot of extra checks for termination 2957 // conditions. 2958 if (fseek(f, 0, SEEK_SET) != 0) { 2959 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 2960 return false; 2961 } 2962 2963 // Allocate the array of records to store the proc info in. The dummy 2964 // element at the end makes the logic in filling them out easier to code. 2965 unsigned **threadInfo = 2966 (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *)); 2967 unsigned i; 2968 for (i = 0; i <= num_records; i++) { 2969 threadInfo[i] = 2970 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2971 } 2972 2973 #define CLEANUP_THREAD_INFO \ 2974 for (i = 0; i <= num_records; i++) { \ 2975 __kmp_free(threadInfo[i]); \ 2976 } \ 2977 __kmp_free(threadInfo); 2978 2979 // A value of UINT_MAX means that we didn't find the field 2980 unsigned __index; 2981 2982 #define INIT_PROC_INFO(p) \ 2983 for (__index = 0; __index <= maxIndex; __index++) { \ 2984 (p)[__index] = UINT_MAX; \ 2985 } 2986 2987 for (i = 0; i <= num_records; i++) { 2988 INIT_PROC_INFO(threadInfo[i]); 2989 } 2990 2991 unsigned num_avail = 0; 2992 *line = 0; 2993 #if KMP_ARCH_S390X 2994 bool reading_s390x_sys_info = true; 2995 #endif 2996 while (!feof(f)) { 2997 // Create an inner scoping level, so that all the goto targets at the end of 2998 // the loop appear in an outer scoping level. This avoids warnings about 2999 // jumping past an initialization to a target in the same block. 3000 { 3001 buf[sizeof(buf) - 1] = 1; 3002 bool long_line = false; 3003 if (!fgets(buf, sizeof(buf), f)) { 3004 // Read errors presumably because of EOF 3005 // If there is valid data in threadInfo[num_avail], then fake 3006 // a blank line in ensure that the last address gets parsed. 3007 bool valid = false; 3008 for (i = 0; i <= maxIndex; i++) { 3009 if (threadInfo[num_avail][i] != UINT_MAX) { 3010 valid = true; 3011 } 3012 } 3013 if (!valid) { 3014 break; 3015 } 3016 buf[0] = 0; 3017 } else if (!buf[sizeof(buf) - 1]) { 3018 // The line is longer than the buffer. Set a flag and don't 3019 // emit an error if we were going to ignore the line, anyway. 3020 long_line = true; 3021 3022 #define CHECK_LINE \ 3023 if (long_line) { \ 3024 CLEANUP_THREAD_INFO; \ 3025 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 3026 return false; \ 3027 } 3028 } 3029 (*line)++; 3030 3031 #if KMP_ARCH_LOONGARCH64 3032 // The parsing logic of /proc/cpuinfo in this function highly depends on 3033 // the blank lines between each processor info block. But on LoongArch a 3034 // blank line exists before the first processor info block (i.e. after the 3035 // "system type" line). This blank line was added because the "system 3036 // type" line is unrelated to any of the CPUs. We must skip this line so 3037 // that the original logic works on LoongArch. 3038 if (*buf == '\n' && *line == 2) 3039 continue; 3040 #endif 3041 #if KMP_ARCH_S390X 3042 // s390x /proc/cpuinfo starts with a variable number of lines containing 3043 // the overall system information. Skip them. 3044 if (reading_s390x_sys_info) { 3045 if (*buf == '\n') 3046 reading_s390x_sys_info = false; 3047 continue; 3048 } 3049 #endif 3050 3051 #if KMP_ARCH_S390X 3052 char s1[] = "cpu number"; 3053 #else 3054 char s1[] = "processor"; 3055 #endif 3056 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 3057 CHECK_LINE; 3058 char *p = strchr(buf + sizeof(s1) - 1, ':'); 3059 unsigned val; 3060 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 3061 goto no_val; 3062 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) 3063 #if KMP_ARCH_AARCH64 3064 // Handle the old AArch64 /proc/cpuinfo layout differently, 3065 // it contains all of the 'processor' entries listed in a 3066 // single 'Processor' section, therefore the normal looking 3067 // for duplicates in that section will always fail. 3068 num_avail++; 3069 #else 3070 goto dup_field; 3071 #endif 3072 threadInfo[num_avail][osIdIndex] = val; 3073 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64) 3074 char path[256]; 3075 KMP_SNPRINTF( 3076 path, sizeof(path), 3077 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 3078 threadInfo[num_avail][osIdIndex]); 3079 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 3080 3081 #if KMP_ARCH_S390X 3082 // Disambiguate physical_package_id. 3083 unsigned book_id; 3084 KMP_SNPRINTF(path, sizeof(path), 3085 "/sys/devices/system/cpu/cpu%u/topology/book_id", 3086 threadInfo[num_avail][osIdIndex]); 3087 __kmp_read_from_file(path, "%u", &book_id); 3088 threadInfo[num_avail][pkgIdIndex] |= (book_id << 8); 3089 3090 unsigned drawer_id; 3091 KMP_SNPRINTF(path, sizeof(path), 3092 "/sys/devices/system/cpu/cpu%u/topology/drawer_id", 3093 threadInfo[num_avail][osIdIndex]); 3094 __kmp_read_from_file(path, "%u", &drawer_id); 3095 threadInfo[num_avail][pkgIdIndex] |= (drawer_id << 16); 3096 #endif 3097 3098 KMP_SNPRINTF(path, sizeof(path), 3099 "/sys/devices/system/cpu/cpu%u/topology/core_id", 3100 threadInfo[num_avail][osIdIndex]); 3101 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 3102 continue; 3103 #else 3104 } 3105 char s2[] = "physical id"; 3106 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 3107 CHECK_LINE; 3108 char *p = strchr(buf + sizeof(s2) - 1, ':'); 3109 unsigned val; 3110 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 3111 goto no_val; 3112 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) 3113 goto dup_field; 3114 threadInfo[num_avail][pkgIdIndex] = val; 3115 continue; 3116 } 3117 char s3[] = "core id"; 3118 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 3119 CHECK_LINE; 3120 char *p = strchr(buf + sizeof(s3) - 1, ':'); 3121 unsigned val; 3122 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 3123 goto no_val; 3124 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) 3125 goto dup_field; 3126 threadInfo[num_avail][coreIdIndex] = val; 3127 continue; 3128 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 3129 } 3130 char s4[] = "thread id"; 3131 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 3132 CHECK_LINE; 3133 char *p = strchr(buf + sizeof(s4) - 1, ':'); 3134 unsigned val; 3135 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 3136 goto no_val; 3137 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) 3138 goto dup_field; 3139 threadInfo[num_avail][threadIdIndex] = val; 3140 continue; 3141 } 3142 unsigned level; 3143 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 3144 CHECK_LINE; 3145 char *p = strchr(buf + sizeof(s4) - 1, ':'); 3146 unsigned val; 3147 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 3148 goto no_val; 3149 // validate the input before using level: 3150 if (level > (unsigned)__kmp_xproc) { // level is too big 3151 level = __kmp_xproc; 3152 } 3153 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) 3154 goto dup_field; 3155 threadInfo[num_avail][nodeIdIndex + level] = val; 3156 continue; 3157 } 3158 3159 // We didn't recognize the leading token on the line. There are lots of 3160 // leading tokens that we don't recognize - if the line isn't empty, go on 3161 // to the next line. 3162 if ((*buf != 0) && (*buf != '\n')) { 3163 // If the line is longer than the buffer, read characters 3164 // until we find a newline. 3165 if (long_line) { 3166 int ch; 3167 while (((ch = fgetc(f)) != EOF) && (ch != '\n')) 3168 ; 3169 } 3170 continue; 3171 } 3172 3173 // A newline has signalled the end of the processor record. 3174 // Check that there aren't too many procs specified. 3175 if ((int)num_avail == __kmp_xproc) { 3176 CLEANUP_THREAD_INFO; 3177 *msg_id = kmp_i18n_str_TooManyEntries; 3178 return false; 3179 } 3180 3181 // Check for missing fields. The osId field must be there, and we 3182 // currently require that the physical id field is specified, also. 3183 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 3184 CLEANUP_THREAD_INFO; 3185 *msg_id = kmp_i18n_str_MissingProcField; 3186 return false; 3187 } 3188 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 3189 CLEANUP_THREAD_INFO; 3190 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 3191 return false; 3192 } 3193 3194 // Skip this proc if it is not included in the machine model. 3195 if (KMP_AFFINITY_CAPABLE() && 3196 !KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], 3197 __kmp_affin_fullMask)) { 3198 INIT_PROC_INFO(threadInfo[num_avail]); 3199 continue; 3200 } 3201 3202 // We have a successful parse of this proc's info. 3203 // Increment the counter, and prepare for the next proc. 3204 num_avail++; 3205 KMP_ASSERT(num_avail <= num_records); 3206 INIT_PROC_INFO(threadInfo[num_avail]); 3207 } 3208 continue; 3209 3210 no_val: 3211 CLEANUP_THREAD_INFO; 3212 *msg_id = kmp_i18n_str_MissingValCpuinfo; 3213 return false; 3214 3215 dup_field: 3216 CLEANUP_THREAD_INFO; 3217 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 3218 return false; 3219 } 3220 *line = 0; 3221 3222 #if KMP_MIC && REDUCE_TEAM_SIZE 3223 unsigned teamSize = 0; 3224 #endif // KMP_MIC && REDUCE_TEAM_SIZE 3225 3226 // check for num_records == __kmp_xproc ??? 3227 3228 // If it is configured to omit the package level when there is only a single 3229 // package, the logic at the end of this routine won't work if there is only a 3230 // single thread 3231 KMP_ASSERT(num_avail > 0); 3232 KMP_ASSERT(num_avail <= num_records); 3233 3234 // Sort the threadInfo table by physical Id. 3235 qsort(threadInfo, num_avail, sizeof(*threadInfo), 3236 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 3237 3238 // The table is now sorted by pkgId / coreId / threadId, but we really don't 3239 // know the radix of any of the fields. pkgId's may be sparsely assigned among 3240 // the chips on a system. Although coreId's are usually assigned 3241 // [0 .. coresPerPkg-1] and threadId's are usually assigned 3242 // [0..threadsPerCore-1], we don't want to make any such assumptions. 3243 // 3244 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 3245 // total # packages) are at this point - we want to determine that now. We 3246 // only have an upper bound on the first two figures. 3247 unsigned *counts = 3248 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 3249 unsigned *maxCt = 3250 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 3251 unsigned *totals = 3252 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 3253 unsigned *lastId = 3254 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 3255 3256 bool assign_thread_ids = false; 3257 unsigned threadIdCt; 3258 unsigned index; 3259 3260 restart_radix_check: 3261 threadIdCt = 0; 3262 3263 // Initialize the counter arrays with data from threadInfo[0]. 3264 if (assign_thread_ids) { 3265 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 3266 threadInfo[0][threadIdIndex] = threadIdCt++; 3267 } else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 3268 threadIdCt = threadInfo[0][threadIdIndex] + 1; 3269 } 3270 } 3271 for (index = 0; index <= maxIndex; index++) { 3272 counts[index] = 1; 3273 maxCt[index] = 1; 3274 totals[index] = 1; 3275 lastId[index] = threadInfo[0][index]; 3276 ; 3277 } 3278 3279 // Run through the rest of the OS procs. 3280 for (i = 1; i < num_avail; i++) { 3281 // Find the most significant index whose id differs from the id for the 3282 // previous OS proc. 3283 for (index = maxIndex; index >= threadIdIndex; index--) { 3284 if (assign_thread_ids && (index == threadIdIndex)) { 3285 // Auto-assign the thread id field if it wasn't specified. 3286 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 3287 threadInfo[i][threadIdIndex] = threadIdCt++; 3288 } 3289 // Apparently the thread id field was specified for some entries and not 3290 // others. Start the thread id counter off at the next higher thread id. 3291 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 3292 threadIdCt = threadInfo[i][threadIdIndex] + 1; 3293 } 3294 } 3295 if (threadInfo[i][index] != lastId[index]) { 3296 // Run through all indices which are less significant, and reset the 3297 // counts to 1. At all levels up to and including index, we need to 3298 // increment the totals and record the last id. 3299 unsigned index2; 3300 for (index2 = threadIdIndex; index2 < index; index2++) { 3301 totals[index2]++; 3302 if (counts[index2] > maxCt[index2]) { 3303 maxCt[index2] = counts[index2]; 3304 } 3305 counts[index2] = 1; 3306 lastId[index2] = threadInfo[i][index2]; 3307 } 3308 counts[index]++; 3309 totals[index]++; 3310 lastId[index] = threadInfo[i][index]; 3311 3312 if (assign_thread_ids && (index > threadIdIndex)) { 3313 3314 #if KMP_MIC && REDUCE_TEAM_SIZE 3315 // The default team size is the total #threads in the machine 3316 // minus 1 thread for every core that has 3 or more threads. 3317 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 3318 #endif // KMP_MIC && REDUCE_TEAM_SIZE 3319 3320 // Restart the thread counter, as we are on a new core. 3321 threadIdCt = 0; 3322 3323 // Auto-assign the thread id field if it wasn't specified. 3324 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 3325 threadInfo[i][threadIdIndex] = threadIdCt++; 3326 } 3327 3328 // Apparently the thread id field was specified for some entries and 3329 // not others. Start the thread id counter off at the next higher 3330 // thread id. 3331 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 3332 threadIdCt = threadInfo[i][threadIdIndex] + 1; 3333 } 3334 } 3335 break; 3336 } 3337 } 3338 if (index < threadIdIndex) { 3339 // If thread ids were specified, it is an error if they are not unique. 3340 // Also, check that we waven't already restarted the loop (to be safe - 3341 // shouldn't need to). 3342 if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) { 3343 __kmp_free(lastId); 3344 __kmp_free(totals); 3345 __kmp_free(maxCt); 3346 __kmp_free(counts); 3347 CLEANUP_THREAD_INFO; 3348 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 3349 return false; 3350 } 3351 3352 // If the thread ids were not specified and we see entries that 3353 // are duplicates, start the loop over and assign the thread ids manually. 3354 assign_thread_ids = true; 3355 goto restart_radix_check; 3356 } 3357 } 3358 3359 #if KMP_MIC && REDUCE_TEAM_SIZE 3360 // The default team size is the total #threads in the machine 3361 // minus 1 thread for every core that has 3 or more threads. 3362 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 3363 #endif // KMP_MIC && REDUCE_TEAM_SIZE 3364 3365 for (index = threadIdIndex; index <= maxIndex; index++) { 3366 if (counts[index] > maxCt[index]) { 3367 maxCt[index] = counts[index]; 3368 } 3369 } 3370 3371 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 3372 nCoresPerPkg = maxCt[coreIdIndex]; 3373 nPackages = totals[pkgIdIndex]; 3374 3375 // When affinity is off, this routine will still be called to set 3376 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 3377 // Make sure all these vars are set correctly, and return now if affinity is 3378 // not enabled. 3379 __kmp_ncores = totals[coreIdIndex]; 3380 if (!KMP_AFFINITY_CAPABLE()) { 3381 KMP_ASSERT(__kmp_affinity.type == affinity_none); 3382 return true; 3383 } 3384 3385 #if KMP_MIC && REDUCE_TEAM_SIZE 3386 // Set the default team size. 3387 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 3388 __kmp_dflt_team_nth = teamSize; 3389 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting " 3390 "__kmp_dflt_team_nth = %d\n", 3391 __kmp_dflt_team_nth)); 3392 } 3393 #endif // KMP_MIC && REDUCE_TEAM_SIZE 3394 3395 KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc); 3396 3397 // Count the number of levels which have more nodes at that level than at the 3398 // parent's level (with there being an implicit root node of the top level). 3399 // This is equivalent to saying that there is at least one node at this level 3400 // which has a sibling. These levels are in the map, and the package level is 3401 // always in the map. 3402 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 3403 for (index = threadIdIndex; index < maxIndex; index++) { 3404 KMP_ASSERT(totals[index] >= totals[index + 1]); 3405 inMap[index] = (totals[index] > totals[index + 1]); 3406 } 3407 inMap[maxIndex] = (totals[maxIndex] > 1); 3408 inMap[pkgIdIndex] = true; 3409 inMap[coreIdIndex] = true; 3410 inMap[threadIdIndex] = true; 3411 3412 int depth = 0; 3413 int idx = 0; 3414 kmp_hw_t types[KMP_HW_LAST]; 3415 int pkgLevel = -1; 3416 int coreLevel = -1; 3417 int threadLevel = -1; 3418 for (index = threadIdIndex; index <= maxIndex; index++) { 3419 if (inMap[index]) { 3420 depth++; 3421 } 3422 } 3423 if (inMap[pkgIdIndex]) { 3424 pkgLevel = idx; 3425 types[idx++] = KMP_HW_SOCKET; 3426 } 3427 if (inMap[coreIdIndex]) { 3428 coreLevel = idx; 3429 types[idx++] = KMP_HW_CORE; 3430 } 3431 if (inMap[threadIdIndex]) { 3432 threadLevel = idx; 3433 types[idx++] = KMP_HW_THREAD; 3434 } 3435 KMP_ASSERT(depth > 0); 3436 3437 // Construct the data structure that is to be returned. 3438 __kmp_topology = kmp_topology_t::allocate(num_avail, depth, types); 3439 3440 for (i = 0; i < num_avail; ++i) { 3441 unsigned os = threadInfo[i][osIdIndex]; 3442 int src_index; 3443 kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 3444 hw_thread.clear(); 3445 hw_thread.os_id = os; 3446 3447 idx = 0; 3448 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 3449 if (!inMap[src_index]) { 3450 continue; 3451 } 3452 if (src_index == pkgIdIndex) { 3453 hw_thread.ids[pkgLevel] = threadInfo[i][src_index]; 3454 } else if (src_index == coreIdIndex) { 3455 hw_thread.ids[coreLevel] = threadInfo[i][src_index]; 3456 } else if (src_index == threadIdIndex) { 3457 hw_thread.ids[threadLevel] = threadInfo[i][src_index]; 3458 } 3459 } 3460 } 3461 3462 __kmp_free(inMap); 3463 __kmp_free(lastId); 3464 __kmp_free(totals); 3465 __kmp_free(maxCt); 3466 __kmp_free(counts); 3467 CLEANUP_THREAD_INFO; 3468 __kmp_topology->sort_ids(); 3469 if (!__kmp_topology->check_ids()) { 3470 kmp_topology_t::deallocate(__kmp_topology); 3471 __kmp_topology = nullptr; 3472 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 3473 return false; 3474 } 3475 return true; 3476 } 3477 3478 // Create and return a table of affinity masks, indexed by OS thread ID. 3479 // This routine handles OR'ing together all the affinity masks of threads 3480 // that are sufficiently close, if granularity > fine. 3481 template <typename FindNextFunctionType> 3482 static void __kmp_create_os_id_masks(unsigned *numUnique, 3483 kmp_affinity_t &affinity, 3484 FindNextFunctionType find_next) { 3485 // First form a table of affinity masks in order of OS thread id. 3486 int maxOsId; 3487 int i; 3488 int numAddrs = __kmp_topology->get_num_hw_threads(); 3489 int depth = __kmp_topology->get_depth(); 3490 const char *env_var = __kmp_get_affinity_env_var(affinity); 3491 KMP_ASSERT(numAddrs); 3492 KMP_ASSERT(depth); 3493 3494 i = find_next(-1); 3495 // If could not find HW thread location with attributes, then return and 3496 // fallback to increment find_next and disregard core attributes. 3497 if (i >= numAddrs) 3498 return; 3499 3500 maxOsId = 0; 3501 for (i = numAddrs - 1;; --i) { 3502 int osId = __kmp_topology->at(i).os_id; 3503 if (osId > maxOsId) { 3504 maxOsId = osId; 3505 } 3506 if (i == 0) 3507 break; 3508 } 3509 affinity.num_os_id_masks = maxOsId + 1; 3510 KMP_CPU_ALLOC_ARRAY(affinity.os_id_masks, affinity.num_os_id_masks); 3511 KMP_ASSERT(affinity.gran_levels >= 0); 3512 if (affinity.flags.verbose && (affinity.gran_levels > 0)) { 3513 KMP_INFORM(ThreadsMigrate, env_var, affinity.gran_levels); 3514 } 3515 if (affinity.gran_levels >= (int)depth) { 3516 KMP_AFF_WARNING(affinity, AffThreadsMayMigrate); 3517 } 3518 3519 // Run through the table, forming the masks for all threads on each core. 3520 // Threads on the same core will have identical kmp_hw_thread_t objects, not 3521 // considering the last level, which must be the thread id. All threads on a 3522 // core will appear consecutively. 3523 int unique = 0; 3524 int j = 0; // index of 1st thread on core 3525 int leader = 0; 3526 kmp_affin_mask_t *sum; 3527 KMP_CPU_ALLOC_ON_STACK(sum); 3528 KMP_CPU_ZERO(sum); 3529 3530 i = j = leader = find_next(-1); 3531 KMP_CPU_SET(__kmp_topology->at(i).os_id, sum); 3532 kmp_full_mask_modifier_t full_mask; 3533 for (i = find_next(i); i < numAddrs; i = find_next(i)) { 3534 // If this thread is sufficiently close to the leader (within the 3535 // granularity setting), then set the bit for this os thread in the 3536 // affinity mask for this group, and go on to the next thread. 3537 if (__kmp_topology->is_close(leader, i, affinity)) { 3538 KMP_CPU_SET(__kmp_topology->at(i).os_id, sum); 3539 continue; 3540 } 3541 3542 // For every thread in this group, copy the mask to the thread's entry in 3543 // the OS Id mask table. Mark the first address as a leader. 3544 for (; j < i; j = find_next(j)) { 3545 int osId = __kmp_topology->at(j).os_id; 3546 KMP_DEBUG_ASSERT(osId <= maxOsId); 3547 kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId); 3548 KMP_CPU_COPY(mask, sum); 3549 __kmp_topology->at(j).leader = (j == leader); 3550 } 3551 unique++; 3552 3553 // Start a new mask. 3554 leader = i; 3555 full_mask.include(sum); 3556 KMP_CPU_ZERO(sum); 3557 KMP_CPU_SET(__kmp_topology->at(i).os_id, sum); 3558 } 3559 3560 // For every thread in last group, copy the mask to the thread's 3561 // entry in the OS Id mask table. 3562 for (; j < i; j = find_next(j)) { 3563 int osId = __kmp_topology->at(j).os_id; 3564 KMP_DEBUG_ASSERT(osId <= maxOsId); 3565 kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId); 3566 KMP_CPU_COPY(mask, sum); 3567 __kmp_topology->at(j).leader = (j == leader); 3568 } 3569 full_mask.include(sum); 3570 unique++; 3571 KMP_CPU_FREE_FROM_STACK(sum); 3572 3573 // See if the OS Id mask table further restricts or changes the full mask 3574 if (full_mask.restrict_to_mask() && affinity.flags.verbose) { 3575 __kmp_topology->print(env_var); 3576 } 3577 3578 *numUnique = unique; 3579 } 3580 3581 // Stuff for the affinity proclist parsers. It's easier to declare these vars 3582 // as file-static than to try and pass them through the calling sequence of 3583 // the recursive-descent OMP_PLACES parser. 3584 static kmp_affin_mask_t *newMasks; 3585 static int numNewMasks; 3586 static int nextNewMask; 3587 3588 #define ADD_MASK(_mask) \ 3589 { \ 3590 if (nextNewMask >= numNewMasks) { \ 3591 int i; \ 3592 numNewMasks *= 2; \ 3593 kmp_affin_mask_t *temp; \ 3594 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ 3595 for (i = 0; i < numNewMasks / 2; i++) { \ 3596 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \ 3597 kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \ 3598 KMP_CPU_COPY(dest, src); \ 3599 } \ 3600 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \ 3601 newMasks = temp; \ 3602 } \ 3603 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 3604 nextNewMask++; \ 3605 } 3606 3607 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \ 3608 { \ 3609 if (((_osId) > _maxOsId) || \ 3610 (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 3611 KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, _osId); \ 3612 } else { \ 3613 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 3614 } \ 3615 } 3616 3617 // Re-parse the proclist (for the explicit affinity type), and form the list 3618 // of affinity newMasks indexed by gtid. 3619 static void __kmp_affinity_process_proclist(kmp_affinity_t &affinity) { 3620 int i; 3621 kmp_affin_mask_t **out_masks = &affinity.masks; 3622 unsigned *out_numMasks = &affinity.num_masks; 3623 const char *proclist = affinity.proclist; 3624 kmp_affin_mask_t *osId2Mask = affinity.os_id_masks; 3625 int maxOsId = affinity.num_os_id_masks - 1; 3626 const char *scan = proclist; 3627 const char *next = proclist; 3628 3629 // We use malloc() for the temporary mask vector, so that we can use 3630 // realloc() to extend it. 3631 numNewMasks = 2; 3632 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 3633 nextNewMask = 0; 3634 kmp_affin_mask_t *sumMask; 3635 KMP_CPU_ALLOC(sumMask); 3636 int setSize = 0; 3637 3638 for (;;) { 3639 int start, end, stride; 3640 3641 SKIP_WS(scan); 3642 next = scan; 3643 if (*next == '\0') { 3644 break; 3645 } 3646 3647 if (*next == '{') { 3648 int num; 3649 setSize = 0; 3650 next++; // skip '{' 3651 SKIP_WS(next); 3652 scan = next; 3653 3654 // Read the first integer in the set. 3655 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist"); 3656 SKIP_DIGITS(next); 3657 num = __kmp_str_to_int(scan, *next); 3658 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 3659 3660 // Copy the mask for that osId to the sum (union) mask. 3661 if ((num > maxOsId) || 3662 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3663 KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num); 3664 KMP_CPU_ZERO(sumMask); 3665 } else { 3666 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 3667 setSize = 1; 3668 } 3669 3670 for (;;) { 3671 // Check for end of set. 3672 SKIP_WS(next); 3673 if (*next == '}') { 3674 next++; // skip '}' 3675 break; 3676 } 3677 3678 // Skip optional comma. 3679 if (*next == ',') { 3680 next++; 3681 } 3682 SKIP_WS(next); 3683 3684 // Read the next integer in the set. 3685 scan = next; 3686 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3687 3688 SKIP_DIGITS(next); 3689 num = __kmp_str_to_int(scan, *next); 3690 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 3691 3692 // Add the mask for that osId to the sum mask. 3693 if ((num > maxOsId) || 3694 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3695 KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num); 3696 } else { 3697 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 3698 setSize++; 3699 } 3700 } 3701 if (setSize > 0) { 3702 ADD_MASK(sumMask); 3703 } 3704 3705 SKIP_WS(next); 3706 if (*next == ',') { 3707 next++; 3708 } 3709 scan = next; 3710 continue; 3711 } 3712 3713 // Read the first integer. 3714 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3715 SKIP_DIGITS(next); 3716 start = __kmp_str_to_int(scan, *next); 3717 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 3718 SKIP_WS(next); 3719 3720 // If this isn't a range, then add a mask to the list and go on. 3721 if (*next != '-') { 3722 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3723 3724 // Skip optional comma. 3725 if (*next == ',') { 3726 next++; 3727 } 3728 scan = next; 3729 continue; 3730 } 3731 3732 // This is a range. Skip over the '-' and read in the 2nd int. 3733 next++; // skip '-' 3734 SKIP_WS(next); 3735 scan = next; 3736 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3737 SKIP_DIGITS(next); 3738 end = __kmp_str_to_int(scan, *next); 3739 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 3740 3741 // Check for a stride parameter 3742 stride = 1; 3743 SKIP_WS(next); 3744 if (*next == ':') { 3745 // A stride is specified. Skip over the ':" and read the 3rd int. 3746 int sign = +1; 3747 next++; // skip ':' 3748 SKIP_WS(next); 3749 scan = next; 3750 if (*next == '-') { 3751 sign = -1; 3752 next++; 3753 SKIP_WS(next); 3754 scan = next; 3755 } 3756 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3757 SKIP_DIGITS(next); 3758 stride = __kmp_str_to_int(scan, *next); 3759 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 3760 stride *= sign; 3761 } 3762 3763 // Do some range checks. 3764 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 3765 if (stride > 0) { 3766 KMP_ASSERT2(start <= end, "bad explicit proc list"); 3767 } else { 3768 KMP_ASSERT2(start >= end, "bad explicit proc list"); 3769 } 3770 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 3771 3772 // Add the mask for each OS proc # to the list. 3773 if (stride > 0) { 3774 do { 3775 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3776 start += stride; 3777 } while (start <= end); 3778 } else { 3779 do { 3780 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3781 start += stride; 3782 } while (start >= end); 3783 } 3784 3785 // Skip optional comma. 3786 SKIP_WS(next); 3787 if (*next == ',') { 3788 next++; 3789 } 3790 scan = next; 3791 } 3792 3793 *out_numMasks = nextNewMask; 3794 if (nextNewMask == 0) { 3795 *out_masks = NULL; 3796 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3797 return; 3798 } 3799 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3800 for (i = 0; i < nextNewMask; i++) { 3801 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3802 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3803 KMP_CPU_COPY(dest, src); 3804 } 3805 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3806 KMP_CPU_FREE(sumMask); 3807 } 3808 3809 /*----------------------------------------------------------------------------- 3810 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 3811 places. Again, Here is the grammar: 3812 3813 place_list := place 3814 place_list := place , place_list 3815 place := num 3816 place := place : num 3817 place := place : num : signed 3818 place := { subplacelist } 3819 place := ! place // (lowest priority) 3820 subplace_list := subplace 3821 subplace_list := subplace , subplace_list 3822 subplace := num 3823 subplace := num : num 3824 subplace := num : num : signed 3825 signed := num 3826 signed := + signed 3827 signed := - signed 3828 -----------------------------------------------------------------------------*/ 3829 static void __kmp_process_subplace_list(const char **scan, 3830 kmp_affinity_t &affinity, int maxOsId, 3831 kmp_affin_mask_t *tempMask, 3832 int *setSize) { 3833 const char *next; 3834 kmp_affin_mask_t *osId2Mask = affinity.os_id_masks; 3835 3836 for (;;) { 3837 int start, count, stride, i; 3838 3839 // Read in the starting proc id 3840 SKIP_WS(*scan); 3841 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3842 next = *scan; 3843 SKIP_DIGITS(next); 3844 start = __kmp_str_to_int(*scan, *next); 3845 KMP_ASSERT(start >= 0); 3846 *scan = next; 3847 3848 // valid follow sets are ',' ':' and '}' 3849 SKIP_WS(*scan); 3850 if (**scan == '}' || **scan == ',') { 3851 if ((start > maxOsId) || 3852 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3853 KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start); 3854 } else { 3855 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3856 (*setSize)++; 3857 } 3858 if (**scan == '}') { 3859 break; 3860 } 3861 (*scan)++; // skip ',' 3862 continue; 3863 } 3864 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3865 (*scan)++; // skip ':' 3866 3867 // Read count parameter 3868 SKIP_WS(*scan); 3869 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3870 next = *scan; 3871 SKIP_DIGITS(next); 3872 count = __kmp_str_to_int(*scan, *next); 3873 KMP_ASSERT(count >= 0); 3874 *scan = next; 3875 3876 // valid follow sets are ',' ':' and '}' 3877 SKIP_WS(*scan); 3878 if (**scan == '}' || **scan == ',') { 3879 for (i = 0; i < count; i++) { 3880 if ((start > maxOsId) || 3881 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3882 KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start); 3883 break; // don't proliferate warnings for large count 3884 } else { 3885 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3886 start++; 3887 (*setSize)++; 3888 } 3889 } 3890 if (**scan == '}') { 3891 break; 3892 } 3893 (*scan)++; // skip ',' 3894 continue; 3895 } 3896 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3897 (*scan)++; // skip ':' 3898 3899 // Read stride parameter 3900 int sign = +1; 3901 for (;;) { 3902 SKIP_WS(*scan); 3903 if (**scan == '+') { 3904 (*scan)++; // skip '+' 3905 continue; 3906 } 3907 if (**scan == '-') { 3908 sign *= -1; 3909 (*scan)++; // skip '-' 3910 continue; 3911 } 3912 break; 3913 } 3914 SKIP_WS(*scan); 3915 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3916 next = *scan; 3917 SKIP_DIGITS(next); 3918 stride = __kmp_str_to_int(*scan, *next); 3919 KMP_ASSERT(stride >= 0); 3920 *scan = next; 3921 stride *= sign; 3922 3923 // valid follow sets are ',' and '}' 3924 SKIP_WS(*scan); 3925 if (**scan == '}' || **scan == ',') { 3926 for (i = 0; i < count; i++) { 3927 if ((start > maxOsId) || 3928 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3929 KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start); 3930 break; // don't proliferate warnings for large count 3931 } else { 3932 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3933 start += stride; 3934 (*setSize)++; 3935 } 3936 } 3937 if (**scan == '}') { 3938 break; 3939 } 3940 (*scan)++; // skip ',' 3941 continue; 3942 } 3943 3944 KMP_ASSERT2(0, "bad explicit places list"); 3945 } 3946 } 3947 3948 static void __kmp_process_place(const char **scan, kmp_affinity_t &affinity, 3949 int maxOsId, kmp_affin_mask_t *tempMask, 3950 int *setSize) { 3951 const char *next; 3952 kmp_affin_mask_t *osId2Mask = affinity.os_id_masks; 3953 3954 // valid follow sets are '{' '!' and num 3955 SKIP_WS(*scan); 3956 if (**scan == '{') { 3957 (*scan)++; // skip '{' 3958 __kmp_process_subplace_list(scan, affinity, maxOsId, tempMask, setSize); 3959 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3960 (*scan)++; // skip '}' 3961 } else if (**scan == '!') { 3962 (*scan)++; // skip '!' 3963 __kmp_process_place(scan, affinity, maxOsId, tempMask, setSize); 3964 KMP_CPU_COMPLEMENT(maxOsId, tempMask); 3965 } else if ((**scan >= '0') && (**scan <= '9')) { 3966 next = *scan; 3967 SKIP_DIGITS(next); 3968 int num = __kmp_str_to_int(*scan, *next); 3969 KMP_ASSERT(num >= 0); 3970 if ((num > maxOsId) || 3971 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3972 KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num); 3973 } else { 3974 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3975 (*setSize)++; 3976 } 3977 *scan = next; // skip num 3978 } else { 3979 KMP_ASSERT2(0, "bad explicit places list"); 3980 } 3981 } 3982 3983 // static void 3984 void __kmp_affinity_process_placelist(kmp_affinity_t &affinity) { 3985 int i, j, count, stride, sign; 3986 kmp_affin_mask_t **out_masks = &affinity.masks; 3987 unsigned *out_numMasks = &affinity.num_masks; 3988 const char *placelist = affinity.proclist; 3989 kmp_affin_mask_t *osId2Mask = affinity.os_id_masks; 3990 int maxOsId = affinity.num_os_id_masks - 1; 3991 const char *scan = placelist; 3992 const char *next = placelist; 3993 3994 numNewMasks = 2; 3995 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 3996 nextNewMask = 0; 3997 3998 // tempMask is modified based on the previous or initial 3999 // place to form the current place 4000 // previousMask contains the previous place 4001 kmp_affin_mask_t *tempMask; 4002 kmp_affin_mask_t *previousMask; 4003 KMP_CPU_ALLOC(tempMask); 4004 KMP_CPU_ZERO(tempMask); 4005 KMP_CPU_ALLOC(previousMask); 4006 KMP_CPU_ZERO(previousMask); 4007 int setSize = 0; 4008 4009 for (;;) { 4010 __kmp_process_place(&scan, affinity, maxOsId, tempMask, &setSize); 4011 4012 // valid follow sets are ',' ':' and EOL 4013 SKIP_WS(scan); 4014 if (*scan == '\0' || *scan == ',') { 4015 if (setSize > 0) { 4016 ADD_MASK(tempMask); 4017 } 4018 KMP_CPU_ZERO(tempMask); 4019 setSize = 0; 4020 if (*scan == '\0') { 4021 break; 4022 } 4023 scan++; // skip ',' 4024 continue; 4025 } 4026 4027 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 4028 scan++; // skip ':' 4029 4030 // Read count parameter 4031 SKIP_WS(scan); 4032 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 4033 next = scan; 4034 SKIP_DIGITS(next); 4035 count = __kmp_str_to_int(scan, *next); 4036 KMP_ASSERT(count >= 0); 4037 scan = next; 4038 4039 // valid follow sets are ',' ':' and EOL 4040 SKIP_WS(scan); 4041 if (*scan == '\0' || *scan == ',') { 4042 stride = +1; 4043 } else { 4044 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 4045 scan++; // skip ':' 4046 4047 // Read stride parameter 4048 sign = +1; 4049 for (;;) { 4050 SKIP_WS(scan); 4051 if (*scan == '+') { 4052 scan++; // skip '+' 4053 continue; 4054 } 4055 if (*scan == '-') { 4056 sign *= -1; 4057 scan++; // skip '-' 4058 continue; 4059 } 4060 break; 4061 } 4062 SKIP_WS(scan); 4063 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 4064 next = scan; 4065 SKIP_DIGITS(next); 4066 stride = __kmp_str_to_int(scan, *next); 4067 KMP_DEBUG_ASSERT(stride >= 0); 4068 scan = next; 4069 stride *= sign; 4070 } 4071 4072 // Add places determined by initial_place : count : stride 4073 for (i = 0; i < count; i++) { 4074 if (setSize == 0) { 4075 break; 4076 } 4077 // Add the current place, then build the next place (tempMask) from that 4078 KMP_CPU_COPY(previousMask, tempMask); 4079 ADD_MASK(previousMask); 4080 KMP_CPU_ZERO(tempMask); 4081 setSize = 0; 4082 KMP_CPU_SET_ITERATE(j, previousMask) { 4083 if (!KMP_CPU_ISSET(j, previousMask)) { 4084 continue; 4085 } 4086 if ((j + stride > maxOsId) || (j + stride < 0) || 4087 (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) || 4088 (!KMP_CPU_ISSET(j + stride, 4089 KMP_CPU_INDEX(osId2Mask, j + stride)))) { 4090 if (i < count - 1) { 4091 KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, j + stride); 4092 } 4093 continue; 4094 } 4095 KMP_CPU_SET(j + stride, tempMask); 4096 setSize++; 4097 } 4098 } 4099 KMP_CPU_ZERO(tempMask); 4100 setSize = 0; 4101 4102 // valid follow sets are ',' and EOL 4103 SKIP_WS(scan); 4104 if (*scan == '\0') { 4105 break; 4106 } 4107 if (*scan == ',') { 4108 scan++; // skip ',' 4109 continue; 4110 } 4111 4112 KMP_ASSERT2(0, "bad explicit places list"); 4113 } 4114 4115 *out_numMasks = nextNewMask; 4116 if (nextNewMask == 0) { 4117 *out_masks = NULL; 4118 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 4119 return; 4120 } 4121 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 4122 KMP_CPU_FREE(tempMask); 4123 KMP_CPU_FREE(previousMask); 4124 for (i = 0; i < nextNewMask; i++) { 4125 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 4126 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 4127 KMP_CPU_COPY(dest, src); 4128 } 4129 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 4130 } 4131 4132 #undef ADD_MASK 4133 #undef ADD_MASK_OSID 4134 4135 // This function figures out the deepest level at which there is at least one 4136 // cluster/core with more than one processing unit bound to it. 4137 static int __kmp_affinity_find_core_level(int nprocs, int bottom_level) { 4138 int core_level = 0; 4139 4140 for (int i = 0; i < nprocs; i++) { 4141 const kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 4142 for (int j = bottom_level; j > 0; j--) { 4143 if (hw_thread.ids[j] > 0) { 4144 if (core_level < (j - 1)) { 4145 core_level = j - 1; 4146 } 4147 } 4148 } 4149 } 4150 return core_level; 4151 } 4152 4153 // This function counts number of clusters/cores at given level. 4154 static int __kmp_affinity_compute_ncores(int nprocs, int bottom_level, 4155 int core_level) { 4156 return __kmp_topology->get_count(core_level); 4157 } 4158 // This function finds to which cluster/core given processing unit is bound. 4159 static int __kmp_affinity_find_core(int proc, int bottom_level, 4160 int core_level) { 4161 int core = 0; 4162 KMP_DEBUG_ASSERT(proc >= 0 && proc < __kmp_topology->get_num_hw_threads()); 4163 for (int i = 0; i <= proc; ++i) { 4164 if (i + 1 <= proc) { 4165 for (int j = 0; j <= core_level; ++j) { 4166 if (__kmp_topology->at(i + 1).sub_ids[j] != 4167 __kmp_topology->at(i).sub_ids[j]) { 4168 core++; 4169 break; 4170 } 4171 } 4172 } 4173 } 4174 return core; 4175 } 4176 4177 // This function finds maximal number of processing units bound to a 4178 // cluster/core at given level. 4179 static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level, 4180 int core_level) { 4181 if (core_level >= bottom_level) 4182 return 1; 4183 int thread_level = __kmp_topology->get_level(KMP_HW_THREAD); 4184 return __kmp_topology->calculate_ratio(thread_level, core_level); 4185 } 4186 4187 static int *procarr = NULL; 4188 static int __kmp_aff_depth = 0; 4189 static int *__kmp_osid_to_hwthread_map = NULL; 4190 4191 static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask, 4192 kmp_affinity_ids_t &ids, 4193 kmp_affinity_attrs_t &attrs) { 4194 if (!KMP_AFFINITY_CAPABLE()) 4195 return; 4196 4197 // Initiailze ids and attrs thread data 4198 for (int i = 0; i < KMP_HW_LAST; ++i) 4199 ids.ids[i] = kmp_hw_thread_t::UNKNOWN_ID; 4200 attrs = KMP_AFFINITY_ATTRS_UNKNOWN; 4201 4202 // Iterate through each os id within the mask and determine 4203 // the topology id and attribute information 4204 int cpu; 4205 int depth = __kmp_topology->get_depth(); 4206 KMP_CPU_SET_ITERATE(cpu, mask) { 4207 int osid_idx = __kmp_osid_to_hwthread_map[cpu]; 4208 ids.os_id = cpu; 4209 const kmp_hw_thread_t &hw_thread = __kmp_topology->at(osid_idx); 4210 for (int level = 0; level < depth; ++level) { 4211 kmp_hw_t type = __kmp_topology->get_type(level); 4212 int id = hw_thread.sub_ids[level]; 4213 if (ids.ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids.ids[type] == id) { 4214 ids.ids[type] = id; 4215 } else { 4216 // This mask spans across multiple topology units, set it as such 4217 // and mark every level below as such as well. 4218 ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID; 4219 for (; level < depth; ++level) { 4220 kmp_hw_t type = __kmp_topology->get_type(level); 4221 ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID; 4222 } 4223 } 4224 } 4225 if (!attrs.valid) { 4226 attrs.core_type = hw_thread.attrs.get_core_type(); 4227 attrs.core_eff = hw_thread.attrs.get_core_eff(); 4228 attrs.valid = 1; 4229 } else { 4230 // This mask spans across multiple attributes, set it as such 4231 if (attrs.core_type != hw_thread.attrs.get_core_type()) 4232 attrs.core_type = KMP_HW_CORE_TYPE_UNKNOWN; 4233 if (attrs.core_eff != hw_thread.attrs.get_core_eff()) 4234 attrs.core_eff = kmp_hw_attr_t::UNKNOWN_CORE_EFF; 4235 } 4236 } 4237 } 4238 4239 static void __kmp_affinity_get_thread_topology_info(kmp_info_t *th) { 4240 if (!KMP_AFFINITY_CAPABLE()) 4241 return; 4242 const kmp_affin_mask_t *mask = th->th.th_affin_mask; 4243 kmp_affinity_ids_t &ids = th->th.th_topology_ids; 4244 kmp_affinity_attrs_t &attrs = th->th.th_topology_attrs; 4245 __kmp_affinity_get_mask_topology_info(mask, ids, attrs); 4246 } 4247 4248 // Assign the topology information to each place in the place list 4249 // A thread can then grab not only its affinity mask, but the topology 4250 // information associated with that mask. e.g., Which socket is a thread on 4251 static void __kmp_affinity_get_topology_info(kmp_affinity_t &affinity) { 4252 if (!KMP_AFFINITY_CAPABLE()) 4253 return; 4254 if (affinity.type != affinity_none) { 4255 KMP_ASSERT(affinity.num_os_id_masks); 4256 KMP_ASSERT(affinity.os_id_masks); 4257 } 4258 KMP_ASSERT(affinity.num_masks); 4259 KMP_ASSERT(affinity.masks); 4260 KMP_ASSERT(__kmp_affin_fullMask); 4261 4262 int max_cpu = __kmp_affin_fullMask->get_max_cpu(); 4263 int num_hw_threads = __kmp_topology->get_num_hw_threads(); 4264 4265 // Allocate thread topology information 4266 if (!affinity.ids) { 4267 affinity.ids = (kmp_affinity_ids_t *)__kmp_allocate( 4268 sizeof(kmp_affinity_ids_t) * affinity.num_masks); 4269 } 4270 if (!affinity.attrs) { 4271 affinity.attrs = (kmp_affinity_attrs_t *)__kmp_allocate( 4272 sizeof(kmp_affinity_attrs_t) * affinity.num_masks); 4273 } 4274 if (!__kmp_osid_to_hwthread_map) { 4275 // Want the +1 because max_cpu should be valid index into map 4276 __kmp_osid_to_hwthread_map = 4277 (int *)__kmp_allocate(sizeof(int) * (max_cpu + 1)); 4278 } 4279 4280 // Create the OS proc to hardware thread map 4281 for (int hw_thread = 0; hw_thread < num_hw_threads; ++hw_thread) { 4282 int os_id = __kmp_topology->at(hw_thread).os_id; 4283 if (KMP_CPU_ISSET(os_id, __kmp_affin_fullMask)) 4284 __kmp_osid_to_hwthread_map[os_id] = hw_thread; 4285 } 4286 4287 for (unsigned i = 0; i < affinity.num_masks; ++i) { 4288 kmp_affinity_ids_t &ids = affinity.ids[i]; 4289 kmp_affinity_attrs_t &attrs = affinity.attrs[i]; 4290 kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.masks, i); 4291 __kmp_affinity_get_mask_topology_info(mask, ids, attrs); 4292 } 4293 } 4294 4295 // Called when __kmp_topology is ready 4296 static void __kmp_aux_affinity_initialize_other_data(kmp_affinity_t &affinity) { 4297 // Initialize other data structures which depend on the topology 4298 if (__kmp_topology && __kmp_topology->get_num_hw_threads()) { 4299 machine_hierarchy.init(__kmp_topology->get_num_hw_threads()); 4300 __kmp_affinity_get_topology_info(affinity); 4301 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED 4302 __kmp_first_osid_with_ecore = __kmp_get_first_osid_with_ecore(); 4303 #endif 4304 } 4305 } 4306 4307 // Create a one element mask array (set of places) which only contains the 4308 // initial process's affinity mask 4309 static void __kmp_create_affinity_none_places(kmp_affinity_t &affinity) { 4310 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4311 KMP_ASSERT(affinity.type == affinity_none); 4312 KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads()); 4313 affinity.num_masks = 1; 4314 KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks); 4315 kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, 0); 4316 KMP_CPU_COPY(dest, __kmp_affin_fullMask); 4317 __kmp_aux_affinity_initialize_other_data(affinity); 4318 } 4319 4320 static void __kmp_aux_affinity_initialize_masks(kmp_affinity_t &affinity) { 4321 // Create the "full" mask - this defines all of the processors that we 4322 // consider to be in the machine model. If respect is set, then it is the 4323 // initialization thread's affinity mask. Otherwise, it is all processors that 4324 // we know about on the machine. 4325 int verbose = affinity.flags.verbose; 4326 const char *env_var = affinity.env_var; 4327 4328 // Already initialized 4329 if (__kmp_affin_fullMask && __kmp_affin_origMask) 4330 return; 4331 4332 if (__kmp_affin_fullMask == NULL) { 4333 KMP_CPU_ALLOC(__kmp_affin_fullMask); 4334 } 4335 if (__kmp_affin_origMask == NULL) { 4336 KMP_CPU_ALLOC(__kmp_affin_origMask); 4337 } 4338 if (KMP_AFFINITY_CAPABLE()) { 4339 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE); 4340 // Make a copy before possible expanding to the entire machine mask 4341 __kmp_affin_origMask->copy(__kmp_affin_fullMask); 4342 if (affinity.flags.respect) { 4343 // Count the number of available processors. 4344 unsigned i; 4345 __kmp_avail_proc = 0; 4346 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 4347 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 4348 continue; 4349 } 4350 __kmp_avail_proc++; 4351 } 4352 if (__kmp_avail_proc > __kmp_xproc) { 4353 KMP_AFF_WARNING(affinity, ErrorInitializeAffinity); 4354 affinity.type = affinity_none; 4355 KMP_AFFINITY_DISABLE(); 4356 return; 4357 } 4358 4359 if (verbose) { 4360 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4361 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4362 __kmp_affin_fullMask); 4363 KMP_INFORM(InitOSProcSetRespect, env_var, buf); 4364 } 4365 } else { 4366 if (verbose) { 4367 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4368 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4369 __kmp_affin_fullMask); 4370 KMP_INFORM(InitOSProcSetNotRespect, env_var, buf); 4371 } 4372 __kmp_avail_proc = 4373 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); 4374 #if KMP_OS_WINDOWS 4375 if (__kmp_num_proc_groups <= 1) { 4376 // Copy expanded full mask if topology has single processor group 4377 __kmp_affin_origMask->copy(__kmp_affin_fullMask); 4378 } 4379 // Set the process affinity mask since threads' affinity 4380 // masks must be subset of process mask in Windows* OS 4381 __kmp_affin_fullMask->set_process_affinity(true); 4382 #endif 4383 } 4384 } 4385 } 4386 4387 static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) { 4388 bool success = false; 4389 const char *env_var = affinity.env_var; 4390 kmp_i18n_id_t msg_id = kmp_i18n_null; 4391 int verbose = affinity.flags.verbose; 4392 4393 // For backward compatibility, setting KMP_CPUINFO_FILE => 4394 // KMP_TOPOLOGY_METHOD=cpuinfo 4395 if ((__kmp_cpuinfo_file != NULL) && 4396 (__kmp_affinity_top_method == affinity_top_method_all)) { 4397 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 4398 } 4399 4400 if (__kmp_affinity_top_method == affinity_top_method_all) { 4401 // In the default code path, errors are not fatal - we just try using 4402 // another method. We only emit a warning message if affinity is on, or the 4403 // verbose flag is set, an the nowarnings flag was not set. 4404 #if KMP_USE_HWLOC 4405 if (!success && 4406 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 4407 if (!__kmp_hwloc_error) { 4408 success = __kmp_affinity_create_hwloc_map(&msg_id); 4409 if (!success && verbose) { 4410 KMP_INFORM(AffIgnoringHwloc, env_var); 4411 } 4412 } else if (verbose) { 4413 KMP_INFORM(AffIgnoringHwloc, env_var); 4414 } 4415 } 4416 #endif 4417 4418 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4419 if (!success) { 4420 success = __kmp_affinity_create_x2apicid_map(&msg_id); 4421 if (!success && verbose && msg_id != kmp_i18n_null) { 4422 KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id)); 4423 } 4424 } 4425 if (!success) { 4426 success = __kmp_affinity_create_apicid_map(&msg_id); 4427 if (!success && verbose && msg_id != kmp_i18n_null) { 4428 KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id)); 4429 } 4430 } 4431 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4432 4433 #if KMP_OS_LINUX 4434 if (!success) { 4435 int line = 0; 4436 success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id); 4437 if (!success && verbose && msg_id != kmp_i18n_null) { 4438 KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id)); 4439 } 4440 } 4441 #endif /* KMP_OS_LINUX */ 4442 4443 #if KMP_GROUP_AFFINITY 4444 if (!success && (__kmp_num_proc_groups > 1)) { 4445 success = __kmp_affinity_create_proc_group_map(&msg_id); 4446 if (!success && verbose && msg_id != kmp_i18n_null) { 4447 KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id)); 4448 } 4449 } 4450 #endif /* KMP_GROUP_AFFINITY */ 4451 4452 if (!success) { 4453 success = __kmp_affinity_create_flat_map(&msg_id); 4454 if (!success && verbose && msg_id != kmp_i18n_null) { 4455 KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id)); 4456 } 4457 KMP_ASSERT(success); 4458 } 4459 } 4460 4461 // If the user has specified that a paricular topology discovery method is to be 4462 // used, then we abort if that method fails. The exception is group affinity, 4463 // which might have been implicitly set. 4464 #if KMP_USE_HWLOC 4465 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 4466 KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC); 4467 success = __kmp_affinity_create_hwloc_map(&msg_id); 4468 if (!success) { 4469 KMP_ASSERT(msg_id != kmp_i18n_null); 4470 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4471 } 4472 } 4473 #endif // KMP_USE_HWLOC 4474 4475 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4476 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid || 4477 __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { 4478 success = __kmp_affinity_create_x2apicid_map(&msg_id); 4479 if (!success) { 4480 KMP_ASSERT(msg_id != kmp_i18n_null); 4481 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4482 } 4483 } else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 4484 success = __kmp_affinity_create_apicid_map(&msg_id); 4485 if (!success) { 4486 KMP_ASSERT(msg_id != kmp_i18n_null); 4487 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4488 } 4489 } 4490 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4491 4492 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 4493 int line = 0; 4494 success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id); 4495 if (!success) { 4496 KMP_ASSERT(msg_id != kmp_i18n_null); 4497 const char *filename = __kmp_cpuinfo_get_filename(); 4498 if (line > 0) { 4499 KMP_FATAL(FileLineMsgExiting, filename, line, 4500 __kmp_i18n_catgets(msg_id)); 4501 } else { 4502 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 4503 } 4504 } 4505 } 4506 4507 #if KMP_GROUP_AFFINITY 4508 else if (__kmp_affinity_top_method == affinity_top_method_group) { 4509 success = __kmp_affinity_create_proc_group_map(&msg_id); 4510 KMP_ASSERT(success); 4511 if (!success) { 4512 KMP_ASSERT(msg_id != kmp_i18n_null); 4513 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4514 } 4515 } 4516 #endif /* KMP_GROUP_AFFINITY */ 4517 4518 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 4519 success = __kmp_affinity_create_flat_map(&msg_id); 4520 // should not fail 4521 KMP_ASSERT(success); 4522 } 4523 4524 // Early exit if topology could not be created 4525 if (!__kmp_topology) { 4526 if (KMP_AFFINITY_CAPABLE()) { 4527 KMP_AFF_WARNING(affinity, ErrorInitializeAffinity); 4528 } 4529 if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 && 4530 __kmp_ncores > 0) { 4531 __kmp_topology = kmp_topology_t::allocate(0, 0, NULL); 4532 __kmp_topology->canonicalize(nPackages, nCoresPerPkg, 4533 __kmp_nThreadsPerCore, __kmp_ncores); 4534 if (verbose) { 4535 __kmp_topology->print(env_var); 4536 } 4537 } 4538 return false; 4539 } 4540 4541 // Canonicalize, print (if requested), apply KMP_HW_SUBSET 4542 __kmp_topology->canonicalize(); 4543 if (verbose) 4544 __kmp_topology->print(env_var); 4545 bool filtered = __kmp_topology->filter_hw_subset(); 4546 if (filtered && verbose) 4547 __kmp_topology->print("KMP_HW_SUBSET"); 4548 return success; 4549 } 4550 4551 static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) { 4552 bool is_regular_affinity = (&affinity == &__kmp_affinity); 4553 bool is_hidden_helper_affinity = (&affinity == &__kmp_hh_affinity); 4554 const char *env_var = __kmp_get_affinity_env_var(affinity); 4555 4556 if (affinity.flags.initialized) { 4557 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4558 return; 4559 } 4560 4561 if (is_regular_affinity && (!__kmp_affin_fullMask || !__kmp_affin_origMask)) 4562 __kmp_aux_affinity_initialize_masks(affinity); 4563 4564 if (is_regular_affinity && !__kmp_topology) { 4565 bool success = __kmp_aux_affinity_initialize_topology(affinity); 4566 if (success) { 4567 KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads()); 4568 } else { 4569 affinity.type = affinity_none; 4570 KMP_AFFINITY_DISABLE(); 4571 } 4572 } 4573 4574 // If KMP_AFFINITY=none, then only create the single "none" place 4575 // which is the process's initial affinity mask or the number of 4576 // hardware threads depending on respect,norespect 4577 if (affinity.type == affinity_none) { 4578 __kmp_create_affinity_none_places(affinity); 4579 #if KMP_USE_HIER_SCHED 4580 __kmp_dispatch_set_hierarchy_values(); 4581 #endif 4582 affinity.flags.initialized = TRUE; 4583 return; 4584 } 4585 4586 __kmp_topology->set_granularity(affinity); 4587 int depth = __kmp_topology->get_depth(); 4588 4589 // Create the table of masks, indexed by thread Id. 4590 unsigned numUnique; 4591 int numAddrs = __kmp_topology->get_num_hw_threads(); 4592 // If OMP_PLACES=cores:<attribute> specified, then attempt 4593 // to make OS Id mask table using those attributes 4594 if (affinity.core_attr_gran.valid) { 4595 __kmp_create_os_id_masks(&numUnique, affinity, [&](int idx) { 4596 KMP_ASSERT(idx >= -1); 4597 for (int i = idx + 1; i < numAddrs; ++i) 4598 if (__kmp_topology->at(i).attrs.contains(affinity.core_attr_gran)) 4599 return i; 4600 return numAddrs; 4601 }); 4602 if (!affinity.os_id_masks) { 4603 const char *core_attribute; 4604 if (affinity.core_attr_gran.core_eff != kmp_hw_attr_t::UNKNOWN_CORE_EFF) 4605 core_attribute = "core_efficiency"; 4606 else 4607 core_attribute = "core_type"; 4608 KMP_AFF_WARNING(affinity, AffIgnoringNotAvailable, env_var, 4609 core_attribute, 4610 __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true)) 4611 } 4612 } 4613 // If core attributes did not work, or none were specified, 4614 // then make OS Id mask table using typical incremental way. 4615 if (!affinity.os_id_masks) { 4616 __kmp_create_os_id_masks(&numUnique, affinity, [](int idx) { 4617 KMP_ASSERT(idx >= -1); 4618 return idx + 1; 4619 }); 4620 } 4621 if (affinity.gran_levels == 0) { 4622 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 4623 } 4624 4625 switch (affinity.type) { 4626 4627 case affinity_explicit: 4628 KMP_DEBUG_ASSERT(affinity.proclist != NULL); 4629 if (is_hidden_helper_affinity || 4630 __kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) { 4631 __kmp_affinity_process_proclist(affinity); 4632 } else { 4633 __kmp_affinity_process_placelist(affinity); 4634 } 4635 if (affinity.num_masks == 0) { 4636 KMP_AFF_WARNING(affinity, AffNoValidProcID); 4637 affinity.type = affinity_none; 4638 __kmp_create_affinity_none_places(affinity); 4639 affinity.flags.initialized = TRUE; 4640 return; 4641 } 4642 break; 4643 4644 // The other affinity types rely on sorting the hardware threads according to 4645 // some permutation of the machine topology tree. Set affinity.compact 4646 // and affinity.offset appropriately, then jump to a common code 4647 // fragment to do the sort and create the array of affinity masks. 4648 case affinity_logical: 4649 affinity.compact = 0; 4650 if (affinity.offset) { 4651 affinity.offset = 4652 __kmp_nThreadsPerCore * affinity.offset % __kmp_avail_proc; 4653 } 4654 goto sortTopology; 4655 4656 case affinity_physical: 4657 if (__kmp_nThreadsPerCore > 1) { 4658 affinity.compact = 1; 4659 if (affinity.compact >= depth) { 4660 affinity.compact = 0; 4661 } 4662 } else { 4663 affinity.compact = 0; 4664 } 4665 if (affinity.offset) { 4666 affinity.offset = 4667 __kmp_nThreadsPerCore * affinity.offset % __kmp_avail_proc; 4668 } 4669 goto sortTopology; 4670 4671 case affinity_scatter: 4672 if (affinity.compact >= depth) { 4673 affinity.compact = 0; 4674 } else { 4675 affinity.compact = depth - 1 - affinity.compact; 4676 } 4677 goto sortTopology; 4678 4679 case affinity_compact: 4680 if (affinity.compact >= depth) { 4681 affinity.compact = depth - 1; 4682 } 4683 goto sortTopology; 4684 4685 case affinity_balanced: 4686 if (depth <= 1 || is_hidden_helper_affinity) { 4687 KMP_AFF_WARNING(affinity, AffBalancedNotAvail, env_var); 4688 affinity.type = affinity_none; 4689 __kmp_create_affinity_none_places(affinity); 4690 affinity.flags.initialized = TRUE; 4691 return; 4692 } else if (!__kmp_topology->is_uniform()) { 4693 // Save the depth for further usage 4694 __kmp_aff_depth = depth; 4695 4696 int core_level = 4697 __kmp_affinity_find_core_level(__kmp_avail_proc, depth - 1); 4698 int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, depth - 1, 4699 core_level); 4700 int maxprocpercore = __kmp_affinity_max_proc_per_core( 4701 __kmp_avail_proc, depth - 1, core_level); 4702 4703 int nproc = ncores * maxprocpercore; 4704 if ((nproc < 2) || (nproc < __kmp_avail_proc)) { 4705 KMP_AFF_WARNING(affinity, AffBalancedNotAvail, env_var); 4706 affinity.type = affinity_none; 4707 __kmp_create_affinity_none_places(affinity); 4708 affinity.flags.initialized = TRUE; 4709 return; 4710 } 4711 4712 procarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4713 for (int i = 0; i < nproc; i++) { 4714 procarr[i] = -1; 4715 } 4716 4717 int lastcore = -1; 4718 int inlastcore = 0; 4719 for (int i = 0; i < __kmp_avail_proc; i++) { 4720 int proc = __kmp_topology->at(i).os_id; 4721 int core = __kmp_affinity_find_core(i, depth - 1, core_level); 4722 4723 if (core == lastcore) { 4724 inlastcore++; 4725 } else { 4726 inlastcore = 0; 4727 } 4728 lastcore = core; 4729 4730 procarr[core * maxprocpercore + inlastcore] = proc; 4731 } 4732 } 4733 if (affinity.compact >= depth) { 4734 affinity.compact = depth - 1; 4735 } 4736 4737 sortTopology: 4738 // Allocate the gtid->affinity mask table. 4739 if (affinity.flags.dups) { 4740 affinity.num_masks = __kmp_avail_proc; 4741 } else { 4742 affinity.num_masks = numUnique; 4743 } 4744 4745 if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) && 4746 (__kmp_affinity_num_places > 0) && 4747 ((unsigned)__kmp_affinity_num_places < affinity.num_masks) && 4748 !is_hidden_helper_affinity) { 4749 affinity.num_masks = __kmp_affinity_num_places; 4750 } 4751 4752 KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks); 4753 4754 // Sort the topology table according to the current setting of 4755 // affinity.compact, then fill out affinity.masks. 4756 __kmp_topology->sort_compact(affinity); 4757 { 4758 int i; 4759 unsigned j; 4760 int num_hw_threads = __kmp_topology->get_num_hw_threads(); 4761 kmp_full_mask_modifier_t full_mask; 4762 for (i = 0, j = 0; i < num_hw_threads; i++) { 4763 if ((!affinity.flags.dups) && (!__kmp_topology->at(i).leader)) { 4764 continue; 4765 } 4766 int osId = __kmp_topology->at(i).os_id; 4767 4768 kmp_affin_mask_t *src = KMP_CPU_INDEX(affinity.os_id_masks, osId); 4769 kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, j); 4770 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 4771 KMP_CPU_COPY(dest, src); 4772 full_mask.include(src); 4773 if (++j >= affinity.num_masks) { 4774 break; 4775 } 4776 } 4777 KMP_DEBUG_ASSERT(j == affinity.num_masks); 4778 // See if the places list further restricts or changes the full mask 4779 if (full_mask.restrict_to_mask() && affinity.flags.verbose) { 4780 __kmp_topology->print(env_var); 4781 } 4782 } 4783 // Sort the topology back using ids 4784 __kmp_topology->sort_ids(); 4785 break; 4786 4787 default: 4788 KMP_ASSERT2(0, "Unexpected affinity setting"); 4789 } 4790 __kmp_aux_affinity_initialize_other_data(affinity); 4791 affinity.flags.initialized = TRUE; 4792 } 4793 4794 void __kmp_affinity_initialize(kmp_affinity_t &affinity) { 4795 // Much of the code above was written assuming that if a machine was not 4796 // affinity capable, then affinity type == affinity_none. 4797 // We now explicitly represent this as affinity type == affinity_disabled. 4798 // There are too many checks for affinity type == affinity_none in this code. 4799 // Instead of trying to change them all, check if 4800 // affinity type == affinity_disabled, and if so, slam it with affinity_none, 4801 // call the real initialization routine, then restore affinity type to 4802 // affinity_disabled. 4803 int disabled = (affinity.type == affinity_disabled); 4804 if (!KMP_AFFINITY_CAPABLE()) 4805 KMP_ASSERT(disabled); 4806 if (disabled) 4807 affinity.type = affinity_none; 4808 __kmp_aux_affinity_initialize(affinity); 4809 if (disabled) 4810 affinity.type = affinity_disabled; 4811 } 4812 4813 void __kmp_affinity_uninitialize(void) { 4814 for (kmp_affinity_t *affinity : __kmp_affinities) { 4815 if (affinity->masks != NULL) 4816 KMP_CPU_FREE_ARRAY(affinity->masks, affinity->num_masks); 4817 if (affinity->os_id_masks != NULL) 4818 KMP_CPU_FREE_ARRAY(affinity->os_id_masks, affinity->num_os_id_masks); 4819 if (affinity->proclist != NULL) 4820 __kmp_free(affinity->proclist); 4821 if (affinity->ids != NULL) 4822 __kmp_free(affinity->ids); 4823 if (affinity->attrs != NULL) 4824 __kmp_free(affinity->attrs); 4825 *affinity = KMP_AFFINITY_INIT(affinity->env_var); 4826 } 4827 if (__kmp_affin_origMask != NULL) { 4828 if (KMP_AFFINITY_CAPABLE()) { 4829 __kmp_set_system_affinity(__kmp_affin_origMask, FALSE); 4830 } 4831 KMP_CPU_FREE(__kmp_affin_origMask); 4832 __kmp_affin_origMask = NULL; 4833 } 4834 __kmp_affinity_num_places = 0; 4835 if (procarr != NULL) { 4836 __kmp_free(procarr); 4837 procarr = NULL; 4838 } 4839 if (__kmp_osid_to_hwthread_map) { 4840 __kmp_free(__kmp_osid_to_hwthread_map); 4841 __kmp_osid_to_hwthread_map = NULL; 4842 } 4843 #if KMP_USE_HWLOC 4844 if (__kmp_hwloc_topology != NULL) { 4845 hwloc_topology_destroy(__kmp_hwloc_topology); 4846 __kmp_hwloc_topology = NULL; 4847 } 4848 #endif 4849 if (__kmp_hw_subset) { 4850 kmp_hw_subset_t::deallocate(__kmp_hw_subset); 4851 __kmp_hw_subset = nullptr; 4852 } 4853 if (__kmp_topology) { 4854 kmp_topology_t::deallocate(__kmp_topology); 4855 __kmp_topology = nullptr; 4856 } 4857 KMPAffinity::destroy_api(); 4858 } 4859 4860 static void __kmp_select_mask_by_gtid(int gtid, const kmp_affinity_t *affinity, 4861 int *place, kmp_affin_mask_t **mask) { 4862 int mask_idx; 4863 bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid); 4864 if (is_hidden_helper) 4865 // The first gtid is the regular primary thread, the second gtid is the main 4866 // thread of hidden team which does not participate in task execution. 4867 mask_idx = gtid - 2; 4868 else 4869 mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid); 4870 KMP_DEBUG_ASSERT(affinity->num_masks > 0); 4871 *place = (mask_idx + affinity->offset) % affinity->num_masks; 4872 *mask = KMP_CPU_INDEX(affinity->masks, *place); 4873 } 4874 4875 // This function initializes the per-thread data concerning affinity including 4876 // the mask and topology information 4877 void __kmp_affinity_set_init_mask(int gtid, int isa_root) { 4878 4879 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4880 4881 // Set the thread topology information to default of unknown 4882 for (int id = 0; id < KMP_HW_LAST; ++id) 4883 th->th.th_topology_ids.ids[id] = kmp_hw_thread_t::UNKNOWN_ID; 4884 th->th.th_topology_attrs = KMP_AFFINITY_ATTRS_UNKNOWN; 4885 4886 if (!KMP_AFFINITY_CAPABLE()) { 4887 return; 4888 } 4889 4890 if (th->th.th_affin_mask == NULL) { 4891 KMP_CPU_ALLOC(th->th.th_affin_mask); 4892 } else { 4893 KMP_CPU_ZERO(th->th.th_affin_mask); 4894 } 4895 4896 // Copy the thread mask to the kmp_info_t structure. If 4897 // __kmp_affinity.type == affinity_none, copy the "full" mask, i.e. 4898 // one that has all of the OS proc ids set, or if 4899 // __kmp_affinity.flags.respect is set, then the full mask is the 4900 // same as the mask of the initialization thread. 4901 kmp_affin_mask_t *mask; 4902 int i; 4903 const kmp_affinity_t *affinity; 4904 bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid); 4905 4906 if (is_hidden_helper) 4907 affinity = &__kmp_hh_affinity; 4908 else 4909 affinity = &__kmp_affinity; 4910 4911 if (KMP_AFFINITY_NON_PROC_BIND || is_hidden_helper) { 4912 if ((affinity->type == affinity_none) || 4913 (affinity->type == affinity_balanced) || 4914 KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) { 4915 #if KMP_GROUP_AFFINITY 4916 if (__kmp_num_proc_groups > 1) { 4917 return; 4918 } 4919 #endif 4920 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4921 i = 0; 4922 mask = __kmp_affin_fullMask; 4923 } else { 4924 __kmp_select_mask_by_gtid(gtid, affinity, &i, &mask); 4925 } 4926 } else { 4927 if (!isa_root || __kmp_nested_proc_bind.bind_types[0] == proc_bind_false) { 4928 #if KMP_GROUP_AFFINITY 4929 if (__kmp_num_proc_groups > 1) { 4930 return; 4931 } 4932 #endif 4933 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4934 i = KMP_PLACE_ALL; 4935 mask = __kmp_affin_fullMask; 4936 } else { 4937 __kmp_select_mask_by_gtid(gtid, affinity, &i, &mask); 4938 } 4939 } 4940 4941 th->th.th_current_place = i; 4942 if (isa_root && !is_hidden_helper) { 4943 th->th.th_new_place = i; 4944 th->th.th_first_place = 0; 4945 th->th.th_last_place = affinity->num_masks - 1; 4946 } else if (KMP_AFFINITY_NON_PROC_BIND) { 4947 // When using a Non-OMP_PROC_BIND affinity method, 4948 // set all threads' place-partition-var to the entire place list 4949 th->th.th_first_place = 0; 4950 th->th.th_last_place = affinity->num_masks - 1; 4951 } 4952 // Copy topology information associated with the place 4953 if (i >= 0) { 4954 th->th.th_topology_ids = __kmp_affinity.ids[i]; 4955 th->th.th_topology_attrs = __kmp_affinity.attrs[i]; 4956 } 4957 4958 if (i == KMP_PLACE_ALL) { 4959 KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to all places\n", 4960 gtid)); 4961 } else { 4962 KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to place %d\n", 4963 gtid, i)); 4964 } 4965 4966 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4967 } 4968 4969 void __kmp_affinity_bind_init_mask(int gtid) { 4970 if (!KMP_AFFINITY_CAPABLE()) { 4971 return; 4972 } 4973 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4974 const kmp_affinity_t *affinity; 4975 const char *env_var; 4976 bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid); 4977 4978 if (is_hidden_helper) 4979 affinity = &__kmp_hh_affinity; 4980 else 4981 affinity = &__kmp_affinity; 4982 env_var = __kmp_get_affinity_env_var(*affinity, /*for_binding=*/true); 4983 /* to avoid duplicate printing (will be correctly printed on barrier) */ 4984 if (affinity->flags.verbose && (affinity->type == affinity_none || 4985 (th->th.th_current_place != KMP_PLACE_ALL && 4986 affinity->type != affinity_balanced)) && 4987 !KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) { 4988 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4989 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4990 th->th.th_affin_mask); 4991 KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(), 4992 gtid, buf); 4993 } 4994 4995 #if KMP_OS_WINDOWS 4996 // On Windows* OS, the process affinity mask might have changed. If the user 4997 // didn't request affinity and this call fails, just continue silently. 4998 // See CQ171393. 4999 if (affinity->type == affinity_none) { 5000 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 5001 } else 5002 #endif 5003 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 5004 } 5005 5006 void __kmp_affinity_bind_place(int gtid) { 5007 // Hidden helper threads should not be affected by OMP_PLACES/OMP_PROC_BIND 5008 if (!KMP_AFFINITY_CAPABLE() || KMP_HIDDEN_HELPER_THREAD(gtid)) { 5009 return; 5010 } 5011 5012 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 5013 5014 KA_TRACE(100, ("__kmp_affinity_bind_place: binding T#%d to place %d (current " 5015 "place = %d)\n", 5016 gtid, th->th.th_new_place, th->th.th_current_place)); 5017 5018 // Check that the new place is within this thread's partition. 5019 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 5020 KMP_ASSERT(th->th.th_new_place >= 0); 5021 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity.num_masks); 5022 if (th->th.th_first_place <= th->th.th_last_place) { 5023 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) && 5024 (th->th.th_new_place <= th->th.th_last_place)); 5025 } else { 5026 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) || 5027 (th->th.th_new_place >= th->th.th_last_place)); 5028 } 5029 5030 // Copy the thread mask to the kmp_info_t structure, 5031 // and set this thread's affinity. 5032 kmp_affin_mask_t *mask = 5033 KMP_CPU_INDEX(__kmp_affinity.masks, th->th.th_new_place); 5034 KMP_CPU_COPY(th->th.th_affin_mask, mask); 5035 th->th.th_current_place = th->th.th_new_place; 5036 5037 if (__kmp_affinity.flags.verbose) { 5038 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5039 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 5040 th->th.th_affin_mask); 5041 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 5042 __kmp_gettid(), gtid, buf); 5043 } 5044 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 5045 } 5046 5047 int __kmp_aux_set_affinity(void **mask) { 5048 int gtid; 5049 kmp_info_t *th; 5050 int retval; 5051 5052 if (!KMP_AFFINITY_CAPABLE()) { 5053 return -1; 5054 } 5055 5056 gtid = __kmp_entry_gtid(); 5057 KA_TRACE( 5058 1000, (""); { 5059 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5060 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 5061 (kmp_affin_mask_t *)(*mask)); 5062 __kmp_debug_printf( 5063 "kmp_set_affinity: setting affinity mask for thread %d = %s\n", 5064 gtid, buf); 5065 }); 5066 5067 if (__kmp_env_consistency_check) { 5068 if ((mask == NULL) || (*mask == NULL)) { 5069 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 5070 } else { 5071 unsigned proc; 5072 int num_procs = 0; 5073 5074 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) { 5075 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 5076 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 5077 } 5078 if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 5079 continue; 5080 } 5081 num_procs++; 5082 } 5083 if (num_procs == 0) { 5084 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 5085 } 5086 5087 #if KMP_GROUP_AFFINITY 5088 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 5089 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 5090 } 5091 #endif /* KMP_GROUP_AFFINITY */ 5092 } 5093 } 5094 5095 th = __kmp_threads[gtid]; 5096 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 5097 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 5098 if (retval == 0) { 5099 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 5100 } 5101 5102 th->th.th_current_place = KMP_PLACE_UNDEFINED; 5103 th->th.th_new_place = KMP_PLACE_UNDEFINED; 5104 th->th.th_first_place = 0; 5105 th->th.th_last_place = __kmp_affinity.num_masks - 1; 5106 5107 // Turn off 4.0 affinity for the current tread at this parallel level. 5108 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 5109 5110 return retval; 5111 } 5112 5113 int __kmp_aux_get_affinity(void **mask) { 5114 int gtid; 5115 int retval; 5116 #if KMP_OS_WINDOWS || KMP_DEBUG 5117 kmp_info_t *th; 5118 #endif 5119 if (!KMP_AFFINITY_CAPABLE()) { 5120 return -1; 5121 } 5122 5123 gtid = __kmp_entry_gtid(); 5124 #if KMP_OS_WINDOWS || KMP_DEBUG 5125 th = __kmp_threads[gtid]; 5126 #else 5127 (void)gtid; // unused variable 5128 #endif 5129 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 5130 5131 KA_TRACE( 5132 1000, (""); { 5133 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5134 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 5135 th->th.th_affin_mask); 5136 __kmp_printf( 5137 "kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, 5138 buf); 5139 }); 5140 5141 if (__kmp_env_consistency_check) { 5142 if ((mask == NULL) || (*mask == NULL)) { 5143 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 5144 } 5145 } 5146 5147 #if !KMP_OS_WINDOWS 5148 5149 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 5150 KA_TRACE( 5151 1000, (""); { 5152 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5153 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 5154 (kmp_affin_mask_t *)(*mask)); 5155 __kmp_printf( 5156 "kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, 5157 buf); 5158 }); 5159 return retval; 5160 5161 #else 5162 (void)retval; 5163 5164 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 5165 return 0; 5166 5167 #endif /* KMP_OS_WINDOWS */ 5168 } 5169 5170 int __kmp_aux_get_affinity_max_proc() { 5171 if (!KMP_AFFINITY_CAPABLE()) { 5172 return 0; 5173 } 5174 #if KMP_GROUP_AFFINITY 5175 if (__kmp_num_proc_groups > 1) { 5176 return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT); 5177 } 5178 #endif 5179 return __kmp_xproc; 5180 } 5181 5182 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) { 5183 if (!KMP_AFFINITY_CAPABLE()) { 5184 return -1; 5185 } 5186 5187 KA_TRACE( 5188 1000, (""); { 5189 int gtid = __kmp_entry_gtid(); 5190 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5191 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 5192 (kmp_affin_mask_t *)(*mask)); 5193 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in " 5194 "affinity mask for thread %d = %s\n", 5195 proc, gtid, buf); 5196 }); 5197 5198 if (__kmp_env_consistency_check) { 5199 if ((mask == NULL) || (*mask == NULL)) { 5200 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 5201 } 5202 } 5203 5204 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 5205 return -1; 5206 } 5207 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 5208 return -2; 5209 } 5210 5211 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 5212 return 0; 5213 } 5214 5215 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) { 5216 if (!KMP_AFFINITY_CAPABLE()) { 5217 return -1; 5218 } 5219 5220 KA_TRACE( 5221 1000, (""); { 5222 int gtid = __kmp_entry_gtid(); 5223 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5224 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 5225 (kmp_affin_mask_t *)(*mask)); 5226 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in " 5227 "affinity mask for thread %d = %s\n", 5228 proc, gtid, buf); 5229 }); 5230 5231 if (__kmp_env_consistency_check) { 5232 if ((mask == NULL) || (*mask == NULL)) { 5233 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 5234 } 5235 } 5236 5237 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 5238 return -1; 5239 } 5240 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 5241 return -2; 5242 } 5243 5244 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 5245 return 0; 5246 } 5247 5248 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) { 5249 if (!KMP_AFFINITY_CAPABLE()) { 5250 return -1; 5251 } 5252 5253 KA_TRACE( 5254 1000, (""); { 5255 int gtid = __kmp_entry_gtid(); 5256 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5257 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 5258 (kmp_affin_mask_t *)(*mask)); 5259 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in " 5260 "affinity mask for thread %d = %s\n", 5261 proc, gtid, buf); 5262 }); 5263 5264 if (__kmp_env_consistency_check) { 5265 if ((mask == NULL) || (*mask == NULL)) { 5266 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 5267 } 5268 } 5269 5270 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 5271 return -1; 5272 } 5273 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 5274 return 0; 5275 } 5276 5277 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 5278 } 5279 5280 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED 5281 // Returns first os proc id with ATOM core 5282 int __kmp_get_first_osid_with_ecore(void) { 5283 int low = 0; 5284 int high = __kmp_topology->get_num_hw_threads() - 1; 5285 int mid = 0; 5286 while (high - low > 1) { 5287 mid = (high + low) / 2; 5288 if (__kmp_topology->at(mid).attrs.get_core_type() == 5289 KMP_HW_CORE_TYPE_CORE) { 5290 low = mid + 1; 5291 } else { 5292 high = mid; 5293 } 5294 } 5295 if (__kmp_topology->at(mid).attrs.get_core_type() == KMP_HW_CORE_TYPE_ATOM) { 5296 return mid; 5297 } 5298 return -1; 5299 } 5300 #endif 5301 5302 // Dynamic affinity settings - Affinity balanced 5303 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) { 5304 KMP_DEBUG_ASSERT(th); 5305 bool fine_gran = true; 5306 int tid = th->th.th_info.ds.ds_tid; 5307 const char *env_var = "KMP_AFFINITY"; 5308 5309 // Do not perform balanced affinity for the hidden helper threads 5310 if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th))) 5311 return; 5312 5313 switch (__kmp_affinity.gran) { 5314 case KMP_HW_THREAD: 5315 break; 5316 case KMP_HW_CORE: 5317 if (__kmp_nThreadsPerCore > 1) { 5318 fine_gran = false; 5319 } 5320 break; 5321 case KMP_HW_SOCKET: 5322 if (nCoresPerPkg > 1) { 5323 fine_gran = false; 5324 } 5325 break; 5326 default: 5327 fine_gran = false; 5328 } 5329 5330 if (__kmp_topology->is_uniform()) { 5331 int coreID; 5332 int threadID; 5333 // Number of hyper threads per core in HT machine 5334 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 5335 // Number of cores 5336 int ncores = __kmp_ncores; 5337 if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) { 5338 __kmp_nth_per_core = __kmp_avail_proc / nPackages; 5339 ncores = nPackages; 5340 } 5341 // How many threads will be bound to each core 5342 int chunk = nthreads / ncores; 5343 // How many cores will have an additional thread bound to it - "big cores" 5344 int big_cores = nthreads % ncores; 5345 // Number of threads on the big cores 5346 int big_nth = (chunk + 1) * big_cores; 5347 if (tid < big_nth) { 5348 coreID = tid / (chunk + 1); 5349 threadID = (tid % (chunk + 1)) % __kmp_nth_per_core; 5350 } else { // tid >= big_nth 5351 coreID = (tid - big_cores) / chunk; 5352 threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core; 5353 } 5354 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 5355 "Illegal set affinity operation when not capable"); 5356 5357 kmp_affin_mask_t *mask = th->th.th_affin_mask; 5358 KMP_CPU_ZERO(mask); 5359 5360 if (fine_gran) { 5361 int osID = 5362 __kmp_topology->at(coreID * __kmp_nth_per_core + threadID).os_id; 5363 KMP_CPU_SET(osID, mask); 5364 } else { 5365 for (int i = 0; i < __kmp_nth_per_core; i++) { 5366 int osID; 5367 osID = __kmp_topology->at(coreID * __kmp_nth_per_core + i).os_id; 5368 KMP_CPU_SET(osID, mask); 5369 } 5370 } 5371 if (__kmp_affinity.flags.verbose) { 5372 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5373 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 5374 KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(), 5375 tid, buf); 5376 } 5377 __kmp_affinity_get_thread_topology_info(th); 5378 __kmp_set_system_affinity(mask, TRUE); 5379 } else { // Non-uniform topology 5380 5381 kmp_affin_mask_t *mask = th->th.th_affin_mask; 5382 KMP_CPU_ZERO(mask); 5383 5384 int core_level = 5385 __kmp_affinity_find_core_level(__kmp_avail_proc, __kmp_aff_depth - 1); 5386 int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, 5387 __kmp_aff_depth - 1, core_level); 5388 int nth_per_core = __kmp_affinity_max_proc_per_core( 5389 __kmp_avail_proc, __kmp_aff_depth - 1, core_level); 5390 5391 // For performance gain consider the special case nthreads == 5392 // __kmp_avail_proc 5393 if (nthreads == __kmp_avail_proc) { 5394 if (fine_gran) { 5395 int osID = __kmp_topology->at(tid).os_id; 5396 KMP_CPU_SET(osID, mask); 5397 } else { 5398 int core = 5399 __kmp_affinity_find_core(tid, __kmp_aff_depth - 1, core_level); 5400 for (int i = 0; i < __kmp_avail_proc; i++) { 5401 int osID = __kmp_topology->at(i).os_id; 5402 if (__kmp_affinity_find_core(i, __kmp_aff_depth - 1, core_level) == 5403 core) { 5404 KMP_CPU_SET(osID, mask); 5405 } 5406 } 5407 } 5408 } else if (nthreads <= ncores) { 5409 5410 int core = 0; 5411 for (int i = 0; i < ncores; i++) { 5412 // Check if this core from procarr[] is in the mask 5413 int in_mask = 0; 5414 for (int j = 0; j < nth_per_core; j++) { 5415 if (procarr[i * nth_per_core + j] != -1) { 5416 in_mask = 1; 5417 break; 5418 } 5419 } 5420 if (in_mask) { 5421 if (tid == core) { 5422 for (int j = 0; j < nth_per_core; j++) { 5423 int osID = procarr[i * nth_per_core + j]; 5424 if (osID != -1) { 5425 KMP_CPU_SET(osID, mask); 5426 // For fine granularity it is enough to set the first available 5427 // osID for this core 5428 if (fine_gran) { 5429 break; 5430 } 5431 } 5432 } 5433 break; 5434 } else { 5435 core++; 5436 } 5437 } 5438 } 5439 } else { // nthreads > ncores 5440 // Array to save the number of processors at each core 5441 int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores); 5442 // Array to save the number of cores with "x" available processors; 5443 int *ncores_with_x_procs = 5444 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 5445 // Array to save the number of cores with # procs from x to nth_per_core 5446 int *ncores_with_x_to_max_procs = 5447 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 5448 5449 for (int i = 0; i <= nth_per_core; i++) { 5450 ncores_with_x_procs[i] = 0; 5451 ncores_with_x_to_max_procs[i] = 0; 5452 } 5453 5454 for (int i = 0; i < ncores; i++) { 5455 int cnt = 0; 5456 for (int j = 0; j < nth_per_core; j++) { 5457 if (procarr[i * nth_per_core + j] != -1) { 5458 cnt++; 5459 } 5460 } 5461 nproc_at_core[i] = cnt; 5462 ncores_with_x_procs[cnt]++; 5463 } 5464 5465 for (int i = 0; i <= nth_per_core; i++) { 5466 for (int j = i; j <= nth_per_core; j++) { 5467 ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j]; 5468 } 5469 } 5470 5471 // Max number of processors 5472 int nproc = nth_per_core * ncores; 5473 // An array to keep number of threads per each context 5474 int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc); 5475 for (int i = 0; i < nproc; i++) { 5476 newarr[i] = 0; 5477 } 5478 5479 int nth = nthreads; 5480 int flag = 0; 5481 while (nth > 0) { 5482 for (int j = 1; j <= nth_per_core; j++) { 5483 int cnt = ncores_with_x_to_max_procs[j]; 5484 for (int i = 0; i < ncores; i++) { 5485 // Skip the core with 0 processors 5486 if (nproc_at_core[i] == 0) { 5487 continue; 5488 } 5489 for (int k = 0; k < nth_per_core; k++) { 5490 if (procarr[i * nth_per_core + k] != -1) { 5491 if (newarr[i * nth_per_core + k] == 0) { 5492 newarr[i * nth_per_core + k] = 1; 5493 cnt--; 5494 nth--; 5495 break; 5496 } else { 5497 if (flag != 0) { 5498 newarr[i * nth_per_core + k]++; 5499 cnt--; 5500 nth--; 5501 break; 5502 } 5503 } 5504 } 5505 } 5506 if (cnt == 0 || nth == 0) { 5507 break; 5508 } 5509 } 5510 if (nth == 0) { 5511 break; 5512 } 5513 } 5514 flag = 1; 5515 } 5516 int sum = 0; 5517 for (int i = 0; i < nproc; i++) { 5518 sum += newarr[i]; 5519 if (sum > tid) { 5520 if (fine_gran) { 5521 int osID = procarr[i]; 5522 KMP_CPU_SET(osID, mask); 5523 } else { 5524 int coreID = i / nth_per_core; 5525 for (int ii = 0; ii < nth_per_core; ii++) { 5526 int osID = procarr[coreID * nth_per_core + ii]; 5527 if (osID != -1) { 5528 KMP_CPU_SET(osID, mask); 5529 } 5530 } 5531 } 5532 break; 5533 } 5534 } 5535 __kmp_free(newarr); 5536 } 5537 5538 if (__kmp_affinity.flags.verbose) { 5539 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5540 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 5541 KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(), 5542 tid, buf); 5543 } 5544 __kmp_affinity_get_thread_topology_info(th); 5545 __kmp_set_system_affinity(mask, TRUE); 5546 } 5547 } 5548 5549 #if KMP_OS_LINUX || KMP_OS_FREEBSD 5550 // We don't need this entry for Windows because 5551 // there is GetProcessAffinityMask() api 5552 // 5553 // The intended usage is indicated by these steps: 5554 // 1) The user gets the current affinity mask 5555 // 2) Then sets the affinity by calling this function 5556 // 3) Error check the return value 5557 // 4) Use non-OpenMP parallelization 5558 // 5) Reset the affinity to what was stored in step 1) 5559 #ifdef __cplusplus 5560 extern "C" 5561 #endif 5562 int 5563 kmp_set_thread_affinity_mask_initial() 5564 // the function returns 0 on success, 5565 // -1 if we cannot bind thread 5566 // >0 (errno) if an error happened during binding 5567 { 5568 int gtid = __kmp_get_gtid(); 5569 if (gtid < 0) { 5570 // Do not touch non-omp threads 5571 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5572 "non-omp thread, returning\n")); 5573 return -1; 5574 } 5575 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) { 5576 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5577 "affinity not initialized, returning\n")); 5578 return -1; 5579 } 5580 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5581 "set full mask for thread %d\n", 5582 gtid)); 5583 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL); 5584 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE); 5585 } 5586 #endif 5587 5588 #endif // KMP_AFFINITY_SUPPORTED 5589