kmp_affinity.cpp - OpenGrok cross reference for /freebsd-src/contrib/llvm-project/openmp/runtime/src/kmp

Lines Matching +full:layer +full:- +full:depth
2  * kmp_affinity.cpp -- affinity management
5 //===----------------------------------------------------------------------===//
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 //===----------------------------------------------------------------------===//
62     return __kmp_topology->restrict_to_mask(mask);
79   kmp_uint32 depth;
89   depth = machine_hierarchy.depth;
90   KMP_DEBUG_ASSERT(depth > 0);
92   thr_bar->depth = depth;
93   __kmp_type_convert(machine_hierarchy.numPerLevel[0] - 1,
94                      &(thr_bar->base_leaf_kids));
95   thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
204   int depth = __kmp_topology->get_depth();
205   for (int level = 0; level < depth; ++level) {
206     if (ahwthread->ids[level] < bhwthread->ids[level])
207       return -1;
208     else if (ahwthread->ids[level] > bhwthread->ids[level])
211   if (ahwthread->os_id < bhwthread->os_id)
212     return -1;
213   else if (ahwthread->os_id > bhwthread->os_id)
223   int depth = __kmp_topology->get_depth();
224   int compact = __kmp_topology->compact;
226   KMP_DEBUG_ASSERT(compact <= depth);
228     int j = depth - i - 1;
229     if (aa->sub_ids[j] < bb->sub_ids[j])
230       return -1;
231     if (aa->sub_ids[j] > bb->sub_ids[j])
234   for (; i < depth; i++) {
235     int j = i - compact;
236     if (aa->sub_ids[j] < bb->sub_ids[j])
237       return -1;
238     if (aa->sub_ids[j] > bb->sub_ids[j])
246   int depth = __kmp_topology->get_depth();
248   for (int i = 0; i < depth; ++i) {
265 // Add a layer to the topology based on the ids. Assume the topology
268   // Figure out where the layer should go by comparing the ids of the current
274   // Start from the highest layer and work down to find target layer
275   // If new layer is equal to another layer then put the new layer above
276   for (target_layer = 0; target_layer < depth; ++target_layer) {
283         // Found the layer we are strictly above
288         // Found a layer we are below. Move to next layer and check.
299   // Found the layer we are above. Now move everything to accommodate the new
300   // layer. And put the new ids and type into the topology.
301   for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
305     for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
310   depth++;
332   __kmp_topology->sort_ids();
337 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
356   while (top_index1 < depth - 1 && top_index2 < depth) {
388       // Select the layer to remove based on preference
400       // If all the indexes for the second (deeper) layer are the same.
401       // e.g., all are zero, then make sure to keep the first layer's ids
405       // the hw threads and removing the layer from types and depth
409         for (int d = remove_layer_ids; d < depth - 1; ++d)
412       for (int idx = remove_layer; idx < depth - 1; ++idx)
414       depth--;
419   KMP_ASSERT(depth > 0);
450 // Gather the count of each topology layer and the ratio
455   for (int i = 0; i < depth; ++i) {
464     for (int layer = 0; layer < depth; ++layer) {
465       int id = hw_thread.ids[layer];
466       if (id != previous_id[layer]) {
468         for (int l = layer; l < depth; ++l)
470         // Keep track of topology layer ratio statistics
471         max[layer]++;
472         for (int l = layer + 1; l < depth; ++l) {
479         if (__kmp_is_hybrid_cpu() && core_level >= 0 && layer <= core_level) {
482             // Because efficiencies can range from 0 to max efficiency - 1,
503     for (int layer = 0; layer < depth; ++layer) {
504       previous_id[layer] = hw_thread.ids[layer];
507   for (int layer = 0; layer < depth; ++layer) {
508     if (max[layer] > ratio[layer])
509       ratio[layer] = max[layer];
518   for (int i = 0; i < depth; ++i)
522     above_level = -1;
541     for (int level = 0; level < depth; ++level)
552   for (int level = 0; level < depth; ++level)
554   flags.uniform = (num == count[depth - 1]);
562   for (int i = 0; i < depth; ++i) {
563     previous_id[i] = -1;
564     sub_id[i] = -1;
569     for (int j = 0; j < depth; ++j) {
572         for (int k = j + 1; k < depth; ++k) {
579     for (int j = 0; j < depth; ++j) {
583     for (int j = 0; j < depth; ++j) {
594   if (package_level == -1)
600   KMP_ASSERT(core_level != -1);
601   KMP_ASSERT(thread_level != -1);
604   if (package_level != -1) {
626     retval->hw_threads = (kmp_hw_thread_t *)(bytes + sizeof(kmp_topology_t));
628     retval->hw_threads = nullptr;
630   retval->num_hw_threads = nproc;
631   retval->depth = ndepth;
634   retval->types = (kmp_hw_t *)arr;
635   retval->ratio = arr + (size_t)KMP_HW_LAST;
636   retval->count = arr + 2 * (size_t)KMP_HW_LAST;
637   retval->num_core_efficiencies = 0;
638   retval->num_core_types = 0;
639   retval->compact = 0;
641     retval->core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN;
642   KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; }
644     retval->types[i] = types[i];
645     retval->equivalent[types[i]] = types[i];
661     kmp_hw_thread_t &previous_thread = hw_threads[i - 1];
663     for (int j = 0; j < depth; ++j) {
680   printf("* depth: %d\n", depth);
683   for (int i = 0; i < depth; ++i)
688   for (int i = 0; i < depth; ++i) {
694   for (int i = 0; i < depth; ++i) {
710     printf("%-15s -> %-15s\n", key, value);
754   KMP_ASSERT(depth > 0 && depth <= (int)KMP_HW_LAST);
758   for (int level = 0; level < depth; ++level)
762     if (print_types[print_types_depth - 1] == KMP_HW_THREAD) {
765       print_types[print_types_depth - 1] = KMP_HW_CORE;
835     for (int level = 0; level < depth; ++level) {
858       // OMP_PLACES with cores:<attribute> but non-hybrid arch, use cores
864       affinity.gran_levels = -1;
869       // OMP_PLACES=core_types|core_effs but non-hybrid, use cores instead
881       affinity.gran_levels = -1;
926     for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i)
946     if (get_level(KMP_HW_L2) != -1)
948     else if (get_level(KMP_HW_TILE) != -1)
954   KMP_ASSERT(depth > 0);
955   for (int level = 0; level < depth; ++level) {
968   depth = ndepth;
970   for (int level = 0; level < depth; ++level) {
1018       __kmp_avail_proc--;
1036       __kmp_affin_origMask->copy(__kmp_affin_fullMask);
1050   __kmp_hw_subset->sort();
1052   __kmp_hw_subset->canonicalize(__kmp_topology);
1057   bool is_absolute = __kmp_hw_subset->is_absolute();
1058   int hw_subset_depth = __kmp_hw_subset->get_depth();
1066     const kmp_hw_subset_t::item_t &item = __kmp_hw_subset->at(i);
1074     // Check to see if current layer is in detected machine topology
1076       __kmp_hw_subset->at(i).type = equivalent_type;
1083     // Check to see if current layer has already been
1093     // Check to see if each layer's num & offset parameters are valid
1115       // Check if using a single core attribute on non-hybrid arch.
1118       // Check if using multiple core attributes on non-hyrbid arch.
1155                         KMP_HNT(ValidValuesRange, 0, num_core_efficiencies - 1),
1169           int level_above = core_level - 1;
1228     abs_sub_ids[i] = -1;
1229     prev_sub_ids[i] = -1;
1232     core_eff_sub_ids[i] = -1;
1234     core_type_sub_ids[i] = -1;
1238   // Helpful to determine if a topology layer is targeted by an absolute subset
1246     // If not absolute KMP_HW_SUBSET, then every layer is seen as targeted
1312       const auto &hw_subset_item = __kmp_hw_subset->at(hw_subset_index);
1314       if (level == -1)
1385   if (hw_level >= depth)
1394   for (int i = 0; i < (depth - hw_level); ++i) {
1445 // The format is a comma separated list of non-negative integers or integer
1446 // ranges: e.g., 1,2,3-5,7,9-15
1456   char *end = buf + buf_len - 1;
1459   if (mask->begin() == mask->end()) {
1460     KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}");
1467   start = mask->begin();
1471     for (finish = mask->next(start), previous = start;
1472          finish == previous + 1 && finish != mask->end();
1473          finish = mask->next(finish)) {
1480       KMP_SNPRINTF(scan, end - scan + 1, "%s", ",");
1486     if (previous - start > 1) {
1487       KMP_SNPRINTF(scan, end - scan + 1, "%u-%u", start, previous);
1490       KMP_SNPRINTF(scan, end - scan + 1, "%u", start);
1492       if (previous - start > 0) {
1493         KMP_SNPRINTF(scan, end - scan + 1, ",%u", previous);
1499     if (start == mask->end())
1502     if (end - scan < 2)
1513 // The format is a comma separated list of non-negative integers or integer
1514 // ranges: e.g., 1,2,3-5,7,9-15
1526   if (mask->begin() == mask->end()) {
1532   start = mask->begin();
1536     for (finish = mask->next(start), previous = start;
1537          finish == previous + 1 && finish != mask->end();
1538          finish = mask->next(finish)) {
1550     if (previous - start > 1) {
1551       __kmp_str_buf_print(buf, "%u-%u", start, previous);
1555       if (previous - start > 0) {
1561     if (start == mask->end())
1585   // e.g., 1,2,4-7,9,11-15
1599     } else if (c == '-') {
1664 // internal topology object and set the layer ids for it.  Each routine
1673   return hwloc_obj_type_is_cache(obj->type);
1675   return obj->type == HWLOC_OBJ_CACHE;
1683     if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION)
1685     switch (obj->attr->cache.depth) {
1701   switch (obj->type) {
1712     if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE)
1714     else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE)
1716     else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE)
1718     else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP)
1738   for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type,
1739                                            obj->logical_index, type, 0);
1741                                                        obj->type, first) == obj;
1742        first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type,
1754   hwloc_obj_type_t ltype = lower->type;
1755   int lindex = lower->logical_index - 1;
1760          hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) {
1761     if (obj->userdata) {
1762       sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata));
1766     lindex--;
1770   lower->userdata = RCAST(void *, sub_id + 1);
1777   int depth;
1808     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1829       cpukinds[idx].efficiency = -1;
1855   // Figure out the depth and types in the topology
1856   depth = 0;
1857   obj = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
1860     if (obj->memory_arity) {
1862       for (memory = obj->memory_first_child; memory;
1864         if (memory->type == HWLOC_OBJ_NUMANODE)
1867       if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
1868         types[depth] = KMP_HW_NUMA;
1869         hwloc_types[depth] = memory->type;
1870         depth++;
1876       types[depth] = type;
1877       hwloc_types[depth] = obj->type;
1878       depth++;
1880     obj = obj->parent;
1882   KMP_ASSERT(depth > 0);
1885   for (int i = 0, j = depth - 1; i < j; ++i, --j) {
1895   __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
1900     int index = depth - 1;
1901     bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask);
1902     kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
1905       hw_thread.ids[index] = pu->logical_index;
1906       hw_thread.os_id = pu->os_index;
1910         int cpukind_index = -1;
1923       index--;
1928       obj = obj->parent;
1933       if (obj->memory_arity) {
1935         for (memory = obj->memory_first_child; memory;
1937           if (memory->type == HWLOC_OBJ_NUMANODE)
1940         if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
1943             hw_thread.ids[index] = memory->logical_index;
1945             index--;
1956           hw_thread.ids[index] = obj->logical_index;
1958           index--;
1975   __kmp_topology->sort_ids();
1982 // mapping of os thread id's <-> processor id's.
1985   int depth = 3;
2010   __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
2018     kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
2033 // If multiple Windows* OS processor groups exist, we can create a 2-level
2039   int depth = 3;
2058   __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
2066     kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++);
2080   const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB;
2111   if (aa->pkgId < bb->pkgId)
2112     return -1;
2113   if (aa->pkgId > bb->pkgId)
2115   if (aa->coreId < bb->coreId)
2116     return -1;
2117   if (aa->coreId > bb->coreId)
2119   if (aa->threadId < bb->threadId)
2120     return -1;
2121   if (aa->threadId > bb->threadId)
2131   kmp_cache_info_t() : depth(0) { get_leaf4_levels(); }
2132   size_t get_depth() const { return depth; }
2152   size_t depth;
2157     while (depth < MAX_CACHE_LEVEL) {
2173       table[depth].level = cache_level;
2174       table[depth].mask = ((-1) << cache_mask_width);
2175       depth++;
2181 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
2203   // need to do something else - use the defaults that we calculated from
2245     // - Older OSes are usually found on machines with older chips, which do not
2247     // - The performance penalty for mistakenly identifying a machine as HT when
2253     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
2269   // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
2271   // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value
2276   //     Hyper-Threading Technology is supported by the chip but has been
2278   //     On other OS/chip combinations supporting Intel(R) Hyper-Threading
2280   //     Hyper-Threading Technology is disabled and 2 when it is enabled.
2281   // - Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4). The value
2284   //     bound, but the IA-32 architecture manual says that it is exactly the
2301     __kmp_affinity_dispatch->bind_thread(i);
2336     int widthT = widthCT - widthC;
2346     int maskC = (1 << widthC) - 1;
2349     int maskT = (1 << widthT) - 1;
2366   // [0 .. coresPerPkg-1] and threadId's are usually assigned
2367   // [0..threadsPerCore-1], we don't want to make any such assumptions.
2370   // total # packages) are at this point - we want to determine that now. We
2388   // intra-pkg consist checks
2457   //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
2458   int depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
2467   KMP_ASSERT(depth > 0);
2468   __kmp_topology = kmp_topology_t::allocate(nApics, depth, types);
2473     kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
2489   __kmp_topology->sort_ids();
2490   if (!__kmp_topology->check_ids()) {
2525     31-16           15-8            7-4            4-0
2526 ---+-----------+--------------+-------------+-----------------+
2528 ---+-----------|--------------+-------------+-----------------|
2530 ---+-----------|--------------+-------------------------------|
2532 ---+-----------+--------------+-------------------------------|
2534 ---+----------------------------------------------------------+
2604       // If it is an unknown level, then logically move the previous layer up
2606         levels[levels_index - 1].mask_width = mask_width;
2607         levels[levels_index - 1].nitems = nitems;
2613   // Ensure the INTEL_LEVEL_TYPE_INVALID (Socket) layer isn't first
2620       levels[i].mask = ~((-1) << levels[i].mask_width);
2621       levels[i].cache_mask = (-1) << levels[i].mask_width;
2626       levels[i].mask = (-1) << levels[i - 1].mask_width;
2685   topology_leaf = -1;
2699   if (topology_leaf == -1 || levels_index == 0) {
2708   // we need to do something else - use the defaults that we calculated from
2722     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
2727   int depth = levels_index;
2728   for (int i = depth - 1, j = 0; i >= 0; --i, ++j)
2742       if (hw_cache_mask == cache_mask && j < levels_index - 1) {
2745         __kmp_topology->set_equivalent_type(cache_type, type);
2771     __kmp_affinity_dispatch->bind_thread(proc);
2776     kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
2786     for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) {
2789         hw_thread.ids[idx] >>= my_levels[j - 1].mask_width;
2804   __kmp_topology->sort_ids();
2805   if (!__kmp_topology->check_ids()) {
2829   for (i = maxIndex;; i--) {
2831       return -1;
2887 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
2956     buf[sizeof(buf) - 1] = 1;
2963     if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2968     // FIXME - this will match "node_<n> <garbage>"
3127       buf[sizeof(buf) - 1] = 1;
3143       } else if (!buf[sizeof(buf) - 1]) {
3182       if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
3184         char *p = strchr(buf + sizeof(s1) - 1, ':');
3232       if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
3234         char *p = strchr(buf + sizeof(s2) - 1, ':');
3244       if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
3246         char *p = strchr(buf + sizeof(s3) - 1, ':');
3257       if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
3259         char *p = strchr(buf + sizeof(s4) - 1, ':');
3271         char *p = strchr(buf + sizeof(s4) - 1, ':');
3286       // leading tokens that we don't recognize - if the line isn't empty, go on
3369   // [0 .. coresPerPkg-1] and threadId's are usually assigned
3370   // [0..threadsPerCore-1], we don't want to make any such assumptions.
3373   // total # packages) are at this point - we want to determine that now. We
3411     for (index = maxIndex; index >= threadIdIndex; index--) {
3413         // Auto-assign the thread id field if it wasn't specified.
3445           teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
3451           // Auto-assign the thread id field if it wasn't specified.
3468       // Also, check that we waven't already restarted the loop (to be safe -
3490   teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
3540   int depth = 0;
3543   int pkgLevel = -1;
3544   int coreLevel = -1;
3545   int threadLevel = -1;
3548       depth++;
3563   KMP_ASSERT(depth > 0);
3566   __kmp_topology = kmp_topology_t::allocate(num_avail, depth, types);
3571     kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
3576     for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
3596   __kmp_topology->sort_ids();
3597   if (!__kmp_topology->check_ids()) {
3616   int numAddrs = __kmp_topology->get_num_hw_threads();
3617   int depth = __kmp_topology->get_depth();
3620   KMP_ASSERT(depth);
3622   i = find_next(-1);
3629   for (i = numAddrs - 1;; --i) {
3630     int osId = __kmp_topology->at(i).os_id;
3643   if (affinity.gran_levels >= (int)depth) {
3658   i = j = leader = find_next(-1);
3659   KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
3665     if (__kmp_topology->is_close(leader, i, affinity)) {
3666       KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
3673       int osId = __kmp_topology->at(j).os_id;
3677       __kmp_topology->at(j).leader = (j == leader);
3685     KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
3691     int osId = __kmp_topology->at(j).os_id;
3695     __kmp_topology->at(j).leader = (j == leader);
3703     __kmp_topology->print(env_var);
3710 // as file-static than to try and pass them through the calling sequence of
3711 // the recursive-descent OMP_PLACES parser.
3745 // Re-parse the proclist (for the explicit affinity type), and form the list
3753   int maxOsId = affinity.num_os_id_masks - 1;
3849     if (*next != '-') {
3860     // This is a range.  Skip over the '-' and read in the 2nd int.
3861     next++; // skip '-'
3878       if (*next == '-') {
3879         sign = -1;
3898     KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
3937 /*-----------------------------------------------------------------------------
3938 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3955 signed := - signed
3956 -----------------------------------------------------------------------------*/
4035       if (**scan == '-') {
4036         sign *= -1;
4037         (*scan)++; // skip '-'
4118   int maxOsId = affinity.num_os_id_masks - 1;
4183         if (*scan == '-') {
4184           sign *= -1;
4185           scan++; // skip '-'
4218           if (i < count - 1) {
4269     const kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
4270     for (int j = bottom_level; j > 0; j--) {
4272         if (core_level < (j - 1)) {
4273           core_level = j - 1;
4284   return __kmp_topology->get_count(core_level);
4290   KMP_DEBUG_ASSERT(proc >= 0 && proc < __kmp_topology->get_num_hw_threads());
4294         if (__kmp_topology->at(i + 1).sub_ids[j] !=
4295             __kmp_topology->at(i).sub_ids[j]) {
4311   int thread_level = __kmp_topology->get_level(KMP_HW_THREAD);
4312   return __kmp_topology->calculate_ratio(thread_level, core_level);
4333   int depth = __kmp_topology->get_depth();
4337     const kmp_hw_thread_t &hw_thread = __kmp_topology->at(osid_idx);
4338     for (int level = 0; level < depth; ++level) {
4339       kmp_hw_t type = __kmp_topology->get_type(level);
4347         for (; level < depth; ++level) {
4348           kmp_hw_t type = __kmp_topology->get_type(level);
4370   const kmp_affin_mask_t *mask = th->th.th_affin_mask;
4371   kmp_affinity_ids_t &ids = th->th.th_topology_ids;
4372   kmp_affinity_attrs_t &attrs = th->th.th_topology_attrs;
4390   int max_cpu = __kmp_affin_fullMask->get_max_cpu();
4391   int num_hw_threads = __kmp_topology->get_num_hw_threads();
4410     int os_id = __kmp_topology->at(hw_thread).os_id;
4426   if (__kmp_topology && __kmp_topology->get_num_hw_threads()) {
4427     machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
4440   KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
4449   // Create the "full" mask - this defines all of the processors that we
4469     __kmp_affin_origMask->copy(__kmp_affin_fullMask);
4505         __kmp_affin_origMask->copy(__kmp_affin_fullMask);
4509       __kmp_affin_fullMask->set_process_affinity(true);
4529 // In the default code path, errors are not fatal - we just try using
4534         __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
4594     KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
4660       __kmp_topology->canonicalize(nPackages, nCoresPerPkg,
4663         __kmp_topology->print(env_var);
4670   __kmp_topology->canonicalize();
4672     __kmp_topology->print(env_var);
4673   bool filtered = __kmp_topology->filter_hw_subset();
4675     __kmp_topology->print("KMP_HW_SUBSET");
4695       KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
4714   __kmp_topology->set_granularity(affinity);
4715   int depth = __kmp_topology->get_depth();
4719   int numAddrs = __kmp_topology->get_num_hw_threads();
4724       KMP_ASSERT(idx >= -1);
4726         if (__kmp_topology->at(i).attrs.contains(affinity.core_attr_gran))
4745       KMP_ASSERT(idx >= -1);
4787       if (affinity.compact >= depth) {
4800     if (affinity.compact >= depth) {
4803       affinity.compact = depth - 1 - affinity.compact;
4808     if (affinity.compact >= depth) {
4809       affinity.compact = depth - 1;
4814     if (depth <= 1 || is_hidden_helper_affinity) {
4820     } else if (!__kmp_topology->is_uniform()) {
4821       // Save the depth for further usage
4822       __kmp_aff_depth = depth;
4825           __kmp_affinity_find_core_level(__kmp_avail_proc, depth - 1);
4826       int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, depth - 1,
4829           __kmp_avail_proc, depth - 1, core_level);
4842         procarr[i] = -1;
4845       int lastcore = -1;
4848         int proc = __kmp_topology->at(i).os_id;
4849         int core = __kmp_affinity_find_core(i, depth - 1, core_level);
4861     if (affinity.compact >= depth) {
4862       affinity.compact = depth - 1;
4866     // Allocate the gtid->affinity mask table.
4884     __kmp_topology->sort_compact(affinity);
4888       int num_hw_threads = __kmp_topology->get_num_hw_threads();
4891         if ((!affinity.flags.dups) && (!__kmp_topology->at(i).leader)) {
4894         int osId = __kmp_topology->at(i).os_id;
4908         __kmp_topology->print(env_var);
4912     __kmp_topology->sort_ids();
4943     if (affinity->masks != NULL)
4944       KMP_CPU_FREE_ARRAY(affinity->masks, affinity->num_masks);
4945     if (affinity->os_id_masks != NULL)
4946       KMP_CPU_FREE_ARRAY(affinity->os_id_masks, affinity->num_os_id_masks);
4947     if (affinity->proclist != NULL)
4948       __kmp_free(affinity->proclist);
4949     if (affinity->ids != NULL)
4950       __kmp_free(affinity->ids);
4951     if (affinity->attrs != NULL)
4952       __kmp_free(affinity->attrs);
4953     *affinity = KMP_AFFINITY_INIT(affinity->env_var);
5000     mask_idx = gtid - 2;
5003   KMP_DEBUG_ASSERT(affinity->num_masks > 0);
5004   *place = (mask_idx + affinity->offset) % affinity->num_masks;
5005   *mask = KMP_CPU_INDEX(affinity->masks, *place);
5008 // This function initializes the per-thread data concerning affinity including
5016     th->th.th_topology_ids.ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
5017   th->th.th_topology_attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
5023   if (th->th.th_affin_mask == NULL) {
5024     KMP_CPU_ALLOC(th->th.th_affin_mask);
5026     KMP_CPU_ZERO(th->th.th_affin_mask);
5045     if ((affinity->type == affinity_none) ||
5046         (affinity->type == affinity_balanced) ||
5074   th->th.th_current_place = i;
5076     th->th.th_new_place = i;
5077     th->th.th_first_place = 0;
5078     th->th.th_last_place = affinity->num_masks - 1;
5080     // When using a Non-OMP_PROC_BIND affinity method,
5081     // set all threads' place-partition-var to the entire place list
5082     th->th.th_first_place = 0;
5083     th->th.th_last_place = affinity->num_masks - 1;
5087     th->th.th_topology_ids = __kmp_affinity.ids[i];
5088     th->th.th_topology_attrs = __kmp_affinity.attrs[i];
5099   KMP_CPU_COPY(th->th.th_affin_mask, mask);
5117   if (affinity->flags.verbose && (affinity->type == affinity_none ||
5118                                   (th->th.th_current_place != KMP_PLACE_ALL &&
5119                                    affinity->type != affinity_balanced)) &&
5123                               th->th.th_affin_mask);
5132   if (affinity->type == affinity_none) {
5133     __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
5138     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
5152                  gtid, th->th.th_new_place, th->th.th_current_place));
5155   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
5156   KMP_ASSERT(th->th.th_new_place >= 0);
5157   KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity.num_masks);
5158   if (th->th.th_first_place <= th->th.th_last_place) {
5159     KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) &&
5160                (th->th.th_new_place <= th->th.th_last_place));
5162     KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) ||
5163                (th->th.th_new_place >= th->th.th_last_place));
5169       KMP_CPU_INDEX(__kmp_affinity.masks, th->th.th_new_place);
5170   KMP_CPU_COPY(th->th.th_affin_mask, mask);
5171   th->th.th_current_place = th->th.th_new_place;
5176                               th->th.th_affin_mask);
5180   __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
5189     return -1;
5232   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
5235     KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
5238   th->th.th_current_place = KMP_PLACE_UNDEFINED;
5239   th->th.th_new_place = KMP_PLACE_UNDEFINED;
5240   th->th.th_first_place = 0;
5241   th->th.th_last_place = __kmp_affinity.num_masks - 1;
5244   th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
5256     return -1;
5265   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
5271                                   th->th.th_affin_mask);
5300   KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
5320     return -1;
5341     return -1;
5344     return -2;
5353     return -1;
5374     return -1;
5377     return -2;
5386     return -1;
5407     return -1;
5420   int high = __kmp_topology->get_num_hw_threads() - 1;
5422   while (high - low > 1) {
5424     if (__kmp_topology->at(mid).attrs.get_core_type() ==
5431   if (__kmp_topology->at(mid).attrs.get_core_type() == KMP_HW_CORE_TYPE_ATOM) {
5434   return -1;
5438 // Dynamic affinity settings - Affinity balanced
5442   int tid = th->th.th_info.ds.ds_tid;
5466   if (__kmp_topology->is_uniform()) {
5479     // How many cores will have an additional thread bound to it - "big cores"
5487       coreID = (tid - big_cores) / chunk;
5488       threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core;
5493     kmp_affin_mask_t *mask = th->th.th_affin_mask;
5498           __kmp_topology->at(coreID * __kmp_nth_per_core + threadID).os_id;
5503         osID = __kmp_topology->at(coreID * __kmp_nth_per_core + i).os_id;
5515   } else { // Non-uniform topology
5517     kmp_affin_mask_t *mask = th->th.th_affin_mask;
5521         __kmp_affinity_find_core_level(__kmp_avail_proc, __kmp_aff_depth - 1);
5523                                                __kmp_aff_depth - 1, core_level);
5525         __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
5531         int osID = __kmp_topology->at(tid).os_id;
5535             __kmp_affinity_find_core(tid, __kmp_aff_depth - 1, core_level);
5537           int osID = __kmp_topology->at(i).os_id;
5538           if (__kmp_affinity_find_core(i, __kmp_aff_depth - 1, core_level) ==
5551           if (procarr[i * nth_per_core + j] != -1) {
5560               if (osID != -1) {
5593           if (procarr[i * nth_per_core + j] != -1) {
5626               if (procarr[i * nth_per_core + k] != -1) {
5629                   cnt--;
5630                   nth--;
5635                     cnt--;
5636                     nth--;
5663               if (osID != -1) {
5694 // 4) Use non-OpenMP parallelization
5702 //   -1 if we cannot bind thread
5707     // Do not touch non-omp threads
5709                   "non-omp thread, returning\n"));
5710     return -1;
5715     return -1;