xref: /llvm-project/openmp/runtime/src/kmp_affinity.h (revision 598970904736f3535939f6a5525022219e4ae517)
1 /*
2  * kmp_affinity.h -- header for affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_AFFINITY_H
14 #define KMP_AFFINITY_H
15 
16 #include "kmp.h"
17 #include "kmp_os.h"
18 #include <limits>
19 
20 #if KMP_AFFINITY_SUPPORTED
21 #if KMP_USE_HWLOC
22 class KMPHwlocAffinity : public KMPAffinity {
23 public:
24   class Mask : public KMPAffinity::Mask {
25     hwloc_cpuset_t mask;
26 
27   public:
28     Mask() {
29       mask = hwloc_bitmap_alloc();
30       this->zero();
31     }
32     Mask(const Mask &other) = delete;
33     Mask &operator=(const Mask &other) = delete;
34     ~Mask() { hwloc_bitmap_free(mask); }
35     void set(int i) override { hwloc_bitmap_set(mask, i); }
36     bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
37     void clear(int i) override { hwloc_bitmap_clr(mask, i); }
38     void zero() override { hwloc_bitmap_zero(mask); }
39     bool empty() const override { return hwloc_bitmap_iszero(mask); }
40     void copy(const KMPAffinity::Mask *src) override {
41       const Mask *convert = static_cast<const Mask *>(src);
42       hwloc_bitmap_copy(mask, convert->mask);
43     }
44     void bitwise_and(const KMPAffinity::Mask *rhs) override {
45       const Mask *convert = static_cast<const Mask *>(rhs);
46       hwloc_bitmap_and(mask, mask, convert->mask);
47     }
48     void bitwise_or(const KMPAffinity::Mask *rhs) override {
49       const Mask *convert = static_cast<const Mask *>(rhs);
50       hwloc_bitmap_or(mask, mask, convert->mask);
51     }
52     void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
53     bool is_equal(const KMPAffinity::Mask *rhs) const override {
54       const Mask *convert = static_cast<const Mask *>(rhs);
55       return hwloc_bitmap_isequal(mask, convert->mask);
56     }
57     int begin() const override { return hwloc_bitmap_first(mask); }
58     int end() const override { return -1; }
59     int next(int previous) const override {
60       return hwloc_bitmap_next(mask, previous);
61     }
62     int get_system_affinity(bool abort_on_error) override {
63       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
64                   "Illegal get affinity operation when not capable");
65       long retval =
66           hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
67       if (retval >= 0) {
68         return 0;
69       }
70       int error = errno;
71       if (abort_on_error) {
72         __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
73                     KMP_ERR(error), __kmp_msg_null);
74       }
75       return error;
76     }
77     int set_system_affinity(bool abort_on_error) const override {
78       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
79                   "Illegal set affinity operation when not capable");
80       long retval =
81           hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
82       if (retval >= 0) {
83         return 0;
84       }
85       int error = errno;
86       if (abort_on_error) {
87         __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
88                     KMP_ERR(error), __kmp_msg_null);
89       }
90       return error;
91     }
92 #if KMP_OS_WINDOWS
93     int set_process_affinity(bool abort_on_error) const override {
94       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
95                   "Illegal set process affinity operation when not capable");
96       int error = 0;
97       const hwloc_topology_support *support =
98           hwloc_topology_get_support(__kmp_hwloc_topology);
99       if (support->cpubind->set_proc_cpubind) {
100         int retval;
101         retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
102                                    HWLOC_CPUBIND_PROCESS);
103         if (retval >= 0)
104           return 0;
105         error = errno;
106         if (abort_on_error)
107           __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
108                       KMP_ERR(error), __kmp_msg_null);
109       }
110       return error;
111     }
112 #endif
113     int get_proc_group() const override {
114       int group = -1;
115 #if KMP_OS_WINDOWS
116       if (__kmp_num_proc_groups == 1) {
117         return 1;
118       }
119       for (int i = 0; i < __kmp_num_proc_groups; i++) {
120         // On windows, the long type is always 32 bits
121         unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
122         unsigned long second_32_bits =
123             hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
124         if (first_32_bits == 0 && second_32_bits == 0) {
125           continue;
126         }
127         if (group >= 0) {
128           return -1;
129         }
130         group = i;
131       }
132 #endif /* KMP_OS_WINDOWS */
133       return group;
134     }
135   };
136   void determine_capable(const char *var) override {
137     const hwloc_topology_support *topology_support;
138     if (__kmp_hwloc_topology == NULL) {
139       if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
140         __kmp_hwloc_error = TRUE;
141         if (__kmp_affinity.flags.verbose) {
142           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
143         }
144       }
145       if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
146         __kmp_hwloc_error = TRUE;
147         if (__kmp_affinity.flags.verbose) {
148           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
149         }
150       }
151     }
152     topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
153     // Is the system capable of setting/getting this thread's affinity?
154     // Also, is topology discovery possible? (pu indicates ability to discover
155     // processing units). And finally, were there no errors when calling any
156     // hwloc_* API functions?
157     if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
158         topology_support->cpubind->get_thisthread_cpubind &&
159         topology_support->discovery->pu && !__kmp_hwloc_error) {
160       // enables affinity according to KMP_AFFINITY_CAPABLE() macro
161       KMP_AFFINITY_ENABLE(TRUE);
162     } else {
163       // indicate that hwloc didn't work and disable affinity
164       __kmp_hwloc_error = TRUE;
165       KMP_AFFINITY_DISABLE();
166     }
167   }
168   void bind_thread(int which) override {
169     KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
170                 "Illegal set affinity operation when not capable");
171     KMPAffinity::Mask *mask;
172     KMP_CPU_ALLOC_ON_STACK(mask);
173     KMP_CPU_ZERO(mask);
174     KMP_CPU_SET(which, mask);
175     __kmp_set_system_affinity(mask, TRUE);
176     KMP_CPU_FREE_FROM_STACK(mask);
177   }
178   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
179   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
180   KMPAffinity::Mask *allocate_mask_array(int num) override {
181     return new Mask[num];
182   }
183   void deallocate_mask_array(KMPAffinity::Mask *array) override {
184     Mask *hwloc_array = static_cast<Mask *>(array);
185     delete[] hwloc_array;
186   }
187   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
188                                       int index) override {
189     Mask *hwloc_array = static_cast<Mask *>(array);
190     return &(hwloc_array[index]);
191   }
192   api_type get_api_type() const override { return HWLOC; }
193 };
194 #endif /* KMP_USE_HWLOC */
195 
196 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||     \
197     KMP_OS_AIX
198 #if KMP_OS_LINUX
199 /* On some of the older OS's that we build on, these constants aren't present
200    in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
201    all systems of the same arch where they are defined, and they cannot change.
202    stone forever. */
203 #include <sys/syscall.h>
204 #if KMP_ARCH_X86 || KMP_ARCH_ARM
205 #ifndef __NR_sched_setaffinity
206 #define __NR_sched_setaffinity 241
207 #elif __NR_sched_setaffinity != 241
208 #error Wrong code for setaffinity system call.
209 #endif /* __NR_sched_setaffinity */
210 #ifndef __NR_sched_getaffinity
211 #define __NR_sched_getaffinity 242
212 #elif __NR_sched_getaffinity != 242
213 #error Wrong code for getaffinity system call.
214 #endif /* __NR_sched_getaffinity */
215 #elif KMP_ARCH_AARCH64
216 #ifndef __NR_sched_setaffinity
217 #define __NR_sched_setaffinity 122
218 #elif __NR_sched_setaffinity != 122
219 #error Wrong code for setaffinity system call.
220 #endif /* __NR_sched_setaffinity */
221 #ifndef __NR_sched_getaffinity
222 #define __NR_sched_getaffinity 123
223 #elif __NR_sched_getaffinity != 123
224 #error Wrong code for getaffinity system call.
225 #endif /* __NR_sched_getaffinity */
226 #elif KMP_ARCH_X86_64
227 #ifndef __NR_sched_setaffinity
228 #define __NR_sched_setaffinity 203
229 #elif __NR_sched_setaffinity != 203
230 #error Wrong code for setaffinity system call.
231 #endif /* __NR_sched_setaffinity */
232 #ifndef __NR_sched_getaffinity
233 #define __NR_sched_getaffinity 204
234 #elif __NR_sched_getaffinity != 204
235 #error Wrong code for getaffinity system call.
236 #endif /* __NR_sched_getaffinity */
237 #elif KMP_ARCH_PPC64
238 #ifndef __NR_sched_setaffinity
239 #define __NR_sched_setaffinity 222
240 #elif __NR_sched_setaffinity != 222
241 #error Wrong code for setaffinity system call.
242 #endif /* __NR_sched_setaffinity */
243 #ifndef __NR_sched_getaffinity
244 #define __NR_sched_getaffinity 223
245 #elif __NR_sched_getaffinity != 223
246 #error Wrong code for getaffinity system call.
247 #endif /* __NR_sched_getaffinity */
248 #elif KMP_ARCH_MIPS
249 #ifndef __NR_sched_setaffinity
250 #define __NR_sched_setaffinity 4239
251 #elif __NR_sched_setaffinity != 4239
252 #error Wrong code for setaffinity system call.
253 #endif /* __NR_sched_setaffinity */
254 #ifndef __NR_sched_getaffinity
255 #define __NR_sched_getaffinity 4240
256 #elif __NR_sched_getaffinity != 4240
257 #error Wrong code for getaffinity system call.
258 #endif /* __NR_sched_getaffinity */
259 #elif KMP_ARCH_MIPS64
260 #ifndef __NR_sched_setaffinity
261 #define __NR_sched_setaffinity 5195
262 #elif __NR_sched_setaffinity != 5195
263 #error Wrong code for setaffinity system call.
264 #endif /* __NR_sched_setaffinity */
265 #ifndef __NR_sched_getaffinity
266 #define __NR_sched_getaffinity 5196
267 #elif __NR_sched_getaffinity != 5196
268 #error Wrong code for getaffinity system call.
269 #endif /* __NR_sched_getaffinity */
270 #elif KMP_ARCH_LOONGARCH64
271 #ifndef __NR_sched_setaffinity
272 #define __NR_sched_setaffinity 122
273 #elif __NR_sched_setaffinity != 122
274 #error Wrong code for setaffinity system call.
275 #endif /* __NR_sched_setaffinity */
276 #ifndef __NR_sched_getaffinity
277 #define __NR_sched_getaffinity 123
278 #elif __NR_sched_getaffinity != 123
279 #error Wrong code for getaffinity system call.
280 #endif /* __NR_sched_getaffinity */
281 #elif KMP_ARCH_RISCV64
282 #ifndef __NR_sched_setaffinity
283 #define __NR_sched_setaffinity 122
284 #elif __NR_sched_setaffinity != 122
285 #error Wrong code for setaffinity system call.
286 #endif /* __NR_sched_setaffinity */
287 #ifndef __NR_sched_getaffinity
288 #define __NR_sched_getaffinity 123
289 #elif __NR_sched_getaffinity != 123
290 #error Wrong code for getaffinity system call.
291 #endif /* __NR_sched_getaffinity */
292 #elif KMP_ARCH_VE
293 #ifndef __NR_sched_setaffinity
294 #define __NR_sched_setaffinity 203
295 #elif __NR_sched_setaffinity != 203
296 #error Wrong code for setaffinity system call.
297 #endif /* __NR_sched_setaffinity */
298 #ifndef __NR_sched_getaffinity
299 #define __NR_sched_getaffinity 204
300 #elif __NR_sched_getaffinity != 204
301 #error Wrong code for getaffinity system call.
302 #endif /* __NR_sched_getaffinity */
303 #elif KMP_ARCH_S390X
304 #ifndef __NR_sched_setaffinity
305 #define __NR_sched_setaffinity 239
306 #elif __NR_sched_setaffinity != 239
307 #error Wrong code for setaffinity system call.
308 #endif /* __NR_sched_setaffinity */
309 #ifndef __NR_sched_getaffinity
310 #define __NR_sched_getaffinity 240
311 #elif __NR_sched_getaffinity != 240
312 #error Wrong code for getaffinity system call.
313 #endif /* __NR_sched_getaffinity */
314 #else
315 #error Unknown or unsupported architecture
316 #endif /* KMP_ARCH_* */
317 #elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY
318 #include <pthread.h>
319 #include <pthread_np.h>
320 #elif KMP_OS_NETBSD
321 #include <pthread.h>
322 #include <sched.h>
323 #elif KMP_OS_AIX
324 #include <sys/dr.h>
325 #include <sys/rset.h>
326 #define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
327 #define GET_NUMBER_SMT_SETS 0x0004
328 extern "C" int syssmt(int flags, int, int, int *);
329 #endif
330 class KMPNativeAffinity : public KMPAffinity {
331   class Mask : public KMPAffinity::Mask {
332     typedef unsigned long mask_t;
333     typedef decltype(__kmp_affin_mask_size) mask_size_type;
334     static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
335     static const mask_t ONE = 1;
336     mask_size_type get_num_mask_types() const {
337       return __kmp_affin_mask_size / sizeof(mask_t);
338     }
339 
340   public:
341     mask_t *mask;
342     Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
343     ~Mask() {
344       if (mask)
345         __kmp_free(mask);
346     }
347     void set(int i) override {
348       mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
349     }
350     bool is_set(int i) const override {
351       return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
352     }
353     void clear(int i) override {
354       mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
355     }
356     void zero() override {
357       mask_size_type e = get_num_mask_types();
358       for (mask_size_type i = 0; i < e; ++i)
359         mask[i] = (mask_t)0;
360     }
361     bool empty() const override {
362       mask_size_type e = get_num_mask_types();
363       for (mask_size_type i = 0; i < e; ++i)
364         if (mask[i] != (mask_t)0)
365           return false;
366       return true;
367     }
368     void copy(const KMPAffinity::Mask *src) override {
369       const Mask *convert = static_cast<const Mask *>(src);
370       mask_size_type e = get_num_mask_types();
371       for (mask_size_type i = 0; i < e; ++i)
372         mask[i] = convert->mask[i];
373     }
374     void bitwise_and(const KMPAffinity::Mask *rhs) override {
375       const Mask *convert = static_cast<const Mask *>(rhs);
376       mask_size_type e = get_num_mask_types();
377       for (mask_size_type i = 0; i < e; ++i)
378         mask[i] &= convert->mask[i];
379     }
380     void bitwise_or(const KMPAffinity::Mask *rhs) override {
381       const Mask *convert = static_cast<const Mask *>(rhs);
382       mask_size_type e = get_num_mask_types();
383       for (mask_size_type i = 0; i < e; ++i)
384         mask[i] |= convert->mask[i];
385     }
386     void bitwise_not() override {
387       mask_size_type e = get_num_mask_types();
388       for (mask_size_type i = 0; i < e; ++i)
389         mask[i] = ~(mask[i]);
390     }
391     bool is_equal(const KMPAffinity::Mask *rhs) const override {
392       const Mask *convert = static_cast<const Mask *>(rhs);
393       mask_size_type e = get_num_mask_types();
394       for (mask_size_type i = 0; i < e; ++i)
395         if (mask[i] != convert->mask[i])
396           return false;
397       return true;
398     }
399     int begin() const override {
400       int retval = 0;
401       while (retval < end() && !is_set(retval))
402         ++retval;
403       return retval;
404     }
405     int end() const override {
406       int e;
407       __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
408       return e;
409     }
410     int next(int previous) const override {
411       int retval = previous + 1;
412       while (retval < end() && !is_set(retval))
413         ++retval;
414       return retval;
415     }
416 #if KMP_OS_AIX
417     // On AIX, we don't have a way to get CPU(s) a thread is bound to.
418     // This routine is only used to get the full mask.
419     int get_system_affinity(bool abort_on_error) override {
420       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
421                   "Illegal get affinity operation when not capable");
422 
423       (void)abort_on_error;
424 
425       // Set the mask with all CPUs that are available.
426       for (int i = 0; i < __kmp_xproc; ++i)
427         KMP_CPU_SET(i, this);
428       return 0;
429     }
430     int set_system_affinity(bool abort_on_error) const override {
431       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
432 
433                   "Illegal set affinity operation when not capable");
434 
435       int location;
436       int gtid = __kmp_entry_gtid();
437       int tid = thread_self();
438 
439       // Unbind the thread if it was bound to any processors before so that
440       // we can bind the thread to CPUs specified by the mask not others.
441       int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
442 
443       // On AIX, we can only bind to one instead of a set of CPUs with the
444       // bindprocessor() system call.
445       KMP_CPU_SET_ITERATE(location, this) {
446         if (KMP_CPU_ISSET(location, this)) {
447           retval = bindprocessor(BINDTHREAD, tid, location);
448           if (retval == -1 && errno == 1) {
449             rsid_t rsid;
450             rsethandle_t rsh;
451             // Put something in rsh to prevent compiler warning
452             // about uninitalized use
453             rsh = rs_alloc(RS_EMPTY);
454             rsid.at_pid = getpid();
455             if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {
456               retval = ra_detachrset(R_PROCESS, rsid, 0);
457               retval = bindprocessor(BINDTHREAD, tid, location);
458             }
459           }
460           if (retval == 0) {
461             KA_TRACE(10, ("__kmp_set_system_affinity:  Done binding "
462                           "T#%d to cpu=%d.\n",
463                           gtid, location));
464             continue;
465           }
466           int error = errno;
467           if (abort_on_error) {
468             __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
469                         KMP_ERR(error), __kmp_msg_null);
470             KA_TRACE(10, ("__kmp_set_system_affinity:  Error binding "
471                           "T#%d to cpu=%d, errno=%d.\n",
472                           gtid, location, error));
473             return error;
474           }
475         }
476       }
477       return 0;
478     }
479 #else // !KMP_OS_AIX
480     int get_system_affinity(bool abort_on_error) override {
481       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
482                   "Illegal get affinity operation when not capable");
483 #if KMP_OS_LINUX
484       long retval =
485           syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
486 #elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
487       int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
488                                      reinterpret_cast<cpuset_t *>(mask));
489       int retval = (r == 0 ? 0 : -1);
490 #endif
491       if (retval >= 0) {
492         return 0;
493       }
494       int error = errno;
495       if (abort_on_error) {
496         __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
497                     KMP_ERR(error), __kmp_msg_null);
498       }
499       return error;
500     }
501     int set_system_affinity(bool abort_on_error) const override {
502       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
503                   "Illegal set affinity operation when not capable");
504 #if KMP_OS_LINUX
505       long retval =
506           syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
507 #elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
508       int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
509                                      reinterpret_cast<cpuset_t *>(mask));
510       int retval = (r == 0 ? 0 : -1);
511 #endif
512       if (retval >= 0) {
513         return 0;
514       }
515       int error = errno;
516       if (abort_on_error) {
517         __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
518                     KMP_ERR(error), __kmp_msg_null);
519       }
520       return error;
521     }
522 #endif // KMP_OS_AIX
523   };
524   void determine_capable(const char *env_var) override {
525     __kmp_affinity_determine_capable(env_var);
526   }
527   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
528   KMPAffinity::Mask *allocate_mask() override {
529     KMPNativeAffinity::Mask *retval = new Mask();
530     return retval;
531   }
532   void deallocate_mask(KMPAffinity::Mask *m) override {
533     KMPNativeAffinity::Mask *native_mask =
534         static_cast<KMPNativeAffinity::Mask *>(m);
535     delete native_mask;
536   }
537   KMPAffinity::Mask *allocate_mask_array(int num) override {
538     return new Mask[num];
539   }
540   void deallocate_mask_array(KMPAffinity::Mask *array) override {
541     Mask *linux_array = static_cast<Mask *>(array);
542     delete[] linux_array;
543   }
544   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
545                                       int index) override {
546     Mask *linux_array = static_cast<Mask *>(array);
547     return &(linux_array[index]);
548   }
549   api_type get_api_type() const override { return NATIVE_OS; }
550 };
551 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY  \
552           || KMP_OS_AIX */
553 
554 #if KMP_OS_WINDOWS
555 class KMPNativeAffinity : public KMPAffinity {
556   class Mask : public KMPAffinity::Mask {
557     typedef ULONG_PTR mask_t;
558     static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
559     mask_t *mask;
560 
561   public:
562     Mask() {
563       mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
564     }
565     ~Mask() {
566       if (mask)
567         __kmp_free(mask);
568     }
569     void set(int i) override {
570       mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
571     }
572     bool is_set(int i) const override {
573       return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
574     }
575     void clear(int i) override {
576       mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
577     }
578     void zero() override {
579       for (int i = 0; i < __kmp_num_proc_groups; ++i)
580         mask[i] = 0;
581     }
582     bool empty() const override {
583       for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
584         if (mask[i])
585           return false;
586       return true;
587     }
588     void copy(const KMPAffinity::Mask *src) override {
589       const Mask *convert = static_cast<const Mask *>(src);
590       for (int i = 0; i < __kmp_num_proc_groups; ++i)
591         mask[i] = convert->mask[i];
592     }
593     void bitwise_and(const KMPAffinity::Mask *rhs) override {
594       const Mask *convert = static_cast<const Mask *>(rhs);
595       for (int i = 0; i < __kmp_num_proc_groups; ++i)
596         mask[i] &= convert->mask[i];
597     }
598     void bitwise_or(const KMPAffinity::Mask *rhs) override {
599       const Mask *convert = static_cast<const Mask *>(rhs);
600       for (int i = 0; i < __kmp_num_proc_groups; ++i)
601         mask[i] |= convert->mask[i];
602     }
603     void bitwise_not() override {
604       for (int i = 0; i < __kmp_num_proc_groups; ++i)
605         mask[i] = ~(mask[i]);
606     }
607     bool is_equal(const KMPAffinity::Mask *rhs) const override {
608       const Mask *convert = static_cast<const Mask *>(rhs);
609       for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
610         if (mask[i] != convert->mask[i])
611           return false;
612       return true;
613     }
614     int begin() const override {
615       int retval = 0;
616       while (retval < end() && !is_set(retval))
617         ++retval;
618       return retval;
619     }
620     int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
621     int next(int previous) const override {
622       int retval = previous + 1;
623       while (retval < end() && !is_set(retval))
624         ++retval;
625       return retval;
626     }
627     int set_process_affinity(bool abort_on_error) const override {
628       if (__kmp_num_proc_groups <= 1) {
629         if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
630           DWORD error = GetLastError();
631           if (abort_on_error) {
632             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
633                         __kmp_msg_null);
634           }
635           return error;
636         }
637       }
638       return 0;
639     }
640     int set_system_affinity(bool abort_on_error) const override {
641       if (__kmp_num_proc_groups > 1) {
642         // Check for a valid mask.
643         GROUP_AFFINITY ga;
644         int group = get_proc_group();
645         if (group < 0) {
646           if (abort_on_error) {
647             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
648           }
649           return -1;
650         }
651         // Transform the bit vector into a GROUP_AFFINITY struct
652         // and make the system call to set affinity.
653         ga.Group = group;
654         ga.Mask = mask[group];
655         ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
656 
657         KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
658         if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
659           DWORD error = GetLastError();
660           if (abort_on_error) {
661             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
662                         __kmp_msg_null);
663           }
664           return error;
665         }
666       } else {
667         if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
668           DWORD error = GetLastError();
669           if (abort_on_error) {
670             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
671                         __kmp_msg_null);
672           }
673           return error;
674         }
675       }
676       return 0;
677     }
678     int get_system_affinity(bool abort_on_error) override {
679       if (__kmp_num_proc_groups > 1) {
680         this->zero();
681         GROUP_AFFINITY ga;
682         KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
683         if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
684           DWORD error = GetLastError();
685           if (abort_on_error) {
686             __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
687                         KMP_ERR(error), __kmp_msg_null);
688           }
689           return error;
690         }
691         if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
692             (ga.Mask == 0)) {
693           return -1;
694         }
695         mask[ga.Group] = ga.Mask;
696       } else {
697         mask_t newMask, sysMask, retval;
698         if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
699           DWORD error = GetLastError();
700           if (abort_on_error) {
701             __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
702                         KMP_ERR(error), __kmp_msg_null);
703           }
704           return error;
705         }
706         retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
707         if (!retval) {
708           DWORD error = GetLastError();
709           if (abort_on_error) {
710             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
711                         KMP_ERR(error), __kmp_msg_null);
712           }
713           return error;
714         }
715         newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
716         if (!newMask) {
717           DWORD error = GetLastError();
718           if (abort_on_error) {
719             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
720                         KMP_ERR(error), __kmp_msg_null);
721           }
722         }
723         *mask = retval;
724       }
725       return 0;
726     }
727     int get_proc_group() const override {
728       int group = -1;
729       if (__kmp_num_proc_groups == 1) {
730         return 1;
731       }
732       for (int i = 0; i < __kmp_num_proc_groups; i++) {
733         if (mask[i] == 0)
734           continue;
735         if (group >= 0)
736           return -1;
737         group = i;
738       }
739       return group;
740     }
741   };
742   void determine_capable(const char *env_var) override {
743     __kmp_affinity_determine_capable(env_var);
744   }
745   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
746   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
747   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
748   KMPAffinity::Mask *allocate_mask_array(int num) override {
749     return new Mask[num];
750   }
751   void deallocate_mask_array(KMPAffinity::Mask *array) override {
752     Mask *windows_array = static_cast<Mask *>(array);
753     delete[] windows_array;
754   }
755   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
756                                       int index) override {
757     Mask *windows_array = static_cast<Mask *>(array);
758     return &(windows_array[index]);
759   }
760   api_type get_api_type() const override { return NATIVE_OS; }
761 };
762 #endif /* KMP_OS_WINDOWS */
763 #endif /* KMP_AFFINITY_SUPPORTED */
764 
765 // Describe an attribute for a level in the machine topology
766 struct kmp_hw_attr_t {
767   int core_type : 8;
768   int core_eff : 8;
769   unsigned valid : 1;
770   unsigned reserved : 15;
771 
772   static const int UNKNOWN_CORE_EFF = -1;
773 
774   kmp_hw_attr_t()
775       : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
776         valid(0), reserved(0) {}
777   void set_core_type(kmp_hw_core_type_t type) {
778     valid = 1;
779     core_type = type;
780   }
781   void set_core_eff(int eff) {
782     valid = 1;
783     core_eff = eff;
784   }
785   kmp_hw_core_type_t get_core_type() const {
786     return (kmp_hw_core_type_t)core_type;
787   }
788   int get_core_eff() const { return core_eff; }
789   bool is_core_type_valid() const {
790     return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
791   }
792   bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
793   operator bool() const { return valid; }
794   void clear() {
795     core_type = KMP_HW_CORE_TYPE_UNKNOWN;
796     core_eff = UNKNOWN_CORE_EFF;
797     valid = 0;
798   }
799   bool contains(const kmp_hw_attr_t &other) const {
800     if (!valid && !other.valid)
801       return true;
802     if (valid && other.valid) {
803       if (other.is_core_type_valid()) {
804         if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
805           return false;
806       }
807       if (other.is_core_eff_valid()) {
808         if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
809           return false;
810       }
811       return true;
812     }
813     return false;
814   }
815 #if KMP_AFFINITY_SUPPORTED
816   bool contains(const kmp_affinity_attrs_t &attr) const {
817     if (!valid && !attr.valid)
818       return true;
819     if (valid && attr.valid) {
820       if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
821         return (is_core_type_valid() &&
822                 (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
823       if (attr.core_eff != UNKNOWN_CORE_EFF)
824         return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
825       return true;
826     }
827     return false;
828   }
829 #endif // KMP_AFFINITY_SUPPORTED
830   bool operator==(const kmp_hw_attr_t &rhs) const {
831     return (rhs.valid == valid && rhs.core_eff == core_eff &&
832             rhs.core_type == core_type);
833   }
834   bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
835 };
836 
837 #if KMP_AFFINITY_SUPPORTED
838 KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
839 #endif
840 
841 class kmp_hw_thread_t {
842 public:
843   static const int UNKNOWN_ID = -1;
844   static const int MULTIPLE_ID = -2;
845   static int compare_ids(const void *a, const void *b);
846   static int compare_compact(const void *a, const void *b);
847   int ids[KMP_HW_LAST];
848   int sub_ids[KMP_HW_LAST];
849   bool leader;
850   int os_id;
851   int original_idx;
852   kmp_hw_attr_t attrs;
853 
854   void print() const;
855   void clear() {
856     for (int i = 0; i < (int)KMP_HW_LAST; ++i)
857       ids[i] = UNKNOWN_ID;
858     leader = false;
859     attrs.clear();
860   }
861 };
862 
863 class kmp_topology_t {
864 
865   struct flags_t {
866     int uniform : 1;
867     int reserved : 31;
868   };
869 
870   int depth;
871 
872   // The following arrays are all 'depth' long and have been
873   // allocated to hold up to KMP_HW_LAST number of objects if
874   // needed so layers can be added without reallocation of any array
875 
876   // Orderd array of the types in the topology
877   kmp_hw_t *types;
878 
879   // Keep quick topology ratios, for non-uniform topologies,
880   // this ratio holds the max number of itemAs per itemB
881   // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
882   int *ratio;
883 
884   // Storage containing the absolute number of each topology layer
885   int *count;
886 
887   // The number of core efficiencies. This is only useful for hybrid
888   // topologies. Core efficiencies will range from 0 to num efficiencies - 1
889   int num_core_efficiencies;
890   int num_core_types;
891   kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
892 
893   // The hardware threads array
894   // hw_threads is num_hw_threads long
895   // Each hw_thread's ids and sub_ids are depth deep
896   int num_hw_threads;
897   kmp_hw_thread_t *hw_threads;
898 
899   // Equivalence hash where the key is the hardware topology item
900   // and the value is the equivalent hardware topology type in the
901   // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
902   // known equivalence for the topology type
903   kmp_hw_t equivalent[KMP_HW_LAST];
904 
905   // Flags describing the topology
906   flags_t flags;
907 
908   // Compact value used during sort_compact()
909   int compact;
910 
911 #if KMP_GROUP_AFFINITY
912   // Insert topology information about Windows Processor groups
913   void _insert_windows_proc_groups();
914 #endif
915 
916   // Count each item & get the num x's per y
917   // e.g., get the number of cores and the number of threads per core
918   // for each (x, y) in (KMP_HW_* , KMP_HW_*)
919   void _gather_enumeration_information();
920 
921   // Remove layers that don't add information to the topology.
922   // This is done by having the layer take on the id = UNKNOWN_ID (-1)
923   void _remove_radix1_layers();
924 
925   // Find out if the topology is uniform
926   void _discover_uniformity();
927 
928   // Set all the sub_ids for each hardware thread
929   void _set_sub_ids();
930 
931   // Set global affinity variables describing the number of threads per
932   // core, the number of packages, the number of cores per package, and
933   // the number of cores.
934   void _set_globals();
935 
936   // Set the last level cache equivalent type
937   void _set_last_level_cache();
938 
939   // Return the number of cores with a particular attribute, 'attr'.
940   // If 'find_all' is true, then find all cores on the machine, otherwise find
941   // all cores per the layer 'above'
942   int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
943                             bool find_all = false) const;
944 
945 public:
946   // Force use of allocate()/deallocate()
947   kmp_topology_t() = delete;
948   kmp_topology_t(const kmp_topology_t &t) = delete;
949   kmp_topology_t(kmp_topology_t &&t) = delete;
950   kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
951   kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
952 
953   static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
954   static void deallocate(kmp_topology_t *);
955 
956   // Functions used in create_map() routines
957   kmp_hw_thread_t &at(int index) {
958     KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
959     return hw_threads[index];
960   }
961   const kmp_hw_thread_t &at(int index) const {
962     KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
963     return hw_threads[index];
964   }
965   int get_num_hw_threads() const { return num_hw_threads; }
966   void sort_ids() {
967     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
968           kmp_hw_thread_t::compare_ids);
969   }
970 
971   // Insert a new topology layer after allocation
972   void insert_layer(kmp_hw_t type, const int *ids);
973 
974   // Check if the hardware ids are unique, if they are
975   // return true, otherwise return false
976   bool check_ids() const;
977 
978   // Function to call after the create_map() routine
979   void canonicalize();
980   void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
981 
982 // Functions used after canonicalize() called
983 
984 #if KMP_AFFINITY_SUPPORTED
985   // Set the granularity for affinity settings
986   void set_granularity(kmp_affinity_t &stgs) const;
987   bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
988   bool restrict_to_mask(const kmp_affin_mask_t *mask);
989   bool filter_hw_subset();
990 #endif
991   bool is_uniform() const { return flags.uniform; }
992   // Tell whether a type is a valid type in the topology
993   // returns KMP_HW_UNKNOWN when there is no equivalent type
994   kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
995     if (type == KMP_HW_UNKNOWN)
996       return KMP_HW_UNKNOWN;
997     return equivalent[type];
998   }
999   // Set type1 = type2
1000   void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
1001     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
1002     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
1003     kmp_hw_t real_type2 = equivalent[type2];
1004     if (real_type2 == KMP_HW_UNKNOWN)
1005       real_type2 = type2;
1006     equivalent[type1] = real_type2;
1007     // This loop is required since any of the types may have been set to
1008     // be equivalent to type1.  They all must be checked and reset to type2.
1009     KMP_FOREACH_HW_TYPE(type) {
1010       if (equivalent[type] == type1) {
1011         equivalent[type] = real_type2;
1012       }
1013     }
1014   }
1015   // Calculate number of types corresponding to level1
1016   // per types corresponding to level2 (e.g., number of threads per core)
1017   int calculate_ratio(int level1, int level2) const {
1018     KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
1019     KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
1020     int r = 1;
1021     for (int level = level1; level > level2; --level)
1022       r *= ratio[level];
1023     return r;
1024   }
1025   int get_ratio(int level) const {
1026     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1027     return ratio[level];
1028   }
1029   int get_depth() const { return depth; };
1030   kmp_hw_t get_type(int level) const {
1031     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1032     return types[level];
1033   }
1034   int get_level(kmp_hw_t type) const {
1035     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
1036     int eq_type = equivalent[type];
1037     if (eq_type == KMP_HW_UNKNOWN)
1038       return -1;
1039     for (int i = 0; i < depth; ++i)
1040       if (types[i] == eq_type)
1041         return i;
1042     return -1;
1043   }
1044   int get_count(int level) const {
1045     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1046     return count[level];
1047   }
1048   // Return the total number of cores with attribute 'attr'
1049   int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
1050     return _get_ncores_with_attr(attr, -1, true);
1051   }
1052   // Return the number of cores with attribute
1053   // 'attr' per topology level 'above'
1054   int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
1055     return _get_ncores_with_attr(attr, above, false);
1056   }
1057 
1058 #if KMP_AFFINITY_SUPPORTED
1059   friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
1060   void sort_compact(kmp_affinity_t &affinity) {
1061     compact = affinity.compact;
1062     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
1063           kmp_hw_thread_t::compare_compact);
1064   }
1065 #endif
1066   void print(const char *env_var = "KMP_AFFINITY") const;
1067   void dump() const;
1068 };
1069 extern kmp_topology_t *__kmp_topology;
1070 
1071 class kmp_hw_subset_t {
1072   const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
1073 
1074 public:
1075   // Describe a machine topology item in KMP_HW_SUBSET
1076   struct item_t {
1077     kmp_hw_t type;
1078     int num_attrs;
1079     int num[MAX_ATTRS];
1080     int offset[MAX_ATTRS];
1081     kmp_hw_attr_t attr[MAX_ATTRS];
1082   };
1083   // Put parenthesis around max to avoid accidental use of Windows max macro.
1084   const static int USE_ALL = (std::numeric_limits<int>::max)();
1085 
1086 private:
1087   int depth;
1088   int capacity;
1089   item_t *items;
1090   kmp_uint64 set;
1091   bool absolute;
1092   // The set must be able to handle up to KMP_HW_LAST number of layers
1093   KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1094   // Sorting the KMP_HW_SUBSET items to follow topology order
1095   // All unknown topology types will be at the beginning of the subset
1096   static int hw_subset_compare(const void *i1, const void *i2) {
1097     kmp_hw_t type1 = ((const item_t *)i1)->type;
1098     kmp_hw_t type2 = ((const item_t *)i2)->type;
1099     int level1 = __kmp_topology->get_level(type1);
1100     int level2 = __kmp_topology->get_level(type2);
1101     return level1 - level2;
1102   }
1103 
1104 public:
1105   // Force use of allocate()/deallocate()
1106   kmp_hw_subset_t() = delete;
1107   kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
1108   kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
1109   kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
1110   kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
1111 
1112   static kmp_hw_subset_t *allocate() {
1113     int initial_capacity = 5;
1114     kmp_hw_subset_t *retval =
1115         (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
1116     retval->depth = 0;
1117     retval->capacity = initial_capacity;
1118     retval->set = 0ull;
1119     retval->absolute = false;
1120     retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1121     return retval;
1122   }
1123   static void deallocate(kmp_hw_subset_t *subset) {
1124     __kmp_free(subset->items);
1125     __kmp_free(subset);
1126   }
1127   void set_absolute() { absolute = true; }
1128   bool is_absolute() const { return absolute; }
1129   void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1130     for (int i = 0; i < depth; ++i) {
1131       // Found an existing item for this layer type
1132       // Add the num, offset, and attr to this item
1133       if (items[i].type == type) {
1134         int idx = items[i].num_attrs++;
1135         if ((size_t)idx >= MAX_ATTRS)
1136           return;
1137         items[i].num[idx] = num;
1138         items[i].offset[idx] = offset;
1139         items[i].attr[idx] = attr;
1140         return;
1141       }
1142     }
1143     if (depth == capacity - 1) {
1144       capacity *= 2;
1145       item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1146       for (int i = 0; i < depth; ++i)
1147         new_items[i] = items[i];
1148       __kmp_free(items);
1149       items = new_items;
1150     }
1151     items[depth].num_attrs = 1;
1152     items[depth].type = type;
1153     items[depth].num[0] = num;
1154     items[depth].offset[0] = offset;
1155     items[depth].attr[0] = attr;
1156     depth++;
1157     set |= (1ull << type);
1158   }
1159   int get_depth() const { return depth; }
1160   const item_t &at(int index) const {
1161     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1162     return items[index];
1163   }
1164   item_t &at(int index) {
1165     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1166     return items[index];
1167   }
1168   void remove(int index) {
1169     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1170     set &= ~(1ull << items[index].type);
1171     for (int j = index + 1; j < depth; ++j) {
1172       items[j - 1] = items[j];
1173     }
1174     depth--;
1175   }
1176   void sort() {
1177     KMP_DEBUG_ASSERT(__kmp_topology);
1178     qsort(items, depth, sizeof(item_t), hw_subset_compare);
1179   }
1180   bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1181 
1182   // Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset.
1183   // This means putting each of {sockets, cores, threads} in the topology if
1184   // they are not specified:
1185   // e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc.
1186   // e.g., 3module => *s,3module,*c,*t
1187   // By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET
1188   // are expecting the traditional sockets/cores/threads topology. For newer
1189   // hardware, there can be intervening layers like dies/tiles/modules
1190   // (usually corresponding to a cache level). So when a user asks for
1191   // 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user
1192   // should get 12 hardware threads across 6 cores and effectively ignore the
1193   // module layer.
1194   void canonicalize(const kmp_topology_t *top) {
1195     // Layers to target for KMP_HW_SUBSET canonicalization
1196     kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1197 
1198     // Do not target-layer-canonicalize absolute KMP_HW_SUBSETS
1199     if (is_absolute())
1200       return;
1201 
1202     // Do not target-layer-canonicalize KMP_HW_SUBSETS when the
1203     // topology doesn't have these layers
1204     for (kmp_hw_t type : targeted)
1205       if (top->get_level(type) == KMP_HW_UNKNOWN)
1206         return;
1207 
1208     // Put targeted layers in topology if they do not exist
1209     for (kmp_hw_t type : targeted) {
1210       bool found = false;
1211       for (int i = 0; i < get_depth(); ++i) {
1212         if (top->get_equivalent_type(items[i].type) == type) {
1213           found = true;
1214           break;
1215         }
1216       }
1217       if (!found) {
1218         push_back(USE_ALL, type, 0, kmp_hw_attr_t{});
1219       }
1220     }
1221     sort();
1222     // Set as an absolute topology that only targets the targeted layers
1223     set_absolute();
1224   }
1225   void dump() const {
1226     printf("**********************\n");
1227     printf("*** kmp_hw_subset: ***\n");
1228     printf("* depth: %d\n", depth);
1229     printf("* items:\n");
1230     for (int i = 0; i < depth; ++i) {
1231       printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1232       for (int j = 0; j < items[i].num_attrs; ++j) {
1233         printf("  num: %d, offset: %d, attr: ", items[i].num[j],
1234                items[i].offset[j]);
1235         if (!items[i].attr[j]) {
1236           printf(" (none)\n");
1237         } else {
1238           printf(
1239               " core_type = %s, core_eff = %d\n",
1240               __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1241               items[i].attr[j].get_core_eff());
1242         }
1243       }
1244     }
1245     printf("* set: 0x%llx\n", set);
1246     printf("* absolute: %d\n", absolute);
1247     printf("**********************\n");
1248   }
1249 };
1250 extern kmp_hw_subset_t *__kmp_hw_subset;
1251 
1252 /* A structure for holding machine-specific hierarchy info to be computed once
1253    at init. This structure represents a mapping of threads to the actual machine
1254    hierarchy, or to our best guess at what the hierarchy might be, for the
1255    purpose of performing an efficient barrier. In the worst case, when there is
1256    no machine hierarchy information, it produces a tree suitable for a barrier,
1257    similar to the tree used in the hyper barrier. */
1258 class hierarchy_info {
1259 public:
1260   /* Good default values for number of leaves and branching factor, given no
1261      affinity information. Behaves a bit like hyper barrier. */
1262   static const kmp_uint32 maxLeaves = 4;
1263   static const kmp_uint32 minBranch = 4;
1264   /** Number of levels in the hierarchy. Typical levels are threads/core,
1265       cores/package or socket, packages/node, nodes/machine, etc. We don't want
1266       to get specific with nomenclature. When the machine is oversubscribed we
1267       add levels to duplicate the hierarchy, doubling the thread capacity of the
1268       hierarchy each time we add a level. */
1269   kmp_uint32 maxLevels;
1270 
1271   /** This is specifically the depth of the machine configuration hierarchy, in
1272       terms of the number of levels along the longest path from root to any
1273       leaf. It corresponds to the number of entries in numPerLevel if we exclude
1274       all but one trailing 1. */
1275   kmp_uint32 depth;
1276   kmp_uint32 base_num_threads = 0;
1277   enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1278   volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1279   // 2=initialization in progress
1280   volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1281 
1282   /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
1283       the parent of a node at level i has. For example, if we have a machine
1284       with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
1285       {2, 4, 4, 1, 1}. All empty levels are set to 1. */
1286   kmp_uint32 *numPerLevel = nullptr;
1287   kmp_uint32 *skipPerLevel = nullptr;
1288 
1289   void deriveLevels() {
1290     int hier_depth = __kmp_topology->get_depth();
1291     for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1292       numPerLevel[level] = __kmp_topology->get_ratio(i);
1293     }
1294   }
1295 
1296   hierarchy_info()
1297       : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1298 
1299   void fini() {
1300     if (!uninitialized && numPerLevel) {
1301       __kmp_free(numPerLevel);
1302       numPerLevel = NULL;
1303       uninitialized = not_initialized;
1304     }
1305   }
1306 
1307   void init(int num_addrs) {
1308     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1309         &uninitialized, not_initialized, initializing);
1310     if (bool_result == 0) { // Wait for initialization
1311       while (TCR_1(uninitialized) != initialized)
1312         KMP_CPU_PAUSE();
1313       return;
1314     }
1315     KMP_DEBUG_ASSERT(bool_result == 1);
1316 
1317     /* Added explicit initialization of the data fields here to prevent usage of
1318        dirty value observed when static library is re-initialized multiple times
1319        (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1320        OpenMP). */
1321     depth = 1;
1322     resizing = 0;
1323     maxLevels = 7;
1324     numPerLevel =
1325         (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1326     skipPerLevel = &(numPerLevel[maxLevels]);
1327     for (kmp_uint32 i = 0; i < maxLevels;
1328          ++i) { // init numPerLevel[*] to 1 item per level
1329       numPerLevel[i] = 1;
1330       skipPerLevel[i] = 1;
1331     }
1332 
1333     // Sort table by physical ID
1334     if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1335       deriveLevels();
1336     } else {
1337       numPerLevel[0] = maxLeaves;
1338       numPerLevel[1] = num_addrs / maxLeaves;
1339       if (num_addrs % maxLeaves)
1340         numPerLevel[1]++;
1341     }
1342 
1343     base_num_threads = num_addrs;
1344     for (int i = maxLevels - 1; i >= 0;
1345          --i) // count non-empty levels to get depth
1346       if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1347         depth++;
1348 
1349     kmp_uint32 branch = minBranch;
1350     if (numPerLevel[0] == 1)
1351       branch = num_addrs / maxLeaves;
1352     if (branch < minBranch)
1353       branch = minBranch;
1354     for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1355       while (numPerLevel[d] > branch ||
1356              (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1357         if (numPerLevel[d] & 1)
1358           numPerLevel[d]++;
1359         numPerLevel[d] = numPerLevel[d] >> 1;
1360         if (numPerLevel[d + 1] == 1)
1361           depth++;
1362         numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1363       }
1364       if (numPerLevel[0] == 1) {
1365         branch = branch >> 1;
1366         if (branch < 4)
1367           branch = minBranch;
1368       }
1369     }
1370 
1371     for (kmp_uint32 i = 1; i < depth; ++i)
1372       skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1373     // Fill in hierarchy in the case of oversubscription
1374     for (kmp_uint32 i = depth; i < maxLevels; ++i)
1375       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1376 
1377     uninitialized = initialized; // One writer
1378   }
1379 
1380   // Resize the hierarchy if nproc changes to something larger than before
1381   void resize(kmp_uint32 nproc) {
1382     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1383     while (bool_result == 0) { // someone else is trying to resize
1384       KMP_CPU_PAUSE();
1385       if (nproc <= base_num_threads) // happy with other thread's resize
1386         return;
1387       else // try to resize
1388         bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1389     }
1390     KMP_DEBUG_ASSERT(bool_result != 0);
1391     if (nproc <= base_num_threads)
1392       return; // happy with other thread's resize
1393 
1394     // Calculate new maxLevels
1395     kmp_uint32 old_sz = skipPerLevel[depth - 1];
1396     kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1397     // First see if old maxLevels is enough to contain new size
1398     for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1399       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1400       numPerLevel[i - 1] *= 2;
1401       old_sz *= 2;
1402       depth++;
1403     }
1404     if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1405       while (nproc > old_sz) {
1406         old_sz *= 2;
1407         incs++;
1408         depth++;
1409       }
1410       maxLevels += incs;
1411 
1412       // Resize arrays
1413       kmp_uint32 *old_numPerLevel = numPerLevel;
1414       kmp_uint32 *old_skipPerLevel = skipPerLevel;
1415       numPerLevel = skipPerLevel = NULL;
1416       numPerLevel =
1417           (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1418       skipPerLevel = &(numPerLevel[maxLevels]);
1419 
1420       // Copy old elements from old arrays
1421       for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1422         // init numPerLevel[*] to 1 item per level
1423         numPerLevel[i] = old_numPerLevel[i];
1424         skipPerLevel[i] = old_skipPerLevel[i];
1425       }
1426 
1427       // Init new elements in arrays to 1
1428       for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1429         // init numPerLevel[*] to 1 item per level
1430         numPerLevel[i] = 1;
1431         skipPerLevel[i] = 1;
1432       }
1433 
1434       // Free old arrays
1435       __kmp_free(old_numPerLevel);
1436     }
1437 
1438     // Fill in oversubscription levels of hierarchy
1439     for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1440       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1441 
1442     base_num_threads = nproc;
1443     resizing = 0; // One writer
1444   }
1445 };
1446 #endif // KMP_AFFINITY_H
1447