LLVM OpenMP* Runtime Library
kmp_affinity.cpp
1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_i18n.h"
16 #include "kmp_io.h"
17 #include "kmp_str.h"
18 #include "kmp_wrapper_getpid.h"
19 #if KMP_USE_HIER_SCHED
20 #include "kmp_dispatch_hier.h"
21 #endif
22 #if KMP_USE_HWLOC
23 // Copied from hwloc
24 #define HWLOC_GROUP_KIND_INTEL_MODULE 102
25 #define HWLOC_GROUP_KIND_INTEL_TILE 103
26 #define HWLOC_GROUP_KIND_INTEL_DIE 104
27 #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220
28 #endif
29 #include <ctype.h>
30 
31 // The machine topology
32 kmp_topology_t *__kmp_topology = nullptr;
33 // KMP_HW_SUBSET environment variable
34 kmp_hw_subset_t *__kmp_hw_subset = nullptr;
35 
36 // Store the real or imagined machine hierarchy here
37 static hierarchy_info machine_hierarchy;
38 
39 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); }
40 
41 #if KMP_AFFINITY_SUPPORTED
42 // Helper class to see if place lists further restrict the fullMask
43 class kmp_full_mask_modifier_t {
44  kmp_affin_mask_t *mask;
45 
46 public:
47  kmp_full_mask_modifier_t() {
48  KMP_CPU_ALLOC(mask);
49  KMP_CPU_ZERO(mask);
50  }
51  ~kmp_full_mask_modifier_t() {
52  KMP_CPU_FREE(mask);
53  mask = nullptr;
54  }
55  void include(const kmp_affin_mask_t *other) { KMP_CPU_UNION(mask, other); }
56  // If the new full mask is different from the current full mask,
57  // then switch them. Returns true if full mask was affected, false otherwise.
58  bool restrict_to_mask() {
59  // See if the new mask further restricts or changes the full mask
60  if (KMP_CPU_EQUAL(__kmp_affin_fullMask, mask) || KMP_CPU_ISEMPTY(mask))
61  return false;
62  return __kmp_topology->restrict_to_mask(mask);
63  }
64 };
65 
66 static inline const char *
67 __kmp_get_affinity_env_var(const kmp_affinity_t &affinity,
68  bool for_binding = false) {
69  if (affinity.flags.omp_places) {
70  if (for_binding)
71  return "OMP_PROC_BIND";
72  return "OMP_PLACES";
73  }
74  return affinity.env_var;
75 }
76 #endif // KMP_AFFINITY_SUPPORTED
77 
78 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
79  kmp_uint32 depth;
80  // The test below is true if affinity is available, but set to "none". Need to
81  // init on first use of hierarchical barrier.
82  if (TCR_1(machine_hierarchy.uninitialized))
83  machine_hierarchy.init(nproc);
84 
85  // Adjust the hierarchy in case num threads exceeds original
86  if (nproc > machine_hierarchy.base_num_threads)
87  machine_hierarchy.resize(nproc);
88 
89  depth = machine_hierarchy.depth;
90  KMP_DEBUG_ASSERT(depth > 0);
91 
92  thr_bar->depth = depth;
93  __kmp_type_convert(machine_hierarchy.numPerLevel[0] - 1,
94  &(thr_bar->base_leaf_kids));
95  thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
96 }
97 
98 static int nCoresPerPkg, nPackages;
99 static int __kmp_nThreadsPerCore;
100 #ifndef KMP_DFLT_NTH_CORES
101 static int __kmp_ncores;
102 #endif
103 
104 const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) {
105  switch (type) {
106  case KMP_HW_SOCKET:
107  return ((plural) ? KMP_I18N_STR(Sockets) : KMP_I18N_STR(Socket));
108  case KMP_HW_DIE:
109  return ((plural) ? KMP_I18N_STR(Dice) : KMP_I18N_STR(Die));
110  case KMP_HW_MODULE:
111  return ((plural) ? KMP_I18N_STR(Modules) : KMP_I18N_STR(Module));
112  case KMP_HW_TILE:
113  return ((plural) ? KMP_I18N_STR(Tiles) : KMP_I18N_STR(Tile));
114  case KMP_HW_NUMA:
115  return ((plural) ? KMP_I18N_STR(NumaDomains) : KMP_I18N_STR(NumaDomain));
116  case KMP_HW_L3:
117  return ((plural) ? KMP_I18N_STR(L3Caches) : KMP_I18N_STR(L3Cache));
118  case KMP_HW_L2:
119  return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache));
120  case KMP_HW_L1:
121  return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache));
122  case KMP_HW_LLC:
123  return ((plural) ? KMP_I18N_STR(LLCaches) : KMP_I18N_STR(LLCache));
124  case KMP_HW_CORE:
125  return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core));
126  case KMP_HW_THREAD:
127  return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread));
128  case KMP_HW_PROC_GROUP:
129  return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup));
130  case KMP_HW_UNKNOWN:
131  case KMP_HW_LAST:
132  return KMP_I18N_STR(Unknown);
133  }
134  KMP_ASSERT2(false, "Unhandled kmp_hw_t enumeration");
135  KMP_BUILTIN_UNREACHABLE;
136 }
137 
138 const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
139  switch (type) {
140  case KMP_HW_SOCKET:
141  return ((plural) ? "sockets" : "socket");
142  case KMP_HW_DIE:
143  return ((plural) ? "dice" : "die");
144  case KMP_HW_MODULE:
145  return ((plural) ? "modules" : "module");
146  case KMP_HW_TILE:
147  return ((plural) ? "tiles" : "tile");
148  case KMP_HW_NUMA:
149  return ((plural) ? "numa_domains" : "numa_domain");
150  case KMP_HW_L3:
151  return ((plural) ? "l3_caches" : "l3_cache");
152  case KMP_HW_L2:
153  return ((plural) ? "l2_caches" : "l2_cache");
154  case KMP_HW_L1:
155  return ((plural) ? "l1_caches" : "l1_cache");
156  case KMP_HW_LLC:
157  return ((plural) ? "ll_caches" : "ll_cache");
158  case KMP_HW_CORE:
159  return ((plural) ? "cores" : "core");
160  case KMP_HW_THREAD:
161  return ((plural) ? "threads" : "thread");
162  case KMP_HW_PROC_GROUP:
163  return ((plural) ? "proc_groups" : "proc_group");
164  case KMP_HW_UNKNOWN:
165  case KMP_HW_LAST:
166  return ((plural) ? "unknowns" : "unknown");
167  }
168  KMP_ASSERT2(false, "Unhandled kmp_hw_t enumeration");
169  KMP_BUILTIN_UNREACHABLE;
170 }
171 
172 const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
173  switch (type) {
174  case KMP_HW_CORE_TYPE_UNKNOWN:
175  case KMP_HW_MAX_NUM_CORE_TYPES:
176  return "unknown";
177 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
178  case KMP_HW_CORE_TYPE_ATOM:
179  return "Intel Atom(R) processor";
180  case KMP_HW_CORE_TYPE_CORE:
181  return "Intel(R) Core(TM) processor";
182 #endif
183  }
184  KMP_ASSERT2(false, "Unhandled kmp_hw_core_type_t enumeration");
185  KMP_BUILTIN_UNREACHABLE;
186 }
187 
188 #if KMP_AFFINITY_SUPPORTED
189 // If affinity is supported, check the affinity
190 // verbose and warning flags before printing warning
191 #define KMP_AFF_WARNING(s, ...) \
192  if (s.flags.verbose || (s.flags.warnings && (s.type != affinity_none))) { \
193  KMP_WARNING(__VA_ARGS__); \
194  }
195 #else
196 #define KMP_AFF_WARNING(s, ...) KMP_WARNING(__VA_ARGS__)
197 #endif
198 
200 // kmp_hw_thread_t methods
201 int kmp_hw_thread_t::compare_ids(const void *a, const void *b) {
202  const kmp_hw_thread_t *ahwthread = (const kmp_hw_thread_t *)a;
203  const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b;
204  int depth = __kmp_topology->get_depth();
205  for (int level = 0; level < depth; ++level) {
206  if (ahwthread->ids[level] < bhwthread->ids[level])
207  return -1;
208  else if (ahwthread->ids[level] > bhwthread->ids[level])
209  return 1;
210  }
211  if (ahwthread->os_id < bhwthread->os_id)
212  return -1;
213  else if (ahwthread->os_id > bhwthread->os_id)
214  return 1;
215  return 0;
216 }
217 
218 #if KMP_AFFINITY_SUPPORTED
219 int kmp_hw_thread_t::compare_compact(const void *a, const void *b) {
220  int i;
221  const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a;
222  const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b;
223  int depth = __kmp_topology->get_depth();
224  int compact = __kmp_topology->compact;
225  KMP_DEBUG_ASSERT(compact >= 0);
226  KMP_DEBUG_ASSERT(compact <= depth);
227  for (i = 0; i < compact; i++) {
228  int j = depth - i - 1;
229  if (aa->sub_ids[j] < bb->sub_ids[j])
230  return -1;
231  if (aa->sub_ids[j] > bb->sub_ids[j])
232  return 1;
233  }
234  for (; i < depth; i++) {
235  int j = i - compact;
236  if (aa->sub_ids[j] < bb->sub_ids[j])
237  return -1;
238  if (aa->sub_ids[j] > bb->sub_ids[j])
239  return 1;
240  }
241  return 0;
242 }
243 #endif
244 
245 void kmp_hw_thread_t::print() const {
246  int depth = __kmp_topology->get_depth();
247  printf("%4d ", os_id);
248  for (int i = 0; i < depth; ++i) {
249  printf("%4d ", ids[i]);
250  }
251  if (attrs) {
252  if (attrs.is_core_type_valid())
253  printf(" (%s)", __kmp_hw_get_core_type_string(attrs.get_core_type()));
254  if (attrs.is_core_eff_valid())
255  printf(" (eff=%d)", attrs.get_core_eff());
256  }
257  if (leader)
258  printf(" (leader)");
259  printf("\n");
260 }
261 
263 // kmp_topology_t methods
264 
265 // Add a layer to the topology based on the ids. Assume the topology
266 // is perfectly nested (i.e., so no object has more than one parent)
267 void kmp_topology_t::_insert_layer(kmp_hw_t type, const int *ids) {
268  // Figure out where the layer should go by comparing the ids of the current
269  // layers with the new ids
270  int target_layer;
271  int previous_id = kmp_hw_thread_t::UNKNOWN_ID;
272  int previous_new_id = kmp_hw_thread_t::UNKNOWN_ID;
273 
274  // Start from the highest layer and work down to find target layer
275  // If new layer is equal to another layer then put the new layer above
276  for (target_layer = 0; target_layer < depth; ++target_layer) {
277  bool layers_equal = true;
278  bool strictly_above_target_layer = false;
279  for (int i = 0; i < num_hw_threads; ++i) {
280  int id = hw_threads[i].ids[target_layer];
281  int new_id = ids[i];
282  if (id != previous_id && new_id == previous_new_id) {
283  // Found the layer we are strictly above
284  strictly_above_target_layer = true;
285  layers_equal = false;
286  break;
287  } else if (id == previous_id && new_id != previous_new_id) {
288  // Found a layer we are below. Move to next layer and check.
289  layers_equal = false;
290  break;
291  }
292  previous_id = id;
293  previous_new_id = new_id;
294  }
295  if (strictly_above_target_layer || layers_equal)
296  break;
297  }
298 
299  // Found the layer we are above. Now move everything to accommodate the new
300  // layer. And put the new ids and type into the topology.
301  for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
302  types[j] = types[i];
303  types[target_layer] = type;
304  for (int k = 0; k < num_hw_threads; ++k) {
305  for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
306  hw_threads[k].ids[j] = hw_threads[k].ids[i];
307  hw_threads[k].ids[target_layer] = ids[k];
308  }
309  equivalent[type] = type;
310  depth++;
311 }
312 
313 #if KMP_GROUP_AFFINITY
314 // Insert the Windows Processor Group structure into the topology
315 void kmp_topology_t::_insert_windows_proc_groups() {
316  // Do not insert the processor group structure for a single group
317  if (__kmp_num_proc_groups == 1)
318  return;
319  kmp_affin_mask_t *mask;
320  int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads);
321  KMP_CPU_ALLOC(mask);
322  for (int i = 0; i < num_hw_threads; ++i) {
323  KMP_CPU_ZERO(mask);
324  KMP_CPU_SET(hw_threads[i].os_id, mask);
325  ids[i] = __kmp_get_proc_group(mask);
326  }
327  KMP_CPU_FREE(mask);
328  _insert_layer(KMP_HW_PROC_GROUP, ids);
329  __kmp_free(ids);
330 
331  // sort topology after adding proc groups
332  __kmp_topology->sort_ids();
333 }
334 #endif
335 
336 // Remove layers that don't add information to the topology.
337 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
338 void kmp_topology_t::_remove_radix1_layers() {
339  int preference[KMP_HW_LAST];
340  int top_index1, top_index2;
341  // Set up preference associative array
342  preference[KMP_HW_SOCKET] = 110;
343  preference[KMP_HW_PROC_GROUP] = 100;
344  preference[KMP_HW_CORE] = 95;
345  preference[KMP_HW_THREAD] = 90;
346  preference[KMP_HW_NUMA] = 85;
347  preference[KMP_HW_DIE] = 80;
348  preference[KMP_HW_TILE] = 75;
349  preference[KMP_HW_MODULE] = 73;
350  preference[KMP_HW_L3] = 70;
351  preference[KMP_HW_L2] = 65;
352  preference[KMP_HW_L1] = 60;
353  preference[KMP_HW_LLC] = 5;
354  top_index1 = 0;
355  top_index2 = 1;
356  while (top_index1 < depth - 1 && top_index2 < depth) {
357  kmp_hw_t type1 = types[top_index1];
358  kmp_hw_t type2 = types[top_index2];
359  KMP_ASSERT_VALID_HW_TYPE(type1);
360  KMP_ASSERT_VALID_HW_TYPE(type2);
361  // Do not allow the three main topology levels (sockets, cores, threads) to
362  // be compacted down
363  if ((type1 == KMP_HW_THREAD || type1 == KMP_HW_CORE ||
364  type1 == KMP_HW_SOCKET) &&
365  (type2 == KMP_HW_THREAD || type2 == KMP_HW_CORE ||
366  type2 == KMP_HW_SOCKET)) {
367  top_index1 = top_index2++;
368  continue;
369  }
370  bool radix1 = true;
371  bool all_same = true;
372  int id1 = hw_threads[0].ids[top_index1];
373  int id2 = hw_threads[0].ids[top_index2];
374  int pref1 = preference[type1];
375  int pref2 = preference[type2];
376  for (int hwidx = 1; hwidx < num_hw_threads; ++hwidx) {
377  if (hw_threads[hwidx].ids[top_index1] == id1 &&
378  hw_threads[hwidx].ids[top_index2] != id2) {
379  radix1 = false;
380  break;
381  }
382  if (hw_threads[hwidx].ids[top_index2] != id2)
383  all_same = false;
384  id1 = hw_threads[hwidx].ids[top_index1];
385  id2 = hw_threads[hwidx].ids[top_index2];
386  }
387  if (radix1) {
388  // Select the layer to remove based on preference
389  kmp_hw_t remove_type, keep_type;
390  int remove_layer, remove_layer_ids;
391  if (pref1 > pref2) {
392  remove_type = type2;
393  remove_layer = remove_layer_ids = top_index2;
394  keep_type = type1;
395  } else {
396  remove_type = type1;
397  remove_layer = remove_layer_ids = top_index1;
398  keep_type = type2;
399  }
400  // If all the indexes for the second (deeper) layer are the same.
401  // e.g., all are zero, then make sure to keep the first layer's ids
402  if (all_same)
403  remove_layer_ids = top_index2;
404  // Remove radix one type by setting the equivalence, removing the id from
405  // the hw threads and removing the layer from types and depth
406  set_equivalent_type(remove_type, keep_type);
407  for (int idx = 0; idx < num_hw_threads; ++idx) {
408  kmp_hw_thread_t &hw_thread = hw_threads[idx];
409  for (int d = remove_layer_ids; d < depth - 1; ++d)
410  hw_thread.ids[d] = hw_thread.ids[d + 1];
411  }
412  for (int idx = remove_layer; idx < depth - 1; ++idx)
413  types[idx] = types[idx + 1];
414  depth--;
415  } else {
416  top_index1 = top_index2++;
417  }
418  }
419  KMP_ASSERT(depth > 0);
420 }
421 
422 void kmp_topology_t::_set_last_level_cache() {
423  if (get_equivalent_type(KMP_HW_L3) != KMP_HW_UNKNOWN)
424  set_equivalent_type(KMP_HW_LLC, KMP_HW_L3);
425  else if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
426  set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
427 #if KMP_MIC_SUPPORTED
428  else if (__kmp_mic_type == mic3) {
429  if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
430  set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
431  else if (get_equivalent_type(KMP_HW_TILE) != KMP_HW_UNKNOWN)
432  set_equivalent_type(KMP_HW_LLC, KMP_HW_TILE);
433  // L2/Tile wasn't detected so just say L1
434  else
435  set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
436  }
437 #endif
438  else if (get_equivalent_type(KMP_HW_L1) != KMP_HW_UNKNOWN)
439  set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
440  // Fallback is to set last level cache to socket or core
441  if (get_equivalent_type(KMP_HW_LLC) == KMP_HW_UNKNOWN) {
442  if (get_equivalent_type(KMP_HW_SOCKET) != KMP_HW_UNKNOWN)
443  set_equivalent_type(KMP_HW_LLC, KMP_HW_SOCKET);
444  else if (get_equivalent_type(KMP_HW_CORE) != KMP_HW_UNKNOWN)
445  set_equivalent_type(KMP_HW_LLC, KMP_HW_CORE);
446  }
447  KMP_ASSERT(get_equivalent_type(KMP_HW_LLC) != KMP_HW_UNKNOWN);
448 }
449 
450 // Gather the count of each topology layer and the ratio
451 void kmp_topology_t::_gather_enumeration_information() {
452  int previous_id[KMP_HW_LAST];
453  int max[KMP_HW_LAST];
454 
455  for (int i = 0; i < depth; ++i) {
456  previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
457  max[i] = 0;
458  count[i] = 0;
459  ratio[i] = 0;
460  }
461  int core_level = get_level(KMP_HW_CORE);
462  for (int i = 0; i < num_hw_threads; ++i) {
463  kmp_hw_thread_t &hw_thread = hw_threads[i];
464  for (int layer = 0; layer < depth; ++layer) {
465  int id = hw_thread.ids[layer];
466  if (id != previous_id[layer]) {
467  // Add an additional increment to each count
468  for (int l = layer; l < depth; ++l)
469  count[l]++;
470  // Keep track of topology layer ratio statistics
471  max[layer]++;
472  for (int l = layer + 1; l < depth; ++l) {
473  if (max[l] > ratio[l])
474  ratio[l] = max[l];
475  max[l] = 1;
476  }
477  // Figure out the number of different core types
478  // and efficiencies for hybrid CPUs
479  if (__kmp_is_hybrid_cpu() && core_level >= 0 && layer <= core_level) {
480  if (hw_thread.attrs.is_core_eff_valid() &&
481  hw_thread.attrs.core_eff >= num_core_efficiencies) {
482  // Because efficiencies can range from 0 to max efficiency - 1,
483  // the number of efficiencies is max efficiency + 1
484  num_core_efficiencies = hw_thread.attrs.core_eff + 1;
485  }
486  if (hw_thread.attrs.is_core_type_valid()) {
487  bool found = false;
488  for (int j = 0; j < num_core_types; ++j) {
489  if (hw_thread.attrs.get_core_type() == core_types[j]) {
490  found = true;
491  break;
492  }
493  }
494  if (!found) {
495  KMP_ASSERT(num_core_types < KMP_HW_MAX_NUM_CORE_TYPES);
496  core_types[num_core_types++] = hw_thread.attrs.get_core_type();
497  }
498  }
499  }
500  break;
501  }
502  }
503  for (int layer = 0; layer < depth; ++layer) {
504  previous_id[layer] = hw_thread.ids[layer];
505  }
506  }
507  for (int layer = 0; layer < depth; ++layer) {
508  if (max[layer] > ratio[layer])
509  ratio[layer] = max[layer];
510  }
511 }
512 
513 int kmp_topology_t::_get_ncores_with_attr(const kmp_hw_attr_t &attr,
514  int above_level,
515  bool find_all) const {
516  int current, current_max;
517  int previous_id[KMP_HW_LAST];
518  for (int i = 0; i < depth; ++i)
519  previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
520  int core_level = get_level(KMP_HW_CORE);
521  if (find_all)
522  above_level = -1;
523  KMP_ASSERT(above_level < core_level);
524  current_max = 0;
525  current = 0;
526  for (int i = 0; i < num_hw_threads; ++i) {
527  kmp_hw_thread_t &hw_thread = hw_threads[i];
528  if (!find_all && hw_thread.ids[above_level] != previous_id[above_level]) {
529  if (current > current_max)
530  current_max = current;
531  current = hw_thread.attrs.contains(attr);
532  } else {
533  for (int level = above_level + 1; level <= core_level; ++level) {
534  if (hw_thread.ids[level] != previous_id[level]) {
535  if (hw_thread.attrs.contains(attr))
536  current++;
537  break;
538  }
539  }
540  }
541  for (int level = 0; level < depth; ++level)
542  previous_id[level] = hw_thread.ids[level];
543  }
544  if (current > current_max)
545  current_max = current;
546  return current_max;
547 }
548 
549 // Find out if the topology is uniform
550 void kmp_topology_t::_discover_uniformity() {
551  int num = 1;
552  for (int level = 0; level < depth; ++level)
553  num *= ratio[level];
554  flags.uniform = (num == count[depth - 1]);
555 }
556 
557 // Set all the sub_ids for each hardware thread
558 void kmp_topology_t::_set_sub_ids() {
559  int previous_id[KMP_HW_LAST];
560  int sub_id[KMP_HW_LAST];
561 
562  for (int i = 0; i < depth; ++i) {
563  previous_id[i] = -1;
564  sub_id[i] = -1;
565  }
566  for (int i = 0; i < num_hw_threads; ++i) {
567  kmp_hw_thread_t &hw_thread = hw_threads[i];
568  // Setup the sub_id
569  for (int j = 0; j < depth; ++j) {
570  if (hw_thread.ids[j] != previous_id[j]) {
571  sub_id[j]++;
572  for (int k = j + 1; k < depth; ++k) {
573  sub_id[k] = 0;
574  }
575  break;
576  }
577  }
578  // Set previous_id
579  for (int j = 0; j < depth; ++j) {
580  previous_id[j] = hw_thread.ids[j];
581  }
582  // Set the sub_ids field
583  for (int j = 0; j < depth; ++j) {
584  hw_thread.sub_ids[j] = sub_id[j];
585  }
586  }
587 }
588 
589 void kmp_topology_t::_set_globals() {
590  // Set nCoresPerPkg, nPackages, __kmp_nThreadsPerCore, __kmp_ncores
591  int core_level, thread_level, package_level;
592  package_level = get_level(KMP_HW_SOCKET);
593 #if KMP_GROUP_AFFINITY
594  if (package_level == -1)
595  package_level = get_level(KMP_HW_PROC_GROUP);
596 #endif
597  core_level = get_level(KMP_HW_CORE);
598  thread_level = get_level(KMP_HW_THREAD);
599 
600  KMP_ASSERT(core_level != -1);
601  KMP_ASSERT(thread_level != -1);
602 
603  __kmp_nThreadsPerCore = calculate_ratio(thread_level, core_level);
604  if (package_level != -1) {
605  nCoresPerPkg = calculate_ratio(core_level, package_level);
606  nPackages = get_count(package_level);
607  } else {
608  // assume one socket
609  nCoresPerPkg = get_count(core_level);
610  nPackages = 1;
611  }
612 #ifndef KMP_DFLT_NTH_CORES
613  __kmp_ncores = get_count(core_level);
614 #endif
615 }
616 
617 kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
618  const kmp_hw_t *types) {
619  kmp_topology_t *retval;
620  // Allocate all data in one large allocation
621  size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc +
622  sizeof(int) * (size_t)KMP_HW_LAST * 3;
623  char *bytes = (char *)__kmp_allocate(size);
624  retval = (kmp_topology_t *)bytes;
625  if (nproc > 0) {
626  retval->hw_threads = (kmp_hw_thread_t *)(bytes + sizeof(kmp_topology_t));
627  } else {
628  retval->hw_threads = nullptr;
629  }
630  retval->num_hw_threads = nproc;
631  retval->depth = ndepth;
632  int *arr =
633  (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc);
634  retval->types = (kmp_hw_t *)arr;
635  retval->ratio = arr + (size_t)KMP_HW_LAST;
636  retval->count = arr + 2 * (size_t)KMP_HW_LAST;
637  retval->num_core_efficiencies = 0;
638  retval->num_core_types = 0;
639  retval->compact = 0;
640  for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i)
641  retval->core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN;
642  KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; }
643  for (int i = 0; i < ndepth; ++i) {
644  retval->types[i] = types[i];
645  retval->equivalent[types[i]] = types[i];
646  }
647  return retval;
648 }
649 
650 void kmp_topology_t::deallocate(kmp_topology_t *topology) {
651  if (topology)
652  __kmp_free(topology);
653 }
654 
655 bool kmp_topology_t::check_ids() const {
656  // Assume ids have been sorted
657  if (num_hw_threads == 0)
658  return true;
659  for (int i = 1; i < num_hw_threads; ++i) {
660  kmp_hw_thread_t &current_thread = hw_threads[i];
661  kmp_hw_thread_t &previous_thread = hw_threads[i - 1];
662  bool unique = false;
663  for (int j = 0; j < depth; ++j) {
664  if (previous_thread.ids[j] != current_thread.ids[j]) {
665  unique = true;
666  break;
667  }
668  }
669  if (unique)
670  continue;
671  return false;
672  }
673  return true;
674 }
675 
676 void kmp_topology_t::dump() const {
677  printf("***********************\n");
678  printf("*** __kmp_topology: ***\n");
679  printf("***********************\n");
680  printf("* depth: %d\n", depth);
681 
682  printf("* types: ");
683  for (int i = 0; i < depth; ++i)
684  printf("%15s ", __kmp_hw_get_keyword(types[i]));
685  printf("\n");
686 
687  printf("* ratio: ");
688  for (int i = 0; i < depth; ++i) {
689  printf("%15d ", ratio[i]);
690  }
691  printf("\n");
692 
693  printf("* count: ");
694  for (int i = 0; i < depth; ++i) {
695  printf("%15d ", count[i]);
696  }
697  printf("\n");
698 
699  printf("* num_core_eff: %d\n", num_core_efficiencies);
700  printf("* num_core_types: %d\n", num_core_types);
701  printf("* core_types: ");
702  for (int i = 0; i < num_core_types; ++i)
703  printf("%3d ", core_types[i]);
704  printf("\n");
705 
706  printf("* equivalent map:\n");
707  KMP_FOREACH_HW_TYPE(i) {
708  const char *key = __kmp_hw_get_keyword(i);
709  const char *value = __kmp_hw_get_keyword(equivalent[i]);
710  printf("%-15s -> %-15s\n", key, value);
711  }
712 
713  printf("* uniform: %s\n", (is_uniform() ? "Yes" : "No"));
714 
715  printf("* num_hw_threads: %d\n", num_hw_threads);
716  printf("* hw_threads:\n");
717  for (int i = 0; i < num_hw_threads; ++i) {
718  hw_threads[i].print();
719  }
720  printf("***********************\n");
721 }
722 
723 void kmp_topology_t::print(const char *env_var) const {
724  kmp_str_buf_t buf;
725  int print_types_depth;
726  __kmp_str_buf_init(&buf);
727  kmp_hw_t print_types[KMP_HW_LAST + 2];
728 
729  // Num Available Threads
730  if (num_hw_threads) {
731  KMP_INFORM(AvailableOSProc, env_var, num_hw_threads);
732  } else {
733  KMP_INFORM(AvailableOSProc, env_var, __kmp_xproc);
734  }
735 
736  // Uniform or not
737  if (is_uniform()) {
738  KMP_INFORM(Uniform, env_var);
739  } else {
740  KMP_INFORM(NonUniform, env_var);
741  }
742 
743  // Equivalent types
744  KMP_FOREACH_HW_TYPE(type) {
745  kmp_hw_t eq_type = equivalent[type];
746  if (eq_type != KMP_HW_UNKNOWN && eq_type != type) {
747  KMP_INFORM(AffEqualTopologyTypes, env_var,
748  __kmp_hw_get_catalog_string(type),
749  __kmp_hw_get_catalog_string(eq_type));
750  }
751  }
752 
753  // Quick topology
754  KMP_ASSERT(depth > 0 && depth <= (int)KMP_HW_LAST);
755  // Create a print types array that always guarantees printing
756  // the core and thread level
757  print_types_depth = 0;
758  for (int level = 0; level < depth; ++level)
759  print_types[print_types_depth++] = types[level];
760  if (equivalent[KMP_HW_CORE] != KMP_HW_CORE) {
761  // Force in the core level for quick topology
762  if (print_types[print_types_depth - 1] == KMP_HW_THREAD) {
763  // Force core before thread e.g., 1 socket X 2 threads/socket
764  // becomes 1 socket X 1 core/socket X 2 threads/socket
765  print_types[print_types_depth - 1] = KMP_HW_CORE;
766  print_types[print_types_depth++] = KMP_HW_THREAD;
767  } else {
768  print_types[print_types_depth++] = KMP_HW_CORE;
769  }
770  }
771  // Always put threads at very end of quick topology
772  if (equivalent[KMP_HW_THREAD] != KMP_HW_THREAD)
773  print_types[print_types_depth++] = KMP_HW_THREAD;
774 
775  __kmp_str_buf_clear(&buf);
776  kmp_hw_t numerator_type;
777  kmp_hw_t denominator_type = KMP_HW_UNKNOWN;
778  int core_level = get_level(KMP_HW_CORE);
779  int ncores = get_count(core_level);
780 
781  for (int plevel = 0, level = 0; plevel < print_types_depth; ++plevel) {
782  int c;
783  bool plural;
784  numerator_type = print_types[plevel];
785  KMP_ASSERT_VALID_HW_TYPE(numerator_type);
786  if (equivalent[numerator_type] != numerator_type)
787  c = 1;
788  else
789  c = get_ratio(level++);
790  plural = (c > 1);
791  if (plevel == 0) {
792  __kmp_str_buf_print(&buf, "%d %s", c,
793  __kmp_hw_get_catalog_string(numerator_type, plural));
794  } else {
795  __kmp_str_buf_print(&buf, " x %d %s/%s", c,
796  __kmp_hw_get_catalog_string(numerator_type, plural),
797  __kmp_hw_get_catalog_string(denominator_type));
798  }
799  denominator_type = numerator_type;
800  }
801  KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores);
802 
803  // Hybrid topology information
804  if (__kmp_is_hybrid_cpu()) {
805  for (int i = 0; i < num_core_types; ++i) {
806  kmp_hw_core_type_t core_type = core_types[i];
807  kmp_hw_attr_t attr;
808  attr.clear();
809  attr.set_core_type(core_type);
810  int ncores = get_ncores_with_attr(attr);
811  if (ncores > 0) {
812  KMP_INFORM(TopologyHybrid, env_var, ncores,
813  __kmp_hw_get_core_type_string(core_type));
814  KMP_ASSERT(num_core_efficiencies <= KMP_HW_MAX_NUM_CORE_EFFS)
815  for (int eff = 0; eff < num_core_efficiencies; ++eff) {
816  attr.set_core_eff(eff);
817  int ncores_with_eff = get_ncores_with_attr(attr);
818  if (ncores_with_eff > 0) {
819  KMP_INFORM(TopologyHybridCoreEff, env_var, ncores_with_eff, eff);
820  }
821  }
822  }
823  }
824  }
825 
826  if (num_hw_threads <= 0) {
827  __kmp_str_buf_free(&buf);
828  return;
829  }
830 
831  // Full OS proc to hardware thread map
832  KMP_INFORM(OSProcToPhysicalThreadMap, env_var);
833  for (int i = 0; i < num_hw_threads; i++) {
834  __kmp_str_buf_clear(&buf);
835  for (int level = 0; level < depth; ++level) {
836  kmp_hw_t type = types[level];
837  __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type));
838  __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]);
839  }
840  if (__kmp_is_hybrid_cpu())
841  __kmp_str_buf_print(
842  &buf, "(%s)",
843  __kmp_hw_get_core_type_string(hw_threads[i].attrs.get_core_type()));
844  KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str);
845  }
846 
847  __kmp_str_buf_free(&buf);
848 }
849 
850 #if KMP_AFFINITY_SUPPORTED
851 void kmp_topology_t::set_granularity(kmp_affinity_t &affinity) const {
852  const char *env_var = __kmp_get_affinity_env_var(affinity);
853  // If requested hybrid CPU attributes for granularity (either OMP_PLACES or
854  // KMP_AFFINITY), but none exist, then reset granularity and have below method
855  // select a granularity and warn user.
856  if (!__kmp_is_hybrid_cpu()) {
857  if (affinity.core_attr_gran.valid) {
858  // OMP_PLACES with cores:<attribute> but non-hybrid arch, use cores
859  // instead
860  KMP_AFF_WARNING(
861  affinity, AffIgnoringNonHybrid, env_var,
862  __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true));
863  affinity.gran = KMP_HW_CORE;
864  affinity.gran_levels = -1;
865  affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN;
866  affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0;
867  } else if (affinity.flags.core_types_gran ||
868  affinity.flags.core_effs_gran) {
869  // OMP_PLACES=core_types|core_effs but non-hybrid, use cores instead
870  if (affinity.flags.omp_places) {
871  KMP_AFF_WARNING(
872  affinity, AffIgnoringNonHybrid, env_var,
873  __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true));
874  } else {
875  // KMP_AFFINITY=granularity=core_type|core_eff,...
876  KMP_AFF_WARNING(affinity, AffGranularityBad, env_var,
877  "Intel(R) Hybrid Technology core attribute",
878  __kmp_hw_get_catalog_string(KMP_HW_CORE));
879  }
880  affinity.gran = KMP_HW_CORE;
881  affinity.gran_levels = -1;
882  affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN;
883  affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0;
884  }
885  }
886  // Set the number of affinity granularity levels
887  if (affinity.gran_levels < 0) {
888  kmp_hw_t gran_type = get_equivalent_type(affinity.gran);
889  // Check if user's granularity request is valid
890  if (gran_type == KMP_HW_UNKNOWN) {
891  // First try core, then thread, then package
892  kmp_hw_t gran_types[3] = {KMP_HW_CORE, KMP_HW_THREAD, KMP_HW_SOCKET};
893  for (auto g : gran_types) {
894  if (get_equivalent_type(g) != KMP_HW_UNKNOWN) {
895  gran_type = g;
896  break;
897  }
898  }
899  KMP_ASSERT(gran_type != KMP_HW_UNKNOWN);
900  // Warn user what granularity setting will be used instead
901  KMP_AFF_WARNING(affinity, AffGranularityBad, env_var,
902  __kmp_hw_get_catalog_string(affinity.gran),
903  __kmp_hw_get_catalog_string(gran_type));
904  affinity.gran = gran_type;
905  }
906 #if KMP_GROUP_AFFINITY
907  // If more than one processor group exists, and the level of
908  // granularity specified by the user is too coarse, then the
909  // granularity must be adjusted "down" to processor group affinity
910  // because threads can only exist within one processor group.
911  // For example, if a user sets granularity=socket and there are two
912  // processor groups that cover a socket, then the runtime must
913  // restrict the granularity down to the processor group level.
914  if (__kmp_num_proc_groups > 1) {
915  int gran_depth = get_level(gran_type);
916  int proc_group_depth = get_level(KMP_HW_PROC_GROUP);
917  if (gran_depth >= 0 && proc_group_depth >= 0 &&
918  gran_depth < proc_group_depth) {
919  KMP_AFF_WARNING(affinity, AffGranTooCoarseProcGroup, env_var,
920  __kmp_hw_get_catalog_string(affinity.gran));
921  affinity.gran = gran_type = KMP_HW_PROC_GROUP;
922  }
923  }
924 #endif
925  affinity.gran_levels = 0;
926  for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i)
927  affinity.gran_levels++;
928  }
929 }
930 #endif
931 
932 void kmp_topology_t::canonicalize() {
933 #if KMP_GROUP_AFFINITY
934  _insert_windows_proc_groups();
935 #endif
936  _remove_radix1_layers();
937  _gather_enumeration_information();
938  _discover_uniformity();
939  _set_sub_ids();
940  _set_globals();
941  _set_last_level_cache();
942 
943 #if KMP_MIC_SUPPORTED
944  // Manually Add L2 = Tile equivalence
945  if (__kmp_mic_type == mic3) {
946  if (get_level(KMP_HW_L2) != -1)
947  set_equivalent_type(KMP_HW_TILE, KMP_HW_L2);
948  else if (get_level(KMP_HW_TILE) != -1)
949  set_equivalent_type(KMP_HW_L2, KMP_HW_TILE);
950  }
951 #endif
952 
953  // Perform post canonicalization checking
954  KMP_ASSERT(depth > 0);
955  for (int level = 0; level < depth; ++level) {
956  // All counts, ratios, and types must be valid
957  KMP_ASSERT(count[level] > 0 && ratio[level] > 0);
958  KMP_ASSERT_VALID_HW_TYPE(types[level]);
959  // Detected types must point to themselves
960  KMP_ASSERT(equivalent[types[level]] == types[level]);
961  }
962 }
963 
964 // Canonicalize an explicit packages X cores/pkg X threads/core topology
965 void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg,
966  int nthreads_per_core, int ncores) {
967  int ndepth = 3;
968  depth = ndepth;
969  KMP_FOREACH_HW_TYPE(i) { equivalent[i] = KMP_HW_UNKNOWN; }
970  for (int level = 0; level < depth; ++level) {
971  count[level] = 0;
972  ratio[level] = 0;
973  }
974  count[0] = npackages;
975  count[1] = ncores;
976  count[2] = __kmp_xproc;
977  ratio[0] = npackages;
978  ratio[1] = ncores_per_pkg;
979  ratio[2] = nthreads_per_core;
980  equivalent[KMP_HW_SOCKET] = KMP_HW_SOCKET;
981  equivalent[KMP_HW_CORE] = KMP_HW_CORE;
982  equivalent[KMP_HW_THREAD] = KMP_HW_THREAD;
983  types[0] = KMP_HW_SOCKET;
984  types[1] = KMP_HW_CORE;
985  types[2] = KMP_HW_THREAD;
986  //__kmp_avail_proc = __kmp_xproc;
987  _discover_uniformity();
988 }
989 
990 #if KMP_AFFINITY_SUPPORTED
991 static kmp_str_buf_t *
992 __kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf,
993  bool plural) {
994  __kmp_str_buf_init(buf);
995  if (attr.is_core_type_valid())
996  __kmp_str_buf_print(buf, "%s %s",
997  __kmp_hw_get_core_type_string(attr.get_core_type()),
998  __kmp_hw_get_catalog_string(KMP_HW_CORE, plural));
999  else
1000  __kmp_str_buf_print(buf, "%s eff=%d",
1001  __kmp_hw_get_catalog_string(KMP_HW_CORE, plural),
1002  attr.get_core_eff());
1003  return buf;
1004 }
1005 
1006 bool kmp_topology_t::restrict_to_mask(const kmp_affin_mask_t *mask) {
1007  // Apply the filter
1008  bool affected;
1009  int new_index = 0;
1010  for (int i = 0; i < num_hw_threads; ++i) {
1011  int os_id = hw_threads[i].os_id;
1012  if (KMP_CPU_ISSET(os_id, mask)) {
1013  if (i != new_index)
1014  hw_threads[new_index] = hw_threads[i];
1015  new_index++;
1016  } else {
1017  KMP_CPU_CLR(os_id, __kmp_affin_fullMask);
1018  __kmp_avail_proc--;
1019  }
1020  }
1021 
1022  KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
1023  affected = (num_hw_threads != new_index);
1024  num_hw_threads = new_index;
1025 
1026  // Post hardware subset canonicalization
1027  if (affected) {
1028  _gather_enumeration_information();
1029  _discover_uniformity();
1030  _set_globals();
1031  _set_last_level_cache();
1032 #if KMP_OS_WINDOWS
1033  // Copy filtered full mask if topology has single processor group
1034  if (__kmp_num_proc_groups <= 1)
1035 #endif
1036  __kmp_affin_origMask->copy(__kmp_affin_fullMask);
1037  }
1038  return affected;
1039 }
1040 
1041 // Apply the KMP_HW_SUBSET envirable to the topology
1042 // Returns true if KMP_HW_SUBSET filtered any processors
1043 // otherwise, returns false
1044 bool kmp_topology_t::filter_hw_subset() {
1045  // If KMP_HW_SUBSET wasn't requested, then do nothing.
1046  if (!__kmp_hw_subset)
1047  return false;
1048 
1049  // First, sort the KMP_HW_SUBSET items by the machine topology
1050  __kmp_hw_subset->sort();
1051 
1052  __kmp_hw_subset->canonicalize(__kmp_topology);
1053 
1054  // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology
1055  bool using_core_types = false;
1056  bool using_core_effs = false;
1057  bool is_absolute = __kmp_hw_subset->is_absolute();
1058  int hw_subset_depth = __kmp_hw_subset->get_depth();
1059  kmp_hw_t specified[KMP_HW_LAST];
1060  int *topology_levels = (int *)KMP_ALLOCA(sizeof(int) * hw_subset_depth);
1061  KMP_ASSERT(hw_subset_depth > 0);
1062  KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; }
1063  int core_level = get_level(KMP_HW_CORE);
1064  for (int i = 0; i < hw_subset_depth; ++i) {
1065  int max_count;
1066  const kmp_hw_subset_t::item_t &item = __kmp_hw_subset->at(i);
1067  int num = item.num[0];
1068  int offset = item.offset[0];
1069  kmp_hw_t type = item.type;
1070  kmp_hw_t equivalent_type = equivalent[type];
1071  int level = get_level(type);
1072  topology_levels[i] = level;
1073 
1074  // Check to see if current layer is in detected machine topology
1075  if (equivalent_type != KMP_HW_UNKNOWN) {
1076  __kmp_hw_subset->at(i).type = equivalent_type;
1077  } else {
1078  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetNotExistGeneric,
1079  __kmp_hw_get_catalog_string(type));
1080  return false;
1081  }
1082 
1083  // Check to see if current layer has already been
1084  // specified either directly or through an equivalent type
1085  if (specified[equivalent_type] != KMP_HW_UNKNOWN) {
1086  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetEqvLayers,
1087  __kmp_hw_get_catalog_string(type),
1088  __kmp_hw_get_catalog_string(specified[equivalent_type]));
1089  return false;
1090  }
1091  specified[equivalent_type] = type;
1092 
1093  // Check to see if each layer's num & offset parameters are valid
1094  max_count = get_ratio(level);
1095  if (!is_absolute) {
1096  if (max_count < 0 ||
1097  (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
1098  bool plural = (num > 1);
1099  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric,
1100  __kmp_hw_get_catalog_string(type, plural));
1101  return false;
1102  }
1103  }
1104 
1105  // Check to see if core attributes are consistent
1106  if (core_level == level) {
1107  // Determine which core attributes are specified
1108  for (int j = 0; j < item.num_attrs; ++j) {
1109  if (item.attr[j].is_core_type_valid())
1110  using_core_types = true;
1111  if (item.attr[j].is_core_eff_valid())
1112  using_core_effs = true;
1113  }
1114 
1115  // Check if using a single core attribute on non-hybrid arch.
1116  // Do not ignore all of KMP_HW_SUBSET, just ignore the attribute.
1117  //
1118  // Check if using multiple core attributes on non-hyrbid arch.
1119  // Ignore all of KMP_HW_SUBSET if this is the case.
1120  if ((using_core_effs || using_core_types) && !__kmp_is_hybrid_cpu()) {
1121  if (item.num_attrs == 1) {
1122  if (using_core_effs) {
1123  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIgnoringAttr,
1124  "efficiency");
1125  } else {
1126  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIgnoringAttr,
1127  "core_type");
1128  }
1129  using_core_effs = false;
1130  using_core_types = false;
1131  } else {
1132  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAttrsNonHybrid);
1133  return false;
1134  }
1135  }
1136 
1137  // Check if using both core types and core efficiencies together
1138  if (using_core_types && using_core_effs) {
1139  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIncompat, "core_type",
1140  "efficiency");
1141  return false;
1142  }
1143 
1144  // Check that core efficiency values are valid
1145  if (using_core_effs) {
1146  for (int j = 0; j < item.num_attrs; ++j) {
1147  if (item.attr[j].is_core_eff_valid()) {
1148  int core_eff = item.attr[j].get_core_eff();
1149  if (core_eff < 0 || core_eff >= num_core_efficiencies) {
1150  kmp_str_buf_t buf;
1151  __kmp_str_buf_init(&buf);
1152  __kmp_str_buf_print(&buf, "%d", item.attr[j].get_core_eff());
1153  __kmp_msg(kmp_ms_warning,
1154  KMP_MSG(AffHWSubsetAttrInvalid, "efficiency", buf.str),
1155  KMP_HNT(ValidValuesRange, 0, num_core_efficiencies - 1),
1156  __kmp_msg_null);
1157  __kmp_str_buf_free(&buf);
1158  return false;
1159  }
1160  }
1161  }
1162  }
1163 
1164  // Check that the number of requested cores with attributes is valid
1165  if ((using_core_types || using_core_effs) && !is_absolute) {
1166  for (int j = 0; j < item.num_attrs; ++j) {
1167  int num = item.num[j];
1168  int offset = item.offset[j];
1169  int level_above = core_level - 1;
1170  if (level_above >= 0) {
1171  max_count = get_ncores_with_attr_per(item.attr[j], level_above);
1172  if (max_count <= 0 ||
1173  (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
1174  kmp_str_buf_t buf;
1175  __kmp_hw_get_catalog_core_string(item.attr[j], &buf, num > 0);
1176  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric, buf.str);
1177  __kmp_str_buf_free(&buf);
1178  return false;
1179  }
1180  }
1181  }
1182  }
1183 
1184  if ((using_core_types || using_core_effs) && item.num_attrs > 1) {
1185  for (int j = 0; j < item.num_attrs; ++j) {
1186  // Ambiguous use of specific core attribute + generic core
1187  // e.g., 4c & 3c:intel_core or 4c & 3c:eff1
1188  if (!item.attr[j]) {
1189  kmp_hw_attr_t other_attr;
1190  for (int k = 0; k < item.num_attrs; ++k) {
1191  if (item.attr[k] != item.attr[j]) {
1192  other_attr = item.attr[k];
1193  break;
1194  }
1195  }
1196  kmp_str_buf_t buf;
1197  __kmp_hw_get_catalog_core_string(other_attr, &buf, item.num[j] > 0);
1198  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIncompat,
1199  __kmp_hw_get_catalog_string(KMP_HW_CORE), buf.str);
1200  __kmp_str_buf_free(&buf);
1201  return false;
1202  }
1203  // Allow specifying a specific core type or core eff exactly once
1204  for (int k = 0; k < j; ++k) {
1205  if (!item.attr[j] || !item.attr[k])
1206  continue;
1207  if (item.attr[k] == item.attr[j]) {
1208  kmp_str_buf_t buf;
1209  __kmp_hw_get_catalog_core_string(item.attr[j], &buf,
1210  item.num[j] > 0);
1211  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAttrRepeat, buf.str);
1212  __kmp_str_buf_free(&buf);
1213  return false;
1214  }
1215  }
1216  }
1217  }
1218  }
1219  }
1220 
1221  // For keeping track of sub_ids for an absolute KMP_HW_SUBSET
1222  // or core attributes (core type or efficiency)
1223  int prev_sub_ids[KMP_HW_LAST];
1224  int abs_sub_ids[KMP_HW_LAST];
1225  int core_eff_sub_ids[KMP_HW_MAX_NUM_CORE_EFFS];
1226  int core_type_sub_ids[KMP_HW_MAX_NUM_CORE_TYPES];
1227  for (size_t i = 0; i < KMP_HW_LAST; ++i) {
1228  abs_sub_ids[i] = -1;
1229  prev_sub_ids[i] = -1;
1230  }
1231  for (size_t i = 0; i < KMP_HW_MAX_NUM_CORE_EFFS; ++i)
1232  core_eff_sub_ids[i] = -1;
1233  for (size_t i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i)
1234  core_type_sub_ids[i] = -1;
1235 
1236  // Determine which hardware threads should be filtered.
1237 
1238  // Helpful to determine if a topology layer is targeted by an absolute subset
1239  auto is_targeted = [&](int level) {
1240  if (is_absolute) {
1241  for (int i = 0; i < hw_subset_depth; ++i)
1242  if (topology_levels[i] == level)
1243  return true;
1244  return false;
1245  }
1246  // If not absolute KMP_HW_SUBSET, then every layer is seen as targeted
1247  return true;
1248  };
1249 
1250  // Helpful to index into core type sub Ids array
1251  auto get_core_type_index = [](const kmp_hw_thread_t &t) {
1252  switch (t.attrs.get_core_type()) {
1253  case KMP_HW_CORE_TYPE_UNKNOWN:
1254  case KMP_HW_MAX_NUM_CORE_TYPES:
1255  return 0;
1256 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1257  case KMP_HW_CORE_TYPE_ATOM:
1258  return 1;
1259  case KMP_HW_CORE_TYPE_CORE:
1260  return 2;
1261 #endif
1262  }
1263  KMP_ASSERT2(false, "Unhandled kmp_hw_thread_t enumeration");
1264  KMP_BUILTIN_UNREACHABLE;
1265  };
1266 
1267  // Helpful to index into core efficiencies sub Ids array
1268  auto get_core_eff_index = [](const kmp_hw_thread_t &t) {
1269  return t.attrs.get_core_eff();
1270  };
1271 
1272  int num_filtered = 0;
1273  kmp_affin_mask_t *filtered_mask;
1274  KMP_CPU_ALLOC(filtered_mask);
1275  KMP_CPU_COPY(filtered_mask, __kmp_affin_fullMask);
1276  for (int i = 0; i < num_hw_threads; ++i) {
1277  kmp_hw_thread_t &hw_thread = hw_threads[i];
1278 
1279  // Figure out the absolute sub ids and core eff/type sub ids
1280  if (is_absolute || using_core_effs || using_core_types) {
1281  for (int level = 0; level < get_depth(); ++level) {
1282  if (hw_thread.sub_ids[level] != prev_sub_ids[level]) {
1283  bool found_targeted = false;
1284  for (int j = level; j < get_depth(); ++j) {
1285  bool targeted = is_targeted(j);
1286  if (!found_targeted && targeted) {
1287  found_targeted = true;
1288  abs_sub_ids[j]++;
1289  if (j == core_level && using_core_effs)
1290  core_eff_sub_ids[get_core_eff_index(hw_thread)]++;
1291  if (j == core_level && using_core_types)
1292  core_type_sub_ids[get_core_type_index(hw_thread)]++;
1293  } else if (targeted) {
1294  abs_sub_ids[j] = 0;
1295  if (j == core_level && using_core_effs)
1296  core_eff_sub_ids[get_core_eff_index(hw_thread)] = 0;
1297  if (j == core_level && using_core_types)
1298  core_type_sub_ids[get_core_type_index(hw_thread)] = 0;
1299  }
1300  }
1301  break;
1302  }
1303  }
1304  for (int level = 0; level < get_depth(); ++level)
1305  prev_sub_ids[level] = hw_thread.sub_ids[level];
1306  }
1307 
1308  // Check to see if this hardware thread should be filtered
1309  bool should_be_filtered = false;
1310  for (int hw_subset_index = 0; hw_subset_index < hw_subset_depth;
1311  ++hw_subset_index) {
1312  const auto &hw_subset_item = __kmp_hw_subset->at(hw_subset_index);
1313  int level = topology_levels[hw_subset_index];
1314  if (level == -1)
1315  continue;
1316  if ((using_core_effs || using_core_types) && level == core_level) {
1317  // Look for the core attribute in KMP_HW_SUBSET which corresponds
1318  // to this hardware thread's core attribute. Use this num,offset plus
1319  // the running sub_id for the particular core attribute of this hardware
1320  // thread to determine if the hardware thread should be filtered or not.
1321  int attr_idx;
1322  kmp_hw_core_type_t core_type = hw_thread.attrs.get_core_type();
1323  int core_eff = hw_thread.attrs.get_core_eff();
1324  for (attr_idx = 0; attr_idx < hw_subset_item.num_attrs; ++attr_idx) {
1325  if (using_core_types &&
1326  hw_subset_item.attr[attr_idx].get_core_type() == core_type)
1327  break;
1328  if (using_core_effs &&
1329  hw_subset_item.attr[attr_idx].get_core_eff() == core_eff)
1330  break;
1331  }
1332  // This core attribute isn't in the KMP_HW_SUBSET so always filter it.
1333  if (attr_idx == hw_subset_item.num_attrs) {
1334  should_be_filtered = true;
1335  break;
1336  }
1337  int sub_id;
1338  int num = hw_subset_item.num[attr_idx];
1339  int offset = hw_subset_item.offset[attr_idx];
1340  if (using_core_types)
1341  sub_id = core_type_sub_ids[get_core_type_index(hw_thread)];
1342  else
1343  sub_id = core_eff_sub_ids[get_core_eff_index(hw_thread)];
1344  if (sub_id < offset ||
1345  (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
1346  should_be_filtered = true;
1347  break;
1348  }
1349  } else {
1350  int sub_id;
1351  int num = hw_subset_item.num[0];
1352  int offset = hw_subset_item.offset[0];
1353  if (is_absolute)
1354  sub_id = abs_sub_ids[level];
1355  else
1356  sub_id = hw_thread.sub_ids[level];
1357  if (sub_id < offset ||
1358  (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
1359  should_be_filtered = true;
1360  break;
1361  }
1362  }
1363  }
1364  // Collect filtering information
1365  if (should_be_filtered) {
1366  KMP_CPU_CLR(hw_thread.os_id, filtered_mask);
1367  num_filtered++;
1368  }
1369  }
1370 
1371  // One last check that we shouldn't allow filtering entire machine
1372  if (num_filtered == num_hw_threads) {
1373  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAllFiltered);
1374  return false;
1375  }
1376 
1377  // Apply the filter
1378  restrict_to_mask(filtered_mask);
1379  return true;
1380 }
1381 
1382 bool kmp_topology_t::is_close(int hwt1, int hwt2,
1383  const kmp_affinity_t &stgs) const {
1384  int hw_level = stgs.gran_levels;
1385  if (hw_level >= depth)
1386  return true;
1387  bool retval = true;
1388  const kmp_hw_thread_t &t1 = hw_threads[hwt1];
1389  const kmp_hw_thread_t &t2 = hw_threads[hwt2];
1390  if (stgs.flags.core_types_gran)
1391  return t1.attrs.get_core_type() == t2.attrs.get_core_type();
1392  if (stgs.flags.core_effs_gran)
1393  return t1.attrs.get_core_eff() == t2.attrs.get_core_eff();
1394  for (int i = 0; i < (depth - hw_level); ++i) {
1395  if (t1.ids[i] != t2.ids[i])
1396  return false;
1397  }
1398  return retval;
1399 }
1400 
1402 
1403 bool KMPAffinity::picked_api = false;
1404 
1405 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
1406 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); }
1407 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); }
1408 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); }
1409 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); }
1410 void KMPAffinity::operator delete(void *p) { __kmp_free(p); }
1411 
1412 void KMPAffinity::pick_api() {
1413  KMPAffinity *affinity_dispatch;
1414  if (picked_api)
1415  return;
1416 #if KMP_USE_HWLOC
1417  // Only use Hwloc if affinity isn't explicitly disabled and
1418  // user requests Hwloc topology method
1419  if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
1420  __kmp_affinity.type != affinity_disabled) {
1421  affinity_dispatch = new KMPHwlocAffinity();
1422  } else
1423 #endif
1424  {
1425  affinity_dispatch = new KMPNativeAffinity();
1426  }
1427  __kmp_affinity_dispatch = affinity_dispatch;
1428  picked_api = true;
1429 }
1430 
1431 void KMPAffinity::destroy_api() {
1432  if (__kmp_affinity_dispatch != NULL) {
1433  delete __kmp_affinity_dispatch;
1434  __kmp_affinity_dispatch = NULL;
1435  picked_api = false;
1436  }
1437 }
1438 
1439 #define KMP_ADVANCE_SCAN(scan) \
1440  while (*scan != '\0') { \
1441  scan++; \
1442  }
1443 
1444 // Print the affinity mask to the character array in a pretty format.
1445 // The format is a comma separated list of non-negative integers or integer
1446 // ranges: e.g., 1,2,3-5,7,9-15
1447 // The format can also be the string "{<empty>}" if no bits are set in mask
1448 char *__kmp_affinity_print_mask(char *buf, int buf_len,
1449  kmp_affin_mask_t *mask) {
1450  int start = 0, finish = 0, previous = 0;
1451  bool first_range;
1452  KMP_ASSERT(buf);
1453  KMP_ASSERT(buf_len >= 40);
1454  KMP_ASSERT(mask);
1455  char *scan = buf;
1456  char *end = buf + buf_len - 1;
1457 
1458  // Check for empty set.
1459  if (mask->begin() == mask->end()) {
1460  KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}");
1461  KMP_ADVANCE_SCAN(scan);
1462  KMP_ASSERT(scan <= end);
1463  return buf;
1464  }
1465 
1466  first_range = true;
1467  start = mask->begin();
1468  while (1) {
1469  // Find next range
1470  // [start, previous] is inclusive range of contiguous bits in mask
1471  for (finish = mask->next(start), previous = start;
1472  finish == previous + 1 && finish != mask->end();
1473  finish = mask->next(finish)) {
1474  previous = finish;
1475  }
1476 
1477  // The first range does not need a comma printed before it, but the rest
1478  // of the ranges do need a comma beforehand
1479  if (!first_range) {
1480  KMP_SNPRINTF(scan, end - scan + 1, "%s", ",");
1481  KMP_ADVANCE_SCAN(scan);
1482  } else {
1483  first_range = false;
1484  }
1485  // Range with three or more contiguous bits in the affinity mask
1486  if (previous - start > 1) {
1487  KMP_SNPRINTF(scan, end - scan + 1, "%u-%u", start, previous);
1488  } else {
1489  // Range with one or two contiguous bits in the affinity mask
1490  KMP_SNPRINTF(scan, end - scan + 1, "%u", start);
1491  KMP_ADVANCE_SCAN(scan);
1492  if (previous - start > 0) {
1493  KMP_SNPRINTF(scan, end - scan + 1, ",%u", previous);
1494  }
1495  }
1496  KMP_ADVANCE_SCAN(scan);
1497  // Start over with new start point
1498  start = finish;
1499  if (start == mask->end())
1500  break;
1501  // Check for overflow
1502  if (end - scan < 2)
1503  break;
1504  }
1505 
1506  // Check for overflow
1507  KMP_ASSERT(scan <= end);
1508  return buf;
1509 }
1510 #undef KMP_ADVANCE_SCAN
1511 
1512 // Print the affinity mask to the string buffer object in a pretty format
1513 // The format is a comma separated list of non-negative integers or integer
1514 // ranges: e.g., 1,2,3-5,7,9-15
1515 // The format can also be the string "{<empty>}" if no bits are set in mask
1516 kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
1517  kmp_affin_mask_t *mask) {
1518  int start = 0, finish = 0, previous = 0;
1519  bool first_range;
1520  KMP_ASSERT(buf);
1521  KMP_ASSERT(mask);
1522 
1523  __kmp_str_buf_clear(buf);
1524 
1525  // Check for empty set.
1526  if (mask->begin() == mask->end()) {
1527  __kmp_str_buf_print(buf, "%s", "{<empty>}");
1528  return buf;
1529  }
1530 
1531  first_range = true;
1532  start = mask->begin();
1533  while (1) {
1534  // Find next range
1535  // [start, previous] is inclusive range of contiguous bits in mask
1536  for (finish = mask->next(start), previous = start;
1537  finish == previous + 1 && finish != mask->end();
1538  finish = mask->next(finish)) {
1539  previous = finish;
1540  }
1541 
1542  // The first range does not need a comma printed before it, but the rest
1543  // of the ranges do need a comma beforehand
1544  if (!first_range) {
1545  __kmp_str_buf_print(buf, "%s", ",");
1546  } else {
1547  first_range = false;
1548  }
1549  // Range with three or more contiguous bits in the affinity mask
1550  if (previous - start > 1) {
1551  __kmp_str_buf_print(buf, "%u-%u", start, previous);
1552  } else {
1553  // Range with one or two contiguous bits in the affinity mask
1554  __kmp_str_buf_print(buf, "%u", start);
1555  if (previous - start > 0) {
1556  __kmp_str_buf_print(buf, ",%u", previous);
1557  }
1558  }
1559  // Start over with new start point
1560  start = finish;
1561  if (start == mask->end())
1562  break;
1563  }
1564  return buf;
1565 }
1566 
1567 // Return (possibly empty) affinity mask representing the offline CPUs
1568 // Caller must free the mask
1569 kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
1570  kmp_affin_mask_t *offline;
1571  KMP_CPU_ALLOC(offline);
1572  KMP_CPU_ZERO(offline);
1573 #if KMP_OS_LINUX
1574  int n, begin_cpu, end_cpu;
1575  kmp_safe_raii_file_t offline_file;
1576  auto skip_ws = [](FILE *f) {
1577  int c;
1578  do {
1579  c = fgetc(f);
1580  } while (isspace(c));
1581  if (c != EOF)
1582  ungetc(c, f);
1583  };
1584  // File contains CSV of integer ranges representing the offline CPUs
1585  // e.g., 1,2,4-7,9,11-15
1586  int status = offline_file.try_open("/sys/devices/system/cpu/offline", "r");
1587  if (status != 0)
1588  return offline;
1589  while (!feof(offline_file)) {
1590  skip_ws(offline_file);
1591  n = fscanf(offline_file, "%d", &begin_cpu);
1592  if (n != 1)
1593  break;
1594  skip_ws(offline_file);
1595  int c = fgetc(offline_file);
1596  if (c == EOF || c == ',') {
1597  // Just single CPU
1598  end_cpu = begin_cpu;
1599  } else if (c == '-') {
1600  // Range of CPUs
1601  skip_ws(offline_file);
1602  n = fscanf(offline_file, "%d", &end_cpu);
1603  if (n != 1)
1604  break;
1605  skip_ws(offline_file);
1606  c = fgetc(offline_file); // skip ','
1607  } else {
1608  // Syntax problem
1609  break;
1610  }
1611  // Ensure a valid range of CPUs
1612  if (begin_cpu < 0 || begin_cpu >= __kmp_xproc || end_cpu < 0 ||
1613  end_cpu >= __kmp_xproc || begin_cpu > end_cpu) {
1614  continue;
1615  }
1616  // Insert [begin_cpu, end_cpu] into offline mask
1617  for (int cpu = begin_cpu; cpu <= end_cpu; ++cpu) {
1618  KMP_CPU_SET(cpu, offline);
1619  }
1620  }
1621 #endif
1622  return offline;
1623 }
1624 
1625 // Return the number of available procs
1626 int __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
1627  int avail_proc = 0;
1628  KMP_CPU_ZERO(mask);
1629 
1630 #if KMP_GROUP_AFFINITY
1631 
1632  if (__kmp_num_proc_groups > 1) {
1633  int group;
1634  KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
1635  for (group = 0; group < __kmp_num_proc_groups; group++) {
1636  int i;
1637  int num = __kmp_GetActiveProcessorCount(group);
1638  for (i = 0; i < num; i++) {
1639  KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
1640  avail_proc++;
1641  }
1642  }
1643  } else
1644 
1645 #endif /* KMP_GROUP_AFFINITY */
1646 
1647  {
1648  int proc;
1649  kmp_affin_mask_t *offline_cpus = __kmp_affinity_get_offline_cpus();
1650  for (proc = 0; proc < __kmp_xproc; proc++) {
1651  // Skip offline CPUs
1652  if (KMP_CPU_ISSET(proc, offline_cpus))
1653  continue;
1654  KMP_CPU_SET(proc, mask);
1655  avail_proc++;
1656  }
1657  KMP_CPU_FREE(offline_cpus);
1658  }
1659 
1660  return avail_proc;
1661 }
1662 
1663 // All of the __kmp_affinity_create_*_map() routines should allocate the
1664 // internal topology object and set the layer ids for it. Each routine
1665 // returns a boolean on whether it was successful at doing so.
1666 kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
1667 // Original mask is a subset of full mask in multiple processor groups topology
1668 kmp_affin_mask_t *__kmp_affin_origMask = NULL;
1669 
1670 #if KMP_USE_HWLOC
1671 static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) {
1672 #if HWLOC_API_VERSION >= 0x00020000
1673  return hwloc_obj_type_is_cache(obj->type);
1674 #else
1675  return obj->type == HWLOC_OBJ_CACHE;
1676 #endif
1677 }
1678 
1679 // Returns KMP_HW_* type derived from HWLOC_* type
1680 static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) {
1681 
1682  if (__kmp_hwloc_is_cache_type(obj)) {
1683  if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION)
1684  return KMP_HW_UNKNOWN;
1685  switch (obj->attr->cache.depth) {
1686  case 1:
1687  return KMP_HW_L1;
1688  case 2:
1689 #if KMP_MIC_SUPPORTED
1690  if (__kmp_mic_type == mic3) {
1691  return KMP_HW_TILE;
1692  }
1693 #endif
1694  return KMP_HW_L2;
1695  case 3:
1696  return KMP_HW_L3;
1697  }
1698  return KMP_HW_UNKNOWN;
1699  }
1700 
1701  switch (obj->type) {
1702  case HWLOC_OBJ_PACKAGE:
1703  return KMP_HW_SOCKET;
1704  case HWLOC_OBJ_NUMANODE:
1705  return KMP_HW_NUMA;
1706  case HWLOC_OBJ_CORE:
1707  return KMP_HW_CORE;
1708  case HWLOC_OBJ_PU:
1709  return KMP_HW_THREAD;
1710  case HWLOC_OBJ_GROUP:
1711 #if HWLOC_API_VERSION >= 0x00020000
1712  if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE)
1713  return KMP_HW_DIE;
1714  else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE)
1715  return KMP_HW_TILE;
1716  else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE)
1717  return KMP_HW_MODULE;
1718  else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP)
1719  return KMP_HW_PROC_GROUP;
1720 #endif
1721  return KMP_HW_UNKNOWN;
1722 #if HWLOC_API_VERSION >= 0x00020100
1723  case HWLOC_OBJ_DIE:
1724  return KMP_HW_DIE;
1725 #endif
1726  }
1727  return KMP_HW_UNKNOWN;
1728 }
1729 
1730 // Returns the number of objects of type 'type' below 'obj' within the topology
1731 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is
1732 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET
1733 // object.
1734 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,
1735  hwloc_obj_type_t type) {
1736  int retval = 0;
1737  hwloc_obj_t first;
1738  for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type,
1739  obj->logical_index, type, 0);
1740  first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology,
1741  obj->type, first) == obj;
1742  first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type,
1743  first)) {
1744  ++retval;
1745  }
1746  return retval;
1747 }
1748 
1749 // This gets the sub_id for a lower object under a higher object in the
1750 // topology tree
1751 static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher,
1752  hwloc_obj_t lower) {
1753  hwloc_obj_t obj;
1754  hwloc_obj_type_t ltype = lower->type;
1755  int lindex = lower->logical_index - 1;
1756  int sub_id = 0;
1757  // Get the previous lower object
1758  obj = hwloc_get_obj_by_type(t, ltype, lindex);
1759  while (obj && lindex >= 0 &&
1760  hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) {
1761  if (obj->userdata) {
1762  sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata));
1763  break;
1764  }
1765  sub_id++;
1766  lindex--;
1767  obj = hwloc_get_obj_by_type(t, ltype, lindex);
1768  }
1769  // store sub_id + 1 so that 0 is differed from NULL
1770  lower->userdata = RCAST(void *, sub_id + 1);
1771  return sub_id;
1772 }
1773 
1774 static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
1775  kmp_hw_t type;
1776  int hw_thread_index, sub_id;
1777  int depth;
1778  hwloc_obj_t pu, obj, root, prev;
1779  kmp_hw_t types[KMP_HW_LAST];
1780  hwloc_obj_type_t hwloc_types[KMP_HW_LAST];
1781 
1782  hwloc_topology_t tp = __kmp_hwloc_topology;
1783  *msg_id = kmp_i18n_null;
1784  if (__kmp_affinity.flags.verbose) {
1785  KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
1786  }
1787 
1788  if (!KMP_AFFINITY_CAPABLE()) {
1789  // Hack to try and infer the machine topology using only the data
1790  // available from hwloc on the current thread, and __kmp_xproc.
1791  KMP_ASSERT(__kmp_affinity.type == affinity_none);
1792  // hwloc only guarantees existance of PU object, so check PACKAGE and CORE
1793  hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0);
1794  if (o != NULL)
1795  nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE);
1796  else
1797  nCoresPerPkg = 1; // no PACKAGE found
1798  o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0);
1799  if (o != NULL)
1800  __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU);
1801  else
1802  __kmp_nThreadsPerCore = 1; // no CORE found
1803  if (__kmp_nThreadsPerCore == 0)
1804  __kmp_nThreadsPerCore = 1;
1805  __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1806  if (nCoresPerPkg == 0)
1807  nCoresPerPkg = 1; // to prevent possible division by 0
1808  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1809  return true;
1810  }
1811 
1812 #if HWLOC_API_VERSION >= 0x00020400
1813  // Handle multiple types of cores if they exist on the system
1814  int nr_cpu_kinds = hwloc_cpukinds_get_nr(tp, 0);
1815 
1816  typedef struct kmp_hwloc_cpukinds_info_t {
1817  int efficiency;
1818  kmp_hw_core_type_t core_type;
1819  hwloc_bitmap_t mask;
1820  } kmp_hwloc_cpukinds_info_t;
1821  kmp_hwloc_cpukinds_info_t *cpukinds = nullptr;
1822 
1823  if (nr_cpu_kinds > 0) {
1824  unsigned nr_infos;
1825  struct hwloc_info_s *infos;
1826  cpukinds = (kmp_hwloc_cpukinds_info_t *)__kmp_allocate(
1827  sizeof(kmp_hwloc_cpukinds_info_t) * nr_cpu_kinds);
1828  for (unsigned idx = 0; idx < (unsigned)nr_cpu_kinds; ++idx) {
1829  cpukinds[idx].efficiency = -1;
1830  cpukinds[idx].core_type = KMP_HW_CORE_TYPE_UNKNOWN;
1831  cpukinds[idx].mask = hwloc_bitmap_alloc();
1832  if (hwloc_cpukinds_get_info(tp, idx, cpukinds[idx].mask,
1833  &cpukinds[idx].efficiency, &nr_infos, &infos,
1834  0) == 0) {
1835  for (unsigned i = 0; i < nr_infos; ++i) {
1836  if (__kmp_str_match("CoreType", 8, infos[i].name)) {
1837 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1838  if (__kmp_str_match("IntelAtom", 9, infos[i].value)) {
1839  cpukinds[idx].core_type = KMP_HW_CORE_TYPE_ATOM;
1840  break;
1841  } else if (__kmp_str_match("IntelCore", 9, infos[i].value)) {
1842  cpukinds[idx].core_type = KMP_HW_CORE_TYPE_CORE;
1843  break;
1844  }
1845 #endif
1846  }
1847  }
1848  }
1849  }
1850  }
1851 #endif
1852 
1853  root = hwloc_get_root_obj(tp);
1854 
1855  // Figure out the depth and types in the topology
1856  depth = 0;
1857  obj = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
1858  while (obj && obj != root) {
1859 #if HWLOC_API_VERSION >= 0x00020000
1860  if (obj->memory_arity) {
1861  hwloc_obj_t memory;
1862  for (memory = obj->memory_first_child; memory;
1863  memory = hwloc_get_next_child(tp, obj, memory)) {
1864  if (memory->type == HWLOC_OBJ_NUMANODE)
1865  break;
1866  }
1867  if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
1868  types[depth] = KMP_HW_NUMA;
1869  hwloc_types[depth] = memory->type;
1870  depth++;
1871  }
1872  }
1873 #endif
1874  type = __kmp_hwloc_type_2_topology_type(obj);
1875  if (type != KMP_HW_UNKNOWN) {
1876  types[depth] = type;
1877  hwloc_types[depth] = obj->type;
1878  depth++;
1879  }
1880  obj = obj->parent;
1881  }
1882  KMP_ASSERT(depth > 0);
1883 
1884  // Get the order for the types correct
1885  for (int i = 0, j = depth - 1; i < j; ++i, --j) {
1886  hwloc_obj_type_t hwloc_temp = hwloc_types[i];
1887  kmp_hw_t temp = types[i];
1888  types[i] = types[j];
1889  types[j] = temp;
1890  hwloc_types[i] = hwloc_types[j];
1891  hwloc_types[j] = hwloc_temp;
1892  }
1893 
1894  // Allocate the data structure to be returned.
1895  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
1896 
1897  hw_thread_index = 0;
1898  pu = NULL;
1899  while ((pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu))) {
1900  int index = depth - 1;
1901  bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask);
1902  kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
1903  if (included) {
1904  hw_thread.clear();
1905  hw_thread.ids[index] = pu->logical_index;
1906  hw_thread.os_id = pu->os_index;
1907  // If multiple core types, then set that attribute for the hardware thread
1908 #if HWLOC_API_VERSION >= 0x00020400
1909  if (cpukinds) {
1910  int cpukind_index = -1;
1911  for (int i = 0; i < nr_cpu_kinds; ++i) {
1912  if (hwloc_bitmap_isset(cpukinds[i].mask, hw_thread.os_id)) {
1913  cpukind_index = i;
1914  break;
1915  }
1916  }
1917  if (cpukind_index >= 0) {
1918  hw_thread.attrs.set_core_type(cpukinds[cpukind_index].core_type);
1919  hw_thread.attrs.set_core_eff(cpukinds[cpukind_index].efficiency);
1920  }
1921  }
1922 #endif
1923  index--;
1924  }
1925  obj = pu;
1926  prev = obj;
1927  while (obj != root && obj != NULL) {
1928  obj = obj->parent;
1929 #if HWLOC_API_VERSION >= 0x00020000
1930  // NUMA Nodes are handled differently since they are not within the
1931  // parent/child structure anymore. They are separate children
1932  // of obj (memory_first_child points to first memory child)
1933  if (obj->memory_arity) {
1934  hwloc_obj_t memory;
1935  for (memory = obj->memory_first_child; memory;
1936  memory = hwloc_get_next_child(tp, obj, memory)) {
1937  if (memory->type == HWLOC_OBJ_NUMANODE)
1938  break;
1939  }
1940  if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
1941  sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev);
1942  if (included) {
1943  hw_thread.ids[index] = memory->logical_index;
1944  hw_thread.ids[index + 1] = sub_id;
1945  index--;
1946  }
1947  prev = memory;
1948  }
1949  prev = obj;
1950  }
1951 #endif
1952  type = __kmp_hwloc_type_2_topology_type(obj);
1953  if (type != KMP_HW_UNKNOWN) {
1954  sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev);
1955  if (included) {
1956  hw_thread.ids[index] = obj->logical_index;
1957  hw_thread.ids[index + 1] = sub_id;
1958  index--;
1959  }
1960  prev = obj;
1961  }
1962  }
1963  if (included)
1964  hw_thread_index++;
1965  }
1966 
1967 #if HWLOC_API_VERSION >= 0x00020400
1968  // Free the core types information
1969  if (cpukinds) {
1970  for (int idx = 0; idx < nr_cpu_kinds; ++idx)
1971  hwloc_bitmap_free(cpukinds[idx].mask);
1972  __kmp_free(cpukinds);
1973  }
1974 #endif
1975  __kmp_topology->sort_ids();
1976  return true;
1977 }
1978 #endif // KMP_USE_HWLOC
1979 
1980 // If we don't know how to retrieve the machine's processor topology, or
1981 // encounter an error in doing so, this routine is called to form a "flat"
1982 // mapping of os thread id's <-> processor id's.
1983 static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) {
1984  *msg_id = kmp_i18n_null;
1985  int depth = 3;
1986  kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1987 
1988  if (__kmp_affinity.flags.verbose) {
1989  KMP_INFORM(UsingFlatOS, "KMP_AFFINITY");
1990  }
1991 
1992  // Even if __kmp_affinity.type == affinity_none, this routine might still
1993  // be called to set __kmp_ncores, as well as
1994  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1995  if (!KMP_AFFINITY_CAPABLE()) {
1996  KMP_ASSERT(__kmp_affinity.type == affinity_none);
1997  __kmp_ncores = nPackages = __kmp_xproc;
1998  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1999  return true;
2000  }
2001 
2002  // When affinity is off, this routine will still be called to set
2003  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
2004  // Make sure all these vars are set correctly, and return now if affinity is
2005  // not enabled.
2006  __kmp_ncores = nPackages = __kmp_avail_proc;
2007  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
2008 
2009  // Construct the data structure to be returned.
2010  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
2011  int avail_ct = 0;
2012  int i;
2013  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
2014  // Skip this proc if it is not included in the machine model.
2015  if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
2016  continue;
2017  }
2018  kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
2019  hw_thread.clear();
2020  hw_thread.os_id = i;
2021  hw_thread.ids[0] = i;
2022  hw_thread.ids[1] = 0;
2023  hw_thread.ids[2] = 0;
2024  avail_ct++;
2025  }
2026  if (__kmp_affinity.flags.verbose) {
2027  KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
2028  }
2029  return true;
2030 }
2031 
2032 #if KMP_GROUP_AFFINITY
2033 // If multiple Windows* OS processor groups exist, we can create a 2-level
2034 // topology map with the groups at level 0 and the individual procs at level 1.
2035 // This facilitates letting the threads float among all procs in a group,
2036 // if granularity=group (the default when there are multiple groups).
2037 static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) {
2038  *msg_id = kmp_i18n_null;
2039  int depth = 3;
2040  kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD};
2041  const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR);
2042 
2043  if (__kmp_affinity.flags.verbose) {
2044  KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
2045  }
2046 
2047  // If we aren't affinity capable, then use flat topology
2048  if (!KMP_AFFINITY_CAPABLE()) {
2049  KMP_ASSERT(__kmp_affinity.type == affinity_none);
2050  nPackages = __kmp_num_proc_groups;
2051  __kmp_nThreadsPerCore = 1;
2052  __kmp_ncores = __kmp_xproc;
2053  nCoresPerPkg = nPackages / __kmp_ncores;
2054  return true;
2055  }
2056 
2057  // Construct the data structure to be returned.
2058  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
2059  int avail_ct = 0;
2060  int i;
2061  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
2062  // Skip this proc if it is not included in the machine model.
2063  if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
2064  continue;
2065  }
2066  kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++);
2067  hw_thread.clear();
2068  hw_thread.os_id = i;
2069  hw_thread.ids[0] = i / BITS_PER_GROUP;
2070  hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP;
2071  }
2072  return true;
2073 }
2074 #endif /* KMP_GROUP_AFFINITY */
2075 
2076 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
2077 
2078 template <kmp_uint32 LSB, kmp_uint32 MSB>
2079 static inline unsigned __kmp_extract_bits(kmp_uint32 v) {
2080  const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB;
2081  const kmp_uint32 SHIFT_RIGHT = LSB;
2082  kmp_uint32 retval = v;
2083  retval <<= SHIFT_LEFT;
2084  retval >>= (SHIFT_LEFT + SHIFT_RIGHT);
2085  return retval;
2086 }
2087 
2088 static int __kmp_cpuid_mask_width(int count) {
2089  int r = 0;
2090 
2091  while ((1 << r) < count)
2092  ++r;
2093  return r;
2094 }
2095 
2096 class apicThreadInfo {
2097 public:
2098  unsigned osId; // param to __kmp_affinity_bind_thread
2099  unsigned apicId; // from cpuid after binding
2100  unsigned maxCoresPerPkg; // ""
2101  unsigned maxThreadsPerPkg; // ""
2102  unsigned pkgId; // inferred from above values
2103  unsigned coreId; // ""
2104  unsigned threadId; // ""
2105 };
2106 
2107 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
2108  const void *b) {
2109  const apicThreadInfo *aa = (const apicThreadInfo *)a;
2110  const apicThreadInfo *bb = (const apicThreadInfo *)b;
2111  if (aa->pkgId < bb->pkgId)
2112  return -1;
2113  if (aa->pkgId > bb->pkgId)
2114  return 1;
2115  if (aa->coreId < bb->coreId)
2116  return -1;
2117  if (aa->coreId > bb->coreId)
2118  return 1;
2119  if (aa->threadId < bb->threadId)
2120  return -1;
2121  if (aa->threadId > bb->threadId)
2122  return 1;
2123  return 0;
2124 }
2125 
2126 class kmp_cache_info_t {
2127 public:
2128  struct info_t {
2129  unsigned level, mask;
2130  };
2131  kmp_cache_info_t() : depth(0) { get_leaf4_levels(); }
2132  size_t get_depth() const { return depth; }
2133  info_t &operator[](size_t index) { return table[index]; }
2134  const info_t &operator[](size_t index) const { return table[index]; }
2135 
2136  static kmp_hw_t get_topology_type(unsigned level) {
2137  KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL);
2138  switch (level) {
2139  case 1:
2140  return KMP_HW_L1;
2141  case 2:
2142  return KMP_HW_L2;
2143  case 3:
2144  return KMP_HW_L3;
2145  }
2146  return KMP_HW_UNKNOWN;
2147  }
2148 
2149 private:
2150  static const int MAX_CACHE_LEVEL = 3;
2151 
2152  size_t depth;
2153  info_t table[MAX_CACHE_LEVEL];
2154 
2155  void get_leaf4_levels() {
2156  unsigned level = 0;
2157  while (depth < MAX_CACHE_LEVEL) {
2158  unsigned cache_type, max_threads_sharing;
2159  unsigned cache_level, cache_mask_width;
2160  kmp_cpuid buf2;
2161  __kmp_x86_cpuid(4, level, &buf2);
2162  cache_type = __kmp_extract_bits<0, 4>(buf2.eax);
2163  if (!cache_type)
2164  break;
2165  // Skip instruction caches
2166  if (cache_type == 2) {
2167  level++;
2168  continue;
2169  }
2170  max_threads_sharing = __kmp_extract_bits<14, 25>(buf2.eax) + 1;
2171  cache_mask_width = __kmp_cpuid_mask_width(max_threads_sharing);
2172  cache_level = __kmp_extract_bits<5, 7>(buf2.eax);
2173  table[depth].level = cache_level;
2174  table[depth].mask = ((-1) << cache_mask_width);
2175  depth++;
2176  level++;
2177  }
2178  }
2179 };
2180 
2181 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
2182 // an algorithm which cycles through the available os threads, setting
2183 // the current thread's affinity mask to that thread, and then retrieves
2184 // the Apic Id for each thread context using the cpuid instruction.
2185 static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
2186  kmp_cpuid buf;
2187  *msg_id = kmp_i18n_null;
2188 
2189  if (__kmp_affinity.flags.verbose) {
2190  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
2191  }
2192 
2193  // Check if cpuid leaf 4 is supported.
2194  __kmp_x86_cpuid(0, 0, &buf);
2195  if (buf.eax < 4) {
2196  *msg_id = kmp_i18n_str_NoLeaf4Support;
2197  return false;
2198  }
2199 
2200  // The algorithm used starts by setting the affinity to each available thread
2201  // and retrieving info from the cpuid instruction, so if we are not capable of
2202  // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
2203  // need to do something else - use the defaults that we calculated from
2204  // issuing cpuid without binding to each proc.
2205  if (!KMP_AFFINITY_CAPABLE()) {
2206  // Hack to try and infer the machine topology using only the data
2207  // available from cpuid on the current thread, and __kmp_xproc.
2208  KMP_ASSERT(__kmp_affinity.type == affinity_none);
2209 
2210  // Get an upper bound on the number of threads per package using cpuid(1).
2211  // On some OS/chps combinations where HT is supported by the chip but is
2212  // disabled, this value will be 2 on a single core chip. Usually, it will be
2213  // 2 if HT is enabled and 1 if HT is disabled.
2214  __kmp_x86_cpuid(1, 0, &buf);
2215  int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
2216  if (maxThreadsPerPkg == 0) {
2217  maxThreadsPerPkg = 1;
2218  }
2219 
2220  // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded
2221  // value.
2222  //
2223  // The author of cpu_count.cpp treated this only an upper bound on the
2224  // number of cores, but I haven't seen any cases where it was greater than
2225  // the actual number of cores, so we will treat it as exact in this block of
2226  // code.
2227  //
2228  // First, we need to check if cpuid(4) is supported on this chip. To see if
2229  // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or
2230  // greater.
2231  __kmp_x86_cpuid(0, 0, &buf);
2232  if (buf.eax >= 4) {
2233  __kmp_x86_cpuid(4, 0, &buf);
2234  nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
2235  } else {
2236  nCoresPerPkg = 1;
2237  }
2238 
2239  // There is no way to reliably tell if HT is enabled without issuing the
2240  // cpuid instruction from every thread, can correlating the cpuid info, so
2241  // if the machine is not affinity capable, we assume that HT is off. We have
2242  // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine
2243  // does not support HT.
2244  //
2245  // - Older OSes are usually found on machines with older chips, which do not
2246  // support HT.
2247  // - The performance penalty for mistakenly identifying a machine as HT when
2248  // it isn't (which results in blocktime being incorrectly set to 0) is
2249  // greater than the penalty when for mistakenly identifying a machine as
2250  // being 1 thread/core when it is really HT enabled (which results in
2251  // blocktime being incorrectly set to a positive value).
2252  __kmp_ncores = __kmp_xproc;
2253  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
2254  __kmp_nThreadsPerCore = 1;
2255  return true;
2256  }
2257 
2258  // From here on, we can assume that it is safe to call
2259  // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
2260  // __kmp_affinity.type = affinity_none.
2261 
2262  // Save the affinity mask for the current thread.
2263  kmp_affinity_raii_t previous_affinity;
2264 
2265  // Run through each of the available contexts, binding the current thread
2266  // to it, and obtaining the pertinent information using the cpuid instr.
2267  //
2268  // The relevant information is:
2269  // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
2270  // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
2271  // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value
2272  // of this field determines the width of the core# + thread# fields in the
2273  // Apic Id. It is also an upper bound on the number of threads per
2274  // package, but it has been verified that situations happen were it is not
2275  // exact. In particular, on certain OS/chip combinations where Intel(R)
2276  // Hyper-Threading Technology is supported by the chip but has been
2277  // disabled, the value of this field will be 2 (for a single core chip).
2278  // On other OS/chip combinations supporting Intel(R) Hyper-Threading
2279  // Technology, the value of this field will be 1 when Intel(R)
2280  // Hyper-Threading Technology is disabled and 2 when it is enabled.
2281  // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value
2282  // of this field (+1) determines the width of the core# field in the Apic
2283  // Id. The comments in "cpucount.cpp" say that this value is an upper
2284  // bound, but the IA-32 architecture manual says that it is exactly the
2285  // number of cores per package, and I haven't seen any case where it
2286  // wasn't.
2287  //
2288  // From this information, deduce the package Id, core Id, and thread Id,
2289  // and set the corresponding fields in the apicThreadInfo struct.
2290  unsigned i;
2291  apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
2292  __kmp_avail_proc * sizeof(apicThreadInfo));
2293  unsigned nApics = 0;
2294  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
2295  // Skip this proc if it is not included in the machine model.
2296  if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
2297  continue;
2298  }
2299  KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
2300 
2301  __kmp_affinity_dispatch->bind_thread(i);
2302  threadInfo[nApics].osId = i;
2303 
2304  // The apic id and max threads per pkg come from cpuid(1).
2305  __kmp_x86_cpuid(1, 0, &buf);
2306  if (((buf.edx >> 9) & 1) == 0) {
2307  __kmp_free(threadInfo);
2308  *msg_id = kmp_i18n_str_ApicNotPresent;
2309  return false;
2310  }
2311  threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
2312  threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
2313  if (threadInfo[nApics].maxThreadsPerPkg == 0) {
2314  threadInfo[nApics].maxThreadsPerPkg = 1;
2315  }
2316 
2317  // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded
2318  // value.
2319  //
2320  // First, we need to check if cpuid(4) is supported on this chip. To see if
2321  // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n
2322  // or greater.
2323  __kmp_x86_cpuid(0, 0, &buf);
2324  if (buf.eax >= 4) {
2325  __kmp_x86_cpuid(4, 0, &buf);
2326  threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
2327  } else {
2328  threadInfo[nApics].maxCoresPerPkg = 1;
2329  }
2330 
2331  // Infer the pkgId / coreId / threadId using only the info obtained locally.
2332  int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg);
2333  threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
2334 
2335  int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg);
2336  int widthT = widthCT - widthC;
2337  if (widthT < 0) {
2338  // I've never seen this one happen, but I suppose it could, if the cpuid
2339  // instruction on a chip was really screwed up. Make sure to restore the
2340  // affinity mask before the tail call.
2341  __kmp_free(threadInfo);
2342  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
2343  return false;
2344  }
2345 
2346  int maskC = (1 << widthC) - 1;
2347  threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC;
2348 
2349  int maskT = (1 << widthT) - 1;
2350  threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT;
2351 
2352  nApics++;
2353  }
2354 
2355  // We've collected all the info we need.
2356  // Restore the old affinity mask for this thread.
2357  previous_affinity.restore();
2358 
2359  // Sort the threadInfo table by physical Id.
2360  qsort(threadInfo, nApics, sizeof(*threadInfo),
2361  __kmp_affinity_cmp_apicThreadInfo_phys_id);
2362 
2363  // The table is now sorted by pkgId / coreId / threadId, but we really don't
2364  // know the radix of any of the fields. pkgId's may be sparsely assigned among
2365  // the chips on a system. Although coreId's are usually assigned
2366  // [0 .. coresPerPkg-1] and threadId's are usually assigned
2367  // [0..threadsPerCore-1], we don't want to make any such assumptions.
2368  //
2369  // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
2370  // total # packages) are at this point - we want to determine that now. We
2371  // only have an upper bound on the first two figures.
2372  //
2373  // We also perform a consistency check at this point: the values returned by
2374  // the cpuid instruction for any thread bound to a given package had better
2375  // return the same info for maxThreadsPerPkg and maxCoresPerPkg.
2376  nPackages = 1;
2377  nCoresPerPkg = 1;
2378  __kmp_nThreadsPerCore = 1;
2379  unsigned nCores = 1;
2380 
2381  unsigned pkgCt = 1; // to determine radii
2382  unsigned lastPkgId = threadInfo[0].pkgId;
2383  unsigned coreCt = 1;
2384  unsigned lastCoreId = threadInfo[0].coreId;
2385  unsigned threadCt = 1;
2386  unsigned lastThreadId = threadInfo[0].threadId;
2387 
2388  // intra-pkg consist checks
2389  unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
2390  unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
2391 
2392  for (i = 1; i < nApics; i++) {
2393  if (threadInfo[i].pkgId != lastPkgId) {
2394  nCores++;
2395  pkgCt++;
2396  lastPkgId = threadInfo[i].pkgId;
2397  if ((int)coreCt > nCoresPerPkg)
2398  nCoresPerPkg = coreCt;
2399  coreCt = 1;
2400  lastCoreId = threadInfo[i].coreId;
2401  if ((int)threadCt > __kmp_nThreadsPerCore)
2402  __kmp_nThreadsPerCore = threadCt;
2403  threadCt = 1;
2404  lastThreadId = threadInfo[i].threadId;
2405 
2406  // This is a different package, so go on to the next iteration without
2407  // doing any consistency checks. Reset the consistency check vars, though.
2408  prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
2409  prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
2410  continue;
2411  }
2412 
2413  if (threadInfo[i].coreId != lastCoreId) {
2414  nCores++;
2415  coreCt++;
2416  lastCoreId = threadInfo[i].coreId;
2417  if ((int)threadCt > __kmp_nThreadsPerCore)
2418  __kmp_nThreadsPerCore = threadCt;
2419  threadCt = 1;
2420  lastThreadId = threadInfo[i].threadId;
2421  } else if (threadInfo[i].threadId != lastThreadId) {
2422  threadCt++;
2423  lastThreadId = threadInfo[i].threadId;
2424  } else {
2425  __kmp_free(threadInfo);
2426  *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
2427  return false;
2428  }
2429 
2430  // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
2431  // fields agree between all the threads bounds to a given package.
2432  if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) ||
2433  (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
2434  __kmp_free(threadInfo);
2435  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
2436  return false;
2437  }
2438  }
2439  // When affinity is off, this routine will still be called to set
2440  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
2441  // Make sure all these vars are set correctly
2442  nPackages = pkgCt;
2443  if ((int)coreCt > nCoresPerPkg)
2444  nCoresPerPkg = coreCt;
2445  if ((int)threadCt > __kmp_nThreadsPerCore)
2446  __kmp_nThreadsPerCore = threadCt;
2447  __kmp_ncores = nCores;
2448  KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc);
2449 
2450  // Now that we've determined the number of packages, the number of cores per
2451  // package, and the number of threads per core, we can construct the data
2452  // structure that is to be returned.
2453  int idx = 0;
2454  int pkgLevel = 0;
2455  int coreLevel = 1;
2456  int threadLevel = 2;
2457  //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
2458  int depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
2459  kmp_hw_t types[3];
2460  if (pkgLevel >= 0)
2461  types[idx++] = KMP_HW_SOCKET;
2462  if (coreLevel >= 0)
2463  types[idx++] = KMP_HW_CORE;
2464  if (threadLevel >= 0)
2465  types[idx++] = KMP_HW_THREAD;
2466 
2467  KMP_ASSERT(depth > 0);
2468  __kmp_topology = kmp_topology_t::allocate(nApics, depth, types);
2469 
2470  for (i = 0; i < nApics; ++i) {
2471  idx = 0;
2472  unsigned os = threadInfo[i].osId;
2473  kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
2474  hw_thread.clear();
2475 
2476  if (pkgLevel >= 0) {
2477  hw_thread.ids[idx++] = threadInfo[i].pkgId;
2478  }
2479  if (coreLevel >= 0) {
2480  hw_thread.ids[idx++] = threadInfo[i].coreId;
2481  }
2482  if (threadLevel >= 0) {
2483  hw_thread.ids[idx++] = threadInfo[i].threadId;
2484  }
2485  hw_thread.os_id = os;
2486  }
2487 
2488  __kmp_free(threadInfo);
2489  __kmp_topology->sort_ids();
2490  if (!__kmp_topology->check_ids()) {
2491  kmp_topology_t::deallocate(__kmp_topology);
2492  __kmp_topology = nullptr;
2493  *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
2494  return false;
2495  }
2496  return true;
2497 }
2498 
2499 // Hybrid cpu detection using CPUID.1A
2500 // Thread should be pinned to processor already
2501 static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type, int *efficiency,
2502  unsigned *native_model_id) {
2503  kmp_cpuid buf;
2504  __kmp_x86_cpuid(0x1a, 0, &buf);
2505  *type = (kmp_hw_core_type_t)__kmp_extract_bits<24, 31>(buf.eax);
2506  switch (*type) {
2507  case KMP_HW_CORE_TYPE_ATOM:
2508  *efficiency = 0;
2509  break;
2510  case KMP_HW_CORE_TYPE_CORE:
2511  *efficiency = 1;
2512  break;
2513  default:
2514  *efficiency = 0;
2515  }
2516  *native_model_id = __kmp_extract_bits<0, 23>(buf.eax);
2517 }
2518 
2519 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
2520 // architectures support a newer interface for specifying the x2APIC Ids,
2521 // based on CPUID.B or CPUID.1F
2522 /*
2523  * CPUID.B or 1F, Input ECX (sub leaf # aka level number)
2524  Bits Bits Bits Bits
2525  31-16 15-8 7-4 4-0
2526 ---+-----------+--------------+-------------+-----------------+
2527 EAX| reserved | reserved | reserved | Bits to Shift |
2528 ---+-----------|--------------+-------------+-----------------|
2529 EBX| reserved | Num logical processors at level (16 bits) |
2530 ---+-----------|--------------+-------------------------------|
2531 ECX| reserved | Level Type | Level Number (8 bits) |
2532 ---+-----------+--------------+-------------------------------|
2533 EDX| X2APIC ID (32 bits) |
2534 ---+----------------------------------------------------------+
2535 */
2536 
2537 enum {
2538  INTEL_LEVEL_TYPE_INVALID = 0, // Package level
2539  INTEL_LEVEL_TYPE_SMT = 1,
2540  INTEL_LEVEL_TYPE_CORE = 2,
2541  INTEL_LEVEL_TYPE_MODULE = 3,
2542  INTEL_LEVEL_TYPE_TILE = 4,
2543  INTEL_LEVEL_TYPE_DIE = 5,
2544  INTEL_LEVEL_TYPE_LAST = 6,
2545 };
2546 
2547 struct cpuid_level_info_t {
2548  unsigned level_type, mask, mask_width, nitems, cache_mask;
2549 };
2550 
2551 static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
2552  switch (intel_type) {
2553  case INTEL_LEVEL_TYPE_INVALID:
2554  return KMP_HW_SOCKET;
2555  case INTEL_LEVEL_TYPE_SMT:
2556  return KMP_HW_THREAD;
2557  case INTEL_LEVEL_TYPE_CORE:
2558  return KMP_HW_CORE;
2559  case INTEL_LEVEL_TYPE_TILE:
2560  return KMP_HW_TILE;
2561  case INTEL_LEVEL_TYPE_MODULE:
2562  return KMP_HW_MODULE;
2563  case INTEL_LEVEL_TYPE_DIE:
2564  return KMP_HW_DIE;
2565  }
2566  return KMP_HW_UNKNOWN;
2567 }
2568 
2569 // This function takes the topology leaf, a levels array to store the levels
2570 // detected and a bitmap of the known levels.
2571 // Returns the number of levels in the topology
2572 static unsigned
2573 __kmp_x2apicid_get_levels(int leaf,
2574  cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST],
2575  kmp_uint64 known_levels) {
2576  unsigned level, levels_index;
2577  unsigned level_type, mask_width, nitems;
2578  kmp_cpuid buf;
2579 
2580  // New algorithm has known topology layers act as highest unknown topology
2581  // layers when unknown topology layers exist.
2582  // e.g., Suppose layers were SMT <X> CORE <Y> <Z> PACKAGE, where <X> <Y> <Z>
2583  // are unknown topology layers, Then SMT will take the characteristics of
2584  // (SMT x <X>) and CORE will take the characteristics of (CORE x <Y> x <Z>).
2585  // This eliminates unknown portions of the topology while still keeping the
2586  // correct structure.
2587  level = levels_index = 0;
2588  do {
2589  __kmp_x86_cpuid(leaf, level, &buf);
2590  level_type = __kmp_extract_bits<8, 15>(buf.ecx);
2591  mask_width = __kmp_extract_bits<0, 4>(buf.eax);
2592  nitems = __kmp_extract_bits<0, 15>(buf.ebx);
2593  if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0)
2594  return 0;
2595 
2596  if (known_levels & (1ull << level_type)) {
2597  // Add a new level to the topology
2598  KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST);
2599  levels[levels_index].level_type = level_type;
2600  levels[levels_index].mask_width = mask_width;
2601  levels[levels_index].nitems = nitems;
2602  levels_index++;
2603  } else {
2604  // If it is an unknown level, then logically move the previous layer up
2605  if (levels_index > 0) {
2606  levels[levels_index - 1].mask_width = mask_width;
2607  levels[levels_index - 1].nitems = nitems;
2608  }
2609  }
2610  level++;
2611  } while (level_type != INTEL_LEVEL_TYPE_INVALID);
2612 
2613  // Ensure the INTEL_LEVEL_TYPE_INVALID (Socket) layer isn't first
2614  if (levels_index == 0 || levels[0].level_type == INTEL_LEVEL_TYPE_INVALID)
2615  return 0;
2616 
2617  // Set the masks to & with apicid
2618  for (unsigned i = 0; i < levels_index; ++i) {
2619  if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) {
2620  levels[i].mask = ~((-1) << levels[i].mask_width);
2621  levels[i].cache_mask = (-1) << levels[i].mask_width;
2622  for (unsigned j = 0; j < i; ++j)
2623  levels[i].mask ^= levels[j].mask;
2624  } else {
2625  KMP_DEBUG_ASSERT(i > 0);
2626  levels[i].mask = (-1) << levels[i - 1].mask_width;
2627  levels[i].cache_mask = 0;
2628  }
2629  }
2630  return levels_index;
2631 }
2632 
2633 static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
2634 
2635  cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
2636  kmp_hw_t types[INTEL_LEVEL_TYPE_LAST];
2637  unsigned levels_index;
2638  kmp_cpuid buf;
2639  kmp_uint64 known_levels;
2640  int topology_leaf, highest_leaf, apic_id;
2641  int num_leaves;
2642  static int leaves[] = {0, 0};
2643 
2644  kmp_i18n_id_t leaf_message_id;
2645 
2646  KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST);
2647 
2648  *msg_id = kmp_i18n_null;
2649  if (__kmp_affinity.flags.verbose) {
2650  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
2651  }
2652 
2653  // Figure out the known topology levels
2654  known_levels = 0ull;
2655  for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) {
2656  if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) {
2657  known_levels |= (1ull << i);
2658  }
2659  }
2660 
2661  // Get the highest cpuid leaf supported
2662  __kmp_x86_cpuid(0, 0, &buf);
2663  highest_leaf = buf.eax;
2664 
2665  // If a specific topology method was requested, only allow that specific leaf
2666  // otherwise, try both leaves 31 and 11 in that order
2667  num_leaves = 0;
2668  if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
2669  num_leaves = 1;
2670  leaves[0] = 11;
2671  leaf_message_id = kmp_i18n_str_NoLeaf11Support;
2672  } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
2673  num_leaves = 1;
2674  leaves[0] = 31;
2675  leaf_message_id = kmp_i18n_str_NoLeaf31Support;
2676  } else {
2677  num_leaves = 2;
2678  leaves[0] = 31;
2679  leaves[1] = 11;
2680  leaf_message_id = kmp_i18n_str_NoLeaf11Support;
2681  }
2682 
2683  // Check to see if cpuid leaf 31 or 11 is supported.
2684  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2685  topology_leaf = -1;
2686  for (int i = 0; i < num_leaves; ++i) {
2687  int leaf = leaves[i];
2688  if (highest_leaf < leaf)
2689  continue;
2690  __kmp_x86_cpuid(leaf, 0, &buf);
2691  if (buf.ebx == 0)
2692  continue;
2693  topology_leaf = leaf;
2694  levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels);
2695  if (levels_index == 0)
2696  continue;
2697  break;
2698  }
2699  if (topology_leaf == -1 || levels_index == 0) {
2700  *msg_id = leaf_message_id;
2701  return false;
2702  }
2703  KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
2704 
2705  // The algorithm used starts by setting the affinity to each available thread
2706  // and retrieving info from the cpuid instruction, so if we are not capable of
2707  // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then
2708  // we need to do something else - use the defaults that we calculated from
2709  // issuing cpuid without binding to each proc.
2710  if (!KMP_AFFINITY_CAPABLE()) {
2711  // Hack to try and infer the machine topology using only the data
2712  // available from cpuid on the current thread, and __kmp_xproc.
2713  KMP_ASSERT(__kmp_affinity.type == affinity_none);
2714  for (unsigned i = 0; i < levels_index; ++i) {
2715  if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
2716  __kmp_nThreadsPerCore = levels[i].nitems;
2717  } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
2718  nCoresPerPkg = levels[i].nitems;
2719  }
2720  }
2721  __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
2722  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
2723  return true;
2724  }
2725 
2726  // Allocate the data structure to be returned.
2727  int depth = levels_index;
2728  for (int i = depth - 1, j = 0; i >= 0; --i, ++j)
2729  types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type);
2730  __kmp_topology =
2731  kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types);
2732 
2733  // Insert equivalent cache types if they exist
2734  kmp_cache_info_t cache_info;
2735  for (size_t i = 0; i < cache_info.get_depth(); ++i) {
2736  const kmp_cache_info_t::info_t &info = cache_info[i];
2737  unsigned cache_mask = info.mask;
2738  unsigned cache_level = info.level;
2739  for (unsigned j = 0; j < levels_index; ++j) {
2740  unsigned hw_cache_mask = levels[j].cache_mask;
2741  kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level);
2742  if (hw_cache_mask == cache_mask && j < levels_index - 1) {
2743  kmp_hw_t type =
2744  __kmp_intel_type_2_topology_type(levels[j + 1].level_type);
2745  __kmp_topology->set_equivalent_type(cache_type, type);
2746  }
2747  }
2748  }
2749 
2750  // From here on, we can assume that it is safe to call
2751  // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
2752  // __kmp_affinity.type = affinity_none.
2753 
2754  // Save the affinity mask for the current thread.
2755  kmp_affinity_raii_t previous_affinity;
2756 
2757  // Run through each of the available contexts, binding the current thread
2758  // to it, and obtaining the pertinent information using the cpuid instr.
2759  unsigned int proc;
2760  int hw_thread_index = 0;
2761  KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
2762  cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST];
2763  unsigned my_levels_index;
2764 
2765  // Skip this proc if it is not included in the machine model.
2766  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
2767  continue;
2768  }
2769  KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc);
2770 
2771  __kmp_affinity_dispatch->bind_thread(proc);
2772 
2773  // New algorithm
2774  __kmp_x86_cpuid(topology_leaf, 0, &buf);
2775  apic_id = buf.edx;
2776  kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
2777  my_levels_index =
2778  __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels);
2779  if (my_levels_index == 0 || my_levels_index != levels_index) {
2780  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
2781  return false;
2782  }
2783  hw_thread.clear();
2784  hw_thread.os_id = proc;
2785  // Put in topology information
2786  for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) {
2787  hw_thread.ids[idx] = apic_id & my_levels[j].mask;
2788  if (j > 0) {
2789  hw_thread.ids[idx] >>= my_levels[j - 1].mask_width;
2790  }
2791  }
2792  // Hybrid information
2793  if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) {
2794  kmp_hw_core_type_t type;
2795  unsigned native_model_id;
2796  int efficiency;
2797  __kmp_get_hybrid_info(&type, &efficiency, &native_model_id);
2798  hw_thread.attrs.set_core_type(type);
2799  hw_thread.attrs.set_core_eff(efficiency);
2800  }
2801  hw_thread_index++;
2802  }
2803  KMP_ASSERT(hw_thread_index > 0);
2804  __kmp_topology->sort_ids();
2805  if (!__kmp_topology->check_ids()) {
2806  kmp_topology_t::deallocate(__kmp_topology);
2807  __kmp_topology = nullptr;
2808  *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
2809  return false;
2810  }
2811  return true;
2812 }
2813 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
2814 
2815 #define osIdIndex 0
2816 #define threadIdIndex 1
2817 #define coreIdIndex 2
2818 #define pkgIdIndex 3
2819 #define nodeIdIndex 4
2820 
2821 typedef unsigned *ProcCpuInfo;
2822 static unsigned maxIndex = pkgIdIndex;
2823 
2824 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
2825  const void *b) {
2826  unsigned i;
2827  const unsigned *aa = *(unsigned *const *)a;
2828  const unsigned *bb = *(unsigned *const *)b;
2829  for (i = maxIndex;; i--) {
2830  if (aa[i] < bb[i])
2831  return -1;
2832  if (aa[i] > bb[i])
2833  return 1;
2834  if (i == osIdIndex)
2835  break;
2836  }
2837  return 0;
2838 }
2839 
2840 #if KMP_USE_HIER_SCHED
2841 // Set the array sizes for the hierarchy layers
2842 static void __kmp_dispatch_set_hierarchy_values() {
2843  // Set the maximum number of L1's to number of cores
2844  // Set the maximum number of L2's to either number of cores / 2 for
2845  // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
2846  // Or the number of cores for Intel(R) Xeon(R) processors
2847  // Set the maximum number of NUMA nodes and L3's to number of packages
2848  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
2849  nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
2850  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
2851 #if KMP_ARCH_X86_64 && \
2852  (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
2853  KMP_OS_WINDOWS) && \
2854  KMP_MIC_SUPPORTED
2855  if (__kmp_mic_type >= mic3)
2856  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
2857  else
2858 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
2859  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores;
2860  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages;
2861  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages;
2862  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1;
2863  // Set the number of threads per unit
2864  // Number of hardware threads per L1/L2/L3/NUMA/LOOP
2865  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
2866  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
2867  __kmp_nThreadsPerCore;
2868 #if KMP_ARCH_X86_64 && \
2869  (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
2870  KMP_OS_WINDOWS) && \
2871  KMP_MIC_SUPPORTED
2872  if (__kmp_mic_type >= mic3)
2873  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
2874  2 * __kmp_nThreadsPerCore;
2875  else
2876 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
2877  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
2878  __kmp_nThreadsPerCore;
2879  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] =
2880  nCoresPerPkg * __kmp_nThreadsPerCore;
2881  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] =
2882  nCoresPerPkg * __kmp_nThreadsPerCore;
2883  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] =
2884  nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
2885 }
2886 
2887 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
2888 // i.e., this thread's L1 or this thread's L2, etc.
2889 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) {
2890  int index = type + 1;
2891  int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
2892  KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST);
2893  if (type == kmp_hier_layer_e::LAYER_THREAD)
2894  return tid;
2895  else if (type == kmp_hier_layer_e::LAYER_LOOP)
2896  return 0;
2897  KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0);
2898  if (tid >= num_hw_threads)
2899  tid = tid % num_hw_threads;
2900  return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index];
2901 }
2902 
2903 // Return the number of t1's per t2
2904 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) {
2905  int i1 = t1 + 1;
2906  int i2 = t2 + 1;
2907  KMP_DEBUG_ASSERT(i1 <= i2);
2908  KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST);
2909  KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST);
2910  KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0);
2911  // (nthreads/t2) / (nthreads/t1) = t1 / t2
2912  return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1];
2913 }
2914 #endif // KMP_USE_HIER_SCHED
2915 
2916 static inline const char *__kmp_cpuinfo_get_filename() {
2917  const char *filename;
2918  if (__kmp_cpuinfo_file != nullptr)
2919  filename = __kmp_cpuinfo_file;
2920  else
2921  filename = "/proc/cpuinfo";
2922  return filename;
2923 }
2924 
2925 static inline const char *__kmp_cpuinfo_get_envvar() {
2926  const char *envvar = nullptr;
2927  if (__kmp_cpuinfo_file != nullptr)
2928  envvar = "KMP_CPUINFO_FILE";
2929  return envvar;
2930 }
2931 
2932 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
2933 // affinity map. On AIX, the map is obtained through system SRAD (Scheduler
2934 // Resource Allocation Domain).
2935 static bool __kmp_affinity_create_cpuinfo_map(int *line,
2936  kmp_i18n_id_t *const msg_id) {
2937  *msg_id = kmp_i18n_null;
2938 
2939 #if KMP_OS_AIX
2940  unsigned num_records = __kmp_xproc;
2941 #else
2942  const char *filename = __kmp_cpuinfo_get_filename();
2943  const char *envvar = __kmp_cpuinfo_get_envvar();
2944 
2945  if (__kmp_affinity.flags.verbose) {
2946  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
2947  }
2948 
2949  kmp_safe_raii_file_t f(filename, "r", envvar);
2950 
2951  // Scan of the file, and count the number of "processor" (osId) fields,
2952  // and find the highest value of <n> for a node_<n> field.
2953  char buf[256];
2954  unsigned num_records = 0;
2955  while (!feof(f)) {
2956  buf[sizeof(buf) - 1] = 1;
2957  if (!fgets(buf, sizeof(buf), f)) {
2958  // Read errors presumably because of EOF
2959  break;
2960  }
2961 
2962  char s1[] = "processor";
2963  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2964  num_records++;
2965  continue;
2966  }
2967 
2968  // FIXME - this will match "node_<n> <garbage>"
2969  unsigned level;
2970  if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
2971  // validate the input fisrt:
2972  if (level > (unsigned)__kmp_xproc) { // level is too big
2973  level = __kmp_xproc;
2974  }
2975  if (nodeIdIndex + level >= maxIndex) {
2976  maxIndex = nodeIdIndex + level;
2977  }
2978  continue;
2979  }
2980  }
2981 
2982  // Check for empty file / no valid processor records, or too many. The number
2983  // of records can't exceed the number of valid bits in the affinity mask.
2984  if (num_records == 0) {
2985  *msg_id = kmp_i18n_str_NoProcRecords;
2986  return false;
2987  }
2988  if (num_records > (unsigned)__kmp_xproc) {
2989  *msg_id = kmp_i18n_str_TooManyProcRecords;
2990  return false;
2991  }
2992 
2993  // Set the file pointer back to the beginning, so that we can scan the file
2994  // again, this time performing a full parse of the data. Allocate a vector of
2995  // ProcCpuInfo object, where we will place the data. Adding an extra element
2996  // at the end allows us to remove a lot of extra checks for termination
2997  // conditions.
2998  if (fseek(f, 0, SEEK_SET) != 0) {
2999  *msg_id = kmp_i18n_str_CantRewindCpuinfo;
3000  return false;
3001  }
3002 #endif // KMP_OS_AIX
3003 
3004  // Allocate the array of records to store the proc info in. The dummy
3005  // element at the end makes the logic in filling them out easier to code.
3006  unsigned **threadInfo =
3007  (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *));
3008  unsigned i;
3009  for (i = 0; i <= num_records; i++) {
3010  threadInfo[i] =
3011  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3012  }
3013 
3014 #define CLEANUP_THREAD_INFO \
3015  for (i = 0; i <= num_records; i++) { \
3016  __kmp_free(threadInfo[i]); \
3017  } \
3018  __kmp_free(threadInfo);
3019 
3020  // A value of UINT_MAX means that we didn't find the field
3021  unsigned __index;
3022 
3023 #define INIT_PROC_INFO(p) \
3024  for (__index = 0; __index <= maxIndex; __index++) { \
3025  (p)[__index] = UINT_MAX; \
3026  }
3027 
3028  for (i = 0; i <= num_records; i++) {
3029  INIT_PROC_INFO(threadInfo[i]);
3030  }
3031 
3032 #if KMP_OS_AIX
3033  int smt_threads;
3034  lpar_info_format1_t cpuinfo;
3035  unsigned num_avail = __kmp_xproc;
3036 
3037  if (__kmp_affinity.flags.verbose)
3038  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "system info for topology");
3039 
3040  // Get the number of SMT threads per core.
3041  smt_threads = syssmt(GET_NUMBER_SMT_SETS, 0, 0, NULL);
3042 
3043  // Allocate a resource set containing available system resourses.
3044  rsethandle_t sys_rset = rs_alloc(RS_SYSTEM);
3045  if (sys_rset == NULL) {
3046  CLEANUP_THREAD_INFO;
3047  *msg_id = kmp_i18n_str_UnknownTopology;
3048  return false;
3049  }
3050  // Allocate a resource set for the SRAD info.
3051  rsethandle_t srad = rs_alloc(RS_EMPTY);
3052  if (srad == NULL) {
3053  rs_free(sys_rset);
3054  CLEANUP_THREAD_INFO;
3055  *msg_id = kmp_i18n_str_UnknownTopology;
3056  return false;
3057  }
3058 
3059  // Get the SRAD system detail level.
3060  int sradsdl = rs_getinfo(NULL, R_SRADSDL, 0);
3061  if (sradsdl < 0) {
3062  rs_free(sys_rset);
3063  rs_free(srad);
3064  CLEANUP_THREAD_INFO;
3065  *msg_id = kmp_i18n_str_UnknownTopology;
3066  return false;
3067  }
3068  // Get the number of RADs at that SRAD SDL.
3069  int num_rads = rs_numrads(sys_rset, sradsdl, 0);
3070  if (num_rads < 0) {
3071  rs_free(sys_rset);
3072  rs_free(srad);
3073  CLEANUP_THREAD_INFO;
3074  *msg_id = kmp_i18n_str_UnknownTopology;
3075  return false;
3076  }
3077 
3078  // Get the maximum number of procs that may be contained in a resource set.
3079  int max_procs = rs_getinfo(NULL, R_MAXPROCS, 0);
3080  if (max_procs < 0) {
3081  rs_free(sys_rset);
3082  rs_free(srad);
3083  CLEANUP_THREAD_INFO;
3084  *msg_id = kmp_i18n_str_UnknownTopology;
3085  return false;
3086  }
3087 
3088  int cur_rad = 0;
3089  int num_set = 0;
3090  for (int srad_idx = 0; cur_rad < num_rads && srad_idx < VMI_MAXRADS;
3091  ++srad_idx) {
3092  // Check if the SRAD is available in the RSET.
3093  if (rs_getrad(sys_rset, srad, sradsdl, srad_idx, 0) < 0)
3094  continue;
3095 
3096  for (int cpu = 0; cpu < max_procs; cpu++) {
3097  // Set the info for the cpu if it is in the SRAD.
3098  if (rs_op(RS_TESTRESOURCE, srad, NULL, R_PROCS, cpu)) {
3099  threadInfo[cpu][osIdIndex] = cpu;
3100  threadInfo[cpu][pkgIdIndex] = cur_rad;
3101  threadInfo[cpu][coreIdIndex] = cpu / smt_threads;
3102  ++num_set;
3103  if (num_set >= num_avail) {
3104  // Done if all available CPUs have been set.
3105  break;
3106  }
3107  }
3108  }
3109  ++cur_rad;
3110  }
3111  rs_free(sys_rset);
3112  rs_free(srad);
3113 
3114  // The topology is already sorted.
3115 
3116 #else // !KMP_OS_AIX
3117  unsigned num_avail = 0;
3118  *line = 0;
3119 #if KMP_ARCH_S390X
3120  bool reading_s390x_sys_info = true;
3121 #endif
3122  while (!feof(f)) {
3123  // Create an inner scoping level, so that all the goto targets at the end of
3124  // the loop appear in an outer scoping level. This avoids warnings about
3125  // jumping past an initialization to a target in the same block.
3126  {
3127  buf[sizeof(buf) - 1] = 1;
3128  bool long_line = false;
3129  if (!fgets(buf, sizeof(buf), f)) {
3130  // Read errors presumably because of EOF
3131  // If there is valid data in threadInfo[num_avail], then fake
3132  // a blank line in ensure that the last address gets parsed.
3133  bool valid = false;
3134  for (i = 0; i <= maxIndex; i++) {
3135  if (threadInfo[num_avail][i] != UINT_MAX) {
3136  valid = true;
3137  }
3138  }
3139  if (!valid) {
3140  break;
3141  }
3142  buf[0] = 0;
3143  } else if (!buf[sizeof(buf) - 1]) {
3144  // The line is longer than the buffer. Set a flag and don't
3145  // emit an error if we were going to ignore the line, anyway.
3146  long_line = true;
3147 
3148 #define CHECK_LINE \
3149  if (long_line) { \
3150  CLEANUP_THREAD_INFO; \
3151  *msg_id = kmp_i18n_str_LongLineCpuinfo; \
3152  return false; \
3153  }
3154  }
3155  (*line)++;
3156 
3157 #if KMP_ARCH_LOONGARCH64
3158  // The parsing logic of /proc/cpuinfo in this function highly depends on
3159  // the blank lines between each processor info block. But on LoongArch a
3160  // blank line exists before the first processor info block (i.e. after the
3161  // "system type" line). This blank line was added because the "system
3162  // type" line is unrelated to any of the CPUs. We must skip this line so
3163  // that the original logic works on LoongArch.
3164  if (*buf == '\n' && *line == 2)
3165  continue;
3166 #endif
3167 #if KMP_ARCH_S390X
3168  // s390x /proc/cpuinfo starts with a variable number of lines containing
3169  // the overall system information. Skip them.
3170  if (reading_s390x_sys_info) {
3171  if (*buf == '\n')
3172  reading_s390x_sys_info = false;
3173  continue;
3174  }
3175 #endif
3176 
3177 #if KMP_ARCH_S390X
3178  char s1[] = "cpu number";
3179 #else
3180  char s1[] = "processor";
3181 #endif
3182  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
3183  CHECK_LINE;
3184  char *p = strchr(buf + sizeof(s1) - 1, ':');
3185  unsigned val;
3186  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3187  goto no_val;
3188  if (threadInfo[num_avail][osIdIndex] != UINT_MAX)
3189 #if KMP_ARCH_AARCH64
3190  // Handle the old AArch64 /proc/cpuinfo layout differently,
3191  // it contains all of the 'processor' entries listed in a
3192  // single 'Processor' section, therefore the normal looking
3193  // for duplicates in that section will always fail.
3194  num_avail++;
3195 #else
3196  goto dup_field;
3197 #endif
3198  threadInfo[num_avail][osIdIndex] = val;
3199 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
3200  char path[256];
3201  KMP_SNPRINTF(
3202  path, sizeof(path),
3203  "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
3204  threadInfo[num_avail][osIdIndex]);
3205  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
3206 
3207 #if KMP_ARCH_S390X
3208  // Disambiguate physical_package_id.
3209  unsigned book_id;
3210  KMP_SNPRINTF(path, sizeof(path),
3211  "/sys/devices/system/cpu/cpu%u/topology/book_id",
3212  threadInfo[num_avail][osIdIndex]);
3213  __kmp_read_from_file(path, "%u", &book_id);
3214  threadInfo[num_avail][pkgIdIndex] |= (book_id << 8);
3215 
3216  unsigned drawer_id;
3217  KMP_SNPRINTF(path, sizeof(path),
3218  "/sys/devices/system/cpu/cpu%u/topology/drawer_id",
3219  threadInfo[num_avail][osIdIndex]);
3220  __kmp_read_from_file(path, "%u", &drawer_id);
3221  threadInfo[num_avail][pkgIdIndex] |= (drawer_id << 16);
3222 #endif
3223 
3224  KMP_SNPRINTF(path, sizeof(path),
3225  "/sys/devices/system/cpu/cpu%u/topology/core_id",
3226  threadInfo[num_avail][osIdIndex]);
3227  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
3228  continue;
3229 #else
3230  }
3231  char s2[] = "physical id";
3232  if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
3233  CHECK_LINE;
3234  char *p = strchr(buf + sizeof(s2) - 1, ':');
3235  unsigned val;
3236  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3237  goto no_val;
3238  if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX)
3239  goto dup_field;
3240  threadInfo[num_avail][pkgIdIndex] = val;
3241  continue;
3242  }
3243  char s3[] = "core id";
3244  if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
3245  CHECK_LINE;
3246  char *p = strchr(buf + sizeof(s3) - 1, ':');
3247  unsigned val;
3248  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3249  goto no_val;
3250  if (threadInfo[num_avail][coreIdIndex] != UINT_MAX)
3251  goto dup_field;
3252  threadInfo[num_avail][coreIdIndex] = val;
3253  continue;
3254 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
3255  }
3256  char s4[] = "thread id";
3257  if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
3258  CHECK_LINE;
3259  char *p = strchr(buf + sizeof(s4) - 1, ':');
3260  unsigned val;
3261  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3262  goto no_val;
3263  if (threadInfo[num_avail][threadIdIndex] != UINT_MAX)
3264  goto dup_field;
3265  threadInfo[num_avail][threadIdIndex] = val;
3266  continue;
3267  }
3268  unsigned level;
3269  if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
3270  CHECK_LINE;
3271  char *p = strchr(buf + sizeof(s4) - 1, ':');
3272  unsigned val;
3273  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3274  goto no_val;
3275  // validate the input before using level:
3276  if (level > (unsigned)__kmp_xproc) { // level is too big
3277  level = __kmp_xproc;
3278  }
3279  if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
3280  goto dup_field;
3281  threadInfo[num_avail][nodeIdIndex + level] = val;
3282  continue;
3283  }
3284 
3285  // We didn't recognize the leading token on the line. There are lots of
3286  // leading tokens that we don't recognize - if the line isn't empty, go on
3287  // to the next line.
3288  if ((*buf != 0) && (*buf != '\n')) {
3289  // If the line is longer than the buffer, read characters
3290  // until we find a newline.
3291  if (long_line) {
3292  int ch;
3293  while (((ch = fgetc(f)) != EOF) && (ch != '\n'))
3294  ;
3295  }
3296  continue;
3297  }
3298 
3299  // A newline has signalled the end of the processor record.
3300  // Check that there aren't too many procs specified.
3301  if ((int)num_avail == __kmp_xproc) {
3302  CLEANUP_THREAD_INFO;
3303  *msg_id = kmp_i18n_str_TooManyEntries;
3304  return false;
3305  }
3306 
3307  // Check for missing fields. The osId field must be there, and we
3308  // currently require that the physical id field is specified, also.
3309  if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
3310  CLEANUP_THREAD_INFO;
3311  *msg_id = kmp_i18n_str_MissingProcField;
3312  return false;
3313  }
3314  if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
3315  CLEANUP_THREAD_INFO;
3316  *msg_id = kmp_i18n_str_MissingPhysicalIDField;
3317  return false;
3318  }
3319 
3320  // Skip this proc if it is not included in the machine model.
3321  if (KMP_AFFINITY_CAPABLE() &&
3322  !KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
3323  __kmp_affin_fullMask)) {
3324  INIT_PROC_INFO(threadInfo[num_avail]);
3325  continue;
3326  }
3327 
3328  // We have a successful parse of this proc's info.
3329  // Increment the counter, and prepare for the next proc.
3330  num_avail++;
3331  KMP_ASSERT(num_avail <= num_records);
3332  INIT_PROC_INFO(threadInfo[num_avail]);
3333  }
3334  continue;
3335 
3336  no_val:
3337  CLEANUP_THREAD_INFO;
3338  *msg_id = kmp_i18n_str_MissingValCpuinfo;
3339  return false;
3340 
3341  dup_field:
3342  CLEANUP_THREAD_INFO;
3343  *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
3344  return false;
3345  }
3346  *line = 0;
3347 
3348 #if KMP_MIC && REDUCE_TEAM_SIZE
3349  unsigned teamSize = 0;
3350 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3351 
3352  // check for num_records == __kmp_xproc ???
3353 
3354  // If it is configured to omit the package level when there is only a single
3355  // package, the logic at the end of this routine won't work if there is only a
3356  // single thread
3357  KMP_ASSERT(num_avail > 0);
3358  KMP_ASSERT(num_avail <= num_records);
3359 
3360  // Sort the threadInfo table by physical Id.
3361  qsort(threadInfo, num_avail, sizeof(*threadInfo),
3362  __kmp_affinity_cmp_ProcCpuInfo_phys_id);
3363 
3364 #endif // KMP_OS_AIX
3365 
3366  // The table is now sorted by pkgId / coreId / threadId, but we really don't
3367  // know the radix of any of the fields. pkgId's may be sparsely assigned among
3368  // the chips on a system. Although coreId's are usually assigned
3369  // [0 .. coresPerPkg-1] and threadId's are usually assigned
3370  // [0..threadsPerCore-1], we don't want to make any such assumptions.
3371  //
3372  // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
3373  // total # packages) are at this point - we want to determine that now. We
3374  // only have an upper bound on the first two figures.
3375  unsigned *counts =
3376  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3377  unsigned *maxCt =
3378  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3379  unsigned *totals =
3380  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3381  unsigned *lastId =
3382  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3383 
3384  bool assign_thread_ids = false;
3385  unsigned threadIdCt;
3386  unsigned index;
3387 
3388 restart_radix_check:
3389  threadIdCt = 0;
3390 
3391  // Initialize the counter arrays with data from threadInfo[0].
3392  if (assign_thread_ids) {
3393  if (threadInfo[0][threadIdIndex] == UINT_MAX) {
3394  threadInfo[0][threadIdIndex] = threadIdCt++;
3395  } else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
3396  threadIdCt = threadInfo[0][threadIdIndex] + 1;
3397  }
3398  }
3399  for (index = 0; index <= maxIndex; index++) {
3400  counts[index] = 1;
3401  maxCt[index] = 1;
3402  totals[index] = 1;
3403  lastId[index] = threadInfo[0][index];
3404  ;
3405  }
3406 
3407  // Run through the rest of the OS procs.
3408  for (i = 1; i < num_avail; i++) {
3409  // Find the most significant index whose id differs from the id for the
3410  // previous OS proc.
3411  for (index = maxIndex; index >= threadIdIndex; index--) {
3412  if (assign_thread_ids && (index == threadIdIndex)) {
3413  // Auto-assign the thread id field if it wasn't specified.
3414  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
3415  threadInfo[i][threadIdIndex] = threadIdCt++;
3416  }
3417  // Apparently the thread id field was specified for some entries and not
3418  // others. Start the thread id counter off at the next higher thread id.
3419  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
3420  threadIdCt = threadInfo[i][threadIdIndex] + 1;
3421  }
3422  }
3423  if (threadInfo[i][index] != lastId[index]) {
3424  // Run through all indices which are less significant, and reset the
3425  // counts to 1. At all levels up to and including index, we need to
3426  // increment the totals and record the last id.
3427  unsigned index2;
3428  for (index2 = threadIdIndex; index2 < index; index2++) {
3429  totals[index2]++;
3430  if (counts[index2] > maxCt[index2]) {
3431  maxCt[index2] = counts[index2];
3432  }
3433  counts[index2] = 1;
3434  lastId[index2] = threadInfo[i][index2];
3435  }
3436  counts[index]++;
3437  totals[index]++;
3438  lastId[index] = threadInfo[i][index];
3439 
3440  if (assign_thread_ids && (index > threadIdIndex)) {
3441 
3442 #if KMP_MIC && REDUCE_TEAM_SIZE
3443  // The default team size is the total #threads in the machine
3444  // minus 1 thread for every core that has 3 or more threads.
3445  teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
3446 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3447 
3448  // Restart the thread counter, as we are on a new core.
3449  threadIdCt = 0;
3450 
3451  // Auto-assign the thread id field if it wasn't specified.
3452  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
3453  threadInfo[i][threadIdIndex] = threadIdCt++;
3454  }
3455 
3456  // Apparently the thread id field was specified for some entries and
3457  // not others. Start the thread id counter off at the next higher
3458  // thread id.
3459  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
3460  threadIdCt = threadInfo[i][threadIdIndex] + 1;
3461  }
3462  }
3463  break;
3464  }
3465  }
3466  if (index < threadIdIndex) {
3467  // If thread ids were specified, it is an error if they are not unique.
3468  // Also, check that we waven't already restarted the loop (to be safe -
3469  // shouldn't need to).
3470  if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) {
3471  __kmp_free(lastId);
3472  __kmp_free(totals);
3473  __kmp_free(maxCt);
3474  __kmp_free(counts);
3475  CLEANUP_THREAD_INFO;
3476  *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
3477  return false;
3478  }
3479 
3480  // If the thread ids were not specified and we see entries that
3481  // are duplicates, start the loop over and assign the thread ids manually.
3482  assign_thread_ids = true;
3483  goto restart_radix_check;
3484  }
3485  }
3486 
3487 #if KMP_MIC && REDUCE_TEAM_SIZE
3488  // The default team size is the total #threads in the machine
3489  // minus 1 thread for every core that has 3 or more threads.
3490  teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
3491 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3492 
3493  for (index = threadIdIndex; index <= maxIndex; index++) {
3494  if (counts[index] > maxCt[index]) {
3495  maxCt[index] = counts[index];
3496  }
3497  }
3498 
3499  __kmp_nThreadsPerCore = maxCt[threadIdIndex];
3500  nCoresPerPkg = maxCt[coreIdIndex];
3501  nPackages = totals[pkgIdIndex];
3502 
3503  // When affinity is off, this routine will still be called to set
3504  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
3505  // Make sure all these vars are set correctly, and return now if affinity is
3506  // not enabled.
3507  __kmp_ncores = totals[coreIdIndex];
3508  if (!KMP_AFFINITY_CAPABLE()) {
3509  KMP_ASSERT(__kmp_affinity.type == affinity_none);
3510  return true;
3511  }
3512 
3513 #if KMP_MIC && REDUCE_TEAM_SIZE
3514  // Set the default team size.
3515  if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
3516  __kmp_dflt_team_nth = teamSize;
3517  KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting "
3518  "__kmp_dflt_team_nth = %d\n",
3519  __kmp_dflt_team_nth));
3520  }
3521 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3522 
3523  KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc);
3524 
3525  // Count the number of levels which have more nodes at that level than at the
3526  // parent's level (with there being an implicit root node of the top level).
3527  // This is equivalent to saying that there is at least one node at this level
3528  // which has a sibling. These levels are in the map, and the package level is
3529  // always in the map.
3530  bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
3531  for (index = threadIdIndex; index < maxIndex; index++) {
3532  KMP_ASSERT(totals[index] >= totals[index + 1]);
3533  inMap[index] = (totals[index] > totals[index + 1]);
3534  }
3535  inMap[maxIndex] = (totals[maxIndex] > 1);
3536  inMap[pkgIdIndex] = true;
3537  inMap[coreIdIndex] = true;
3538  inMap[threadIdIndex] = true;
3539 
3540  int depth = 0;
3541  int idx = 0;
3542  kmp_hw_t types[KMP_HW_LAST];
3543  int pkgLevel = -1;
3544  int coreLevel = -1;
3545  int threadLevel = -1;
3546  for (index = threadIdIndex; index <= maxIndex; index++) {
3547  if (inMap[index]) {
3548  depth++;
3549  }
3550  }
3551  if (inMap[pkgIdIndex]) {
3552  pkgLevel = idx;
3553  types[idx++] = KMP_HW_SOCKET;
3554  }
3555  if (inMap[coreIdIndex]) {
3556  coreLevel = idx;
3557  types[idx++] = KMP_HW_CORE;
3558  }
3559  if (inMap[threadIdIndex]) {
3560  threadLevel = idx;
3561  types[idx++] = KMP_HW_THREAD;
3562  }
3563  KMP_ASSERT(depth > 0);
3564 
3565  // Construct the data structure that is to be returned.
3566  __kmp_topology = kmp_topology_t::allocate(num_avail, depth, types);
3567 
3568  for (i = 0; i < num_avail; ++i) {
3569  unsigned os = threadInfo[i][osIdIndex];
3570  int src_index;
3571  kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
3572  hw_thread.clear();
3573  hw_thread.os_id = os;
3574 
3575  idx = 0;
3576  for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
3577  if (!inMap[src_index]) {
3578  continue;
3579  }
3580  if (src_index == pkgIdIndex) {
3581  hw_thread.ids[pkgLevel] = threadInfo[i][src_index];
3582  } else if (src_index == coreIdIndex) {
3583  hw_thread.ids[coreLevel] = threadInfo[i][src_index];
3584  } else if (src_index == threadIdIndex) {
3585  hw_thread.ids[threadLevel] = threadInfo[i][src_index];
3586  }
3587  }
3588  }
3589 
3590  __kmp_free(inMap);
3591  __kmp_free(lastId);
3592  __kmp_free(totals);
3593  __kmp_free(maxCt);
3594  __kmp_free(counts);
3595  CLEANUP_THREAD_INFO;
3596  __kmp_topology->sort_ids();
3597  if (!__kmp_topology->check_ids()) {
3598  kmp_topology_t::deallocate(__kmp_topology);
3599  __kmp_topology = nullptr;
3600  *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
3601  return false;
3602  }
3603  return true;
3604 }
3605 
3606 // Create and return a table of affinity masks, indexed by OS thread ID.
3607 // This routine handles OR'ing together all the affinity masks of threads
3608 // that are sufficiently close, if granularity > fine.
3609 template <typename FindNextFunctionType>
3610 static void __kmp_create_os_id_masks(unsigned *numUnique,
3611  kmp_affinity_t &affinity,
3612  FindNextFunctionType find_next) {
3613  // First form a table of affinity masks in order of OS thread id.
3614  int maxOsId;
3615  int i;
3616  int numAddrs = __kmp_topology->get_num_hw_threads();
3617  int depth = __kmp_topology->get_depth();
3618  const char *env_var = __kmp_get_affinity_env_var(affinity);
3619  KMP_ASSERT(numAddrs);
3620  KMP_ASSERT(depth);
3621 
3622  i = find_next(-1);
3623  // If could not find HW thread location with attributes, then return and
3624  // fallback to increment find_next and disregard core attributes.
3625  if (i >= numAddrs)
3626  return;
3627 
3628  maxOsId = 0;
3629  for (i = numAddrs - 1;; --i) {
3630  int osId = __kmp_topology->at(i).os_id;
3631  if (osId > maxOsId) {
3632  maxOsId = osId;
3633  }
3634  if (i == 0)
3635  break;
3636  }
3637  affinity.num_os_id_masks = maxOsId + 1;
3638  KMP_CPU_ALLOC_ARRAY(affinity.os_id_masks, affinity.num_os_id_masks);
3639  KMP_ASSERT(affinity.gran_levels >= 0);
3640  if (affinity.flags.verbose && (affinity.gran_levels > 0)) {
3641  KMP_INFORM(ThreadsMigrate, env_var, affinity.gran_levels);
3642  }
3643  if (affinity.gran_levels >= (int)depth) {
3644  KMP_AFF_WARNING(affinity, AffThreadsMayMigrate);
3645  }
3646 
3647  // Run through the table, forming the masks for all threads on each core.
3648  // Threads on the same core will have identical kmp_hw_thread_t objects, not
3649  // considering the last level, which must be the thread id. All threads on a
3650  // core will appear consecutively.
3651  int unique = 0;
3652  int j = 0; // index of 1st thread on core
3653  int leader = 0;
3654  kmp_affin_mask_t *sum;
3655  KMP_CPU_ALLOC_ON_STACK(sum);
3656  KMP_CPU_ZERO(sum);
3657 
3658  i = j = leader = find_next(-1);
3659  KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
3660  kmp_full_mask_modifier_t full_mask;
3661  for (i = find_next(i); i < numAddrs; i = find_next(i)) {
3662  // If this thread is sufficiently close to the leader (within the
3663  // granularity setting), then set the bit for this os thread in the
3664  // affinity mask for this group, and go on to the next thread.
3665  if (__kmp_topology->is_close(leader, i, affinity)) {
3666  KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
3667  continue;
3668  }
3669 
3670  // For every thread in this group, copy the mask to the thread's entry in
3671  // the OS Id mask table. Mark the first address as a leader.
3672  for (; j < i; j = find_next(j)) {
3673  int osId = __kmp_topology->at(j).os_id;
3674  KMP_DEBUG_ASSERT(osId <= maxOsId);
3675  kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId);
3676  KMP_CPU_COPY(mask, sum);
3677  __kmp_topology->at(j).leader = (j == leader);
3678  }
3679  unique++;
3680 
3681  // Start a new mask.
3682  leader = i;
3683  full_mask.include(sum);
3684  KMP_CPU_ZERO(sum);
3685  KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
3686  }
3687 
3688  // For every thread in last group, copy the mask to the thread's
3689  // entry in the OS Id mask table.
3690  for (; j < i; j = find_next(j)) {
3691  int osId = __kmp_topology->at(j).os_id;
3692  KMP_DEBUG_ASSERT(osId <= maxOsId);
3693  kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId);
3694  KMP_CPU_COPY(mask, sum);
3695  __kmp_topology->at(j).leader = (j == leader);
3696  }
3697  full_mask.include(sum);
3698  unique++;
3699  KMP_CPU_FREE_FROM_STACK(sum);
3700 
3701  // See if the OS Id mask table further restricts or changes the full mask
3702  if (full_mask.restrict_to_mask() && affinity.flags.verbose) {
3703  __kmp_topology->print(env_var);
3704  }
3705 
3706  *numUnique = unique;
3707 }
3708 
3709 // Stuff for the affinity proclist parsers. It's easier to declare these vars
3710 // as file-static than to try and pass them through the calling sequence of
3711 // the recursive-descent OMP_PLACES parser.
3712 static kmp_affin_mask_t *newMasks;
3713 static int numNewMasks;
3714 static int nextNewMask;
3715 
3716 #define ADD_MASK(_mask) \
3717  { \
3718  if (nextNewMask >= numNewMasks) { \
3719  int i; \
3720  numNewMasks *= 2; \
3721  kmp_affin_mask_t *temp; \
3722  KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \
3723  for (i = 0; i < numNewMasks / 2; i++) { \
3724  kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \
3725  kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \
3726  KMP_CPU_COPY(dest, src); \
3727  } \
3728  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \
3729  newMasks = temp; \
3730  } \
3731  KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
3732  nextNewMask++; \
3733  }
3734 
3735 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \
3736  { \
3737  if (((_osId) > _maxOsId) || \
3738  (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
3739  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, _osId); \
3740  } else { \
3741  ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
3742  } \
3743  }
3744 
3745 // Re-parse the proclist (for the explicit affinity type), and form the list
3746 // of affinity newMasks indexed by gtid.
3747 static void __kmp_affinity_process_proclist(kmp_affinity_t &affinity) {
3748  int i;
3749  kmp_affin_mask_t **out_masks = &affinity.masks;
3750  unsigned *out_numMasks = &affinity.num_masks;
3751  const char *proclist = affinity.proclist;
3752  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
3753  int maxOsId = affinity.num_os_id_masks - 1;
3754  const char *scan = proclist;
3755  const char *next = proclist;
3756 
3757  // We use malloc() for the temporary mask vector, so that we can use
3758  // realloc() to extend it.
3759  numNewMasks = 2;
3760  KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
3761  nextNewMask = 0;
3762  kmp_affin_mask_t *sumMask;
3763  KMP_CPU_ALLOC(sumMask);
3764  int setSize = 0;
3765 
3766  for (;;) {
3767  int start, end, stride;
3768 
3769  SKIP_WS(scan);
3770  next = scan;
3771  if (*next == '\0') {
3772  break;
3773  }
3774 
3775  if (*next == '{') {
3776  int num;
3777  setSize = 0;
3778  next++; // skip '{'
3779  SKIP_WS(next);
3780  scan = next;
3781 
3782  // Read the first integer in the set.
3783  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist");
3784  SKIP_DIGITS(next);
3785  num = __kmp_str_to_int(scan, *next);
3786  KMP_ASSERT2(num >= 0, "bad explicit proc list");
3787 
3788  // Copy the mask for that osId to the sum (union) mask.
3789  if ((num > maxOsId) ||
3790  (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3791  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
3792  KMP_CPU_ZERO(sumMask);
3793  } else {
3794  KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
3795  setSize = 1;
3796  }
3797 
3798  for (;;) {
3799  // Check for end of set.
3800  SKIP_WS(next);
3801  if (*next == '}') {
3802  next++; // skip '}'
3803  break;
3804  }
3805 
3806  // Skip optional comma.
3807  if (*next == ',') {
3808  next++;
3809  }
3810  SKIP_WS(next);
3811 
3812  // Read the next integer in the set.
3813  scan = next;
3814  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3815 
3816  SKIP_DIGITS(next);
3817  num = __kmp_str_to_int(scan, *next);
3818  KMP_ASSERT2(num >= 0, "bad explicit proc list");
3819 
3820  // Add the mask for that osId to the sum mask.
3821  if ((num > maxOsId) ||
3822  (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3823  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
3824  } else {
3825  KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
3826  setSize++;
3827  }
3828  }
3829  if (setSize > 0) {
3830  ADD_MASK(sumMask);
3831  }
3832 
3833  SKIP_WS(next);
3834  if (*next == ',') {
3835  next++;
3836  }
3837  scan = next;
3838  continue;
3839  }
3840 
3841  // Read the first integer.
3842  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3843  SKIP_DIGITS(next);
3844  start = __kmp_str_to_int(scan, *next);
3845  KMP_ASSERT2(start >= 0, "bad explicit proc list");
3846  SKIP_WS(next);
3847 
3848  // If this isn't a range, then add a mask to the list and go on.
3849  if (*next != '-') {
3850  ADD_MASK_OSID(start, osId2Mask, maxOsId);
3851 
3852  // Skip optional comma.
3853  if (*next == ',') {
3854  next++;
3855  }
3856  scan = next;
3857  continue;
3858  }
3859 
3860  // This is a range. Skip over the '-' and read in the 2nd int.
3861  next++; // skip '-'
3862  SKIP_WS(next);
3863  scan = next;
3864  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3865  SKIP_DIGITS(next);
3866  end = __kmp_str_to_int(scan, *next);
3867  KMP_ASSERT2(end >= 0, "bad explicit proc list");
3868 
3869  // Check for a stride parameter
3870  stride = 1;
3871  SKIP_WS(next);
3872  if (*next == ':') {
3873  // A stride is specified. Skip over the ':" and read the 3rd int.
3874  int sign = +1;
3875  next++; // skip ':'
3876  SKIP_WS(next);
3877  scan = next;
3878  if (*next == '-') {
3879  sign = -1;
3880  next++;
3881  SKIP_WS(next);
3882  scan = next;
3883  }
3884  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3885  SKIP_DIGITS(next);
3886  stride = __kmp_str_to_int(scan, *next);
3887  KMP_ASSERT2(stride >= 0, "bad explicit proc list");
3888  stride *= sign;
3889  }
3890 
3891  // Do some range checks.
3892  KMP_ASSERT2(stride != 0, "bad explicit proc list");
3893  if (stride > 0) {
3894  KMP_ASSERT2(start <= end, "bad explicit proc list");
3895  } else {
3896  KMP_ASSERT2(start >= end, "bad explicit proc list");
3897  }
3898  KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
3899 
3900  // Add the mask for each OS proc # to the list.
3901  if (stride > 0) {
3902  do {
3903  ADD_MASK_OSID(start, osId2Mask, maxOsId);
3904  start += stride;
3905  } while (start <= end);
3906  } else {
3907  do {
3908  ADD_MASK_OSID(start, osId2Mask, maxOsId);
3909  start += stride;
3910  } while (start >= end);
3911  }
3912 
3913  // Skip optional comma.
3914  SKIP_WS(next);
3915  if (*next == ',') {
3916  next++;
3917  }
3918  scan = next;
3919  }
3920 
3921  *out_numMasks = nextNewMask;
3922  if (nextNewMask == 0) {
3923  *out_masks = NULL;
3924  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3925  return;
3926  }
3927  KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3928  for (i = 0; i < nextNewMask; i++) {
3929  kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
3930  kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
3931  KMP_CPU_COPY(dest, src);
3932  }
3933  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3934  KMP_CPU_FREE(sumMask);
3935 }
3936 
3937 /*-----------------------------------------------------------------------------
3938 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3939 places. Again, Here is the grammar:
3940 
3941 place_list := place
3942 place_list := place , place_list
3943 place := num
3944 place := place : num
3945 place := place : num : signed
3946 place := { subplacelist }
3947 place := ! place // (lowest priority)
3948 subplace_list := subplace
3949 subplace_list := subplace , subplace_list
3950 subplace := num
3951 subplace := num : num
3952 subplace := num : num : signed
3953 signed := num
3954 signed := + signed
3955 signed := - signed
3956 -----------------------------------------------------------------------------*/
3957 static void __kmp_process_subplace_list(const char **scan,
3958  kmp_affinity_t &affinity, int maxOsId,
3959  kmp_affin_mask_t *tempMask,
3960  int *setSize) {
3961  const char *next;
3962  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
3963 
3964  for (;;) {
3965  int start, count, stride, i;
3966 
3967  // Read in the starting proc id
3968  SKIP_WS(*scan);
3969  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3970  next = *scan;
3971  SKIP_DIGITS(next);
3972  start = __kmp_str_to_int(*scan, *next);
3973  KMP_ASSERT(start >= 0);
3974  *scan = next;
3975 
3976  // valid follow sets are ',' ':' and '}'
3977  SKIP_WS(*scan);
3978  if (**scan == '}' || **scan == ',') {
3979  if ((start > maxOsId) ||
3980  (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3981  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
3982  } else {
3983  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3984  (*setSize)++;
3985  }
3986  if (**scan == '}') {
3987  break;
3988  }
3989  (*scan)++; // skip ','
3990  continue;
3991  }
3992  KMP_ASSERT2(**scan == ':', "bad explicit places list");
3993  (*scan)++; // skip ':'
3994 
3995  // Read count parameter
3996  SKIP_WS(*scan);
3997  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3998  next = *scan;
3999  SKIP_DIGITS(next);
4000  count = __kmp_str_to_int(*scan, *next);
4001  KMP_ASSERT(count >= 0);
4002  *scan = next;
4003 
4004  // valid follow sets are ',' ':' and '}'
4005  SKIP_WS(*scan);
4006  if (**scan == '}' || **scan == ',') {
4007  for (i = 0; i < count; i++) {
4008  if ((start > maxOsId) ||
4009  (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
4010  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
4011  break; // don't proliferate warnings for large count
4012  } else {
4013  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
4014  start++;
4015  (*setSize)++;
4016  }
4017  }
4018  if (**scan == '}') {
4019  break;
4020  }
4021  (*scan)++; // skip ','
4022  continue;
4023  }
4024  KMP_ASSERT2(**scan == ':', "bad explicit places list");
4025  (*scan)++; // skip ':'
4026 
4027  // Read stride parameter
4028  int sign = +1;
4029  for (;;) {
4030  SKIP_WS(*scan);
4031  if (**scan == '+') {
4032  (*scan)++; // skip '+'
4033  continue;
4034  }
4035  if (**scan == '-') {
4036  sign *= -1;
4037  (*scan)++; // skip '-'
4038  continue;
4039  }
4040  break;
4041  }
4042  SKIP_WS(*scan);
4043  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
4044  next = *scan;
4045  SKIP_DIGITS(next);
4046  stride = __kmp_str_to_int(*scan, *next);
4047  KMP_ASSERT(stride >= 0);
4048  *scan = next;
4049  stride *= sign;
4050 
4051  // valid follow sets are ',' and '}'
4052  SKIP_WS(*scan);
4053  if (**scan == '}' || **scan == ',') {
4054  for (i = 0; i < count; i++) {
4055  if ((start > maxOsId) ||
4056  (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
4057  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
4058  break; // don't proliferate warnings for large count
4059  } else {
4060  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
4061  start += stride;
4062  (*setSize)++;
4063  }
4064  }
4065  if (**scan == '}') {
4066  break;
4067  }
4068  (*scan)++; // skip ','
4069  continue;
4070  }
4071 
4072  KMP_ASSERT2(0, "bad explicit places list");
4073  }
4074 }
4075 
4076 static void __kmp_process_place(const char **scan, kmp_affinity_t &affinity,
4077  int maxOsId, kmp_affin_mask_t *tempMask,
4078  int *setSize) {
4079  const char *next;
4080  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
4081 
4082  // valid follow sets are '{' '!' and num
4083  SKIP_WS(*scan);
4084  if (**scan == '{') {
4085  (*scan)++; // skip '{'
4086  __kmp_process_subplace_list(scan, affinity, maxOsId, tempMask, setSize);
4087  KMP_ASSERT2(**scan == '}', "bad explicit places list");
4088  (*scan)++; // skip '}'
4089  } else if (**scan == '!') {
4090  (*scan)++; // skip '!'
4091  __kmp_process_place(scan, affinity, maxOsId, tempMask, setSize);
4092  KMP_CPU_COMPLEMENT(maxOsId, tempMask);
4093  } else if ((**scan >= '0') && (**scan <= '9')) {
4094  next = *scan;
4095  SKIP_DIGITS(next);
4096  int num = __kmp_str_to_int(*scan, *next);
4097  KMP_ASSERT(num >= 0);
4098  if ((num > maxOsId) ||
4099  (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
4100  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
4101  } else {
4102  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
4103  (*setSize)++;
4104  }
4105  *scan = next; // skip num
4106  } else {
4107  KMP_ASSERT2(0, "bad explicit places list");
4108  }
4109 }
4110 
4111 // static void
4112 void __kmp_affinity_process_placelist(kmp_affinity_t &affinity) {
4113  int i, j, count, stride, sign;
4114  kmp_affin_mask_t **out_masks = &affinity.masks;
4115  unsigned *out_numMasks = &affinity.num_masks;
4116  const char *placelist = affinity.proclist;
4117  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
4118  int maxOsId = affinity.num_os_id_masks - 1;
4119  const char *scan = placelist;
4120  const char *next = placelist;
4121 
4122  numNewMasks = 2;
4123  KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
4124  nextNewMask = 0;
4125 
4126  // tempMask is modified based on the previous or initial
4127  // place to form the current place
4128  // previousMask contains the previous place
4129  kmp_affin_mask_t *tempMask;
4130  kmp_affin_mask_t *previousMask;
4131  KMP_CPU_ALLOC(tempMask);
4132  KMP_CPU_ZERO(tempMask);
4133  KMP_CPU_ALLOC(previousMask);
4134  KMP_CPU_ZERO(previousMask);
4135  int setSize = 0;
4136 
4137  for (;;) {
4138  __kmp_process_place(&scan, affinity, maxOsId, tempMask, &setSize);
4139 
4140  // valid follow sets are ',' ':' and EOL
4141  SKIP_WS(scan);
4142  if (*scan == '\0' || *scan == ',') {
4143  if (setSize > 0) {
4144  ADD_MASK(tempMask);
4145  }
4146  KMP_CPU_ZERO(tempMask);
4147  setSize = 0;
4148  if (*scan == '\0') {
4149  break;
4150  }
4151  scan++; // skip ','
4152  continue;
4153  }
4154 
4155  KMP_ASSERT2(*scan == ':', "bad explicit places list");
4156  scan++; // skip ':'
4157 
4158  // Read count parameter
4159  SKIP_WS(scan);
4160  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
4161  next = scan;
4162  SKIP_DIGITS(next);
4163  count = __kmp_str_to_int(scan, *next);
4164  KMP_ASSERT(count >= 0);
4165  scan = next;
4166 
4167  // valid follow sets are ',' ':' and EOL
4168  SKIP_WS(scan);
4169  if (*scan == '\0' || *scan == ',') {
4170  stride = +1;
4171  } else {
4172  KMP_ASSERT2(*scan == ':', "bad explicit places list");
4173  scan++; // skip ':'
4174 
4175  // Read stride parameter
4176  sign = +1;
4177  for (;;) {
4178  SKIP_WS(scan);
4179  if (*scan == '+') {
4180  scan++; // skip '+'
4181  continue;
4182  }
4183  if (*scan == '-') {
4184  sign *= -1;
4185  scan++; // skip '-'
4186  continue;
4187  }
4188  break;
4189  }
4190  SKIP_WS(scan);
4191  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
4192  next = scan;
4193  SKIP_DIGITS(next);
4194  stride = __kmp_str_to_int(scan, *next);
4195  KMP_DEBUG_ASSERT(stride >= 0);
4196  scan = next;
4197  stride *= sign;
4198  }
4199 
4200  // Add places determined by initial_place : count : stride
4201  for (i = 0; i < count; i++) {
4202  if (setSize == 0) {
4203  break;
4204  }
4205  // Add the current place, then build the next place (tempMask) from that
4206  KMP_CPU_COPY(previousMask, tempMask);
4207  ADD_MASK(previousMask);
4208  KMP_CPU_ZERO(tempMask);
4209  setSize = 0;
4210  KMP_CPU_SET_ITERATE(j, previousMask) {
4211  if (!KMP_CPU_ISSET(j, previousMask)) {
4212  continue;
4213  }
4214  if ((j + stride > maxOsId) || (j + stride < 0) ||
4215  (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
4216  (!KMP_CPU_ISSET(j + stride,
4217  KMP_CPU_INDEX(osId2Mask, j + stride)))) {
4218  if (i < count - 1) {
4219  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, j + stride);
4220  }
4221  continue;
4222  }
4223  KMP_CPU_SET(j + stride, tempMask);
4224  setSize++;
4225  }
4226  }
4227  KMP_CPU_ZERO(tempMask);
4228  setSize = 0;
4229 
4230  // valid follow sets are ',' and EOL
4231  SKIP_WS(scan);
4232  if (*scan == '\0') {
4233  break;
4234  }
4235  if (*scan == ',') {
4236  scan++; // skip ','
4237  continue;
4238  }
4239 
4240  KMP_ASSERT2(0, "bad explicit places list");
4241  }
4242 
4243  *out_numMasks = nextNewMask;
4244  if (nextNewMask == 0) {
4245  *out_masks = NULL;
4246  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
4247  return;
4248  }
4249  KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
4250  KMP_CPU_FREE(tempMask);
4251  KMP_CPU_FREE(previousMask);
4252  for (i = 0; i < nextNewMask; i++) {
4253  kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
4254  kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
4255  KMP_CPU_COPY(dest, src);
4256  }
4257  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
4258 }
4259 
4260 #undef ADD_MASK
4261 #undef ADD_MASK_OSID
4262 
4263 // This function figures out the deepest level at which there is at least one
4264 // cluster/core with more than one processing unit bound to it.
4265 static int __kmp_affinity_find_core_level(int nprocs, int bottom_level) {
4266  int core_level = 0;
4267 
4268  for (int i = 0; i < nprocs; i++) {
4269  const kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
4270  for (int j = bottom_level; j > 0; j--) {
4271  if (hw_thread.ids[j] > 0) {
4272  if (core_level < (j - 1)) {
4273  core_level = j - 1;
4274  }
4275  }
4276  }
4277  }
4278  return core_level;
4279 }
4280 
4281 // This function counts number of clusters/cores at given level.
4282 static int __kmp_affinity_compute_ncores(int nprocs, int bottom_level,
4283  int core_level) {
4284  return __kmp_topology->get_count(core_level);
4285 }
4286 // This function finds to which cluster/core given processing unit is bound.
4287 static int __kmp_affinity_find_core(int proc, int bottom_level,
4288  int core_level) {
4289  int core = 0;
4290  KMP_DEBUG_ASSERT(proc >= 0 && proc < __kmp_topology->get_num_hw_threads());
4291  for (int i = 0; i <= proc; ++i) {
4292  if (i + 1 <= proc) {
4293  for (int j = 0; j <= core_level; ++j) {
4294  if (__kmp_topology->at(i + 1).sub_ids[j] !=
4295  __kmp_topology->at(i).sub_ids[j]) {
4296  core++;
4297  break;
4298  }
4299  }
4300  }
4301  }
4302  return core;
4303 }
4304 
4305 // This function finds maximal number of processing units bound to a
4306 // cluster/core at given level.
4307 static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level,
4308  int core_level) {
4309  if (core_level >= bottom_level)
4310  return 1;
4311  int thread_level = __kmp_topology->get_level(KMP_HW_THREAD);
4312  return __kmp_topology->calculate_ratio(thread_level, core_level);
4313 }
4314 
4315 static int *procarr = NULL;
4316 static int __kmp_aff_depth = 0;
4317 static int *__kmp_osid_to_hwthread_map = NULL;
4318 
4319 static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,
4320  kmp_affinity_ids_t &ids,
4321  kmp_affinity_attrs_t &attrs) {
4322  if (!KMP_AFFINITY_CAPABLE())
4323  return;
4324 
4325  // Initiailze ids and attrs thread data
4326  for (int i = 0; i < KMP_HW_LAST; ++i)
4327  ids.ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
4328  attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
4329 
4330  // Iterate through each os id within the mask and determine
4331  // the topology id and attribute information
4332  int cpu;
4333  int depth = __kmp_topology->get_depth();
4334  KMP_CPU_SET_ITERATE(cpu, mask) {
4335  int osid_idx = __kmp_osid_to_hwthread_map[cpu];
4336  ids.os_id = cpu;
4337  const kmp_hw_thread_t &hw_thread = __kmp_topology->at(osid_idx);
4338  for (int level = 0; level < depth; ++level) {
4339  kmp_hw_t type = __kmp_topology->get_type(level);
4340  int id = hw_thread.sub_ids[level];
4341  if (ids.ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids.ids[type] == id) {
4342  ids.ids[type] = id;
4343  } else {
4344  // This mask spans across multiple topology units, set it as such
4345  // and mark every level below as such as well.
4346  ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
4347  for (; level < depth; ++level) {
4348  kmp_hw_t type = __kmp_topology->get_type(level);
4349  ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
4350  }
4351  }
4352  }
4353  if (!attrs.valid) {
4354  attrs.core_type = hw_thread.attrs.get_core_type();
4355  attrs.core_eff = hw_thread.attrs.get_core_eff();
4356  attrs.valid = 1;
4357  } else {
4358  // This mask spans across multiple attributes, set it as such
4359  if (attrs.core_type != hw_thread.attrs.get_core_type())
4360  attrs.core_type = KMP_HW_CORE_TYPE_UNKNOWN;
4361  if (attrs.core_eff != hw_thread.attrs.get_core_eff())
4362  attrs.core_eff = kmp_hw_attr_t::UNKNOWN_CORE_EFF;
4363  }
4364  }
4365 }
4366 
4367 static void __kmp_affinity_get_thread_topology_info(kmp_info_t *th) {
4368  if (!KMP_AFFINITY_CAPABLE())
4369  return;
4370  const kmp_affin_mask_t *mask = th->th.th_affin_mask;
4371  kmp_affinity_ids_t &ids = th->th.th_topology_ids;
4372  kmp_affinity_attrs_t &attrs = th->th.th_topology_attrs;
4373  __kmp_affinity_get_mask_topology_info(mask, ids, attrs);
4374 }
4375 
4376 // Assign the topology information to each place in the place list
4377 // A thread can then grab not only its affinity mask, but the topology
4378 // information associated with that mask. e.g., Which socket is a thread on
4379 static void __kmp_affinity_get_topology_info(kmp_affinity_t &affinity) {
4380  if (!KMP_AFFINITY_CAPABLE())
4381  return;
4382  if (affinity.type != affinity_none) {
4383  KMP_ASSERT(affinity.num_os_id_masks);
4384  KMP_ASSERT(affinity.os_id_masks);
4385  }
4386  KMP_ASSERT(affinity.num_masks);
4387  KMP_ASSERT(affinity.masks);
4388  KMP_ASSERT(__kmp_affin_fullMask);
4389 
4390  int max_cpu = __kmp_affin_fullMask->get_max_cpu();
4391  int num_hw_threads = __kmp_topology->get_num_hw_threads();
4392 
4393  // Allocate thread topology information
4394  if (!affinity.ids) {
4395  affinity.ids = (kmp_affinity_ids_t *)__kmp_allocate(
4396  sizeof(kmp_affinity_ids_t) * affinity.num_masks);
4397  }
4398  if (!affinity.attrs) {
4399  affinity.attrs = (kmp_affinity_attrs_t *)__kmp_allocate(
4400  sizeof(kmp_affinity_attrs_t) * affinity.num_masks);
4401  }
4402  if (!__kmp_osid_to_hwthread_map) {
4403  // Want the +1 because max_cpu should be valid index into map
4404  __kmp_osid_to_hwthread_map =
4405  (int *)__kmp_allocate(sizeof(int) * (max_cpu + 1));
4406  }
4407 
4408  // Create the OS proc to hardware thread map
4409  for (int hw_thread = 0; hw_thread < num_hw_threads; ++hw_thread) {
4410  int os_id = __kmp_topology->at(hw_thread).os_id;
4411  if (KMP_CPU_ISSET(os_id, __kmp_affin_fullMask))
4412  __kmp_osid_to_hwthread_map[os_id] = hw_thread;
4413  }
4414 
4415  for (unsigned i = 0; i < affinity.num_masks; ++i) {
4416  kmp_affinity_ids_t &ids = affinity.ids[i];
4417  kmp_affinity_attrs_t &attrs = affinity.attrs[i];
4418  kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.masks, i);
4419  __kmp_affinity_get_mask_topology_info(mask, ids, attrs);
4420  }
4421 }
4422 
4423 // Called when __kmp_topology is ready
4424 static void __kmp_aux_affinity_initialize_other_data(kmp_affinity_t &affinity) {
4425  // Initialize other data structures which depend on the topology
4426  if (__kmp_topology && __kmp_topology->get_num_hw_threads()) {
4427  machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
4428  __kmp_affinity_get_topology_info(affinity);
4429 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED
4430  __kmp_first_osid_with_ecore = __kmp_get_first_osid_with_ecore();
4431 #endif
4432  }
4433 }
4434 
4435 // Create a one element mask array (set of places) which only contains the
4436 // initial process's affinity mask
4437 static void __kmp_create_affinity_none_places(kmp_affinity_t &affinity) {
4438  KMP_ASSERT(__kmp_affin_fullMask != NULL);
4439  KMP_ASSERT(affinity.type == affinity_none);
4440  KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
4441  affinity.num_masks = 1;
4442  KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks);
4443  kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, 0);
4444  KMP_CPU_COPY(dest, __kmp_affin_fullMask);
4445  __kmp_aux_affinity_initialize_other_data(affinity);
4446 }
4447 
4448 static void __kmp_aux_affinity_initialize_masks(kmp_affinity_t &affinity) {
4449  // Create the "full" mask - this defines all of the processors that we
4450  // consider to be in the machine model. If respect is set, then it is the
4451  // initialization thread's affinity mask. Otherwise, it is all processors that
4452  // we know about on the machine.
4453  int verbose = affinity.flags.verbose;
4454  const char *env_var = affinity.env_var;
4455 
4456  // Already initialized
4457  if (__kmp_affin_fullMask && __kmp_affin_origMask)
4458  return;
4459 
4460  if (__kmp_affin_fullMask == NULL) {
4461  KMP_CPU_ALLOC(__kmp_affin_fullMask);
4462  }
4463  if (__kmp_affin_origMask == NULL) {
4464  KMP_CPU_ALLOC(__kmp_affin_origMask);
4465  }
4466  if (KMP_AFFINITY_CAPABLE()) {
4467  __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
4468  // Make a copy before possible expanding to the entire machine mask
4469  __kmp_affin_origMask->copy(__kmp_affin_fullMask);
4470  if (affinity.flags.respect) {
4471  // Count the number of available processors.
4472  unsigned i;
4473  __kmp_avail_proc = 0;
4474  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
4475  if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
4476  continue;
4477  }
4478  __kmp_avail_proc++;
4479  }
4480  if (__kmp_avail_proc > __kmp_xproc) {
4481  KMP_AFF_WARNING(affinity, ErrorInitializeAffinity);
4482  affinity.type = affinity_none;
4483  KMP_AFFINITY_DISABLE();
4484  return;
4485  }
4486 
4487  if (verbose) {
4488  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4489  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4490  __kmp_affin_fullMask);
4491  KMP_INFORM(InitOSProcSetRespect, env_var, buf);
4492  }
4493  } else {
4494  if (verbose) {
4495  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4496  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4497  __kmp_affin_fullMask);
4498  KMP_INFORM(InitOSProcSetNotRespect, env_var, buf);
4499  }
4500  __kmp_avail_proc =
4501  __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
4502 #if KMP_OS_WINDOWS
4503  if (__kmp_num_proc_groups <= 1) {
4504  // Copy expanded full mask if topology has single processor group
4505  __kmp_affin_origMask->copy(__kmp_affin_fullMask);
4506  }
4507  // Set the process affinity mask since threads' affinity
4508  // masks must be subset of process mask in Windows* OS
4509  __kmp_affin_fullMask->set_process_affinity(true);
4510 #endif
4511  }
4512  }
4513 }
4514 
4515 static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) {
4516  bool success = false;
4517  const char *env_var = affinity.env_var;
4518  kmp_i18n_id_t msg_id = kmp_i18n_null;
4519  int verbose = affinity.flags.verbose;
4520 
4521  // For backward compatibility, setting KMP_CPUINFO_FILE =>
4522  // KMP_TOPOLOGY_METHOD=cpuinfo
4523  if ((__kmp_cpuinfo_file != NULL) &&
4524  (__kmp_affinity_top_method == affinity_top_method_all)) {
4525  __kmp_affinity_top_method = affinity_top_method_cpuinfo;
4526  }
4527 
4528  if (__kmp_affinity_top_method == affinity_top_method_all) {
4529 // In the default code path, errors are not fatal - we just try using
4530 // another method. We only emit a warning message if affinity is on, or the
4531 // verbose flag is set, an the nowarnings flag was not set.
4532 #if KMP_USE_HWLOC
4533  if (!success &&
4534  __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
4535  if (!__kmp_hwloc_error) {
4536  success = __kmp_affinity_create_hwloc_map(&msg_id);
4537  if (!success && verbose) {
4538  KMP_INFORM(AffIgnoringHwloc, env_var);
4539  }
4540  } else if (verbose) {
4541  KMP_INFORM(AffIgnoringHwloc, env_var);
4542  }
4543  }
4544 #endif
4545 
4546 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4547  if (!success) {
4548  success = __kmp_affinity_create_x2apicid_map(&msg_id);
4549  if (!success && verbose && msg_id != kmp_i18n_null) {
4550  KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4551  }
4552  }
4553  if (!success) {
4554  success = __kmp_affinity_create_apicid_map(&msg_id);
4555  if (!success && verbose && msg_id != kmp_i18n_null) {
4556  KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4557  }
4558  }
4559 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4560 
4561 #if KMP_OS_LINUX || KMP_OS_AIX
4562  if (!success) {
4563  int line = 0;
4564  success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
4565  if (!success && verbose && msg_id != kmp_i18n_null) {
4566  KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4567  }
4568  }
4569 #endif /* KMP_OS_LINUX */
4570 
4571 #if KMP_GROUP_AFFINITY
4572  if (!success && (__kmp_num_proc_groups > 1)) {
4573  success = __kmp_affinity_create_proc_group_map(&msg_id);
4574  if (!success && verbose && msg_id != kmp_i18n_null) {
4575  KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4576  }
4577  }
4578 #endif /* KMP_GROUP_AFFINITY */
4579 
4580  if (!success) {
4581  success = __kmp_affinity_create_flat_map(&msg_id);
4582  if (!success && verbose && msg_id != kmp_i18n_null) {
4583  KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4584  }
4585  KMP_ASSERT(success);
4586  }
4587  }
4588 
4589 // If the user has specified that a paricular topology discovery method is to be
4590 // used, then we abort if that method fails. The exception is group affinity,
4591 // which might have been implicitly set.
4592 #if KMP_USE_HWLOC
4593  else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
4594  KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
4595  success = __kmp_affinity_create_hwloc_map(&msg_id);
4596  if (!success) {
4597  KMP_ASSERT(msg_id != kmp_i18n_null);
4598  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4599  }
4600  }
4601 #endif // KMP_USE_HWLOC
4602 
4603 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4604  else if (__kmp_affinity_top_method == affinity_top_method_x2apicid ||
4605  __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
4606  success = __kmp_affinity_create_x2apicid_map(&msg_id);
4607  if (!success) {
4608  KMP_ASSERT(msg_id != kmp_i18n_null);
4609  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4610  }
4611  } else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
4612  success = __kmp_affinity_create_apicid_map(&msg_id);
4613  if (!success) {
4614  KMP_ASSERT(msg_id != kmp_i18n_null);
4615  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4616  }
4617  }
4618 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4619 
4620  else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
4621  int line = 0;
4622  success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
4623  if (!success) {
4624  KMP_ASSERT(msg_id != kmp_i18n_null);
4625  const char *filename = __kmp_cpuinfo_get_filename();
4626  if (line > 0) {
4627  KMP_FATAL(FileLineMsgExiting, filename, line,
4628  __kmp_i18n_catgets(msg_id));
4629  } else {
4630  KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
4631  }
4632  }
4633  }
4634 
4635 #if KMP_GROUP_AFFINITY
4636  else if (__kmp_affinity_top_method == affinity_top_method_group) {
4637  success = __kmp_affinity_create_proc_group_map(&msg_id);
4638  KMP_ASSERT(success);
4639  if (!success) {
4640  KMP_ASSERT(msg_id != kmp_i18n_null);
4641  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4642  }
4643  }
4644 #endif /* KMP_GROUP_AFFINITY */
4645 
4646  else if (__kmp_affinity_top_method == affinity_top_method_flat) {
4647  success = __kmp_affinity_create_flat_map(&msg_id);
4648  // should not fail
4649  KMP_ASSERT(success);
4650  }
4651 
4652  // Early exit if topology could not be created
4653  if (!__kmp_topology) {
4654  if (KMP_AFFINITY_CAPABLE()) {
4655  KMP_AFF_WARNING(affinity, ErrorInitializeAffinity);
4656  }
4657  if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 &&
4658  __kmp_ncores > 0) {
4659  __kmp_topology = kmp_topology_t::allocate(0, 0, NULL);
4660  __kmp_topology->canonicalize(nPackages, nCoresPerPkg,
4661  __kmp_nThreadsPerCore, __kmp_ncores);
4662  if (verbose) {
4663  __kmp_topology->print(env_var);
4664  }
4665  }
4666  return false;
4667  }
4668 
4669  // Canonicalize, print (if requested), apply KMP_HW_SUBSET
4670  __kmp_topology->canonicalize();
4671  if (verbose)
4672  __kmp_topology->print(env_var);
4673  bool filtered = __kmp_topology->filter_hw_subset();
4674  if (filtered && verbose)
4675  __kmp_topology->print("KMP_HW_SUBSET");
4676  return success;
4677 }
4678 
4679 static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) {
4680  bool is_regular_affinity = (&affinity == &__kmp_affinity);
4681  bool is_hidden_helper_affinity = (&affinity == &__kmp_hh_affinity);
4682  const char *env_var = __kmp_get_affinity_env_var(affinity);
4683 
4684  if (affinity.flags.initialized) {
4685  KMP_ASSERT(__kmp_affin_fullMask != NULL);
4686  return;
4687  }
4688 
4689  if (is_regular_affinity && (!__kmp_affin_fullMask || !__kmp_affin_origMask))
4690  __kmp_aux_affinity_initialize_masks(affinity);
4691 
4692  if (is_regular_affinity && !__kmp_topology) {
4693  bool success = __kmp_aux_affinity_initialize_topology(affinity);
4694  if (success) {
4695  KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
4696  } else {
4697  affinity.type = affinity_none;
4698  KMP_AFFINITY_DISABLE();
4699  }
4700  }
4701 
4702  // If KMP_AFFINITY=none, then only create the single "none" place
4703  // which is the process's initial affinity mask or the number of
4704  // hardware threads depending on respect,norespect
4705  if (affinity.type == affinity_none) {
4706  __kmp_create_affinity_none_places(affinity);
4707 #if KMP_USE_HIER_SCHED
4708  __kmp_dispatch_set_hierarchy_values();
4709 #endif
4710  affinity.flags.initialized = TRUE;
4711  return;
4712  }
4713 
4714  __kmp_topology->set_granularity(affinity);
4715  int depth = __kmp_topology->get_depth();
4716 
4717  // Create the table of masks, indexed by thread Id.
4718  unsigned numUnique;
4719  int numAddrs = __kmp_topology->get_num_hw_threads();
4720  // If OMP_PLACES=cores:<attribute> specified, then attempt
4721  // to make OS Id mask table using those attributes
4722  if (affinity.core_attr_gran.valid) {
4723  __kmp_create_os_id_masks(&numUnique, affinity, [&](int idx) {
4724  KMP_ASSERT(idx >= -1);
4725  for (int i = idx + 1; i < numAddrs; ++i)
4726  if (__kmp_topology->at(i).attrs.contains(affinity.core_attr_gran))
4727  return i;
4728  return numAddrs;
4729  });
4730  if (!affinity.os_id_masks) {
4731  const char *core_attribute;
4732  if (affinity.core_attr_gran.core_eff != kmp_hw_attr_t::UNKNOWN_CORE_EFF)
4733  core_attribute = "core_efficiency";
4734  else
4735  core_attribute = "core_type";
4736  KMP_AFF_WARNING(affinity, AffIgnoringNotAvailable, env_var,
4737  core_attribute,
4738  __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true))
4739  }
4740  }
4741  // If core attributes did not work, or none were specified,
4742  // then make OS Id mask table using typical incremental way.
4743  if (!affinity.os_id_masks) {
4744  __kmp_create_os_id_masks(&numUnique, affinity, [](int idx) {
4745  KMP_ASSERT(idx >= -1);
4746  return idx + 1;
4747  });
4748  }
4749  if (affinity.gran_levels == 0) {
4750  KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
4751  }
4752 
4753  switch (affinity.type) {
4754 
4755  case affinity_explicit:
4756  KMP_DEBUG_ASSERT(affinity.proclist != NULL);
4757  if (is_hidden_helper_affinity ||
4758  __kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) {
4759  __kmp_affinity_process_proclist(affinity);
4760  } else {
4761  __kmp_affinity_process_placelist(affinity);
4762  }
4763  if (affinity.num_masks == 0) {
4764  KMP_AFF_WARNING(affinity, AffNoValidProcID);
4765  affinity.type = affinity_none;
4766  __kmp_create_affinity_none_places(affinity);
4767  affinity.flags.initialized = TRUE;
4768  return;
4769  }
4770  break;
4771 
4772  // The other affinity types rely on sorting the hardware threads according to
4773  // some permutation of the machine topology tree. Set affinity.compact
4774  // and affinity.offset appropriately, then jump to a common code
4775  // fragment to do the sort and create the array of affinity masks.
4776  case affinity_logical:
4777  affinity.compact = 0;
4778  if (affinity.offset) {
4779  affinity.offset =
4780  __kmp_nThreadsPerCore * affinity.offset % __kmp_avail_proc;
4781  }
4782  goto sortTopology;
4783 
4784  case affinity_physical:
4785  if (__kmp_nThreadsPerCore > 1) {
4786  affinity.compact = 1;
4787  if (affinity.compact >= depth) {
4788  affinity.compact = 0;
4789  }
4790  } else {
4791  affinity.compact = 0;
4792  }
4793  if (affinity.offset) {
4794  affinity.offset =
4795  __kmp_nThreadsPerCore * affinity.offset % __kmp_avail_proc;
4796  }
4797  goto sortTopology;
4798 
4799  case affinity_scatter:
4800  if (affinity.compact >= depth) {
4801  affinity.compact = 0;
4802  } else {
4803  affinity.compact = depth - 1 - affinity.compact;
4804  }
4805  goto sortTopology;
4806 
4807  case affinity_compact:
4808  if (affinity.compact >= depth) {
4809  affinity.compact = depth - 1;
4810  }
4811  goto sortTopology;
4812 
4813  case affinity_balanced:
4814  if (depth <= 1 || is_hidden_helper_affinity) {
4815  KMP_AFF_WARNING(affinity, AffBalancedNotAvail, env_var);
4816  affinity.type = affinity_none;
4817  __kmp_create_affinity_none_places(affinity);
4818  affinity.flags.initialized = TRUE;
4819  return;
4820  } else if (!__kmp_topology->is_uniform()) {
4821  // Save the depth for further usage
4822  __kmp_aff_depth = depth;
4823 
4824  int core_level =
4825  __kmp_affinity_find_core_level(__kmp_avail_proc, depth - 1);
4826  int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, depth - 1,
4827  core_level);
4828  int maxprocpercore = __kmp_affinity_max_proc_per_core(
4829  __kmp_avail_proc, depth - 1, core_level);
4830 
4831  int nproc = ncores * maxprocpercore;
4832  if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
4833  KMP_AFF_WARNING(affinity, AffBalancedNotAvail, env_var);
4834  affinity.type = affinity_none;
4835  __kmp_create_affinity_none_places(affinity);
4836  affinity.flags.initialized = TRUE;
4837  return;
4838  }
4839 
4840  procarr = (int *)__kmp_allocate(sizeof(int) * nproc);
4841  for (int i = 0; i < nproc; i++) {
4842  procarr[i] = -1;
4843  }
4844 
4845  int lastcore = -1;
4846  int inlastcore = 0;
4847  for (int i = 0; i < __kmp_avail_proc; i++) {
4848  int proc = __kmp_topology->at(i).os_id;
4849  int core = __kmp_affinity_find_core(i, depth - 1, core_level);
4850 
4851  if (core == lastcore) {
4852  inlastcore++;
4853  } else {
4854  inlastcore = 0;
4855  }
4856  lastcore = core;
4857 
4858  procarr[core * maxprocpercore + inlastcore] = proc;
4859  }
4860  }
4861  if (affinity.compact >= depth) {
4862  affinity.compact = depth - 1;
4863  }
4864 
4865  sortTopology:
4866  // Allocate the gtid->affinity mask table.
4867  if (affinity.flags.dups) {
4868  affinity.num_masks = __kmp_avail_proc;
4869  } else {
4870  affinity.num_masks = numUnique;
4871  }
4872 
4873  if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
4874  (__kmp_affinity_num_places > 0) &&
4875  ((unsigned)__kmp_affinity_num_places < affinity.num_masks) &&
4876  !is_hidden_helper_affinity) {
4877  affinity.num_masks = __kmp_affinity_num_places;
4878  }
4879 
4880  KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks);
4881 
4882  // Sort the topology table according to the current setting of
4883  // affinity.compact, then fill out affinity.masks.
4884  __kmp_topology->sort_compact(affinity);
4885  {
4886  int i;
4887  unsigned j;
4888  int num_hw_threads = __kmp_topology->get_num_hw_threads();
4889  kmp_full_mask_modifier_t full_mask;
4890  for (i = 0, j = 0; i < num_hw_threads; i++) {
4891  if ((!affinity.flags.dups) && (!__kmp_topology->at(i).leader)) {
4892  continue;
4893  }
4894  int osId = __kmp_topology->at(i).os_id;
4895 
4896  kmp_affin_mask_t *src = KMP_CPU_INDEX(affinity.os_id_masks, osId);
4897  kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, j);
4898  KMP_ASSERT(KMP_CPU_ISSET(osId, src));
4899  KMP_CPU_COPY(dest, src);
4900  full_mask.include(src);
4901  if (++j >= affinity.num_masks) {
4902  break;
4903  }
4904  }
4905  KMP_DEBUG_ASSERT(j == affinity.num_masks);
4906  // See if the places list further restricts or changes the full mask
4907  if (full_mask.restrict_to_mask() && affinity.flags.verbose) {
4908  __kmp_topology->print(env_var);
4909  }
4910  }
4911  // Sort the topology back using ids
4912  __kmp_topology->sort_ids();
4913  break;
4914 
4915  default:
4916  KMP_ASSERT2(0, "Unexpected affinity setting");
4917  }
4918  __kmp_aux_affinity_initialize_other_data(affinity);
4919  affinity.flags.initialized = TRUE;
4920 }
4921 
4922 void __kmp_affinity_initialize(kmp_affinity_t &affinity) {
4923  // Much of the code above was written assuming that if a machine was not
4924  // affinity capable, then affinity type == affinity_none.
4925  // We now explicitly represent this as affinity type == affinity_disabled.
4926  // There are too many checks for affinity type == affinity_none in this code.
4927  // Instead of trying to change them all, check if
4928  // affinity type == affinity_disabled, and if so, slam it with affinity_none,
4929  // call the real initialization routine, then restore affinity type to
4930  // affinity_disabled.
4931  int disabled = (affinity.type == affinity_disabled);
4932  if (!KMP_AFFINITY_CAPABLE())
4933  KMP_ASSERT(disabled);
4934  if (disabled)
4935  affinity.type = affinity_none;
4936  __kmp_aux_affinity_initialize(affinity);
4937  if (disabled)
4938  affinity.type = affinity_disabled;
4939 }
4940 
4941 void __kmp_affinity_uninitialize(void) {
4942  for (kmp_affinity_t *affinity : __kmp_affinities) {
4943  if (affinity->masks != NULL)
4944  KMP_CPU_FREE_ARRAY(affinity->masks, affinity->num_masks);
4945  if (affinity->os_id_masks != NULL)
4946  KMP_CPU_FREE_ARRAY(affinity->os_id_masks, affinity->num_os_id_masks);
4947  if (affinity->proclist != NULL)
4948  __kmp_free(affinity->proclist);
4949  if (affinity->ids != NULL)
4950  __kmp_free(affinity->ids);
4951  if (affinity->attrs != NULL)
4952  __kmp_free(affinity->attrs);
4953  *affinity = KMP_AFFINITY_INIT(affinity->env_var);
4954  }
4955  if (__kmp_affin_origMask != NULL) {
4956  if (KMP_AFFINITY_CAPABLE()) {
4957 #if KMP_OS_AIX
4958  // Uninitialize by unbinding the thread.
4959  bindprocessor(BINDTHREAD, thread_self(), PROCESSOR_CLASS_ANY);
4960 #else
4961  __kmp_set_system_affinity(__kmp_affin_origMask, FALSE);
4962 #endif
4963  }
4964  KMP_CPU_FREE(__kmp_affin_origMask);
4965  __kmp_affin_origMask = NULL;
4966  }
4967  __kmp_affinity_num_places = 0;
4968  if (procarr != NULL) {
4969  __kmp_free(procarr);
4970  procarr = NULL;
4971  }
4972  if (__kmp_osid_to_hwthread_map) {
4973  __kmp_free(__kmp_osid_to_hwthread_map);
4974  __kmp_osid_to_hwthread_map = NULL;
4975  }
4976 #if KMP_USE_HWLOC
4977  if (__kmp_hwloc_topology != NULL) {
4978  hwloc_topology_destroy(__kmp_hwloc_topology);
4979  __kmp_hwloc_topology = NULL;
4980  }
4981 #endif
4982  if (__kmp_hw_subset) {
4983  kmp_hw_subset_t::deallocate(__kmp_hw_subset);
4984  __kmp_hw_subset = nullptr;
4985  }
4986  if (__kmp_topology) {
4987  kmp_topology_t::deallocate(__kmp_topology);
4988  __kmp_topology = nullptr;
4989  }
4990  KMPAffinity::destroy_api();
4991 }
4992 
4993 static void __kmp_select_mask_by_gtid(int gtid, const kmp_affinity_t *affinity,
4994  int *place, kmp_affin_mask_t **mask) {
4995  int mask_idx;
4996  bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
4997  if (is_hidden_helper)
4998  // The first gtid is the regular primary thread, the second gtid is the main
4999  // thread of hidden team which does not participate in task execution.
5000  mask_idx = gtid - 2;
5001  else
5002  mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
5003  KMP_DEBUG_ASSERT(affinity->num_masks > 0);
5004  *place = (mask_idx + affinity->offset) % affinity->num_masks;
5005  *mask = KMP_CPU_INDEX(affinity->masks, *place);
5006 }
5007 
5008 // This function initializes the per-thread data concerning affinity including
5009 // the mask and topology information
5010 void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
5011 
5012  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
5013 
5014  // Set the thread topology information to default of unknown
5015  for (int id = 0; id < KMP_HW_LAST; ++id)
5016  th->th.th_topology_ids.ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
5017  th->th.th_topology_attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
5018 
5019  if (!KMP_AFFINITY_CAPABLE()) {
5020  return;
5021  }
5022 
5023  if (th->th.th_affin_mask == NULL) {
5024  KMP_CPU_ALLOC(th->th.th_affin_mask);
5025  } else {
5026  KMP_CPU_ZERO(th->th.th_affin_mask);
5027  }
5028 
5029  // Copy the thread mask to the kmp_info_t structure. If
5030  // __kmp_affinity.type == affinity_none, copy the "full" mask, i.e.
5031  // one that has all of the OS proc ids set, or if
5032  // __kmp_affinity.flags.respect is set, then the full mask is the
5033  // same as the mask of the initialization thread.
5034  kmp_affin_mask_t *mask;
5035  int i;
5036  const kmp_affinity_t *affinity;
5037  bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
5038 
5039  if (is_hidden_helper)
5040  affinity = &__kmp_hh_affinity;
5041  else
5042  affinity = &__kmp_affinity;
5043 
5044  if (KMP_AFFINITY_NON_PROC_BIND || is_hidden_helper) {
5045  if ((affinity->type == affinity_none) ||
5046  (affinity->type == affinity_balanced) ||
5047  KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) {
5048 #if KMP_GROUP_AFFINITY
5049  if (__kmp_num_proc_groups > 1) {
5050  return;
5051  }
5052 #endif
5053  KMP_ASSERT(__kmp_affin_fullMask != NULL);
5054  i = 0;
5055  mask = __kmp_affin_fullMask;
5056  } else {
5057  __kmp_select_mask_by_gtid(gtid, affinity, &i, &mask);
5058  }
5059  } else {
5060  if (!isa_root || __kmp_nested_proc_bind.bind_types[0] == proc_bind_false) {
5061 #if KMP_GROUP_AFFINITY
5062  if (__kmp_num_proc_groups > 1) {
5063  return;
5064  }
5065 #endif
5066  KMP_ASSERT(__kmp_affin_fullMask != NULL);
5067  i = KMP_PLACE_ALL;
5068  mask = __kmp_affin_fullMask;
5069  } else {
5070  __kmp_select_mask_by_gtid(gtid, affinity, &i, &mask);
5071  }
5072  }
5073 
5074  th->th.th_current_place = i;
5075  if (isa_root && !is_hidden_helper) {
5076  th->th.th_new_place = i;
5077  th->th.th_first_place = 0;
5078  th->th.th_last_place = affinity->num_masks - 1;
5079  } else if (KMP_AFFINITY_NON_PROC_BIND) {
5080  // When using a Non-OMP_PROC_BIND affinity method,
5081  // set all threads' place-partition-var to the entire place list
5082  th->th.th_first_place = 0;
5083  th->th.th_last_place = affinity->num_masks - 1;
5084  }
5085  // Copy topology information associated with the place
5086  if (i >= 0) {
5087  th->th.th_topology_ids = __kmp_affinity.ids[i];
5088  th->th.th_topology_attrs = __kmp_affinity.attrs[i];
5089  }
5090 
5091  if (i == KMP_PLACE_ALL) {
5092  KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to all places\n",
5093  gtid));
5094  } else {
5095  KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to place %d\n",
5096  gtid, i));
5097  }
5098 
5099  KMP_CPU_COPY(th->th.th_affin_mask, mask);
5100 }
5101 
5102 void __kmp_affinity_bind_init_mask(int gtid) {
5103  if (!KMP_AFFINITY_CAPABLE()) {
5104  return;
5105  }
5106  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
5107  const kmp_affinity_t *affinity;
5108  const char *env_var;
5109  bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
5110 
5111  if (is_hidden_helper)
5112  affinity = &__kmp_hh_affinity;
5113  else
5114  affinity = &__kmp_affinity;
5115  env_var = __kmp_get_affinity_env_var(*affinity, /*for_binding=*/true);
5116  /* to avoid duplicate printing (will be correctly printed on barrier) */
5117  if (affinity->flags.verbose && (affinity->type == affinity_none ||
5118  (th->th.th_current_place != KMP_PLACE_ALL &&
5119  affinity->type != affinity_balanced)) &&
5120  !KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) {
5121  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5122  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5123  th->th.th_affin_mask);
5124  KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
5125  gtid, buf);
5126  }
5127 
5128 #if KMP_OS_WINDOWS
5129  // On Windows* OS, the process affinity mask might have changed. If the user
5130  // didn't request affinity and this call fails, just continue silently.
5131  // See CQ171393.
5132  if (affinity->type == affinity_none) {
5133  __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
5134  } else
5135 #endif
5136 #ifndef KMP_OS_AIX
5137  // Do not set the full mask as the init mask on AIX.
5138  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
5139 #endif
5140 }
5141 
5142 void __kmp_affinity_bind_place(int gtid) {
5143  // Hidden helper threads should not be affected by OMP_PLACES/OMP_PROC_BIND
5144  if (!KMP_AFFINITY_CAPABLE() || KMP_HIDDEN_HELPER_THREAD(gtid)) {
5145  return;
5146  }
5147 
5148  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
5149 
5150  KA_TRACE(100, ("__kmp_affinity_bind_place: binding T#%d to place %d (current "
5151  "place = %d)\n",
5152  gtid, th->th.th_new_place, th->th.th_current_place));
5153 
5154  // Check that the new place is within this thread's partition.
5155  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
5156  KMP_ASSERT(th->th.th_new_place >= 0);
5157  KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity.num_masks);
5158  if (th->th.th_first_place <= th->th.th_last_place) {
5159  KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) &&
5160  (th->th.th_new_place <= th->th.th_last_place));
5161  } else {
5162  KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) ||
5163  (th->th.th_new_place >= th->th.th_last_place));
5164  }
5165 
5166  // Copy the thread mask to the kmp_info_t structure,
5167  // and set this thread's affinity.
5168  kmp_affin_mask_t *mask =
5169  KMP_CPU_INDEX(__kmp_affinity.masks, th->th.th_new_place);
5170  KMP_CPU_COPY(th->th.th_affin_mask, mask);
5171  th->th.th_current_place = th->th.th_new_place;
5172 
5173  if (__kmp_affinity.flags.verbose) {
5174  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5175  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5176  th->th.th_affin_mask);
5177  KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
5178  __kmp_gettid(), gtid, buf);
5179  }
5180  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
5181 }
5182 
5183 int __kmp_aux_set_affinity(void **mask) {
5184  int gtid;
5185  kmp_info_t *th;
5186  int retval;
5187 
5188  if (!KMP_AFFINITY_CAPABLE()) {
5189  return -1;
5190  }
5191 
5192  gtid = __kmp_entry_gtid();
5193  KA_TRACE(
5194  1000, (""); {
5195  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5196  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5197  (kmp_affin_mask_t *)(*mask));
5198  __kmp_debug_printf(
5199  "kmp_set_affinity: setting affinity mask for thread %d = %s\n",
5200  gtid, buf);
5201  });
5202 
5203  if (__kmp_env_consistency_check) {
5204  if ((mask == NULL) || (*mask == NULL)) {
5205  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
5206  } else {
5207  unsigned proc;
5208  int num_procs = 0;
5209 
5210  KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) {
5211  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5212  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
5213  }
5214  if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
5215  continue;
5216  }
5217  num_procs++;
5218  }
5219  if (num_procs == 0) {
5220  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
5221  }
5222 
5223 #if KMP_GROUP_AFFINITY
5224  if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
5225  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
5226  }
5227 #endif /* KMP_GROUP_AFFINITY */
5228  }
5229  }
5230 
5231  th = __kmp_threads[gtid];
5232  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
5233  retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
5234  if (retval == 0) {
5235  KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
5236  }
5237 
5238  th->th.th_current_place = KMP_PLACE_UNDEFINED;
5239  th->th.th_new_place = KMP_PLACE_UNDEFINED;
5240  th->th.th_first_place = 0;
5241  th->th.th_last_place = __kmp_affinity.num_masks - 1;
5242 
5243  // Turn off 4.0 affinity for the current tread at this parallel level.
5244  th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
5245 
5246  return retval;
5247 }
5248 
5249 int __kmp_aux_get_affinity(void **mask) {
5250  int gtid;
5251  int retval;
5252 #if KMP_OS_WINDOWS || KMP_OS_AIX || KMP_DEBUG
5253  kmp_info_t *th;
5254 #endif
5255  if (!KMP_AFFINITY_CAPABLE()) {
5256  return -1;
5257  }
5258 
5259  gtid = __kmp_entry_gtid();
5260 #if KMP_OS_WINDOWS || KMP_OS_AIX || KMP_DEBUG
5261  th = __kmp_threads[gtid];
5262 #else
5263  (void)gtid; // unused variable
5264 #endif
5265  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
5266 
5267  KA_TRACE(
5268  1000, (""); {
5269  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5270  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5271  th->th.th_affin_mask);
5272  __kmp_printf(
5273  "kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid,
5274  buf);
5275  });
5276 
5277  if (__kmp_env_consistency_check) {
5278  if ((mask == NULL) || (*mask == NULL)) {
5279  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
5280  }
5281  }
5282 
5283 #if !KMP_OS_WINDOWS && !KMP_OS_AIX
5284 
5285  retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
5286  KA_TRACE(
5287  1000, (""); {
5288  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5289  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5290  (kmp_affin_mask_t *)(*mask));
5291  __kmp_printf(
5292  "kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid,
5293  buf);
5294  });
5295  return retval;
5296 
5297 #else
5298  (void)retval;
5299 
5300  KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
5301  return 0;
5302 
5303 #endif /* !KMP_OS_WINDOWS && !KMP_OS_AIX */
5304 }
5305 
5306 int __kmp_aux_get_affinity_max_proc() {
5307  if (!KMP_AFFINITY_CAPABLE()) {
5308  return 0;
5309  }
5310 #if KMP_GROUP_AFFINITY
5311  if (__kmp_num_proc_groups > 1) {
5312  return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT);
5313  }
5314 #endif
5315  return __kmp_xproc;
5316 }
5317 
5318 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) {
5319  if (!KMP_AFFINITY_CAPABLE()) {
5320  return -1;
5321  }
5322 
5323  KA_TRACE(
5324  1000, (""); {
5325  int gtid = __kmp_entry_gtid();
5326  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5327  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5328  (kmp_affin_mask_t *)(*mask));
5329  __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in "
5330  "affinity mask for thread %d = %s\n",
5331  proc, gtid, buf);
5332  });
5333 
5334  if (__kmp_env_consistency_check) {
5335  if ((mask == NULL) || (*mask == NULL)) {
5336  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
5337  }
5338  }
5339 
5340  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5341  return -1;
5342  }
5343  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5344  return -2;
5345  }
5346 
5347  KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
5348  return 0;
5349 }
5350 
5351 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) {
5352  if (!KMP_AFFINITY_CAPABLE()) {
5353  return -1;
5354  }
5355 
5356  KA_TRACE(
5357  1000, (""); {
5358  int gtid = __kmp_entry_gtid();
5359  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5360  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5361  (kmp_affin_mask_t *)(*mask));
5362  __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in "
5363  "affinity mask for thread %d = %s\n",
5364  proc, gtid, buf);
5365  });
5366 
5367  if (__kmp_env_consistency_check) {
5368  if ((mask == NULL) || (*mask == NULL)) {
5369  KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
5370  }
5371  }
5372 
5373  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5374  return -1;
5375  }
5376  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5377  return -2;
5378  }
5379 
5380  KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
5381  return 0;
5382 }
5383 
5384 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
5385  if (!KMP_AFFINITY_CAPABLE()) {
5386  return -1;
5387  }
5388 
5389  KA_TRACE(
5390  1000, (""); {
5391  int gtid = __kmp_entry_gtid();
5392  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5393  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5394  (kmp_affin_mask_t *)(*mask));
5395  __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in "
5396  "affinity mask for thread %d = %s\n",
5397  proc, gtid, buf);
5398  });
5399 
5400  if (__kmp_env_consistency_check) {
5401  if ((mask == NULL) || (*mask == NULL)) {
5402  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
5403  }
5404  }
5405 
5406  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5407  return -1;
5408  }
5409  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5410  return 0;
5411  }
5412 
5413  return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
5414 }
5415 
5416 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED
5417 // Returns first os proc id with ATOM core
5418 int __kmp_get_first_osid_with_ecore(void) {
5419  int low = 0;
5420  int high = __kmp_topology->get_num_hw_threads() - 1;
5421  int mid = 0;
5422  while (high - low > 1) {
5423  mid = (high + low) / 2;
5424  if (__kmp_topology->at(mid).attrs.get_core_type() ==
5425  KMP_HW_CORE_TYPE_CORE) {
5426  low = mid + 1;
5427  } else {
5428  high = mid;
5429  }
5430  }
5431  if (__kmp_topology->at(mid).attrs.get_core_type() == KMP_HW_CORE_TYPE_ATOM) {
5432  return mid;
5433  }
5434  return -1;
5435 }
5436 #endif
5437 
5438 // Dynamic affinity settings - Affinity balanced
5439 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
5440  KMP_DEBUG_ASSERT(th);
5441  bool fine_gran = true;
5442  int tid = th->th.th_info.ds.ds_tid;
5443  const char *env_var = "KMP_AFFINITY";
5444 
5445  // Do not perform balanced affinity for the hidden helper threads
5446  if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th)))
5447  return;
5448 
5449  switch (__kmp_affinity.gran) {
5450  case KMP_HW_THREAD:
5451  break;
5452  case KMP_HW_CORE:
5453  if (__kmp_nThreadsPerCore > 1) {
5454  fine_gran = false;
5455  }
5456  break;
5457  case KMP_HW_SOCKET:
5458  if (nCoresPerPkg > 1) {
5459  fine_gran = false;
5460  }
5461  break;
5462  default:
5463  fine_gran = false;
5464  }
5465 
5466  if (__kmp_topology->is_uniform()) {
5467  int coreID;
5468  int threadID;
5469  // Number of hyper threads per core in HT machine
5470  int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
5471  // Number of cores
5472  int ncores = __kmp_ncores;
5473  if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) {
5474  __kmp_nth_per_core = __kmp_avail_proc / nPackages;
5475  ncores = nPackages;
5476  }
5477  // How many threads will be bound to each core
5478  int chunk = nthreads / ncores;
5479  // How many cores will have an additional thread bound to it - "big cores"
5480  int big_cores = nthreads % ncores;
5481  // Number of threads on the big cores
5482  int big_nth = (chunk + 1) * big_cores;
5483  if (tid < big_nth) {
5484  coreID = tid / (chunk + 1);
5485  threadID = (tid % (chunk + 1)) % __kmp_nth_per_core;
5486  } else { // tid >= big_nth
5487  coreID = (tid - big_cores) / chunk;
5488  threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core;
5489  }
5490  KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
5491  "Illegal set affinity operation when not capable");
5492 
5493  kmp_affin_mask_t *mask = th->th.th_affin_mask;
5494  KMP_CPU_ZERO(mask);
5495 
5496  if (fine_gran) {
5497  int osID =
5498  __kmp_topology->at(coreID * __kmp_nth_per_core + threadID).os_id;
5499  KMP_CPU_SET(osID, mask);
5500  } else {
5501  for (int i = 0; i < __kmp_nth_per_core; i++) {
5502  int osID;
5503  osID = __kmp_topology->at(coreID * __kmp_nth_per_core + i).os_id;
5504  KMP_CPU_SET(osID, mask);
5505  }
5506  }
5507  if (__kmp_affinity.flags.verbose) {
5508  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5509  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
5510  KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
5511  tid, buf);
5512  }
5513  __kmp_affinity_get_thread_topology_info(th);
5514  __kmp_set_system_affinity(mask, TRUE);
5515  } else { // Non-uniform topology
5516 
5517  kmp_affin_mask_t *mask = th->th.th_affin_mask;
5518  KMP_CPU_ZERO(mask);
5519 
5520  int core_level =
5521  __kmp_affinity_find_core_level(__kmp_avail_proc, __kmp_aff_depth - 1);
5522  int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc,
5523  __kmp_aff_depth - 1, core_level);
5524  int nth_per_core = __kmp_affinity_max_proc_per_core(
5525  __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
5526 
5527  // For performance gain consider the special case nthreads ==
5528  // __kmp_avail_proc
5529  if (nthreads == __kmp_avail_proc) {
5530  if (fine_gran) {
5531  int osID = __kmp_topology->at(tid).os_id;
5532  KMP_CPU_SET(osID, mask);
5533  } else {
5534  int core =
5535  __kmp_affinity_find_core(tid, __kmp_aff_depth - 1, core_level);
5536  for (int i = 0; i < __kmp_avail_proc; i++) {
5537  int osID = __kmp_topology->at(i).os_id;
5538  if (__kmp_affinity_find_core(i, __kmp_aff_depth - 1, core_level) ==
5539  core) {
5540  KMP_CPU_SET(osID, mask);
5541  }
5542  }
5543  }
5544  } else if (nthreads <= ncores) {
5545 
5546  int core = 0;
5547  for (int i = 0; i < ncores; i++) {
5548  // Check if this core from procarr[] is in the mask
5549  int in_mask = 0;
5550  for (int j = 0; j < nth_per_core; j++) {
5551  if (procarr[i * nth_per_core + j] != -1) {
5552  in_mask = 1;
5553  break;
5554  }
5555  }
5556  if (in_mask) {
5557  if (tid == core) {
5558  for (int j = 0; j < nth_per_core; j++) {
5559  int osID = procarr[i * nth_per_core + j];
5560  if (osID != -1) {
5561  KMP_CPU_SET(osID, mask);
5562  // For fine granularity it is enough to set the first available
5563  // osID for this core
5564  if (fine_gran) {
5565  break;
5566  }
5567  }
5568  }
5569  break;
5570  } else {
5571  core++;
5572  }
5573  }
5574  }
5575  } else { // nthreads > ncores
5576  // Array to save the number of processors at each core
5577  int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores);
5578  // Array to save the number of cores with "x" available processors;
5579  int *ncores_with_x_procs =
5580  (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
5581  // Array to save the number of cores with # procs from x to nth_per_core
5582  int *ncores_with_x_to_max_procs =
5583  (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
5584 
5585  for (int i = 0; i <= nth_per_core; i++) {
5586  ncores_with_x_procs[i] = 0;
5587  ncores_with_x_to_max_procs[i] = 0;
5588  }
5589 
5590  for (int i = 0; i < ncores; i++) {
5591  int cnt = 0;
5592  for (int j = 0; j < nth_per_core; j++) {
5593  if (procarr[i * nth_per_core + j] != -1) {
5594  cnt++;
5595  }
5596  }
5597  nproc_at_core[i] = cnt;
5598  ncores_with_x_procs[cnt]++;
5599  }
5600 
5601  for (int i = 0; i <= nth_per_core; i++) {
5602  for (int j = i; j <= nth_per_core; j++) {
5603  ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j];
5604  }
5605  }
5606 
5607  // Max number of processors
5608  int nproc = nth_per_core * ncores;
5609  // An array to keep number of threads per each context
5610  int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc);
5611  for (int i = 0; i < nproc; i++) {
5612  newarr[i] = 0;
5613  }
5614 
5615  int nth = nthreads;
5616  int flag = 0;
5617  while (nth > 0) {
5618  for (int j = 1; j <= nth_per_core; j++) {
5619  int cnt = ncores_with_x_to_max_procs[j];
5620  for (int i = 0; i < ncores; i++) {
5621  // Skip the core with 0 processors
5622  if (nproc_at_core[i] == 0) {
5623  continue;
5624  }
5625  for (int k = 0; k < nth_per_core; k++) {
5626  if (procarr[i * nth_per_core + k] != -1) {
5627  if (newarr[i * nth_per_core + k] == 0) {
5628  newarr[i * nth_per_core + k] = 1;
5629  cnt--;
5630  nth--;
5631  break;
5632  } else {
5633  if (flag != 0) {
5634  newarr[i * nth_per_core + k]++;
5635  cnt--;
5636  nth--;
5637  break;
5638  }
5639  }
5640  }
5641  }
5642  if (cnt == 0 || nth == 0) {
5643  break;
5644  }
5645  }
5646  if (nth == 0) {
5647  break;
5648  }
5649  }
5650  flag = 1;
5651  }
5652  int sum = 0;
5653  for (int i = 0; i < nproc; i++) {
5654  sum += newarr[i];
5655  if (sum > tid) {
5656  if (fine_gran) {
5657  int osID = procarr[i];
5658  KMP_CPU_SET(osID, mask);
5659  } else {
5660  int coreID = i / nth_per_core;
5661  for (int ii = 0; ii < nth_per_core; ii++) {
5662  int osID = procarr[coreID * nth_per_core + ii];
5663  if (osID != -1) {
5664  KMP_CPU_SET(osID, mask);
5665  }
5666  }
5667  }
5668  break;
5669  }
5670  }
5671  __kmp_free(newarr);
5672  }
5673 
5674  if (__kmp_affinity.flags.verbose) {
5675  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5676  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
5677  KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
5678  tid, buf);
5679  }
5680  __kmp_affinity_get_thread_topology_info(th);
5681  __kmp_set_system_affinity(mask, TRUE);
5682  }
5683 }
5684 
5685 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
5686  KMP_OS_AIX
5687 // We don't need this entry for Windows because
5688 // there is GetProcessAffinityMask() api
5689 //
5690 // The intended usage is indicated by these steps:
5691 // 1) The user gets the current affinity mask
5692 // 2) Then sets the affinity by calling this function
5693 // 3) Error check the return value
5694 // 4) Use non-OpenMP parallelization
5695 // 5) Reset the affinity to what was stored in step 1)
5696 #ifdef __cplusplus
5697 extern "C"
5698 #endif
5699  int
5700  kmp_set_thread_affinity_mask_initial()
5701 // the function returns 0 on success,
5702 // -1 if we cannot bind thread
5703 // >0 (errno) if an error happened during binding
5704 {
5705  int gtid = __kmp_get_gtid();
5706  if (gtid < 0) {
5707  // Do not touch non-omp threads
5708  KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5709  "non-omp thread, returning\n"));
5710  return -1;
5711  }
5712  if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
5713  KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5714  "affinity not initialized, returning\n"));
5715  return -1;
5716  }
5717  KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5718  "set full mask for thread %d\n",
5719  gtid));
5720  KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
5721 #if KMP_OS_AIX
5722  return bindprocessor(BINDTHREAD, thread_self(), PROCESSOR_CLASS_ANY);
5723 #else
5724  return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
5725 #endif
5726 }
5727 #endif
5728 
5729 #endif // KMP_AFFINITY_SUPPORTED
int try_open(const char *filename, const char *mode)
Definition: kmp.h:4756