LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #include "kmp_utils.h"
28 #if KMP_USE_HIER_SCHED
29 #include "kmp_dispatch_hier.h"
30 #endif
31 
32 #if OMPT_SUPPORT
33 #include "ompt-specific.h"
34 #endif
35 #if OMPD_SUPPORT
36 #include "ompd-specific.h"
37 #endif
38 
39 #if OMP_PROFILING_SUPPORT
40 #include "llvm/Support/TimeProfiler.h"
41 static char *ProfileTraceFile = nullptr;
42 #endif
43 
44 /* these are temporary issues to be dealt with */
45 #define KMP_USE_PRCTL 0
46 
47 #if KMP_OS_WINDOWS
48 #include <process.h>
49 #endif
50 
51 #ifndef KMP_USE_SHM
52 // Windows and WASI do not need these include files as they don't use shared
53 // memory.
54 #else
55 #include <sys/mman.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #define SHM_SIZE 1024
59 #endif
60 
61 #if defined(KMP_GOMP_COMPAT)
62 char const __kmp_version_alt_comp[] =
63  KMP_VERSION_PREFIX "alternative compiler support: yes";
64 #endif /* defined(KMP_GOMP_COMPAT) */
65 
66 char const __kmp_version_omp_api[] =
67  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68 
69 #ifdef KMP_DEBUG
70 char const __kmp_version_lock[] =
71  KMP_VERSION_PREFIX "lock type: run time selectable";
72 #endif /* KMP_DEBUG */
73 
74 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75 
76 /* ------------------------------------------------------------------------ */
77 
78 #if KMP_USE_MONITOR
79 kmp_info_t __kmp_monitor;
80 #endif
81 
82 /* Forward declarations */
83 
84 void __kmp_cleanup(void);
85 
86 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87  int gtid);
88 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89  kmp_internal_control_t *new_icvs,
90  ident_t *loc);
91 #if KMP_AFFINITY_SUPPORTED
92 static void __kmp_partition_places(kmp_team_t *team,
93  int update_master_only = 0);
94 #endif
95 static void __kmp_do_serial_initialize(void);
96 void __kmp_fork_barrier(int gtid, int tid);
97 void __kmp_join_barrier(int gtid);
98 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99  kmp_internal_control_t *new_icvs, ident_t *loc);
100 
101 #ifdef USE_LOAD_BALANCE
102 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103 #endif
104 
105 static int __kmp_expand_threads(int nNeed);
106 #if KMP_OS_WINDOWS
107 static int __kmp_unregister_root_other_thread(int gtid);
108 #endif
109 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111 
112 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
113  int new_nthreads);
114 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
115 
116 static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr,
117  int level) {
118  kmp_nested_nthreads_t *new_nested_nth =
119  (kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC(
120  sizeof(kmp_nested_nthreads_t));
121  int new_size = level + thr->th.th_set_nested_nth_sz;
122  new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int));
123  for (int i = 0; i < level + 1; ++i)
124  new_nested_nth->nth[i] = 0;
125  for (int i = level + 1, j = 1; i < new_size; ++i, ++j)
126  new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j];
127  new_nested_nth->size = new_nested_nth->used = new_size;
128  return new_nested_nth;
129 }
130 
131 /* Calculate the identifier of the current thread */
132 /* fast (and somewhat portable) way to get unique identifier of executing
133  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
134 int __kmp_get_global_thread_id() {
135  int i;
136  kmp_info_t **other_threads;
137  size_t stack_data;
138  char *stack_addr;
139  size_t stack_size;
140  char *stack_base;
141 
142  KA_TRACE(
143  1000,
144  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
145  __kmp_nth, __kmp_all_nth));
146 
147  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
148  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
149  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
150  __kmp_init_gtid for this to work. */
151 
152  if (!TCR_4(__kmp_init_gtid))
153  return KMP_GTID_DNE;
154 
155 #ifdef KMP_TDATA_GTID
156  if (TCR_4(__kmp_gtid_mode) >= 3) {
157  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
158  return __kmp_gtid;
159  }
160 #endif
161  if (TCR_4(__kmp_gtid_mode) >= 2) {
162  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
163  return __kmp_gtid_get_specific();
164  }
165  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
166 
167  stack_addr = (char *)&stack_data;
168  other_threads = __kmp_threads;
169 
170  /* ATT: The code below is a source of potential bugs due to unsynchronized
171  access to __kmp_threads array. For example:
172  1. Current thread loads other_threads[i] to thr and checks it, it is
173  non-NULL.
174  2. Current thread is suspended by OS.
175  3. Another thread unregisters and finishes (debug versions of free()
176  may fill memory with something like 0xEF).
177  4. Current thread is resumed.
178  5. Current thread reads junk from *thr.
179  TODO: Fix it. --ln */
180 
181  for (i = 0; i < __kmp_threads_capacity; i++) {
182 
183  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
184  if (!thr)
185  continue;
186 
187  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
188  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
189 
190  /* stack grows down -- search through all of the active threads */
191 
192  if (stack_addr <= stack_base) {
193  size_t stack_diff = stack_base - stack_addr;
194 
195  if (stack_diff <= stack_size) {
196  /* The only way we can be closer than the allocated */
197  /* stack size is if we are running on this thread. */
198  // __kmp_gtid_get_specific can return negative value because this
199  // function can be called by thread destructor. However, before the
200  // thread destructor is called, the value of the corresponding
201  // thread-specific data will be reset to NULL.
202  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
203  __kmp_gtid_get_specific() == i);
204  return i;
205  }
206  }
207  }
208 
209  /* get specific to try and determine our gtid */
210  KA_TRACE(1000,
211  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
212  "thread, using TLS\n"));
213  i = __kmp_gtid_get_specific();
214 
215  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
216 
217  /* if we havn't been assigned a gtid, then return code */
218  if (i < 0)
219  return i;
220 
221  // other_threads[i] can be nullptr at this point because the corresponding
222  // thread could have already been destructed. It can happen when this function
223  // is called in end library routine.
224  if (!TCR_SYNC_PTR(other_threads[i]))
225  return i;
226 
227  /* dynamically updated stack window for uber threads to avoid get_specific
228  call */
229  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
230  KMP_FATAL(StackOverflow, i);
231  }
232 
233  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
234  if (stack_addr > stack_base) {
235  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
236  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
237  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
238  stack_base);
239  } else {
240  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
241  stack_base - stack_addr);
242  }
243 
244  /* Reprint stack bounds for ubermaster since they have been refined */
245  if (__kmp_storage_map) {
246  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
247  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
248  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
249  other_threads[i]->th.th_info.ds.ds_stacksize,
250  "th_%d stack (refinement)", i);
251  }
252  return i;
253 }
254 
255 int __kmp_get_global_thread_id_reg() {
256  int gtid;
257 
258  if (!__kmp_init_serial) {
259  gtid = KMP_GTID_DNE;
260  } else
261 #ifdef KMP_TDATA_GTID
262  if (TCR_4(__kmp_gtid_mode) >= 3) {
263  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
264  gtid = __kmp_gtid;
265  } else
266 #endif
267  if (TCR_4(__kmp_gtid_mode) >= 2) {
268  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
269  gtid = __kmp_gtid_get_specific();
270  } else {
271  KA_TRACE(1000,
272  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
273  gtid = __kmp_get_global_thread_id();
274  }
275 
276  /* we must be a new uber master sibling thread */
277  if (gtid == KMP_GTID_DNE) {
278  KA_TRACE(10,
279  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
280  "Registering a new gtid.\n"));
281  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
282  if (!__kmp_init_serial) {
283  __kmp_do_serial_initialize();
284  gtid = __kmp_gtid_get_specific();
285  } else {
286  gtid = __kmp_register_root(FALSE);
287  }
288  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
289  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
290  }
291 
292  KMP_DEBUG_ASSERT(gtid >= 0);
293 
294  return gtid;
295 }
296 
297 /* caller must hold forkjoin_lock */
298 void __kmp_check_stack_overlap(kmp_info_t *th) {
299  int f;
300  char *stack_beg = NULL;
301  char *stack_end = NULL;
302  int gtid;
303 
304  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
305  if (__kmp_storage_map) {
306  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
307  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
308 
309  gtid = __kmp_gtid_from_thread(th);
310 
311  if (gtid == KMP_GTID_MONITOR) {
312  __kmp_print_storage_map_gtid(
313  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
314  "th_%s stack (%s)", "mon",
315  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
316  } else {
317  __kmp_print_storage_map_gtid(
318  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
319  "th_%d stack (%s)", gtid,
320  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
321  }
322  }
323 
324  /* No point in checking ubermaster threads since they use refinement and
325  * cannot overlap */
326  gtid = __kmp_gtid_from_thread(th);
327  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
328  KA_TRACE(10,
329  ("__kmp_check_stack_overlap: performing extensive checking\n"));
330  if (stack_beg == NULL) {
331  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
332  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
333  }
334 
335  for (f = 0; f < __kmp_threads_capacity; f++) {
336  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
337 
338  if (f_th && f_th != th) {
339  char *other_stack_end =
340  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
341  char *other_stack_beg =
342  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
343  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
344  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
345 
346  /* Print the other stack values before the abort */
347  if (__kmp_storage_map)
348  __kmp_print_storage_map_gtid(
349  -1, other_stack_beg, other_stack_end,
350  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
351  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
352 
353  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
354  __kmp_msg_null);
355  }
356  }
357  }
358  }
359  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
360 }
361 
362 /* ------------------------------------------------------------------------ */
363 
364 void __kmp_infinite_loop(void) {
365  static int done = FALSE;
366 
367  while (!done) {
368  KMP_YIELD(TRUE);
369  }
370 }
371 
372 #define MAX_MESSAGE 512
373 
374 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
375  char const *format, ...) {
376  char buffer[MAX_MESSAGE];
377  va_list ap;
378 
379  va_start(ap, format);
380  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
381  p2, (unsigned long)size, format);
382  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
383  __kmp_vprintf(kmp_err, buffer, ap);
384 #if KMP_PRINT_DATA_PLACEMENT
385  int node;
386  if (gtid >= 0) {
387  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
388  if (__kmp_storage_map_verbose) {
389  node = __kmp_get_host_node(p1);
390  if (node < 0) /* doesn't work, so don't try this next time */
391  __kmp_storage_map_verbose = FALSE;
392  else {
393  char *last;
394  int lastNode;
395  int localProc = __kmp_get_cpu_from_gtid(gtid);
396 
397  const int page_size = KMP_GET_PAGE_SIZE();
398 
399  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
400  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
401  if (localProc >= 0)
402  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
403  localProc >> 1);
404  else
405  __kmp_printf_no_lock(" GTID %d\n", gtid);
406 #if KMP_USE_PRCTL
407  /* The more elaborate format is disabled for now because of the prctl
408  * hanging bug. */
409  do {
410  last = p1;
411  lastNode = node;
412  /* This loop collates adjacent pages with the same host node. */
413  do {
414  (char *)p1 += page_size;
415  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
416  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
417  lastNode);
418  } while (p1 <= p2);
419 #else
420  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
421  (char *)p1 + (page_size - 1),
422  __kmp_get_host_node(p1));
423  if (p1 < p2) {
424  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
425  (char *)p2 + (page_size - 1),
426  __kmp_get_host_node(p2));
427  }
428 #endif
429  }
430  }
431  } else
432  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
433  }
434 #endif /* KMP_PRINT_DATA_PLACEMENT */
435  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
436 
437  va_end(ap);
438 }
439 
440 void __kmp_warn(char const *format, ...) {
441  char buffer[MAX_MESSAGE];
442  va_list ap;
443 
444  if (__kmp_generate_warnings == kmp_warnings_off) {
445  return;
446  }
447 
448  va_start(ap, format);
449 
450  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
451  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
452  __kmp_vprintf(kmp_err, buffer, ap);
453  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
454 
455  va_end(ap);
456 }
457 
458 void __kmp_abort_process() {
459  // Later threads may stall here, but that's ok because abort() will kill them.
460  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
461 
462  if (__kmp_debug_buf) {
463  __kmp_dump_debug_buffer();
464  }
465 
466 #if KMP_OS_WINDOWS
467  // Let other threads know of abnormal termination and prevent deadlock
468  // if abort happened during library initialization or shutdown
469  __kmp_global.g.g_abort = SIGABRT;
470 
471  /* On Windows* OS by default abort() causes pop-up error box, which stalls
472  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
473  boxes. _set_abort_behavior() works well, but this function is not
474  available in VS7 (this is not problem for DLL, but it is a problem for
475  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
476  help, at least in some versions of MS C RTL.
477 
478  It seems following sequence is the only way to simulate abort() and
479  avoid pop-up error box. */
480  raise(SIGABRT);
481  _exit(3); // Just in case, if signal ignored, exit anyway.
482 #else
483  __kmp_unregister_library();
484  abort();
485 #endif
486 
487  __kmp_infinite_loop();
488  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
489 
490 } // __kmp_abort_process
491 
492 void __kmp_abort_thread(void) {
493  // TODO: Eliminate g_abort global variable and this function.
494  // In case of abort just call abort(), it will kill all the threads.
495  __kmp_infinite_loop();
496 } // __kmp_abort_thread
497 
498 /* Print out the storage map for the major kmp_info_t thread data structures
499  that are allocated together. */
500 
501 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
502  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
503  gtid);
504 
505  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
506  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
507 
508  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
509  sizeof(kmp_local_t), "th_%d.th_local", gtid);
510 
511  __kmp_print_storage_map_gtid(
512  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
513  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
514 
515  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
516  &thr->th.th_bar[bs_plain_barrier + 1],
517  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
518  gtid);
519 
520  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
521  &thr->th.th_bar[bs_forkjoin_barrier + 1],
522  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
523  gtid);
524 
525 #if KMP_FAST_REDUCTION_BARRIER
526  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
527  &thr->th.th_bar[bs_reduction_barrier + 1],
528  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
529  gtid);
530 #endif // KMP_FAST_REDUCTION_BARRIER
531 }
532 
533 /* Print out the storage map for the major kmp_team_t team data structures
534  that are allocated together. */
535 
536 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
537  int team_id, int num_thr) {
538  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
539  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
540  header, team_id);
541 
542  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
543  &team->t.t_bar[bs_last_barrier],
544  sizeof(kmp_balign_team_t) * bs_last_barrier,
545  "%s_%d.t_bar", header, team_id);
546 
547  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
548  &team->t.t_bar[bs_plain_barrier + 1],
549  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
550  header, team_id);
551 
552  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
553  &team->t.t_bar[bs_forkjoin_barrier + 1],
554  sizeof(kmp_balign_team_t),
555  "%s_%d.t_bar[forkjoin]", header, team_id);
556 
557 #if KMP_FAST_REDUCTION_BARRIER
558  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
559  &team->t.t_bar[bs_reduction_barrier + 1],
560  sizeof(kmp_balign_team_t),
561  "%s_%d.t_bar[reduction]", header, team_id);
562 #endif // KMP_FAST_REDUCTION_BARRIER
563 
564  __kmp_print_storage_map_gtid(
565  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
566  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
567 
568  __kmp_print_storage_map_gtid(
569  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
570  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
571 
572  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
573  &team->t.t_disp_buffer[num_disp_buff],
574  sizeof(dispatch_shared_info_t) * num_disp_buff,
575  "%s_%d.t_disp_buffer", header, team_id);
576 }
577 
578 static void __kmp_init_allocator() {
579  __kmp_init_memkind();
580  __kmp_init_target_mem();
581 }
582 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
583 
584 /* ------------------------------------------------------------------------ */
585 
586 #if ENABLE_LIBOMPTARGET
587 static void __kmp_init_omptarget() {
588  __kmp_init_target_task();
589 }
590 #endif
591 
592 /* ------------------------------------------------------------------------ */
593 
594 #if KMP_DYNAMIC_LIB
595 #if KMP_OS_WINDOWS
596 
597 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
598  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
599 
600  switch (fdwReason) {
601 
602  case DLL_PROCESS_ATTACH:
603  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
604 
605  return TRUE;
606 
607  case DLL_PROCESS_DETACH:
608  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
609 
610  // According to Windows* documentation for DllMain entry point:
611  // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
612  // lpReserved == NULL when FreeLibrary() is called,
613  // lpReserved != NULL when the process is terminated.
614  // When FreeLibrary() is called, worker threads remain alive. So the
615  // runtime's state is consistent and executing proper shutdown is OK.
616  // When the process is terminated, worker threads have exited or been
617  // forcefully terminated by the OS and only the shutdown thread remains.
618  // This can leave the runtime in an inconsistent state.
619  // Hence, only attempt proper cleanup when FreeLibrary() is called.
620  // Otherwise, rely on OS to reclaim resources.
621  if (lpReserved == NULL)
622  __kmp_internal_end_library(__kmp_gtid_get_specific());
623 
624  return TRUE;
625 
626  case DLL_THREAD_ATTACH:
627  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
628 
629  /* if we want to register new siblings all the time here call
630  * __kmp_get_gtid(); */
631  return TRUE;
632 
633  case DLL_THREAD_DETACH:
634  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
635 
636  __kmp_internal_end_thread(__kmp_gtid_get_specific());
637  return TRUE;
638  }
639 
640  return TRUE;
641 }
642 
643 #endif /* KMP_OS_WINDOWS */
644 #endif /* KMP_DYNAMIC_LIB */
645 
646 /* __kmp_parallel_deo -- Wait until it's our turn. */
647 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
648  int gtid = *gtid_ref;
649 #ifdef BUILD_PARALLEL_ORDERED
650  kmp_team_t *team = __kmp_team_from_gtid(gtid);
651 #endif /* BUILD_PARALLEL_ORDERED */
652 
653  if (__kmp_env_consistency_check) {
654  if (__kmp_threads[gtid]->th.th_root->r.r_active)
655 #if KMP_USE_DYNAMIC_LOCK
656  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
657 #else
658  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
659 #endif
660  }
661 #ifdef BUILD_PARALLEL_ORDERED
662  if (!team->t.t_serialized) {
663  KMP_MB();
664  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
665  NULL);
666  KMP_MB();
667  }
668 #endif /* BUILD_PARALLEL_ORDERED */
669 }
670 
671 /* __kmp_parallel_dxo -- Signal the next task. */
672 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
673  int gtid = *gtid_ref;
674 #ifdef BUILD_PARALLEL_ORDERED
675  int tid = __kmp_tid_from_gtid(gtid);
676  kmp_team_t *team = __kmp_team_from_gtid(gtid);
677 #endif /* BUILD_PARALLEL_ORDERED */
678 
679  if (__kmp_env_consistency_check) {
680  if (__kmp_threads[gtid]->th.th_root->r.r_active)
681  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
682  }
683 #ifdef BUILD_PARALLEL_ORDERED
684  if (!team->t.t_serialized) {
685  KMP_MB(); /* Flush all pending memory write invalidates. */
686 
687  /* use the tid of the next thread in this team */
688  /* TODO replace with general release procedure */
689  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
690 
691  KMP_MB(); /* Flush all pending memory write invalidates. */
692  }
693 #endif /* BUILD_PARALLEL_ORDERED */
694 }
695 
696 /* ------------------------------------------------------------------------ */
697 /* The BARRIER for a SINGLE process section is always explicit */
698 
699 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
700  int status;
701  kmp_info_t *th;
702  kmp_team_t *team;
703 
704  if (!TCR_4(__kmp_init_parallel))
705  __kmp_parallel_initialize();
706  __kmp_resume_if_soft_paused();
707 
708  th = __kmp_threads[gtid];
709  team = th->th.th_team;
710  status = 0;
711 
712  th->th.th_ident = id_ref;
713 
714  if (team->t.t_serialized) {
715  status = 1;
716  } else {
717  kmp_int32 old_this = th->th.th_local.this_construct;
718 
719  ++th->th.th_local.this_construct;
720  /* try to set team count to thread count--success means thread got the
721  single block */
722  /* TODO: Should this be acquire or release? */
723  if (team->t.t_construct == old_this) {
724  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
725  th->th.th_local.this_construct);
726  }
727 #if USE_ITT_BUILD
728  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
729  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
730  team->t.t_active_level == 1) {
731  // Only report metadata by primary thread of active team at level 1
732  __kmp_itt_metadata_single(id_ref);
733  }
734 #endif /* USE_ITT_BUILD */
735  }
736 
737  if (__kmp_env_consistency_check) {
738  if (status && push_ws) {
739  __kmp_push_workshare(gtid, ct_psingle, id_ref);
740  } else {
741  __kmp_check_workshare(gtid, ct_psingle, id_ref);
742  }
743  }
744 #if USE_ITT_BUILD
745  if (status) {
746  __kmp_itt_single_start(gtid);
747  }
748 #endif /* USE_ITT_BUILD */
749  return status;
750 }
751 
752 void __kmp_exit_single(int gtid) {
753 #if USE_ITT_BUILD
754  __kmp_itt_single_end(gtid);
755 #endif /* USE_ITT_BUILD */
756  if (__kmp_env_consistency_check)
757  __kmp_pop_workshare(gtid, ct_psingle, NULL);
758 }
759 
760 /* determine if we can go parallel or must use a serialized parallel region and
761  * how many threads we can use
762  * set_nproc is the number of threads requested for the team
763  * returns 0 if we should serialize or only use one thread,
764  * otherwise the number of threads to use
765  * The forkjoin lock is held by the caller. */
766 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
767  int master_tid, int set_nthreads,
768  int enter_teams) {
769  int capacity;
770  int new_nthreads;
771  KMP_DEBUG_ASSERT(__kmp_init_serial);
772  KMP_DEBUG_ASSERT(root && parent_team);
773  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
774 
775  // If dyn-var is set, dynamically adjust the number of desired threads,
776  // according to the method specified by dynamic_mode.
777  new_nthreads = set_nthreads;
778  if (!get__dynamic_2(parent_team, master_tid)) {
779  ;
780  }
781 #ifdef USE_LOAD_BALANCE
782  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
783  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
784  if (new_nthreads == 1) {
785  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
786  "reservation to 1 thread\n",
787  master_tid));
788  return 1;
789  }
790  if (new_nthreads < set_nthreads) {
791  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
792  "reservation to %d threads\n",
793  master_tid, new_nthreads));
794  }
795  }
796 #endif /* USE_LOAD_BALANCE */
797  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
798  new_nthreads = __kmp_avail_proc - __kmp_nth +
799  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
800  if (new_nthreads <= 1) {
801  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
802  "reservation to 1 thread\n",
803  master_tid));
804  return 1;
805  }
806  if (new_nthreads < set_nthreads) {
807  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
808  "reservation to %d threads\n",
809  master_tid, new_nthreads));
810  } else {
811  new_nthreads = set_nthreads;
812  }
813  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
814  if (set_nthreads > 2) {
815  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
816  new_nthreads = (new_nthreads % set_nthreads) + 1;
817  if (new_nthreads == 1) {
818  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
819  "reservation to 1 thread\n",
820  master_tid));
821  return 1;
822  }
823  if (new_nthreads < set_nthreads) {
824  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
825  "reservation to %d threads\n",
826  master_tid, new_nthreads));
827  }
828  }
829  } else {
830  KMP_ASSERT(0);
831  }
832 
833  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
834  if (__kmp_nth + new_nthreads -
835  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
836  __kmp_max_nth) {
837  int tl_nthreads = __kmp_max_nth - __kmp_nth +
838  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
839  if (tl_nthreads <= 0) {
840  tl_nthreads = 1;
841  }
842 
843  // If dyn-var is false, emit a 1-time warning.
844  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
845  __kmp_reserve_warn = 1;
846  __kmp_msg(kmp_ms_warning,
847  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
848  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
849  }
850  if (tl_nthreads == 1) {
851  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
852  "reduced reservation to 1 thread\n",
853  master_tid));
854  return 1;
855  }
856  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
857  "reservation to %d threads\n",
858  master_tid, tl_nthreads));
859  new_nthreads = tl_nthreads;
860  }
861 
862  // Respect OMP_THREAD_LIMIT
863  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
864  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
865  if (cg_nthreads + new_nthreads -
866  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
867  max_cg_threads) {
868  int tl_nthreads = max_cg_threads - cg_nthreads +
869  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
870  if (tl_nthreads <= 0) {
871  tl_nthreads = 1;
872  }
873 
874  // If dyn-var is false, emit a 1-time warning.
875  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
876  __kmp_reserve_warn = 1;
877  __kmp_msg(kmp_ms_warning,
878  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
879  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
880  }
881  if (tl_nthreads == 1) {
882  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
883  "reduced reservation to 1 thread\n",
884  master_tid));
885  return 1;
886  }
887  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
888  "reservation to %d threads\n",
889  master_tid, tl_nthreads));
890  new_nthreads = tl_nthreads;
891  }
892 
893  // Check if the threads array is large enough, or needs expanding.
894  // See comment in __kmp_register_root() about the adjustment if
895  // __kmp_threads[0] == NULL.
896  capacity = __kmp_threads_capacity;
897  if (TCR_PTR(__kmp_threads[0]) == NULL) {
898  --capacity;
899  }
900  // If it is not for initializing the hidden helper team, we need to take
901  // __kmp_hidden_helper_threads_num out of the capacity because it is included
902  // in __kmp_threads_capacity.
903  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
904  capacity -= __kmp_hidden_helper_threads_num;
905  }
906  if (__kmp_nth + new_nthreads -
907  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
908  capacity) {
909  // Expand the threads array.
910  int slotsRequired = __kmp_nth + new_nthreads -
911  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
912  capacity;
913  int slotsAdded = __kmp_expand_threads(slotsRequired);
914  if (slotsAdded < slotsRequired) {
915  // The threads array was not expanded enough.
916  new_nthreads -= (slotsRequired - slotsAdded);
917  KMP_ASSERT(new_nthreads >= 1);
918 
919  // If dyn-var is false, emit a 1-time warning.
920  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
921  __kmp_reserve_warn = 1;
922  if (__kmp_tp_cached) {
923  __kmp_msg(kmp_ms_warning,
924  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
925  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
926  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
927  } else {
928  __kmp_msg(kmp_ms_warning,
929  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
930  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
931  }
932  }
933  }
934  }
935 
936 #ifdef KMP_DEBUG
937  if (new_nthreads == 1) {
938  KC_TRACE(10,
939  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
940  "dead roots and rechecking; requested %d threads\n",
941  __kmp_get_gtid(), set_nthreads));
942  } else {
943  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
944  " %d threads\n",
945  __kmp_get_gtid(), new_nthreads, set_nthreads));
946  }
947 #endif // KMP_DEBUG
948 
949  if (this_thr->th.th_nt_strict && new_nthreads < set_nthreads) {
950  __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
951  this_thr->th.th_nt_msg);
952  }
953  return new_nthreads;
954 }
955 
956 /* Allocate threads from the thread pool and assign them to the new team. We are
957  assured that there are enough threads available, because we checked on that
958  earlier within critical section forkjoin */
959 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
960  kmp_info_t *master_th, int master_gtid,
961  int fork_teams_workers) {
962  int i;
963  int use_hot_team;
964 
965  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
966  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
967  KMP_MB();
968 
969  /* first, let's setup the primary thread */
970  master_th->th.th_info.ds.ds_tid = 0;
971  master_th->th.th_team = team;
972  master_th->th.th_team_nproc = team->t.t_nproc;
973  master_th->th.th_team_master = master_th;
974  master_th->th.th_team_serialized = FALSE;
975  master_th->th.th_dispatch = &team->t.t_dispatch[0];
976 
977 /* make sure we are not the optimized hot team */
978 #if KMP_NESTED_HOT_TEAMS
979  use_hot_team = 0;
980  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
981  if (hot_teams) { // hot teams array is not allocated if
982  // KMP_HOT_TEAMS_MAX_LEVEL=0
983  int level = team->t.t_active_level - 1; // index in array of hot teams
984  if (master_th->th.th_teams_microtask) { // are we inside the teams?
985  if (master_th->th.th_teams_size.nteams > 1) {
986  ++level; // level was not increased in teams construct for
987  // team_of_masters
988  }
989  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
990  master_th->th.th_teams_level == team->t.t_level) {
991  ++level; // level was not increased in teams construct for
992  // team_of_workers before the parallel
993  } // team->t.t_level will be increased inside parallel
994  }
995  if (level < __kmp_hot_teams_max_level) {
996  if (hot_teams[level].hot_team) {
997  // hot team has already been allocated for given level
998  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
999  use_hot_team = 1; // the team is ready to use
1000  } else {
1001  use_hot_team = 0; // AC: threads are not allocated yet
1002  hot_teams[level].hot_team = team; // remember new hot team
1003  hot_teams[level].hot_team_nth = team->t.t_nproc;
1004  }
1005  } else {
1006  use_hot_team = 0;
1007  }
1008  }
1009 #else
1010  use_hot_team = team == root->r.r_hot_team;
1011 #endif
1012  if (!use_hot_team) {
1013 
1014  /* install the primary thread */
1015  team->t.t_threads[0] = master_th;
1016  __kmp_initialize_info(master_th, team, 0, master_gtid);
1017 
1018  /* now, install the worker threads */
1019  for (i = 1; i < team->t.t_nproc; i++) {
1020 
1021  /* fork or reallocate a new thread and install it in team */
1022  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1023  team->t.t_threads[i] = thr;
1024  KMP_DEBUG_ASSERT(thr);
1025  KMP_DEBUG_ASSERT(thr->th.th_team == team);
1026  /* align team and thread arrived states */
1027  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1028  "T#%d(%d:%d) join =%llu, plain=%llu\n",
1029  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1030  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1031  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1032  team->t.t_bar[bs_plain_barrier].b_arrived));
1033  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1034  thr->th.th_teams_level = master_th->th.th_teams_level;
1035  thr->th.th_teams_size = master_th->th.th_teams_size;
1036  { // Initialize threads' barrier data.
1037  int b;
1038  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1039  for (b = 0; b < bs_last_barrier; ++b) {
1040  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1041  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1042 #if USE_DEBUGGER
1043  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1044 #endif
1045  }
1046  }
1047  }
1048 
1049 #if KMP_AFFINITY_SUPPORTED
1050  // Do not partition the places list for teams construct workers who
1051  // haven't actually been forked to do real work yet. This partitioning
1052  // will take place in the parallel region nested within the teams construct.
1053  if (!fork_teams_workers) {
1054  __kmp_partition_places(team);
1055  }
1056 #endif
1057 
1058  if (team->t.t_nproc > 1 &&
1059  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1060  team->t.b->update_num_threads(team->t.t_nproc);
1061  __kmp_add_threads_to_team(team, team->t.t_nproc);
1062  }
1063  }
1064 
1065  // Take care of primary thread's task state
1066  if (__kmp_tasking_mode != tskm_immediate_exec) {
1067  if (use_hot_team) {
1068  KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th);
1069  KA_TRACE(
1070  20,
1071  ("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team "
1072  "%p, new task_team %p / team %p\n",
1073  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
1074  team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state],
1075  team));
1076 
1077  // Store primary thread's current task state on new team
1078  KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1079  master_th->th.th_task_state);
1080 
1081  // Restore primary thread's task state to hot team's state
1082  // by using thread 1's task state
1083  if (team->t.t_nproc > 1) {
1084  KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 ||
1085  team->t.t_threads[1]->th.th_task_state == 1);
1086  KMP_CHECK_UPDATE(master_th->th.th_task_state,
1087  team->t.t_threads[1]->th.th_task_state);
1088  } else {
1089  master_th->th.th_task_state = 0;
1090  }
1091  } else {
1092  // Store primary thread's current task_state on new team
1093  KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1094  master_th->th.th_task_state);
1095  // Are not using hot team, so set task state to 0.
1096  master_th->th.th_task_state = 0;
1097  }
1098  }
1099 
1100  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1101  for (i = 0; i < team->t.t_nproc; i++) {
1102  kmp_info_t *thr = team->t.t_threads[i];
1103  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1104  thr->th.th_prev_level != team->t.t_level) {
1105  team->t.t_display_affinity = 1;
1106  break;
1107  }
1108  }
1109  }
1110 
1111  KMP_MB();
1112 }
1113 
1114 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1115 // Propagate any changes to the floating point control registers out to the team
1116 // We try to avoid unnecessary writes to the relevant cache line in the team
1117 // structure, so we don't make changes unless they are needed.
1118 inline static void propagateFPControl(kmp_team_t *team) {
1119  if (__kmp_inherit_fp_control) {
1120  kmp_int16 x87_fpu_control_word;
1121  kmp_uint32 mxcsr;
1122 
1123  // Get primary thread's values of FPU control flags (both X87 and vector)
1124  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1125  __kmp_store_mxcsr(&mxcsr);
1126  mxcsr &= KMP_X86_MXCSR_MASK;
1127 
1128  // There is no point looking at t_fp_control_saved here.
1129  // If it is TRUE, we still have to update the values if they are different
1130  // from those we now have. If it is FALSE we didn't save anything yet, but
1131  // our objective is the same. We have to ensure that the values in the team
1132  // are the same as those we have.
1133  // So, this code achieves what we need whether or not t_fp_control_saved is
1134  // true. By checking whether the value needs updating we avoid unnecessary
1135  // writes that would put the cache-line into a written state, causing all
1136  // threads in the team to have to read it again.
1137  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1138  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1139  // Although we don't use this value, other code in the runtime wants to know
1140  // whether it should restore them. So we must ensure it is correct.
1141  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1142  } else {
1143  // Similarly here. Don't write to this cache-line in the team structure
1144  // unless we have to.
1145  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1146  }
1147 }
1148 
1149 // Do the opposite, setting the hardware registers to the updated values from
1150 // the team.
1151 inline static void updateHWFPControl(kmp_team_t *team) {
1152  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1153  // Only reset the fp control regs if they have been changed in the team.
1154  // the parallel region that we are exiting.
1155  kmp_int16 x87_fpu_control_word;
1156  kmp_uint32 mxcsr;
1157  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1158  __kmp_store_mxcsr(&mxcsr);
1159  mxcsr &= KMP_X86_MXCSR_MASK;
1160 
1161  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1162  __kmp_clear_x87_fpu_status_word();
1163  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1164  }
1165 
1166  if (team->t.t_mxcsr != mxcsr) {
1167  __kmp_load_mxcsr(&team->t.t_mxcsr);
1168  }
1169  }
1170 }
1171 #else
1172 #define propagateFPControl(x) ((void)0)
1173 #define updateHWFPControl(x) ((void)0)
1174 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1175 
1176 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1177  int realloc); // forward declaration
1178 
1179 /* Run a parallel region that has been serialized, so runs only in a team of the
1180  single primary thread. */
1181 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1182  kmp_info_t *this_thr;
1183  kmp_team_t *serial_team;
1184 
1185  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1186 
1187  /* Skip all this code for autopar serialized loops since it results in
1188  unacceptable overhead */
1189  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1190  return;
1191 
1192  if (!TCR_4(__kmp_init_parallel))
1193  __kmp_parallel_initialize();
1194  __kmp_resume_if_soft_paused();
1195 
1196  this_thr = __kmp_threads[global_tid];
1197  serial_team = this_thr->th.th_serial_team;
1198 
1199  /* utilize the serialized team held by this thread */
1200  KMP_DEBUG_ASSERT(serial_team);
1201  KMP_MB();
1202 
1203  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1204  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1205  proc_bind = proc_bind_false;
1206  } else if (proc_bind == proc_bind_default) {
1207  // No proc_bind clause was specified, so use the current value
1208  // of proc-bind-var for this parallel region.
1209  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1210  }
1211  // Reset for next parallel region
1212  this_thr->th.th_set_proc_bind = proc_bind_default;
1213 
1214  // Reset num_threads for next parallel region
1215  this_thr->th.th_set_nproc = 0;
1216 
1217 #if OMPT_SUPPORT
1218  ompt_data_t ompt_parallel_data = ompt_data_none;
1219  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1220  if (ompt_enabled.enabled &&
1221  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1222 
1223  ompt_task_info_t *parent_task_info;
1224  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1225 
1226  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1227  if (ompt_enabled.ompt_callback_parallel_begin) {
1228  int team_size = 1;
1229 
1230  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1231  &(parent_task_info->task_data), &(parent_task_info->frame),
1232  &ompt_parallel_data, team_size,
1233  ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1234  }
1235  }
1236 #endif // OMPT_SUPPORT
1237 
1238  if (this_thr->th.th_team != serial_team) {
1239  // Nested level will be an index in the nested nthreads array
1240  int level = this_thr->th.th_team->t.t_level;
1241 
1242  if (serial_team->t.t_serialized) {
1243  /* this serial team was already used
1244  TODO increase performance by making this locks more specific */
1245  kmp_team_t *new_team;
1246 
1247  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1248 
1249  new_team =
1250  __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1251 #if OMPT_SUPPORT
1252  ompt_parallel_data,
1253 #endif
1254  proc_bind, &this_thr->th.th_current_task->td_icvs,
1255  0 USE_NESTED_HOT_ARG(NULL));
1256  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1257  KMP_ASSERT(new_team);
1258 
1259  /* setup new serialized team and install it */
1260  new_team->t.t_threads[0] = this_thr;
1261  new_team->t.t_parent = this_thr->th.th_team;
1262  serial_team = new_team;
1263  this_thr->th.th_serial_team = serial_team;
1264 
1265  KF_TRACE(
1266  10,
1267  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1268  global_tid, serial_team));
1269 
1270  /* TODO the above breaks the requirement that if we run out of resources,
1271  then we can still guarantee that serialized teams are ok, since we may
1272  need to allocate a new one */
1273  } else {
1274  KF_TRACE(
1275  10,
1276  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1277  global_tid, serial_team));
1278  }
1279 
1280  /* we have to initialize this serial team */
1281  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1282  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1283  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1284  serial_team->t.t_ident = loc;
1285  serial_team->t.t_serialized = 1;
1286  serial_team->t.t_nproc = 1;
1287  serial_team->t.t_parent = this_thr->th.th_team;
1288  if (this_thr->th.th_team->t.t_nested_nth)
1289  serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth;
1290  else
1291  serial_team->t.t_nested_nth = &__kmp_nested_nth;
1292  // Save previous team's task state on serial team structure
1293  serial_team->t.t_primary_task_state = this_thr->th.th_task_state;
1294  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1295  this_thr->th.th_team = serial_team;
1296  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1297 
1298  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1299  this_thr->th.th_current_task));
1300  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1301  this_thr->th.th_current_task->td_flags.executing = 0;
1302 
1303  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1304 
1305  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1306  implicit task for each serialized task represented by
1307  team->t.t_serialized? */
1308  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1309  &this_thr->th.th_current_task->td_parent->td_icvs);
1310 
1311  // Thread value exists in the nested nthreads array for the next nested
1312  // level
1313  kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1314  if (this_thr->th.th_team->t.t_nested_nth)
1315  nested_nth = this_thr->th.th_team->t.t_nested_nth;
1316  if (nested_nth->used && (level + 1 < nested_nth->used)) {
1317  this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1318  }
1319 
1320  if (__kmp_nested_proc_bind.used &&
1321  (level + 1 < __kmp_nested_proc_bind.used)) {
1322  this_thr->th.th_current_task->td_icvs.proc_bind =
1323  __kmp_nested_proc_bind.bind_types[level + 1];
1324  }
1325 
1326 #if USE_DEBUGGER
1327  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1328 #endif
1329  this_thr->th.th_info.ds.ds_tid = 0;
1330 
1331  /* set thread cache values */
1332  this_thr->th.th_team_nproc = 1;
1333  this_thr->th.th_team_master = this_thr;
1334  this_thr->th.th_team_serialized = 1;
1335  this_thr->th.th_task_team = NULL;
1336  this_thr->th.th_task_state = 0;
1337 
1338  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1339  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1340  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1341 
1342  propagateFPControl(serial_team);
1343 
1344  /* check if we need to allocate dispatch buffers stack */
1345  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1346  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1347  serial_team->t.t_dispatch->th_disp_buffer =
1348  (dispatch_private_info_t *)__kmp_allocate(
1349  sizeof(dispatch_private_info_t));
1350  }
1351  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1352 
1353  KMP_MB();
1354 
1355  } else {
1356  /* this serialized team is already being used,
1357  * that's fine, just add another nested level */
1358  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1359  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1360  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1361  ++serial_team->t.t_serialized;
1362  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1363 
1364  // Nested level will be an index in the nested nthreads array
1365  int level = this_thr->th.th_team->t.t_level;
1366  // Thread value exists in the nested nthreads array for the next nested
1367  // level
1368 
1369  kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1370  if (serial_team->t.t_nested_nth)
1371  nested_nth = serial_team->t.t_nested_nth;
1372  if (nested_nth->used && (level + 1 < nested_nth->used)) {
1373  this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1374  }
1375 
1376  serial_team->t.t_level++;
1377  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1378  "of serial team %p to %d\n",
1379  global_tid, serial_team, serial_team->t.t_level));
1380 
1381  /* allocate/push dispatch buffers stack */
1382  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1383  {
1384  dispatch_private_info_t *disp_buffer =
1385  (dispatch_private_info_t *)__kmp_allocate(
1386  sizeof(dispatch_private_info_t));
1387  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1388  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1389  }
1390  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1391 
1392  /* allocate/push task team stack */
1393  __kmp_push_task_team_node(this_thr, serial_team);
1394 
1395  KMP_MB();
1396  }
1397  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1398 
1399  // Perform the display affinity functionality for
1400  // serialized parallel regions
1401  if (__kmp_display_affinity) {
1402  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1403  this_thr->th.th_prev_num_threads != 1) {
1404  // NULL means use the affinity-format-var ICV
1405  __kmp_aux_display_affinity(global_tid, NULL);
1406  this_thr->th.th_prev_level = serial_team->t.t_level;
1407  this_thr->th.th_prev_num_threads = 1;
1408  }
1409  }
1410 
1411  if (__kmp_env_consistency_check)
1412  __kmp_push_parallel(global_tid, NULL);
1413 #if OMPT_SUPPORT
1414  serial_team->t.ompt_team_info.master_return_address = codeptr;
1415  if (ompt_enabled.enabled &&
1416  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1417  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1418  OMPT_GET_FRAME_ADDRESS(0);
1419 
1420  ompt_lw_taskteam_t lw_taskteam;
1421  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1422  &ompt_parallel_data, codeptr);
1423 
1424  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1425  // don't use lw_taskteam after linking. content was swaped
1426 
1427  /* OMPT implicit task begin */
1428  if (ompt_enabled.ompt_callback_implicit_task) {
1429  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1430  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1431  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1432  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1433  OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1434  __kmp_tid_from_gtid(global_tid);
1435  }
1436 
1437  /* OMPT state */
1438  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1439  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1440  OMPT_GET_FRAME_ADDRESS(0);
1441  }
1442 #endif
1443 }
1444 
1445 // Test if this fork is for a team closely nested in a teams construct
1446 static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1447  microtask_t microtask, int level,
1448  int teams_level, kmp_va_list ap) {
1449  return (master_th->th.th_teams_microtask && ap &&
1450  microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1451 }
1452 
1453 // Test if this fork is for the teams construct, i.e. to form the outer league
1454 // of teams
1455 static inline bool __kmp_is_entering_teams(int active_level, int level,
1456  int teams_level, kmp_va_list ap) {
1457  return ((ap == NULL && active_level == 0) ||
1458  (ap && teams_level > 0 && teams_level == level));
1459 }
1460 
1461 // AC: This is start of parallel that is nested inside teams construct.
1462 // The team is actual (hot), all workers are ready at the fork barrier.
1463 // No lock needed to initialize the team a bit, then free workers.
1464 static inline int
1465 __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1466  kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1467  enum fork_context_e call_context, microtask_t microtask,
1468  launch_t invoker, int master_set_numthreads, int level,
1469 #if OMPT_SUPPORT
1470  ompt_data_t ompt_parallel_data, void *return_address,
1471 #endif
1472  kmp_va_list ap) {
1473  void **argv;
1474  int i;
1475 
1476  parent_team->t.t_ident = loc;
1477  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1478  parent_team->t.t_argc = argc;
1479  argv = (void **)parent_team->t.t_argv;
1480  for (i = argc - 1; i >= 0; --i) {
1481  *argv++ = va_arg(kmp_va_deref(ap), void *);
1482  }
1483  // Increment our nested depth levels, but not increase the serialization
1484  if (parent_team == master_th->th.th_serial_team) {
1485  // AC: we are in serialized parallel
1486  __kmpc_serialized_parallel(loc, gtid);
1487  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1488 
1489  if (call_context == fork_context_gnu) {
1490  // AC: need to decrement t_serialized for enquiry functions to work
1491  // correctly, will restore at join time
1492  parent_team->t.t_serialized--;
1493  return TRUE;
1494  }
1495 
1496 #if OMPD_SUPPORT
1497  parent_team->t.t_pkfn = microtask;
1498 #endif
1499 
1500 #if OMPT_SUPPORT
1501  void *dummy;
1502  void **exit_frame_p;
1503  ompt_data_t *implicit_task_data;
1504  ompt_lw_taskteam_t lw_taskteam;
1505 
1506  if (ompt_enabled.enabled) {
1507  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1508  &ompt_parallel_data, return_address);
1509  exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1510 
1511  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1512  // Don't use lw_taskteam after linking. Content was swapped.
1513 
1514  /* OMPT implicit task begin */
1515  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1516  if (ompt_enabled.ompt_callback_implicit_task) {
1517  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1518  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1519  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1520  1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1521  }
1522 
1523  /* OMPT state */
1524  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1525  } else {
1526  exit_frame_p = &dummy;
1527  }
1528 #endif
1529 
1530  // AC: need to decrement t_serialized for enquiry functions to work
1531  // correctly, will restore at join time
1532  parent_team->t.t_serialized--;
1533 
1534  {
1535  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1536  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1537  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1538 #if OMPT_SUPPORT
1539  ,
1540  exit_frame_p
1541 #endif
1542  );
1543  }
1544 
1545 #if OMPT_SUPPORT
1546  if (ompt_enabled.enabled) {
1547  *exit_frame_p = NULL;
1548  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1549  if (ompt_enabled.ompt_callback_implicit_task) {
1550  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1551  ompt_scope_end, NULL, implicit_task_data, 1,
1552  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1553  }
1554  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1555  __ompt_lw_taskteam_unlink(master_th);
1556  if (ompt_enabled.ompt_callback_parallel_end) {
1557  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1558  &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1559  OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1560  }
1561  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1562  }
1563 #endif
1564  return TRUE;
1565  }
1566 
1567  parent_team->t.t_pkfn = microtask;
1568  parent_team->t.t_invoke = invoker;
1569  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1570  parent_team->t.t_active_level++;
1571  parent_team->t.t_level++;
1572  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1573 
1574  // If the threads allocated to the team are less than the thread limit, update
1575  // the thread limit here. th_teams_size.nth is specific to this team nested
1576  // in a teams construct, the team is fully created, and we're about to do
1577  // the actual fork. Best to do this here so that the subsequent uses below
1578  // and in the join have the correct value.
1579  master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1580 
1581 #if OMPT_SUPPORT
1582  if (ompt_enabled.enabled) {
1583  ompt_lw_taskteam_t lw_taskteam;
1584  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1585  return_address);
1586  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1587  }
1588 #endif
1589 
1590  /* Change number of threads in the team if requested */
1591  if (master_set_numthreads) { // The parallel has num_threads clause
1592  if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1593  // AC: only can reduce number of threads dynamically, can't increase
1594  kmp_info_t **other_threads = parent_team->t.t_threads;
1595  // NOTE: if using distributed barrier, we need to run this code block
1596  // even when the team size appears not to have changed from the max.
1597  int old_proc = master_th->th.th_teams_size.nth;
1598  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1599  __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1600  __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1601  }
1602  parent_team->t.t_nproc = master_set_numthreads;
1603  for (i = 0; i < master_set_numthreads; ++i) {
1604  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1605  }
1606  }
1607  // Keep extra threads hot in the team for possible next parallels
1608  master_th->th.th_set_nproc = 0;
1609  }
1610 
1611 #if USE_DEBUGGER
1612  if (__kmp_debugging) { // Let debugger override number of threads.
1613  int nth = __kmp_omp_num_threads(loc);
1614  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1615  master_set_numthreads = nth;
1616  }
1617  }
1618 #endif
1619 
1620  // Figure out the proc_bind policy for the nested parallel within teams
1621  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1622  // proc_bind_default means don't update
1623  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1624  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1625  proc_bind = proc_bind_false;
1626  } else {
1627  // No proc_bind clause specified; use current proc-bind-var
1628  if (proc_bind == proc_bind_default) {
1629  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1630  }
1631  /* else: The proc_bind policy was specified explicitly on parallel clause.
1632  This overrides proc-bind-var for this parallel region, but does not
1633  change proc-bind-var. */
1634  // Figure the value of proc-bind-var for the child threads.
1635  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1636  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1637  master_th->th.th_current_task->td_icvs.proc_bind)) {
1638  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1639  }
1640  }
1641  KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1642  // Need to change the bind-var ICV to correct value for each implicit task
1643  if (proc_bind_icv != proc_bind_default &&
1644  master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1645  kmp_info_t **other_threads = parent_team->t.t_threads;
1646  for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1647  other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1648  }
1649  }
1650  // Reset for next parallel region
1651  master_th->th.th_set_proc_bind = proc_bind_default;
1652 
1653 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1654  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1655  KMP_ITT_DEBUG) &&
1656  __kmp_forkjoin_frames_mode == 3 &&
1657  parent_team->t.t_active_level == 1 // only report frames at level 1
1658  && master_th->th.th_teams_size.nteams == 1) {
1659  kmp_uint64 tmp_time = __itt_get_timestamp();
1660  master_th->th.th_frame_time = tmp_time;
1661  parent_team->t.t_region_time = tmp_time;
1662  }
1663  if (__itt_stack_caller_create_ptr) {
1664  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1665  // create new stack stitching id before entering fork barrier
1666  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1667  }
1668 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1669 #if KMP_AFFINITY_SUPPORTED
1670  __kmp_partition_places(parent_team);
1671 #endif
1672 
1673  KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1674  "master_th=%p, gtid=%d\n",
1675  root, parent_team, master_th, gtid));
1676  __kmp_internal_fork(loc, gtid, parent_team);
1677  KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1678  "master_th=%p, gtid=%d\n",
1679  root, parent_team, master_th, gtid));
1680 
1681  if (call_context == fork_context_gnu)
1682  return TRUE;
1683 
1684  /* Invoke microtask for PRIMARY thread */
1685  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1686  parent_team->t.t_id, parent_team->t.t_pkfn));
1687 
1688  if (!parent_team->t.t_invoke(gtid)) {
1689  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1690  }
1691  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1692  parent_team->t.t_id, parent_team->t.t_pkfn));
1693  KMP_MB(); /* Flush all pending memory write invalidates. */
1694 
1695  KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1696 
1697  return TRUE;
1698 }
1699 
1700 // Create a serialized parallel region
1701 static inline int
1702 __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1703  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1704  kmp_info_t *master_th, kmp_team_t *parent_team,
1705 #if OMPT_SUPPORT
1706  ompt_data_t *ompt_parallel_data, void **return_address,
1707  ompt_data_t **parent_task_data,
1708 #endif
1709  kmp_va_list ap) {
1710  kmp_team_t *team;
1711  int i;
1712  void **argv;
1713 
1714 /* josh todo: hypothetical question: what do we do for OS X*? */
1715 #if KMP_OS_LINUX && \
1716  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1717  SimpleVLA<void *> args(argc);
1718 #else
1719  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1720 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1721  KMP_ARCH_AARCH64) */
1722 
1723  KA_TRACE(
1724  20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1725 
1726  __kmpc_serialized_parallel(loc, gtid);
1727 
1728 #if OMPD_SUPPORT
1729  master_th->th.th_serial_team->t.t_pkfn = microtask;
1730 #endif
1731 
1732  if (call_context == fork_context_intel) {
1733  /* TODO this sucks, use the compiler itself to pass args! :) */
1734  master_th->th.th_serial_team->t.t_ident = loc;
1735  if (!ap) {
1736  // revert change made in __kmpc_serialized_parallel()
1737  master_th->th.th_serial_team->t.t_level--;
1738 // Get args from parent team for teams construct
1739 
1740 #if OMPT_SUPPORT
1741  void *dummy;
1742  void **exit_frame_p;
1743  ompt_task_info_t *task_info;
1744  ompt_lw_taskteam_t lw_taskteam;
1745 
1746  if (ompt_enabled.enabled) {
1747  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1748  ompt_parallel_data, *return_address);
1749 
1750  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1751  // don't use lw_taskteam after linking. content was swaped
1752  task_info = OMPT_CUR_TASK_INFO(master_th);
1753  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1754  if (ompt_enabled.ompt_callback_implicit_task) {
1755  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1756  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1757  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1758  &(task_info->task_data), 1,
1759  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1760  }
1761 
1762  /* OMPT state */
1763  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1764  } else {
1765  exit_frame_p = &dummy;
1766  }
1767 #endif
1768 
1769  {
1770  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1771  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1772  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1773 #if OMPT_SUPPORT
1774  ,
1775  exit_frame_p
1776 #endif
1777  );
1778  }
1779 
1780 #if OMPT_SUPPORT
1781  if (ompt_enabled.enabled) {
1782  *exit_frame_p = NULL;
1783  if (ompt_enabled.ompt_callback_implicit_task) {
1784  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1785  ompt_scope_end, NULL, &(task_info->task_data), 1,
1786  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1787  }
1788  *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1789  __ompt_lw_taskteam_unlink(master_th);
1790  if (ompt_enabled.ompt_callback_parallel_end) {
1791  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1792  ompt_parallel_data, *parent_task_data,
1793  OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1794  }
1795  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1796  }
1797 #endif
1798  } else if (microtask == (microtask_t)__kmp_teams_master) {
1799  KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1800  team = master_th->th.th_team;
1801  // team->t.t_pkfn = microtask;
1802  team->t.t_invoke = invoker;
1803  __kmp_alloc_argv_entries(argc, team, TRUE);
1804  team->t.t_argc = argc;
1805  argv = (void **)team->t.t_argv;
1806  for (i = argc - 1; i >= 0; --i)
1807  *argv++ = va_arg(kmp_va_deref(ap), void *);
1808  // AC: revert change made in __kmpc_serialized_parallel()
1809  // because initial code in teams should have level=0
1810  team->t.t_level--;
1811  // AC: call special invoker for outer "parallel" of teams construct
1812  invoker(gtid);
1813 #if OMPT_SUPPORT
1814  if (ompt_enabled.enabled) {
1815  ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1816  if (ompt_enabled.ompt_callback_implicit_task) {
1817  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1818  ompt_scope_end, NULL, &(task_info->task_data), 0,
1819  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1820  }
1821  if (ompt_enabled.ompt_callback_parallel_end) {
1822  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1823  ompt_parallel_data, *parent_task_data,
1824  OMPT_INVOKER(call_context) | ompt_parallel_league,
1825  *return_address);
1826  }
1827  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1828  }
1829 #endif
1830  } else {
1831  argv = args;
1832  for (i = argc - 1; i >= 0; --i)
1833  *argv++ = va_arg(kmp_va_deref(ap), void *);
1834  KMP_MB();
1835 
1836 #if OMPT_SUPPORT
1837  void *dummy;
1838  void **exit_frame_p;
1839  ompt_task_info_t *task_info;
1840  ompt_lw_taskteam_t lw_taskteam;
1841  ompt_data_t *implicit_task_data;
1842 
1843  if (ompt_enabled.enabled) {
1844  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1845  ompt_parallel_data, *return_address);
1846  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1847  // don't use lw_taskteam after linking. content was swaped
1848  task_info = OMPT_CUR_TASK_INFO(master_th);
1849  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1850 
1851  /* OMPT implicit task begin */
1852  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1853  if (ompt_enabled.ompt_callback_implicit_task) {
1854  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1855  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1856  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1857  ompt_task_implicit);
1858  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1859  }
1860 
1861  /* OMPT state */
1862  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1863  } else {
1864  exit_frame_p = &dummy;
1865  }
1866 #endif
1867 
1868  {
1869  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1870  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1871  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1872 #if OMPT_SUPPORT
1873  ,
1874  exit_frame_p
1875 #endif
1876  );
1877  }
1878 
1879 #if OMPT_SUPPORT
1880  if (ompt_enabled.enabled) {
1881  *exit_frame_p = NULL;
1882  if (ompt_enabled.ompt_callback_implicit_task) {
1883  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1884  ompt_scope_end, NULL, &(task_info->task_data), 1,
1885  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1886  }
1887 
1888  *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1889  __ompt_lw_taskteam_unlink(master_th);
1890  if (ompt_enabled.ompt_callback_parallel_end) {
1891  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1892  ompt_parallel_data, *parent_task_data,
1893  OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1894  }
1895  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1896  }
1897 #endif
1898  }
1899  } else if (call_context == fork_context_gnu) {
1900 #if OMPT_SUPPORT
1901  if (ompt_enabled.enabled) {
1902  ompt_lw_taskteam_t lwt;
1903  __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1904  *return_address);
1905 
1906  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1907  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1908  }
1909 // don't use lw_taskteam after linking. content was swaped
1910 #endif
1911 
1912  // we were called from GNU native code
1913  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1914  return FALSE;
1915  } else {
1916  KMP_ASSERT2(call_context < fork_context_last,
1917  "__kmp_serial_fork_call: unknown fork_context parameter");
1918  }
1919 
1920  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1921  KMP_MB();
1922  return FALSE;
1923 }
1924 
1925 /* most of the work for a fork */
1926 /* return true if we really went parallel, false if serialized */
1927 int __kmp_fork_call(ident_t *loc, int gtid,
1928  enum fork_context_e call_context, // Intel, GNU, ...
1929  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1930  kmp_va_list ap) {
1931  void **argv;
1932  int i;
1933  int master_tid;
1934  int master_this_cons;
1935  kmp_team_t *team;
1936  kmp_team_t *parent_team;
1937  kmp_info_t *master_th;
1938  kmp_root_t *root;
1939  int nthreads;
1940  int master_active;
1941  int master_set_numthreads;
1942  int task_thread_limit = 0;
1943  int level;
1944  int active_level;
1945  int teams_level;
1946 #if KMP_NESTED_HOT_TEAMS
1947  kmp_hot_team_ptr_t **p_hot_teams;
1948 #endif
1949  { // KMP_TIME_BLOCK
1950  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1951  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1952 
1953  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1954  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1955  /* Some systems prefer the stack for the root thread(s) to start with */
1956  /* some gap from the parent stack to prevent false sharing. */
1957  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1958  /* These 2 lines below are so this does not get optimized out */
1959  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1960  __kmp_stkpadding += (short)((kmp_int64)dummy);
1961  }
1962 
1963  /* initialize if needed */
1964  KMP_DEBUG_ASSERT(
1965  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1966  if (!TCR_4(__kmp_init_parallel))
1967  __kmp_parallel_initialize();
1968  __kmp_resume_if_soft_paused();
1969 
1970  /* setup current data */
1971  // AC: potentially unsafe, not in sync with library shutdown,
1972  // __kmp_threads can be freed
1973  master_th = __kmp_threads[gtid];
1974 
1975  parent_team = master_th->th.th_team;
1976  master_tid = master_th->th.th_info.ds.ds_tid;
1977  master_this_cons = master_th->th.th_local.this_construct;
1978  root = master_th->th.th_root;
1979  master_active = root->r.r_active;
1980  master_set_numthreads = master_th->th.th_set_nproc;
1981  task_thread_limit =
1982  master_th->th.th_current_task->td_icvs.task_thread_limit;
1983 
1984 #if OMPT_SUPPORT
1985  ompt_data_t ompt_parallel_data = ompt_data_none;
1986  ompt_data_t *parent_task_data;
1987  ompt_frame_t *ompt_frame;
1988  void *return_address = NULL;
1989 
1990  if (ompt_enabled.enabled) {
1991  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1992  NULL, NULL);
1993  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1994  }
1995 #endif
1996 
1997  // Assign affinity to root thread if it hasn't happened yet
1998  __kmp_assign_root_init_mask();
1999 
2000  // Nested level will be an index in the nested nthreads array
2001  level = parent_team->t.t_level;
2002  // used to launch non-serial teams even if nested is not allowed
2003  active_level = parent_team->t.t_active_level;
2004  // needed to check nesting inside the teams
2005  teams_level = master_th->th.th_teams_level;
2006 #if KMP_NESTED_HOT_TEAMS
2007  p_hot_teams = &master_th->th.th_hot_teams;
2008  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
2009  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
2010  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
2011  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
2012  // it is either actual or not needed (when active_level > 0)
2013  (*p_hot_teams)[0].hot_team_nth = 1;
2014  }
2015 #endif
2016 
2017 #if OMPT_SUPPORT
2018  if (ompt_enabled.enabled) {
2019  if (ompt_enabled.ompt_callback_parallel_begin) {
2020  int team_size = master_set_numthreads
2021  ? master_set_numthreads
2022  : get__nproc_2(parent_team, master_tid);
2023  int flags = OMPT_INVOKER(call_context) |
2024  ((microtask == (microtask_t)__kmp_teams_master)
2025  ? ompt_parallel_league
2026  : ompt_parallel_team);
2027  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
2028  parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
2029  return_address);
2030  }
2031  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2032  }
2033 #endif
2034 
2035  master_th->th.th_ident = loc;
2036 
2037  // Parallel closely nested in teams construct:
2038  if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
2039  return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
2040  call_context, microtask, invoker,
2041  master_set_numthreads, level,
2042 #if OMPT_SUPPORT
2043  ompt_parallel_data, return_address,
2044 #endif
2045  ap);
2046  } // End parallel closely nested in teams construct
2047 
2048  // Need this to happen before we determine the number of threads, not while
2049  // we are allocating the team
2050  //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
2051 
2052  KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th);
2053 
2054  // Determine the number of threads
2055  int enter_teams =
2056  __kmp_is_entering_teams(active_level, level, teams_level, ap);
2057  if ((!enter_teams &&
2058  (parent_team->t.t_active_level >=
2059  master_th->th.th_current_task->td_icvs.max_active_levels)) ||
2060  (__kmp_library == library_serial)) {
2061  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
2062  nthreads = 1;
2063  } else {
2064  nthreads = master_set_numthreads
2065  ? master_set_numthreads
2066  // TODO: get nproc directly from current task
2067  : get__nproc_2(parent_team, master_tid);
2068  // Use the thread_limit set for the current target task if exists, else go
2069  // with the deduced nthreads
2070  nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
2071  ? task_thread_limit
2072  : nthreads;
2073  // Check if we need to take forkjoin lock? (no need for serialized
2074  // parallel out of teams construct).
2075  if (nthreads > 1) {
2076  /* determine how many new threads we can use */
2077  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2078  /* AC: If we execute teams from parallel region (on host), then teams
2079  should be created but each can only have 1 thread if nesting is
2080  disabled. If teams called from serial region, then teams and their
2081  threads should be created regardless of the nesting setting. */
2082  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2083  nthreads, enter_teams);
2084  if (nthreads == 1) {
2085  // Free lock for single thread execution here; for multi-thread
2086  // execution it will be freed later after team of threads created
2087  // and initialized
2088  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2089  }
2090  }
2091  }
2092  KMP_DEBUG_ASSERT(nthreads > 0);
2093 
2094  // If we temporarily changed the set number of threads then restore it now
2095  master_th->th.th_set_nproc = 0;
2096 
2097  if (nthreads == 1) {
2098  return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2099  invoker, master_th, parent_team,
2100 #if OMPT_SUPPORT
2101  &ompt_parallel_data, &return_address,
2102  &parent_task_data,
2103 #endif
2104  ap);
2105  } // if (nthreads == 1)
2106 
2107  // GEH: only modify the executing flag in the case when not serialized
2108  // serialized case is handled in kmpc_serialized_parallel
2109  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2110  "curtask=%p, curtask_max_aclevel=%d\n",
2111  parent_team->t.t_active_level, master_th,
2112  master_th->th.th_current_task,
2113  master_th->th.th_current_task->td_icvs.max_active_levels));
2114  // TODO: GEH - cannot do this assertion because root thread not set up as
2115  // executing
2116  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2117  master_th->th.th_current_task->td_flags.executing = 0;
2118 
2119  if (!master_th->th.th_teams_microtask || level > teams_level) {
2120  /* Increment our nested depth level */
2121  KMP_ATOMIC_INC(&root->r.r_in_parallel);
2122  }
2123 
2124  // See if we need to make a copy of the ICVs.
2125  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2126  kmp_nested_nthreads_t *nested_nth = NULL;
2127  if (!master_th->th.th_set_nested_nth &&
2128  (level + 1 < parent_team->t.t_nested_nth->used) &&
2129  (parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) {
2130  nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1];
2131  } else if (master_th->th.th_set_nested_nth) {
2132  nested_nth = __kmp_override_nested_nth(master_th, level);
2133  if ((level + 1 < nested_nth->used) &&
2134  (nested_nth->nth[level + 1] != nthreads_icv))
2135  nthreads_icv = nested_nth->nth[level + 1];
2136  else
2137  nthreads_icv = 0; // don't update
2138  } else {
2139  nthreads_icv = 0; // don't update
2140  }
2141 
2142  // Figure out the proc_bind_policy for the new team.
2143  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2144  // proc_bind_default means don't update
2145  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2146  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2147  proc_bind = proc_bind_false;
2148  } else {
2149  // No proc_bind clause specified; use current proc-bind-var for this
2150  // parallel region
2151  if (proc_bind == proc_bind_default) {
2152  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2153  }
2154  // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2155  if (master_th->th.th_teams_microtask &&
2156  microtask == (microtask_t)__kmp_teams_master) {
2157  proc_bind = __kmp_teams_proc_bind;
2158  }
2159  /* else: The proc_bind policy was specified explicitly on parallel clause.
2160  This overrides proc-bind-var for this parallel region, but does not
2161  change proc-bind-var. */
2162  // Figure the value of proc-bind-var for the child threads.
2163  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2164  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2165  master_th->th.th_current_task->td_icvs.proc_bind)) {
2166  // Do not modify the proc bind icv for the two teams construct forks
2167  // They just let the proc bind icv pass through
2168  if (!master_th->th.th_teams_microtask ||
2169  !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2170  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2171  }
2172  }
2173 
2174  // Reset for next parallel region
2175  master_th->th.th_set_proc_bind = proc_bind_default;
2176 
2177  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2178  kmp_internal_control_t new_icvs;
2179  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2180  new_icvs.next = NULL;
2181  if (nthreads_icv > 0) {
2182  new_icvs.nproc = nthreads_icv;
2183  }
2184  if (proc_bind_icv != proc_bind_default) {
2185  new_icvs.proc_bind = proc_bind_icv;
2186  }
2187 
2188  /* allocate a new parallel team */
2189  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2190  team = __kmp_allocate_team(root, nthreads, nthreads,
2191 #if OMPT_SUPPORT
2192  ompt_parallel_data,
2193 #endif
2194  proc_bind, &new_icvs,
2195  argc USE_NESTED_HOT_ARG(master_th));
2196  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2197  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2198  } else {
2199  /* allocate a new parallel team */
2200  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2201  team = __kmp_allocate_team(root, nthreads, nthreads,
2202 #if OMPT_SUPPORT
2203  ompt_parallel_data,
2204 #endif
2205  proc_bind,
2206  &master_th->th.th_current_task->td_icvs,
2207  argc USE_NESTED_HOT_ARG(master_th));
2208  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2209  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2210  &master_th->th.th_current_task->td_icvs);
2211  }
2212  KF_TRACE(
2213  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2214 
2215  /* setup the new team */
2216  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2217  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2218  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2219  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2220  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2221 #if OMPT_SUPPORT
2222  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2223  return_address);
2224 #endif
2225  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2226  // TODO: parent_team->t.t_level == INT_MAX ???
2227  if (!master_th->th.th_teams_microtask || level > teams_level) {
2228  int new_level = parent_team->t.t_level + 1;
2229  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2230  new_level = parent_team->t.t_active_level + 1;
2231  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2232  } else {
2233  // AC: Do not increase parallel level at start of the teams construct
2234  int new_level = parent_team->t.t_level;
2235  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2236  new_level = parent_team->t.t_active_level;
2237  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2238  }
2239  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2240  // set primary thread's schedule as new run-time schedule
2241  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2242 
2243  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2244  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2245 
2246  // Check if hot team has potentially outdated list, and if so, free it
2247  if (team->t.t_nested_nth &&
2248  team->t.t_nested_nth != parent_team->t.t_nested_nth) {
2249  KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
2250  KMP_INTERNAL_FREE(team->t.t_nested_nth);
2251  team->t.t_nested_nth = NULL;
2252  }
2253  team->t.t_nested_nth = parent_team->t.t_nested_nth;
2254  if (master_th->th.th_set_nested_nth) {
2255  if (!nested_nth)
2256  nested_nth = __kmp_override_nested_nth(master_th, level);
2257  team->t.t_nested_nth = nested_nth;
2258  KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth);
2259  master_th->th.th_set_nested_nth = NULL;
2260  master_th->th.th_set_nested_nth_sz = 0;
2261  master_th->th.th_nt_strict = false;
2262  }
2263 
2264  // Update the floating point rounding in the team if required.
2265  propagateFPControl(team);
2266 #if OMPD_SUPPORT
2267  if (ompd_state & OMPD_ENABLE_BP)
2268  ompd_bp_parallel_begin();
2269 #endif
2270 
2271  KA_TRACE(
2272  20,
2273  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2274  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2275  team->t.t_nproc));
2276  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2277  (team->t.t_master_tid == 0 &&
2278  (team->t.t_parent == root->r.r_root_team ||
2279  team->t.t_parent->t.t_serialized)));
2280  KMP_MB();
2281 
2282  /* now, setup the arguments */
2283  argv = (void **)team->t.t_argv;
2284  if (ap) {
2285  for (i = argc - 1; i >= 0; --i) {
2286  void *new_argv = va_arg(kmp_va_deref(ap), void *);
2287  KMP_CHECK_UPDATE(*argv, new_argv);
2288  argv++;
2289  }
2290  } else {
2291  for (i = 0; i < argc; ++i) {
2292  // Get args from parent team for teams construct
2293  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2294  }
2295  }
2296 
2297  /* now actually fork the threads */
2298  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2299  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2300  root->r.r_active = TRUE;
2301 
2302  __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2303  __kmp_setup_icv_copy(team, nthreads,
2304  &master_th->th.th_current_task->td_icvs, loc);
2305 
2306 #if OMPT_SUPPORT
2307  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2308 #endif
2309 
2310  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2311 
2312 #if USE_ITT_BUILD
2313  if (team->t.t_active_level == 1 // only report frames at level 1
2314  && !master_th->th.th_teams_microtask) { // not in teams construct
2315 #if USE_ITT_NOTIFY
2316  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2317  (__kmp_forkjoin_frames_mode == 3 ||
2318  __kmp_forkjoin_frames_mode == 1)) {
2319  kmp_uint64 tmp_time = 0;
2320  if (__itt_get_timestamp_ptr)
2321  tmp_time = __itt_get_timestamp();
2322  // Internal fork - report frame begin
2323  master_th->th.th_frame_time = tmp_time;
2324  if (__kmp_forkjoin_frames_mode == 3)
2325  team->t.t_region_time = tmp_time;
2326  } else
2327 // only one notification scheme (either "submit" or "forking/joined", not both)
2328 #endif /* USE_ITT_NOTIFY */
2329  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2330  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2331  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2332  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2333  }
2334  }
2335 #endif /* USE_ITT_BUILD */
2336 
2337  /* now go on and do the work */
2338  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2339  KMP_MB();
2340  KF_TRACE(10,
2341  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2342  root, team, master_th, gtid));
2343 
2344 #if USE_ITT_BUILD
2345  if (__itt_stack_caller_create_ptr) {
2346  // create new stack stitching id before entering fork barrier
2347  if (!enter_teams) {
2348  KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2349  team->t.t_stack_id = __kmp_itt_stack_caller_create();
2350  } else if (parent_team->t.t_serialized) {
2351  // keep stack stitching id in the serialized parent_team;
2352  // current team will be used for parallel inside the teams;
2353  // if parent_team is active, then it already keeps stack stitching id
2354  // for the league of teams
2355  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2356  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2357  }
2358  }
2359 #endif /* USE_ITT_BUILD */
2360 
2361  // AC: skip __kmp_internal_fork at teams construct, let only primary
2362  // threads execute
2363  if (ap) {
2364  __kmp_internal_fork(loc, gtid, team);
2365  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2366  "master_th=%p, gtid=%d\n",
2367  root, team, master_th, gtid));
2368  }
2369 
2370  if (call_context == fork_context_gnu) {
2371  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2372  return TRUE;
2373  }
2374 
2375  /* Invoke microtask for PRIMARY thread */
2376  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2377  team->t.t_id, team->t.t_pkfn));
2378  } // END of timer KMP_fork_call block
2379 
2380 #if KMP_STATS_ENABLED
2381  // If beginning a teams construct, then change thread state
2382  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2383  if (!ap) {
2384  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2385  }
2386 #endif
2387 
2388  if (!team->t.t_invoke(gtid)) {
2389  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2390  }
2391 
2392 #if KMP_STATS_ENABLED
2393  // If was beginning of a teams construct, then reset thread state
2394  if (!ap) {
2395  KMP_SET_THREAD_STATE(previous_state);
2396  }
2397 #endif
2398 
2399  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2400  team->t.t_id, team->t.t_pkfn));
2401  KMP_MB(); /* Flush all pending memory write invalidates. */
2402 
2403  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2404 #if OMPT_SUPPORT
2405  if (ompt_enabled.enabled) {
2406  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2407  }
2408 #endif
2409 
2410  return TRUE;
2411 }
2412 
2413 #if OMPT_SUPPORT
2414 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2415  kmp_team_t *team) {
2416  // restore state outside the region
2417  thread->th.ompt_thread_info.state =
2418  ((team->t.t_serialized) ? ompt_state_work_serial
2419  : ompt_state_work_parallel);
2420 }
2421 
2422 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2423  kmp_team_t *team, ompt_data_t *parallel_data,
2424  int flags, void *codeptr) {
2425  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2426  if (ompt_enabled.ompt_callback_parallel_end) {
2427  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2428  parallel_data, &(task_info->task_data), flags, codeptr);
2429  }
2430 
2431  task_info->frame.enter_frame = ompt_data_none;
2432  __kmp_join_restore_state(thread, team);
2433 }
2434 #endif
2435 
2436 void __kmp_join_call(ident_t *loc, int gtid
2437 #if OMPT_SUPPORT
2438  ,
2439  enum fork_context_e fork_context
2440 #endif
2441  ,
2442  int exit_teams) {
2443  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2444  kmp_team_t *team;
2445  kmp_team_t *parent_team;
2446  kmp_info_t *master_th;
2447  kmp_root_t *root;
2448  int master_active;
2449 
2450  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2451 
2452  /* setup current data */
2453  master_th = __kmp_threads[gtid];
2454  root = master_th->th.th_root;
2455  team = master_th->th.th_team;
2456  parent_team = team->t.t_parent;
2457 
2458  master_th->th.th_ident = loc;
2459 
2460 #if OMPT_SUPPORT
2461  void *team_microtask = (void *)team->t.t_pkfn;
2462  // For GOMP interface with serialized parallel, need the
2463  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2464  // and end-parallel events.
2465  if (ompt_enabled.enabled &&
2466  !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2467  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2468  }
2469 #endif
2470 
2471 #if KMP_DEBUG
2472  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2473  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2474  "th_task_team = %p\n",
2475  __kmp_gtid_from_thread(master_th), team,
2476  team->t.t_task_team[master_th->th.th_task_state],
2477  master_th->th.th_task_team));
2478  KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th);
2479  }
2480 #endif
2481 
2482  if (team->t.t_serialized) {
2483  if (master_th->th.th_teams_microtask) {
2484  // We are in teams construct
2485  int level = team->t.t_level;
2486  int tlevel = master_th->th.th_teams_level;
2487  if (level == tlevel) {
2488  // AC: we haven't incremented it earlier at start of teams construct,
2489  // so do it here - at the end of teams construct
2490  team->t.t_level++;
2491  } else if (level == tlevel + 1) {
2492  // AC: we are exiting parallel inside teams, need to increment
2493  // serialization in order to restore it in the next call to
2494  // __kmpc_end_serialized_parallel
2495  team->t.t_serialized++;
2496  }
2497  }
2498  __kmpc_end_serialized_parallel(loc, gtid);
2499 
2500 #if OMPT_SUPPORT
2501  if (ompt_enabled.enabled) {
2502  if (fork_context == fork_context_gnu) {
2503  __ompt_lw_taskteam_unlink(master_th);
2504  }
2505  __kmp_join_restore_state(master_th, parent_team);
2506  }
2507 #endif
2508 
2509  return;
2510  }
2511 
2512  master_active = team->t.t_master_active;
2513 
2514  if (!exit_teams) {
2515  // AC: No barrier for internal teams at exit from teams construct.
2516  // But there is barrier for external team (league).
2517  __kmp_internal_join(loc, gtid, team);
2518 #if USE_ITT_BUILD
2519  if (__itt_stack_caller_create_ptr) {
2520  KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2521  // destroy the stack stitching id after join barrier
2522  __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2523  team->t.t_stack_id = NULL;
2524  }
2525 #endif
2526  } else {
2527  master_th->th.th_task_state =
2528  0; // AC: no tasking in teams (out of any parallel)
2529 #if USE_ITT_BUILD
2530  if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2531  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2532  // destroy the stack stitching id on exit from the teams construct
2533  // if parent_team is active, then the id will be destroyed later on
2534  // by master of the league of teams
2535  __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2536  parent_team->t.t_stack_id = NULL;
2537  }
2538 #endif
2539  }
2540 
2541  KMP_MB();
2542 
2543 #if OMPT_SUPPORT
2544  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2545  void *codeptr = team->t.ompt_team_info.master_return_address;
2546 #endif
2547 
2548 #if USE_ITT_BUILD
2549  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2550  if (team->t.t_active_level == 1 &&
2551  (!master_th->th.th_teams_microtask || /* not in teams construct */
2552  master_th->th.th_teams_size.nteams == 1)) {
2553  master_th->th.th_ident = loc;
2554  // only one notification scheme (either "submit" or "forking/joined", not
2555  // both)
2556  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2557  __kmp_forkjoin_frames_mode == 3)
2558  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2559  master_th->th.th_frame_time, 0, loc,
2560  master_th->th.th_team_nproc, 1);
2561  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2562  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2563  __kmp_itt_region_joined(gtid);
2564  } // active_level == 1
2565 #endif /* USE_ITT_BUILD */
2566 
2567 #if KMP_AFFINITY_SUPPORTED
2568  if (!exit_teams) {
2569  // Restore master thread's partition.
2570  master_th->th.th_first_place = team->t.t_first_place;
2571  master_th->th.th_last_place = team->t.t_last_place;
2572  }
2573 #endif // KMP_AFFINITY_SUPPORTED
2574 
2575  if (master_th->th.th_teams_microtask && !exit_teams &&
2576  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2577  team->t.t_level == master_th->th.th_teams_level + 1) {
2578 // AC: We need to leave the team structure intact at the end of parallel
2579 // inside the teams construct, so that at the next parallel same (hot) team
2580 // works, only adjust nesting levels
2581 #if OMPT_SUPPORT
2582  ompt_data_t ompt_parallel_data = ompt_data_none;
2583  if (ompt_enabled.enabled) {
2584  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2585  if (ompt_enabled.ompt_callback_implicit_task) {
2586  int ompt_team_size = team->t.t_nproc;
2587  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2588  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2589  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2590  }
2591  task_info->frame.exit_frame = ompt_data_none;
2592  task_info->task_data = ompt_data_none;
2593  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2594  __ompt_lw_taskteam_unlink(master_th);
2595  }
2596 #endif
2597  /* Decrement our nested depth level */
2598  team->t.t_level--;
2599  team->t.t_active_level--;
2600  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2601 
2602  // Restore number of threads in the team if needed. This code relies on
2603  // the proper adjustment of th_teams_size.nth after the fork in
2604  // __kmp_teams_master on each teams primary thread in the case that
2605  // __kmp_reserve_threads reduced it.
2606  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2607  int old_num = master_th->th.th_team_nproc;
2608  int new_num = master_th->th.th_teams_size.nth;
2609  kmp_info_t **other_threads = team->t.t_threads;
2610  team->t.t_nproc = new_num;
2611  for (int i = 0; i < old_num; ++i) {
2612  other_threads[i]->th.th_team_nproc = new_num;
2613  }
2614  // Adjust states of non-used threads of the team
2615  for (int i = old_num; i < new_num; ++i) {
2616  // Re-initialize thread's barrier data.
2617  KMP_DEBUG_ASSERT(other_threads[i]);
2618  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2619  for (int b = 0; b < bs_last_barrier; ++b) {
2620  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2621  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2622 #if USE_DEBUGGER
2623  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2624 #endif
2625  }
2626  if (__kmp_tasking_mode != tskm_immediate_exec) {
2627  // Synchronize thread's task state
2628  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2629  }
2630  }
2631  }
2632 
2633 #if OMPT_SUPPORT
2634  if (ompt_enabled.enabled) {
2635  __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2636  OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2637  }
2638 #endif
2639 
2640  return;
2641  }
2642 
2643  /* do cleanup and restore the parent team */
2644  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2645  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2646 
2647  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2648 
2649  /* jc: The following lock has instructions with REL and ACQ semantics,
2650  separating the parallel user code called in this parallel region
2651  from the serial user code called after this function returns. */
2652  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2653 
2654  if (!master_th->th.th_teams_microtask ||
2655  team->t.t_level > master_th->th.th_teams_level) {
2656  /* Decrement our nested depth level */
2657  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2658  }
2659  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2660 
2661 #if OMPT_SUPPORT
2662  if (ompt_enabled.enabled) {
2663  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2664  if (ompt_enabled.ompt_callback_implicit_task) {
2665  int flags = (team_microtask == (void *)__kmp_teams_master)
2666  ? ompt_task_initial
2667  : ompt_task_implicit;
2668  int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2669  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2670  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2671  OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2672  }
2673  task_info->frame.exit_frame = ompt_data_none;
2674  task_info->task_data = ompt_data_none;
2675  }
2676 #endif
2677 
2678  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2679  master_th, team));
2680  __kmp_pop_current_task_from_thread(master_th);
2681 
2682  master_th->th.th_def_allocator = team->t.t_def_allocator;
2683 
2684 #if OMPD_SUPPORT
2685  if (ompd_state & OMPD_ENABLE_BP)
2686  ompd_bp_parallel_end();
2687 #endif
2688  updateHWFPControl(team);
2689 
2690  if (root->r.r_active != master_active)
2691  root->r.r_active = master_active;
2692 
2693  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2694  master_th)); // this will free worker threads
2695 
2696  /* this race was fun to find. make sure the following is in the critical
2697  region otherwise assertions may fail occasionally since the old team may be
2698  reallocated and the hierarchy appears inconsistent. it is actually safe to
2699  run and won't cause any bugs, but will cause those assertion failures. it's
2700  only one deref&assign so might as well put this in the critical region */
2701  master_th->th.th_team = parent_team;
2702  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2703  master_th->th.th_team_master = parent_team->t.t_threads[0];
2704  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2705 
2706  /* restore serialized team, if need be */
2707  if (parent_team->t.t_serialized &&
2708  parent_team != master_th->th.th_serial_team &&
2709  parent_team != root->r.r_root_team) {
2710  __kmp_free_team(root,
2711  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2712  master_th->th.th_serial_team = parent_team;
2713  }
2714 
2715  if (__kmp_tasking_mode != tskm_immediate_exec) {
2716  // Restore primary thread's task state from team structure
2717  KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 ||
2718  team->t.t_primary_task_state == 1);
2719  master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state;
2720 
2721  // Copy the task team from the parent team to the primary thread
2722  master_th->th.th_task_team =
2723  parent_team->t.t_task_team[master_th->th.th_task_state];
2724  KA_TRACE(20,
2725  ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2726  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2727  parent_team));
2728  }
2729 
2730  // TODO: GEH - cannot do this assertion because root thread not set up as
2731  // executing
2732  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2733  master_th->th.th_current_task->td_flags.executing = 1;
2734 
2735  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2736 
2737 #if KMP_AFFINITY_SUPPORTED
2738  if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2739  __kmp_reset_root_init_mask(gtid);
2740  }
2741 #endif
2742 #if OMPT_SUPPORT
2743  int flags =
2744  OMPT_INVOKER(fork_context) |
2745  ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2746  : ompt_parallel_team);
2747  if (ompt_enabled.enabled) {
2748  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2749  codeptr);
2750  }
2751 #endif
2752 
2753  KMP_MB();
2754  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2755 }
2756 
2757 /* Check whether we should push an internal control record onto the
2758  serial team stack. If so, do it. */
2759 void __kmp_save_internal_controls(kmp_info_t *thread) {
2760 
2761  if (thread->th.th_team != thread->th.th_serial_team) {
2762  return;
2763  }
2764  if (thread->th.th_team->t.t_serialized > 1) {
2765  int push = 0;
2766 
2767  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2768  push = 1;
2769  } else {
2770  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2771  thread->th.th_team->t.t_serialized) {
2772  push = 1;
2773  }
2774  }
2775  if (push) { /* push a record on the serial team's stack */
2776  kmp_internal_control_t *control =
2777  (kmp_internal_control_t *)__kmp_allocate(
2778  sizeof(kmp_internal_control_t));
2779 
2780  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2781 
2782  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2783 
2784  control->next = thread->th.th_team->t.t_control_stack_top;
2785  thread->th.th_team->t.t_control_stack_top = control;
2786  }
2787  }
2788 }
2789 
2790 /* Changes set_nproc */
2791 void __kmp_set_num_threads(int new_nth, int gtid) {
2792  kmp_info_t *thread;
2793  kmp_root_t *root;
2794 
2795  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2796  KMP_DEBUG_ASSERT(__kmp_init_serial);
2797 
2798  if (new_nth < 1)
2799  new_nth = 1;
2800  else if (new_nth > __kmp_max_nth)
2801  new_nth = __kmp_max_nth;
2802 
2803  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2804  thread = __kmp_threads[gtid];
2805  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2806  return; // nothing to do
2807 
2808  __kmp_save_internal_controls(thread);
2809 
2810  set__nproc(thread, new_nth);
2811 
2812  // If this omp_set_num_threads() call will cause the hot team size to be
2813  // reduced (in the absence of a num_threads clause), then reduce it now,
2814  // rather than waiting for the next parallel region.
2815  root = thread->th.th_root;
2816  if (__kmp_init_parallel && (!root->r.r_active) &&
2817  (root->r.r_hot_team->t.t_nproc > new_nth)
2818 #if KMP_NESTED_HOT_TEAMS
2819  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2820 #endif
2821  ) {
2822  kmp_team_t *hot_team = root->r.r_hot_team;
2823  int f;
2824 
2825  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2826 
2827  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2828  __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2829  }
2830  // Release the extra threads we don't need any more.
2831  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2832  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2833  if (__kmp_tasking_mode != tskm_immediate_exec) {
2834  // When decreasing team size, threads no longer in the team should unref
2835  // task team.
2836  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2837  }
2838  __kmp_free_thread(hot_team->t.t_threads[f]);
2839  hot_team->t.t_threads[f] = NULL;
2840  }
2841  hot_team->t.t_nproc = new_nth;
2842 #if KMP_NESTED_HOT_TEAMS
2843  if (thread->th.th_hot_teams) {
2844  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2845  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2846  }
2847 #endif
2848 
2849  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2850  hot_team->t.b->update_num_threads(new_nth);
2851  __kmp_add_threads_to_team(hot_team, new_nth);
2852  }
2853 
2854  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2855 
2856  // Update the t_nproc field in the threads that are still active.
2857  for (f = 0; f < new_nth; f++) {
2858  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2859  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2860  }
2861  // Special flag in case omp_set_num_threads() call
2862  hot_team->t.t_size_changed = -1;
2863  }
2864 }
2865 
2866 /* Changes max_active_levels */
2867 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2868  kmp_info_t *thread;
2869 
2870  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2871  "%d = (%d)\n",
2872  gtid, max_active_levels));
2873  KMP_DEBUG_ASSERT(__kmp_init_serial);
2874 
2875  // validate max_active_levels
2876  if (max_active_levels < 0) {
2877  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2878  // We ignore this call if the user has specified a negative value.
2879  // The current setting won't be changed. The last valid setting will be
2880  // used. A warning will be issued (if warnings are allowed as controlled by
2881  // the KMP_WARNINGS env var).
2882  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2883  "max_active_levels for thread %d = (%d)\n",
2884  gtid, max_active_levels));
2885  return;
2886  }
2887  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2888  // it's OK, the max_active_levels is within the valid range: [ 0;
2889  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2890  // We allow a zero value. (implementation defined behavior)
2891  } else {
2892  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2893  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2894  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2895  // Current upper limit is MAX_INT. (implementation defined behavior)
2896  // If the input exceeds the upper limit, we correct the input to be the
2897  // upper limit. (implementation defined behavior)
2898  // Actually, the flow should never get here until we use MAX_INT limit.
2899  }
2900  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2901  "max_active_levels for thread %d = (%d)\n",
2902  gtid, max_active_levels));
2903 
2904  thread = __kmp_threads[gtid];
2905 
2906  __kmp_save_internal_controls(thread);
2907 
2908  set__max_active_levels(thread, max_active_levels);
2909 }
2910 
2911 /* Gets max_active_levels */
2912 int __kmp_get_max_active_levels(int gtid) {
2913  kmp_info_t *thread;
2914 
2915  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2916  KMP_DEBUG_ASSERT(__kmp_init_serial);
2917 
2918  thread = __kmp_threads[gtid];
2919  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2920  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2921  "curtask_maxaclevel=%d\n",
2922  gtid, thread->th.th_current_task,
2923  thread->th.th_current_task->td_icvs.max_active_levels));
2924  return thread->th.th_current_task->td_icvs.max_active_levels;
2925 }
2926 
2927 // nteams-var per-device ICV
2928 void __kmp_set_num_teams(int num_teams) {
2929  if (num_teams > 0)
2930  __kmp_nteams = num_teams;
2931 }
2932 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2933 // teams-thread-limit-var per-device ICV
2934 void __kmp_set_teams_thread_limit(int limit) {
2935  if (limit > 0)
2936  __kmp_teams_thread_limit = limit;
2937 }
2938 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2939 
2940 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2941 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2942 
2943 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2944 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2945  kmp_info_t *thread;
2946  kmp_sched_t orig_kind;
2947  // kmp_team_t *team;
2948 
2949  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2950  gtid, (int)kind, chunk));
2951  KMP_DEBUG_ASSERT(__kmp_init_serial);
2952 
2953  // Check if the kind parameter is valid, correct if needed.
2954  // Valid parameters should fit in one of two intervals - standard or extended:
2955  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2956  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2957  orig_kind = kind;
2958  kind = __kmp_sched_without_mods(kind);
2959 
2960  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2961  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2962  // TODO: Hint needs attention in case we change the default schedule.
2963  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2964  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2965  __kmp_msg_null);
2966  kind = kmp_sched_default;
2967  chunk = 0; // ignore chunk value in case of bad kind
2968  }
2969 
2970  thread = __kmp_threads[gtid];
2971 
2972  __kmp_save_internal_controls(thread);
2973 
2974  if (kind < kmp_sched_upper_std) {
2975  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2976  // differ static chunked vs. unchunked: chunk should be invalid to
2977  // indicate unchunked schedule (which is the default)
2978  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2979  } else {
2980  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2981  __kmp_sch_map[kind - kmp_sched_lower - 1];
2982  }
2983  } else {
2984  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2985  // kmp_sched_lower - 2 ];
2986  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2987  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2988  kmp_sched_lower - 2];
2989  }
2990  __kmp_sched_apply_mods_intkind(
2991  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2992  if (kind == kmp_sched_auto || chunk < 1) {
2993  // ignore parameter chunk for schedule auto
2994  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2995  } else {
2996  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2997  }
2998 }
2999 
3000 /* Gets def_sched_var ICV values */
3001 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
3002  kmp_info_t *thread;
3003  enum sched_type th_type;
3004 
3005  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
3006  KMP_DEBUG_ASSERT(__kmp_init_serial);
3007 
3008  thread = __kmp_threads[gtid];
3009 
3010  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
3011  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
3012  case kmp_sch_static:
3013  case kmp_sch_static_greedy:
3014  case kmp_sch_static_balanced:
3015  *kind = kmp_sched_static;
3016  __kmp_sched_apply_mods_stdkind(kind, th_type);
3017  *chunk = 0; // chunk was not set, try to show this fact via zero value
3018  return;
3019  case kmp_sch_static_chunked:
3020  *kind = kmp_sched_static;
3021  break;
3022  case kmp_sch_dynamic_chunked:
3023  *kind = kmp_sched_dynamic;
3024  break;
3026  case kmp_sch_guided_iterative_chunked:
3027  case kmp_sch_guided_analytical_chunked:
3028  *kind = kmp_sched_guided;
3029  break;
3030  case kmp_sch_auto:
3031  *kind = kmp_sched_auto;
3032  break;
3033  case kmp_sch_trapezoidal:
3034  *kind = kmp_sched_trapezoidal;
3035  break;
3036 #if KMP_STATIC_STEAL_ENABLED
3037  case kmp_sch_static_steal:
3038  *kind = kmp_sched_static_steal;
3039  break;
3040 #endif
3041  default:
3042  KMP_FATAL(UnknownSchedulingType, th_type);
3043  }
3044 
3045  __kmp_sched_apply_mods_stdkind(kind, th_type);
3046  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3047 }
3048 
3049 int __kmp_get_ancestor_thread_num(int gtid, int level) {
3050 
3051  int ii, dd;
3052  kmp_team_t *team;
3053  kmp_info_t *thr;
3054 
3055  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3056  KMP_DEBUG_ASSERT(__kmp_init_serial);
3057 
3058  // validate level
3059  if (level == 0)
3060  return 0;
3061  if (level < 0)
3062  return -1;
3063  thr = __kmp_threads[gtid];
3064  team = thr->th.th_team;
3065  ii = team->t.t_level;
3066  if (level > ii)
3067  return -1;
3068 
3069  if (thr->th.th_teams_microtask) {
3070  // AC: we are in teams region where multiple nested teams have same level
3071  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3072  if (level <=
3073  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3074  KMP_DEBUG_ASSERT(ii >= tlevel);
3075  // AC: As we need to pass by the teams league, we need to artificially
3076  // increase ii
3077  if (ii == tlevel) {
3078  ii += 2; // three teams have same level
3079  } else {
3080  ii++; // two teams have same level
3081  }
3082  }
3083  }
3084 
3085  if (ii == level)
3086  return __kmp_tid_from_gtid(gtid);
3087 
3088  dd = team->t.t_serialized;
3089  level++;
3090  while (ii > level) {
3091  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3092  }
3093  if ((team->t.t_serialized) && (!dd)) {
3094  team = team->t.t_parent;
3095  continue;
3096  }
3097  if (ii > level) {
3098  team = team->t.t_parent;
3099  dd = team->t.t_serialized;
3100  ii--;
3101  }
3102  }
3103 
3104  return (dd > 1) ? (0) : (team->t.t_master_tid);
3105 }
3106 
3107 int __kmp_get_team_size(int gtid, int level) {
3108 
3109  int ii, dd;
3110  kmp_team_t *team;
3111  kmp_info_t *thr;
3112 
3113  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3114  KMP_DEBUG_ASSERT(__kmp_init_serial);
3115 
3116  // validate level
3117  if (level == 0)
3118  return 1;
3119  if (level < 0)
3120  return -1;
3121  thr = __kmp_threads[gtid];
3122  team = thr->th.th_team;
3123  ii = team->t.t_level;
3124  if (level > ii)
3125  return -1;
3126 
3127  if (thr->th.th_teams_microtask) {
3128  // AC: we are in teams region where multiple nested teams have same level
3129  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3130  if (level <=
3131  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3132  KMP_DEBUG_ASSERT(ii >= tlevel);
3133  // AC: As we need to pass by the teams league, we need to artificially
3134  // increase ii
3135  if (ii == tlevel) {
3136  ii += 2; // three teams have same level
3137  } else {
3138  ii++; // two teams have same level
3139  }
3140  }
3141  }
3142 
3143  while (ii > level) {
3144  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3145  }
3146  if (team->t.t_serialized && (!dd)) {
3147  team = team->t.t_parent;
3148  continue;
3149  }
3150  if (ii > level) {
3151  team = team->t.t_parent;
3152  ii--;
3153  }
3154  }
3155 
3156  return team->t.t_nproc;
3157 }
3158 
3159 kmp_r_sched_t __kmp_get_schedule_global() {
3160  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3161  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3162  // independently. So one can get the updated schedule here.
3163 
3164  kmp_r_sched_t r_sched;
3165 
3166  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3167  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3168  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3169  // different roots (even in OMP 2.5)
3170  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3171  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3172  if (s == kmp_sch_static) {
3173  // replace STATIC with more detailed schedule (balanced or greedy)
3174  r_sched.r_sched_type = __kmp_static;
3175  } else if (s == kmp_sch_guided_chunked) {
3176  // replace GUIDED with more detailed schedule (iterative or analytical)
3177  r_sched.r_sched_type = __kmp_guided;
3178  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3179  r_sched.r_sched_type = __kmp_sched;
3180  }
3181  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3182 
3183  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3184  // __kmp_chunk may be wrong here (if it was not ever set)
3185  r_sched.chunk = KMP_DEFAULT_CHUNK;
3186  } else {
3187  r_sched.chunk = __kmp_chunk;
3188  }
3189 
3190  return r_sched;
3191 }
3192 
3193 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3194  at least argc number of *t_argv entries for the requested team. */
3195 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3196 
3197  KMP_DEBUG_ASSERT(team);
3198  if (!realloc || argc > team->t.t_max_argc) {
3199 
3200  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3201  "current entries=%d\n",
3202  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3203  /* if previously allocated heap space for args, free them */
3204  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3205  __kmp_free((void *)team->t.t_argv);
3206 
3207  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3208  /* use unused space in the cache line for arguments */
3209  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3210  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3211  "argv entries\n",
3212  team->t.t_id, team->t.t_max_argc));
3213  team->t.t_argv = &team->t.t_inline_argv[0];
3214  if (__kmp_storage_map) {
3215  __kmp_print_storage_map_gtid(
3216  -1, &team->t.t_inline_argv[0],
3217  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3218  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3219  team->t.t_id);
3220  }
3221  } else {
3222  /* allocate space for arguments in the heap */
3223  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3224  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3225  : 2 * argc;
3226  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3227  "argv entries\n",
3228  team->t.t_id, team->t.t_max_argc));
3229  team->t.t_argv =
3230  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3231  if (__kmp_storage_map) {
3232  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3233  &team->t.t_argv[team->t.t_max_argc],
3234  sizeof(void *) * team->t.t_max_argc,
3235  "team_%d.t_argv", team->t.t_id);
3236  }
3237  }
3238  }
3239 }
3240 
3241 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3242  int i;
3243  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3244  team->t.t_threads =
3245  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3246  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3247  sizeof(dispatch_shared_info_t) * num_disp_buff);
3248  team->t.t_dispatch =
3249  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3250  team->t.t_implicit_task_taskdata =
3251  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3252  team->t.t_max_nproc = max_nth;
3253 
3254  /* setup dispatch buffers */
3255  for (i = 0; i < num_disp_buff; ++i) {
3256  team->t.t_disp_buffer[i].buffer_index = i;
3257  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3258  }
3259 }
3260 
3261 static void __kmp_free_team_arrays(kmp_team_t *team) {
3262  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3263  int i;
3264  for (i = 0; i < team->t.t_max_nproc; ++i) {
3265  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3266  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3267  team->t.t_dispatch[i].th_disp_buffer = NULL;
3268  }
3269  }
3270 #if KMP_USE_HIER_SCHED
3271  __kmp_dispatch_free_hierarchies(team);
3272 #endif
3273  __kmp_free(team->t.t_threads);
3274  __kmp_free(team->t.t_disp_buffer);
3275  __kmp_free(team->t.t_dispatch);
3276  __kmp_free(team->t.t_implicit_task_taskdata);
3277  team->t.t_threads = NULL;
3278  team->t.t_disp_buffer = NULL;
3279  team->t.t_dispatch = NULL;
3280  team->t.t_implicit_task_taskdata = 0;
3281 }
3282 
3283 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3284  kmp_info_t **oldThreads = team->t.t_threads;
3285 
3286  __kmp_free(team->t.t_disp_buffer);
3287  __kmp_free(team->t.t_dispatch);
3288  __kmp_free(team->t.t_implicit_task_taskdata);
3289  __kmp_allocate_team_arrays(team, max_nth);
3290 
3291  KMP_MEMCPY(team->t.t_threads, oldThreads,
3292  team->t.t_nproc * sizeof(kmp_info_t *));
3293 
3294  __kmp_free(oldThreads);
3295 }
3296 
3297 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3298 
3299  kmp_r_sched_t r_sched =
3300  __kmp_get_schedule_global(); // get current state of scheduling globals
3301 
3302  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3303 
3304  kmp_internal_control_t g_icvs = {
3305  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3306  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3307  // adjustment of threads (per thread)
3308  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3309  // whether blocktime is explicitly set
3310  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3311 #if KMP_USE_MONITOR
3312  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3313 // intervals
3314 #endif
3315  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3316  // next parallel region (per thread)
3317  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3318  __kmp_cg_max_nth, // int thread_limit;
3319  __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
3320  // on task. This is used in the case of target thread_limit
3321  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3322  // for max_active_levels
3323  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3324  // {sched,chunk} pair
3325  __kmp_nested_proc_bind.bind_types[0],
3326  __kmp_default_device,
3327  NULL // struct kmp_internal_control *next;
3328  };
3329 
3330  return g_icvs;
3331 }
3332 
3333 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3334 
3335  kmp_internal_control_t gx_icvs;
3336  gx_icvs.serial_nesting_level =
3337  0; // probably =team->t.t_serial like in save_inter_controls
3338  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3339  gx_icvs.next = NULL;
3340 
3341  return gx_icvs;
3342 }
3343 
3344 static void __kmp_initialize_root(kmp_root_t *root) {
3345  int f;
3346  kmp_team_t *root_team;
3347  kmp_team_t *hot_team;
3348  int hot_team_max_nth;
3349  kmp_r_sched_t r_sched =
3350  __kmp_get_schedule_global(); // get current state of scheduling globals
3351  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3352  KMP_DEBUG_ASSERT(root);
3353  KMP_ASSERT(!root->r.r_begin);
3354 
3355  /* setup the root state structure */
3356  __kmp_init_lock(&root->r.r_begin_lock);
3357  root->r.r_begin = FALSE;
3358  root->r.r_active = FALSE;
3359  root->r.r_in_parallel = 0;
3360  root->r.r_blocktime = __kmp_dflt_blocktime;
3361 #if KMP_AFFINITY_SUPPORTED
3362  root->r.r_affinity_assigned = FALSE;
3363 #endif
3364 
3365  /* setup the root team for this task */
3366  /* allocate the root team structure */
3367  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3368 
3369  root_team =
3370  __kmp_allocate_team(root,
3371  1, // new_nproc
3372  1, // max_nproc
3373 #if OMPT_SUPPORT
3374  ompt_data_none, // root parallel id
3375 #endif
3376  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3377  0 // argc
3378  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3379  );
3380 #if USE_DEBUGGER
3381  // Non-NULL value should be assigned to make the debugger display the root
3382  // team.
3383  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3384 #endif
3385 
3386  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3387 
3388  root->r.r_root_team = root_team;
3389  root_team->t.t_control_stack_top = NULL;
3390 
3391  /* initialize root team */
3392  root_team->t.t_threads[0] = NULL;
3393  root_team->t.t_nproc = 1;
3394  root_team->t.t_serialized = 1;
3395  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3396  root_team->t.t_sched.sched = r_sched.sched;
3397  root_team->t.t_nested_nth = &__kmp_nested_nth;
3398  KA_TRACE(
3399  20,
3400  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3401  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3402 
3403  /* setup the hot team for this task */
3404  /* allocate the hot team structure */
3405  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3406 
3407  hot_team =
3408  __kmp_allocate_team(root,
3409  1, // new_nproc
3410  __kmp_dflt_team_nth_ub * 2, // max_nproc
3411 #if OMPT_SUPPORT
3412  ompt_data_none, // root parallel id
3413 #endif
3414  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3415  0 // argc
3416  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3417  );
3418  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3419 
3420  root->r.r_hot_team = hot_team;
3421  root_team->t.t_control_stack_top = NULL;
3422 
3423  /* first-time initialization */
3424  hot_team->t.t_parent = root_team;
3425 
3426  /* initialize hot team */
3427  hot_team_max_nth = hot_team->t.t_max_nproc;
3428  for (f = 0; f < hot_team_max_nth; ++f) {
3429  hot_team->t.t_threads[f] = NULL;
3430  }
3431  hot_team->t.t_nproc = 1;
3432  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3433  hot_team->t.t_sched.sched = r_sched.sched;
3434  hot_team->t.t_size_changed = 0;
3435  hot_team->t.t_nested_nth = &__kmp_nested_nth;
3436 }
3437 
3438 #ifdef KMP_DEBUG
3439 
3440 typedef struct kmp_team_list_item {
3441  kmp_team_p const *entry;
3442  struct kmp_team_list_item *next;
3443 } kmp_team_list_item_t;
3444 typedef kmp_team_list_item_t *kmp_team_list_t;
3445 
3446 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3447  kmp_team_list_t list, // List of teams.
3448  kmp_team_p const *team // Team to add.
3449 ) {
3450 
3451  // List must terminate with item where both entry and next are NULL.
3452  // Team is added to the list only once.
3453  // List is sorted in ascending order by team id.
3454  // Team id is *not* a key.
3455 
3456  kmp_team_list_t l;
3457 
3458  KMP_DEBUG_ASSERT(list != NULL);
3459  if (team == NULL) {
3460  return;
3461  }
3462 
3463  __kmp_print_structure_team_accum(list, team->t.t_parent);
3464  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3465 
3466  // Search list for the team.
3467  l = list;
3468  while (l->next != NULL && l->entry != team) {
3469  l = l->next;
3470  }
3471  if (l->next != NULL) {
3472  return; // Team has been added before, exit.
3473  }
3474 
3475  // Team is not found. Search list again for insertion point.
3476  l = list;
3477  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3478  l = l->next;
3479  }
3480 
3481  // Insert team.
3482  {
3483  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3484  sizeof(kmp_team_list_item_t));
3485  *item = *l;
3486  l->entry = team;
3487  l->next = item;
3488  }
3489 }
3490 
3491 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3492 
3493 ) {
3494  __kmp_printf("%s", title);
3495  if (team != NULL) {
3496  __kmp_printf("%2x %p\n", team->t.t_id, team);
3497  } else {
3498  __kmp_printf(" - (nil)\n");
3499  }
3500 }
3501 
3502 static void __kmp_print_structure_thread(char const *title,
3503  kmp_info_p const *thread) {
3504  __kmp_printf("%s", title);
3505  if (thread != NULL) {
3506  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3507  } else {
3508  __kmp_printf(" - (nil)\n");
3509  }
3510 }
3511 
3512 void __kmp_print_structure(void) {
3513 
3514  kmp_team_list_t list;
3515 
3516  // Initialize list of teams.
3517  list =
3518  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3519  list->entry = NULL;
3520  list->next = NULL;
3521 
3522  __kmp_printf("\n------------------------------\nGlobal Thread "
3523  "Table\n------------------------------\n");
3524  {
3525  int gtid;
3526  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3527  __kmp_printf("%2d", gtid);
3528  if (__kmp_threads != NULL) {
3529  __kmp_printf(" %p", __kmp_threads[gtid]);
3530  }
3531  if (__kmp_root != NULL) {
3532  __kmp_printf(" %p", __kmp_root[gtid]);
3533  }
3534  __kmp_printf("\n");
3535  }
3536  }
3537 
3538  // Print out __kmp_threads array.
3539  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3540  "----------\n");
3541  if (__kmp_threads != NULL) {
3542  int gtid;
3543  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3544  kmp_info_t const *thread = __kmp_threads[gtid];
3545  if (thread != NULL) {
3546  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3547  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3548  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3549  __kmp_print_structure_team(" Serial Team: ",
3550  thread->th.th_serial_team);
3551  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3552  __kmp_print_structure_thread(" Primary: ",
3553  thread->th.th_team_master);
3554  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3555  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3556  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3557  __kmp_print_structure_thread(" Next in pool: ",
3558  thread->th.th_next_pool);
3559  __kmp_printf("\n");
3560  __kmp_print_structure_team_accum(list, thread->th.th_team);
3561  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3562  }
3563  }
3564  } else {
3565  __kmp_printf("Threads array is not allocated.\n");
3566  }
3567 
3568  // Print out __kmp_root array.
3569  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3570  "--------\n");
3571  if (__kmp_root != NULL) {
3572  int gtid;
3573  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3574  kmp_root_t const *root = __kmp_root[gtid];
3575  if (root != NULL) {
3576  __kmp_printf("GTID %2d %p:\n", gtid, root);
3577  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3578  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3579  __kmp_print_structure_thread(" Uber Thread: ",
3580  root->r.r_uber_thread);
3581  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3582  __kmp_printf(" In Parallel: %2d\n",
3583  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3584  __kmp_printf("\n");
3585  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3586  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3587  }
3588  }
3589  } else {
3590  __kmp_printf("Ubers array is not allocated.\n");
3591  }
3592 
3593  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3594  "--------\n");
3595  while (list->next != NULL) {
3596  kmp_team_p const *team = list->entry;
3597  int i;
3598  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3599  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3600  __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3601  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3602  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3603  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3604  for (i = 0; i < team->t.t_nproc; ++i) {
3605  __kmp_printf(" Thread %2d: ", i);
3606  __kmp_print_structure_thread("", team->t.t_threads[i]);
3607  }
3608  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3609  __kmp_printf("\n");
3610  list = list->next;
3611  }
3612 
3613  // Print out __kmp_thread_pool and __kmp_team_pool.
3614  __kmp_printf("\n------------------------------\nPools\n----------------------"
3615  "--------\n");
3616  __kmp_print_structure_thread("Thread pool: ",
3617  CCAST(kmp_info_t *, __kmp_thread_pool));
3618  __kmp_print_structure_team("Team pool: ",
3619  CCAST(kmp_team_t *, __kmp_team_pool));
3620  __kmp_printf("\n");
3621 
3622  // Free team list.
3623  while (list != NULL) {
3624  kmp_team_list_item_t *item = list;
3625  list = list->next;
3626  KMP_INTERNAL_FREE(item);
3627  }
3628 }
3629 
3630 #endif
3631 
3632 //---------------------------------------------------------------------------
3633 // Stuff for per-thread fast random number generator
3634 // Table of primes
3635 static const unsigned __kmp_primes[] = {
3636  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3637  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3638  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3639  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3640  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3641  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3642  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3643  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3644  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3645  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3646  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3647 
3648 //---------------------------------------------------------------------------
3649 // __kmp_get_random: Get a random number using a linear congruential method.
3650 unsigned short __kmp_get_random(kmp_info_t *thread) {
3651  unsigned x = thread->th.th_x;
3652  unsigned short r = (unsigned short)(x >> 16);
3653 
3654  thread->th.th_x = x * thread->th.th_a + 1;
3655 
3656  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3657  thread->th.th_info.ds.ds_tid, r));
3658 
3659  return r;
3660 }
3661 //--------------------------------------------------------
3662 // __kmp_init_random: Initialize a random number generator
3663 void __kmp_init_random(kmp_info_t *thread) {
3664  unsigned seed = thread->th.th_info.ds.ds_tid;
3665 
3666  thread->th.th_a =
3667  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3668  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3669  KA_TRACE(30,
3670  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3671 }
3672 
3673 #if KMP_OS_WINDOWS
3674 /* reclaim array entries for root threads that are already dead, returns number
3675  * reclaimed */
3676 static int __kmp_reclaim_dead_roots(void) {
3677  int i, r = 0;
3678 
3679  for (i = 0; i < __kmp_threads_capacity; ++i) {
3680  if (KMP_UBER_GTID(i) &&
3681  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3682  !__kmp_root[i]
3683  ->r.r_active) { // AC: reclaim only roots died in non-active state
3684  r += __kmp_unregister_root_other_thread(i);
3685  }
3686  }
3687  return r;
3688 }
3689 #endif
3690 
3691 /* This function attempts to create free entries in __kmp_threads and
3692  __kmp_root, and returns the number of free entries generated.
3693 
3694  For Windows* OS static library, the first mechanism used is to reclaim array
3695  entries for root threads that are already dead.
3696 
3697  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3698  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3699  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3700  threadprivate cache array has been created. Synchronization with
3701  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3702 
3703  After any dead root reclamation, if the clipping value allows array expansion
3704  to result in the generation of a total of nNeed free slots, the function does
3705  that expansion. If not, nothing is done beyond the possible initial root
3706  thread reclamation.
3707 
3708  If any argument is negative, the behavior is undefined. */
3709 static int __kmp_expand_threads(int nNeed) {
3710  int added = 0;
3711  int minimumRequiredCapacity;
3712  int newCapacity;
3713  kmp_info_t **newThreads;
3714  kmp_root_t **newRoot;
3715 
3716  // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3717  // resizing __kmp_threads does not need additional protection if foreign
3718  // threads are present
3719 
3720 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3721  /* only for Windows static library */
3722  /* reclaim array entries for root threads that are already dead */
3723  added = __kmp_reclaim_dead_roots();
3724 
3725  if (nNeed) {
3726  nNeed -= added;
3727  if (nNeed < 0)
3728  nNeed = 0;
3729  }
3730 #endif
3731  if (nNeed <= 0)
3732  return added;
3733 
3734  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3735  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3736  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3737  // > __kmp_max_nth in one of two ways:
3738  //
3739  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3740  // may not be reused by another thread, so we may need to increase
3741  // __kmp_threads_capacity to __kmp_max_nth + 1.
3742  //
3743  // 2) New foreign root(s) are encountered. We always register new foreign
3744  // roots. This may cause a smaller # of threads to be allocated at
3745  // subsequent parallel regions, but the worker threads hang around (and
3746  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3747  //
3748  // Anyway, that is the reason for moving the check to see if
3749  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3750  // instead of having it performed here. -BB
3751 
3752  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3753 
3754  /* compute expansion headroom to check if we can expand */
3755  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3756  /* possible expansion too small -- give up */
3757  return added;
3758  }
3759  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3760 
3761  newCapacity = __kmp_threads_capacity;
3762  do {
3763  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3764  : __kmp_sys_max_nth;
3765  } while (newCapacity < minimumRequiredCapacity);
3766  newThreads = (kmp_info_t **)__kmp_allocate(
3767  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3768  newRoot =
3769  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3770  KMP_MEMCPY(newThreads, __kmp_threads,
3771  __kmp_threads_capacity * sizeof(kmp_info_t *));
3772  KMP_MEMCPY(newRoot, __kmp_root,
3773  __kmp_threads_capacity * sizeof(kmp_root_t *));
3774  // Put old __kmp_threads array on a list. Any ongoing references to the old
3775  // list will be valid. This list is cleaned up at library shutdown.
3776  kmp_old_threads_list_t *node =
3777  (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3778  node->threads = __kmp_threads;
3779  node->next = __kmp_old_threads_list;
3780  __kmp_old_threads_list = node;
3781 
3782  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3783  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3784  added += newCapacity - __kmp_threads_capacity;
3785  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3786 
3787  if (newCapacity > __kmp_tp_capacity) {
3788  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3789  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3790  __kmp_threadprivate_resize_cache(newCapacity);
3791  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3792  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3793  }
3794  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3795  }
3796 
3797  return added;
3798 }
3799 
3800 /* Register the current thread as a root thread and obtain our gtid. We must
3801  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3802  thread that calls from __kmp_do_serial_initialize() */
3803 int __kmp_register_root(int initial_thread) {
3804  kmp_info_t *root_thread;
3805  kmp_root_t *root;
3806  int gtid;
3807  int capacity;
3808  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3809  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3810  KMP_MB();
3811 
3812  /* 2007-03-02:
3813  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3814  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3815  work as expected -- it may return false (that means there is at least one
3816  empty slot in __kmp_threads array), but it is possible the only free slot
3817  is #0, which is reserved for initial thread and so cannot be used for this
3818  one. Following code workarounds this bug.
3819 
3820  However, right solution seems to be not reserving slot #0 for initial
3821  thread because:
3822  (1) there is no magic in slot #0,
3823  (2) we cannot detect initial thread reliably (the first thread which does
3824  serial initialization may be not a real initial thread).
3825  */
3826  capacity = __kmp_threads_capacity;
3827  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3828  --capacity;
3829  }
3830 
3831  // If it is not for initializing the hidden helper team, we need to take
3832  // __kmp_hidden_helper_threads_num out of the capacity because it is included
3833  // in __kmp_threads_capacity.
3834  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3835  capacity -= __kmp_hidden_helper_threads_num;
3836  }
3837 
3838  /* see if there are too many threads */
3839  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3840  if (__kmp_tp_cached) {
3841  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3842  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3843  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3844  } else {
3845  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3846  __kmp_msg_null);
3847  }
3848  }
3849 
3850  // When hidden helper task is enabled, __kmp_threads is organized as follows:
3851  // 0: initial thread, also a regular OpenMP thread.
3852  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3853  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3854  // regular OpenMP threads.
3855  if (TCR_4(__kmp_init_hidden_helper_threads)) {
3856  // Find an available thread slot for hidden helper thread. Slots for hidden
3857  // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3858  for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3859  gtid <= __kmp_hidden_helper_threads_num;
3860  gtid++)
3861  ;
3862  KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3863  KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3864  "hidden helper thread: T#%d\n",
3865  gtid));
3866  } else {
3867  /* find an available thread slot */
3868  // Don't reassign the zero slot since we need that to only be used by
3869  // initial thread. Slots for hidden helper threads should also be skipped.
3870  if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3871  gtid = 0;
3872  } else {
3873  for (gtid = __kmp_hidden_helper_threads_num + 1;
3874  TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3875  ;
3876  }
3877  KA_TRACE(
3878  1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3879  KMP_ASSERT(gtid < __kmp_threads_capacity);
3880  }
3881 
3882  /* update global accounting */
3883  __kmp_all_nth++;
3884  TCW_4(__kmp_nth, __kmp_nth + 1);
3885 
3886  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3887  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3888  if (__kmp_adjust_gtid_mode) {
3889  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3890  if (TCR_4(__kmp_gtid_mode) != 2) {
3891  TCW_4(__kmp_gtid_mode, 2);
3892  }
3893  } else {
3894  if (TCR_4(__kmp_gtid_mode) != 1) {
3895  TCW_4(__kmp_gtid_mode, 1);
3896  }
3897  }
3898  }
3899 
3900 #ifdef KMP_ADJUST_BLOCKTIME
3901  /* Adjust blocktime to zero if necessary */
3902  /* Middle initialization might not have occurred yet */
3903  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3904  if (__kmp_nth > __kmp_avail_proc) {
3905  __kmp_zero_bt = TRUE;
3906  }
3907  }
3908 #endif /* KMP_ADJUST_BLOCKTIME */
3909 
3910  /* setup this new hierarchy */
3911  if (!(root = __kmp_root[gtid])) {
3912  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3913  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3914  }
3915 
3916 #if KMP_STATS_ENABLED
3917  // Initialize stats as soon as possible (right after gtid assignment).
3918  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3919  __kmp_stats_thread_ptr->startLife();
3920  KMP_SET_THREAD_STATE(SERIAL_REGION);
3921  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3922 #endif
3923  __kmp_initialize_root(root);
3924 
3925  /* setup new root thread structure */
3926  if (root->r.r_uber_thread) {
3927  root_thread = root->r.r_uber_thread;
3928  } else {
3929  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3930  if (__kmp_storage_map) {
3931  __kmp_print_thread_storage_map(root_thread, gtid);
3932  }
3933  root_thread->th.th_info.ds.ds_gtid = gtid;
3934 #if OMPT_SUPPORT
3935  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3936 #endif
3937  root_thread->th.th_root = root;
3938  if (__kmp_env_consistency_check) {
3939  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3940  }
3941 #if USE_FAST_MEMORY
3942  __kmp_initialize_fast_memory(root_thread);
3943 #endif /* USE_FAST_MEMORY */
3944 
3945 #if KMP_USE_BGET
3946  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3947  __kmp_initialize_bget(root_thread);
3948 #endif
3949  __kmp_init_random(root_thread); // Initialize random number generator
3950  }
3951 
3952  /* setup the serial team held in reserve by the root thread */
3953  if (!root_thread->th.th_serial_team) {
3954  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3955  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3956  root_thread->th.th_serial_team = __kmp_allocate_team(
3957  root, 1, 1,
3958 #if OMPT_SUPPORT
3959  ompt_data_none, // root parallel id
3960 #endif
3961  proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3962  }
3963  KMP_ASSERT(root_thread->th.th_serial_team);
3964  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3965  root_thread->th.th_serial_team));
3966 
3967  /* drop root_thread into place */
3968  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3969 
3970  root->r.r_root_team->t.t_threads[0] = root_thread;
3971  root->r.r_hot_team->t.t_threads[0] = root_thread;
3972  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3973  // AC: the team created in reserve, not for execution (it is unused for now).
3974  root_thread->th.th_serial_team->t.t_serialized = 0;
3975  root->r.r_uber_thread = root_thread;
3976 
3977  /* initialize the thread, get it ready to go */
3978  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3979  TCW_4(__kmp_init_gtid, TRUE);
3980 
3981  /* prepare the primary thread for get_gtid() */
3982  __kmp_gtid_set_specific(gtid);
3983 
3984 #if USE_ITT_BUILD
3985  __kmp_itt_thread_name(gtid);
3986 #endif /* USE_ITT_BUILD */
3987 
3988 #ifdef KMP_TDATA_GTID
3989  __kmp_gtid = gtid;
3990 #endif
3991  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3992  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3993 
3994  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3995  "plain=%u\n",
3996  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3997  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3998  KMP_INIT_BARRIER_STATE));
3999  { // Initialize barrier data.
4000  int b;
4001  for (b = 0; b < bs_last_barrier; ++b) {
4002  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
4003 #if USE_DEBUGGER
4004  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
4005 #endif
4006  }
4007  }
4008  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
4009  KMP_INIT_BARRIER_STATE);
4010 
4011 #if KMP_AFFINITY_SUPPORTED
4012  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
4013  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
4014  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
4015  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
4016 #endif /* KMP_AFFINITY_SUPPORTED */
4017  root_thread->th.th_def_allocator = __kmp_def_allocator;
4018  root_thread->th.th_prev_level = 0;
4019  root_thread->th.th_prev_num_threads = 1;
4020 
4021  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
4022  tmp->cg_root = root_thread;
4023  tmp->cg_thread_limit = __kmp_cg_max_nth;
4024  tmp->cg_nthreads = 1;
4025  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
4026  " cg_nthreads init to 1\n",
4027  root_thread, tmp));
4028  tmp->up = NULL;
4029  root_thread->th.th_cg_roots = tmp;
4030 
4031  __kmp_root_counter++;
4032 
4033 #if OMPT_SUPPORT
4034  if (ompt_enabled.enabled) {
4035 
4036  kmp_info_t *root_thread = ompt_get_thread();
4037 
4038  ompt_set_thread_state(root_thread, ompt_state_overhead);
4039 
4040  if (ompt_enabled.ompt_callback_thread_begin) {
4041  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4042  ompt_thread_initial, __ompt_get_thread_data_internal());
4043  }
4044  ompt_data_t *task_data;
4045  ompt_data_t *parallel_data;
4046  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4047  NULL);
4048  if (ompt_enabled.ompt_callback_implicit_task) {
4049  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4050  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4051  }
4052 
4053  ompt_set_thread_state(root_thread, ompt_state_work_serial);
4054  }
4055 #endif
4056 #if OMPD_SUPPORT
4057  if (ompd_state & OMPD_ENABLE_BP)
4058  ompd_bp_thread_begin();
4059 #endif
4060 
4061  KMP_MB();
4062  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4063 
4064  return gtid;
4065 }
4066 
4067 #if KMP_NESTED_HOT_TEAMS
4068 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4069  const int max_level) {
4070  int i, n, nth;
4071  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4072  if (!hot_teams || !hot_teams[level].hot_team) {
4073  return 0;
4074  }
4075  KMP_DEBUG_ASSERT(level < max_level);
4076  kmp_team_t *team = hot_teams[level].hot_team;
4077  nth = hot_teams[level].hot_team_nth;
4078  n = nth - 1; // primary thread is not freed
4079  if (level < max_level - 1) {
4080  for (i = 0; i < nth; ++i) {
4081  kmp_info_t *th = team->t.t_threads[i];
4082  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4083  if (i > 0 && th->th.th_hot_teams) {
4084  __kmp_free(th->th.th_hot_teams);
4085  th->th.th_hot_teams = NULL;
4086  }
4087  }
4088  }
4089  __kmp_free_team(root, team, NULL);
4090  return n;
4091 }
4092 #endif
4093 
4094 // Resets a root thread and clear its root and hot teams.
4095 // Returns the number of __kmp_threads entries directly and indirectly freed.
4096 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4097  kmp_team_t *root_team = root->r.r_root_team;
4098  kmp_team_t *hot_team = root->r.r_hot_team;
4099  int n = hot_team->t.t_nproc;
4100  int i;
4101 
4102  KMP_DEBUG_ASSERT(!root->r.r_active);
4103 
4104  root->r.r_root_team = NULL;
4105  root->r.r_hot_team = NULL;
4106  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4107  // before call to __kmp_free_team().
4108  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4109 #if KMP_NESTED_HOT_TEAMS
4110  if (__kmp_hot_teams_max_level >
4111  0) { // need to free nested hot teams and their threads if any
4112  for (i = 0; i < hot_team->t.t_nproc; ++i) {
4113  kmp_info_t *th = hot_team->t.t_threads[i];
4114  if (__kmp_hot_teams_max_level > 1) {
4115  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4116  }
4117  if (th->th.th_hot_teams) {
4118  __kmp_free(th->th.th_hot_teams);
4119  th->th.th_hot_teams = NULL;
4120  }
4121  }
4122  }
4123 #endif
4124  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4125 
4126  // Before we can reap the thread, we need to make certain that all other
4127  // threads in the teams that had this root as ancestor have stopped trying to
4128  // steal tasks.
4129  if (__kmp_tasking_mode != tskm_immediate_exec) {
4130  __kmp_wait_to_unref_task_teams();
4131  }
4132 
4133 #if KMP_OS_WINDOWS
4134  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4135  KA_TRACE(
4136  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4137  "\n",
4138  (LPVOID) & (root->r.r_uber_thread->th),
4139  root->r.r_uber_thread->th.th_info.ds.ds_thread));
4140  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4141 #endif /* KMP_OS_WINDOWS */
4142 
4143 #if OMPD_SUPPORT
4144  if (ompd_state & OMPD_ENABLE_BP)
4145  ompd_bp_thread_end();
4146 #endif
4147 
4148 #if OMPT_SUPPORT
4149  ompt_data_t *task_data;
4150  ompt_data_t *parallel_data;
4151  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4152  NULL);
4153  if (ompt_enabled.ompt_callback_implicit_task) {
4154  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4155  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4156  }
4157  if (ompt_enabled.ompt_callback_thread_end) {
4158  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4159  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4160  }
4161 #endif
4162 
4163  TCW_4(__kmp_nth,
4164  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4165  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4166  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4167  " to %d\n",
4168  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4169  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4170  if (i == 1) {
4171  // need to free contention group structure
4172  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4173  root->r.r_uber_thread->th.th_cg_roots->cg_root);
4174  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4175  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4176  root->r.r_uber_thread->th.th_cg_roots = NULL;
4177  }
4178  __kmp_reap_thread(root->r.r_uber_thread, 1);
4179 
4180  // We canot put root thread to __kmp_thread_pool, so we have to reap it
4181  // instead of freeing.
4182  root->r.r_uber_thread = NULL;
4183  /* mark root as no longer in use */
4184  root->r.r_begin = FALSE;
4185 
4186  return n;
4187 }
4188 
4189 void __kmp_unregister_root_current_thread(int gtid) {
4190  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4191  /* this lock should be ok, since unregister_root_current_thread is never
4192  called during an abort, only during a normal close. furthermore, if you
4193  have the forkjoin lock, you should never try to get the initz lock */
4194  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4195  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4196  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4197  "exiting T#%d\n",
4198  gtid));
4199  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4200  return;
4201  }
4202  kmp_root_t *root = __kmp_root[gtid];
4203 
4204  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4205  KMP_ASSERT(KMP_UBER_GTID(gtid));
4206  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4207  KMP_ASSERT(root->r.r_active == FALSE);
4208 
4209  KMP_MB();
4210 
4211  kmp_info_t *thread = __kmp_threads[gtid];
4212  kmp_team_t *team = thread->th.th_team;
4213  kmp_task_team_t *task_team = thread->th.th_task_team;
4214 
4215  // we need to wait for the proxy tasks before finishing the thread
4216  if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4217  task_team->tt.tt_hidden_helper_task_encountered)) {
4218 #if OMPT_SUPPORT
4219  // the runtime is shutting down so we won't report any events
4220  thread->th.ompt_thread_info.state = ompt_state_undefined;
4221 #endif
4222  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4223  }
4224 
4225  __kmp_reset_root(gtid, root);
4226 
4227  KMP_MB();
4228  KC_TRACE(10,
4229  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4230 
4231  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4232 }
4233 
4234 #if KMP_OS_WINDOWS
4235 /* __kmp_forkjoin_lock must be already held
4236  Unregisters a root thread that is not the current thread. Returns the number
4237  of __kmp_threads entries freed as a result. */
4238 static int __kmp_unregister_root_other_thread(int gtid) {
4239  kmp_root_t *root = __kmp_root[gtid];
4240  int r;
4241 
4242  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4243  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4244  KMP_ASSERT(KMP_UBER_GTID(gtid));
4245  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4246  KMP_ASSERT(root->r.r_active == FALSE);
4247 
4248  r = __kmp_reset_root(gtid, root);
4249  KC_TRACE(10,
4250  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4251  return r;
4252 }
4253 #endif
4254 
4255 #if KMP_DEBUG
4256 void __kmp_task_info() {
4257 
4258  kmp_int32 gtid = __kmp_entry_gtid();
4259  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4260  kmp_info_t *this_thr = __kmp_threads[gtid];
4261  kmp_team_t *steam = this_thr->th.th_serial_team;
4262  kmp_team_t *team = this_thr->th.th_team;
4263 
4264  __kmp_printf(
4265  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4266  "ptask=%p\n",
4267  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4268  team->t.t_implicit_task_taskdata[tid].td_parent);
4269 }
4270 #endif // KMP_DEBUG
4271 
4272 /* TODO optimize with one big memclr, take out what isn't needed, split
4273  responsibility to workers as much as possible, and delay initialization of
4274  features as much as possible */
4275 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4276  int tid, int gtid) {
4277  /* this_thr->th.th_info.ds.ds_gtid is setup in
4278  kmp_allocate_thread/create_worker.
4279  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4280  KMP_DEBUG_ASSERT(this_thr != NULL);
4281  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4282  KMP_DEBUG_ASSERT(team);
4283  KMP_DEBUG_ASSERT(team->t.t_threads);
4284  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4285  kmp_info_t *master = team->t.t_threads[0];
4286  KMP_DEBUG_ASSERT(master);
4287  KMP_DEBUG_ASSERT(master->th.th_root);
4288 
4289  KMP_MB();
4290 
4291  TCW_SYNC_PTR(this_thr->th.th_team, team);
4292 
4293  this_thr->th.th_info.ds.ds_tid = tid;
4294  this_thr->th.th_set_nproc = 0;
4295  if (__kmp_tasking_mode != tskm_immediate_exec)
4296  // When tasking is possible, threads are not safe to reap until they are
4297  // done tasking; this will be set when tasking code is exited in wait
4298  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4299  else // no tasking --> always safe to reap
4300  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4301  this_thr->th.th_set_proc_bind = proc_bind_default;
4302 
4303 #if KMP_AFFINITY_SUPPORTED
4304  this_thr->th.th_new_place = this_thr->th.th_current_place;
4305 #endif
4306  this_thr->th.th_root = master->th.th_root;
4307 
4308  /* setup the thread's cache of the team structure */
4309  this_thr->th.th_team_nproc = team->t.t_nproc;
4310  this_thr->th.th_team_master = master;
4311  this_thr->th.th_team_serialized = team->t.t_serialized;
4312 
4313  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4314 
4315  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4316  tid, gtid, this_thr, this_thr->th.th_current_task));
4317 
4318  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4319  team, tid, TRUE);
4320 
4321  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4322  tid, gtid, this_thr, this_thr->th.th_current_task));
4323  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4324  // __kmp_initialize_team()?
4325 
4326  /* TODO no worksharing in speculative threads */
4327  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4328 
4329  this_thr->th.th_local.this_construct = 0;
4330 
4331  if (!this_thr->th.th_pri_common) {
4332  this_thr->th.th_pri_common =
4333  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4334  if (__kmp_storage_map) {
4335  __kmp_print_storage_map_gtid(
4336  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4337  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4338  }
4339  this_thr->th.th_pri_head = NULL;
4340  }
4341 
4342  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4343  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4344  // Make new thread's CG root same as primary thread's
4345  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4346  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4347  if (tmp) {
4348  // worker changes CG, need to check if old CG should be freed
4349  int i = tmp->cg_nthreads--;
4350  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4351  " on node %p of thread %p to %d\n",
4352  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4353  if (i == 1) {
4354  __kmp_free(tmp); // last thread left CG --> free it
4355  }
4356  }
4357  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4358  // Increment new thread's CG root's counter to add the new thread
4359  this_thr->th.th_cg_roots->cg_nthreads++;
4360  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4361  " node %p of thread %p to %d\n",
4362  this_thr, this_thr->th.th_cg_roots,
4363  this_thr->th.th_cg_roots->cg_root,
4364  this_thr->th.th_cg_roots->cg_nthreads));
4365  this_thr->th.th_current_task->td_icvs.thread_limit =
4366  this_thr->th.th_cg_roots->cg_thread_limit;
4367  }
4368 
4369  /* Initialize dynamic dispatch */
4370  {
4371  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4372  // Use team max_nproc since this will never change for the team.
4373  size_t disp_size =
4374  sizeof(dispatch_private_info_t) *
4375  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4376  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4377  team->t.t_max_nproc));
4378  KMP_ASSERT(dispatch);
4379  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4380  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4381 
4382  dispatch->th_disp_index = 0;
4383  dispatch->th_doacross_buf_idx = 0;
4384  if (!dispatch->th_disp_buffer) {
4385  dispatch->th_disp_buffer =
4386  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4387 
4388  if (__kmp_storage_map) {
4389  __kmp_print_storage_map_gtid(
4390  gtid, &dispatch->th_disp_buffer[0],
4391  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4392  ? 1
4393  : __kmp_dispatch_num_buffers],
4394  disp_size,
4395  "th_%d.th_dispatch.th_disp_buffer "
4396  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4397  gtid, team->t.t_id, gtid);
4398  }
4399  } else {
4400  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4401  }
4402 
4403  dispatch->th_dispatch_pr_current = 0;
4404  dispatch->th_dispatch_sh_current = 0;
4405 
4406  dispatch->th_deo_fcn = 0; /* ORDERED */
4407  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4408  }
4409 
4410  this_thr->th.th_next_pool = NULL;
4411 
4412  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4413  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4414 
4415  KMP_MB();
4416 }
4417 
4418 /* allocate a new thread for the requesting team. this is only called from
4419  within a forkjoin critical section. we will first try to get an available
4420  thread from the thread pool. if none is available, we will fork a new one
4421  assuming we are able to create a new one. this should be assured, as the
4422  caller should check on this first. */
4423 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4424  int new_tid) {
4425  kmp_team_t *serial_team;
4426  kmp_info_t *new_thr;
4427  int new_gtid;
4428 
4429  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4430  KMP_DEBUG_ASSERT(root && team);
4431 #if !KMP_NESTED_HOT_TEAMS
4432  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4433 #endif
4434  KMP_MB();
4435 
4436  /* first, try to get one from the thread pool unless allocating thread is
4437  * the main hidden helper thread. The hidden helper team should always
4438  * allocate new OS threads. */
4439  if (__kmp_thread_pool && !KMP_HIDDEN_HELPER_TEAM(team)) {
4440  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4441  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4442  if (new_thr == __kmp_thread_pool_insert_pt) {
4443  __kmp_thread_pool_insert_pt = NULL;
4444  }
4445  TCW_4(new_thr->th.th_in_pool, FALSE);
4446  __kmp_suspend_initialize_thread(new_thr);
4447  __kmp_lock_suspend_mx(new_thr);
4448  if (new_thr->th.th_active_in_pool == TRUE) {
4449  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4450  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4451  new_thr->th.th_active_in_pool = FALSE;
4452  }
4453  __kmp_unlock_suspend_mx(new_thr);
4454 
4455  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4456  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4457  KMP_ASSERT(!new_thr->th.th_team);
4458  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4459 
4460  /* setup the thread structure */
4461  __kmp_initialize_info(new_thr, team, new_tid,
4462  new_thr->th.th_info.ds.ds_gtid);
4463  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4464 
4465  TCW_4(__kmp_nth, __kmp_nth + 1);
4466 
4467  new_thr->th.th_task_state = 0;
4468 
4469  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4470  // Make sure pool thread has transitioned to waiting on own thread struct
4471  KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4472  // Thread activated in __kmp_allocate_team when increasing team size
4473  }
4474 
4475 #ifdef KMP_ADJUST_BLOCKTIME
4476  /* Adjust blocktime back to zero if necessary */
4477  /* Middle initialization might not have occurred yet */
4478  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4479  if (__kmp_nth > __kmp_avail_proc) {
4480  __kmp_zero_bt = TRUE;
4481  }
4482  }
4483 #endif /* KMP_ADJUST_BLOCKTIME */
4484 
4485 #if KMP_DEBUG
4486  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4487  // KMP_BARRIER_PARENT_FLAG.
4488  int b;
4489  kmp_balign_t *balign = new_thr->th.th_bar;
4490  for (b = 0; b < bs_last_barrier; ++b)
4491  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4492 #endif
4493 
4494  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4495  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4496 
4497  KMP_MB();
4498  return new_thr;
4499  }
4500 
4501  /* no, well fork a new one */
4502  KMP_ASSERT(KMP_HIDDEN_HELPER_TEAM(team) || __kmp_nth == __kmp_all_nth);
4503  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4504 
4505 #if KMP_USE_MONITOR
4506  // If this is the first worker thread the RTL is creating, then also
4507  // launch the monitor thread. We try to do this as early as possible.
4508  if (!TCR_4(__kmp_init_monitor)) {
4509  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4510  if (!TCR_4(__kmp_init_monitor)) {
4511  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4512  TCW_4(__kmp_init_monitor, 1);
4513  __kmp_create_monitor(&__kmp_monitor);
4514  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4515 #if KMP_OS_WINDOWS
4516  // AC: wait until monitor has started. This is a fix for CQ232808.
4517  // The reason is that if the library is loaded/unloaded in a loop with
4518  // small (parallel) work in between, then there is high probability that
4519  // monitor thread started after the library shutdown. At shutdown it is
4520  // too late to cope with the problem, because when the primary thread is
4521  // in DllMain (process detach) the monitor has no chances to start (it is
4522  // blocked), and primary thread has no means to inform the monitor that
4523  // the library has gone, because all the memory which the monitor can
4524  // access is going to be released/reset.
4525  while (TCR_4(__kmp_init_monitor) < 2) {
4526  KMP_YIELD(TRUE);
4527  }
4528  KF_TRACE(10, ("after monitor thread has started\n"));
4529 #endif
4530  }
4531  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4532  }
4533 #endif
4534 
4535  KMP_MB();
4536 
4537  {
4538  int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4539  ? 1
4540  : __kmp_hidden_helper_threads_num + 1;
4541 
4542  for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4543  ++new_gtid) {
4544  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4545  }
4546 
4547  if (TCR_4(__kmp_init_hidden_helper_threads)) {
4548  KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4549  }
4550  }
4551 
4552  /* allocate space for it. */
4553  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4554 
4555  new_thr->th.th_nt_strict = false;
4556  new_thr->th.th_nt_loc = NULL;
4557  new_thr->th.th_nt_sev = severity_fatal;
4558  new_thr->th.th_nt_msg = NULL;
4559 
4560  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4561 
4562 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4563  // suppress race conditions detection on synchronization flags in debug mode
4564  // this helps to analyze library internals eliminating false positives
4565  __itt_suppress_mark_range(
4566  __itt_suppress_range, __itt_suppress_threading_errors,
4567  &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4568  __itt_suppress_mark_range(
4569  __itt_suppress_range, __itt_suppress_threading_errors,
4570  &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4571 #if KMP_OS_WINDOWS
4572  __itt_suppress_mark_range(
4573  __itt_suppress_range, __itt_suppress_threading_errors,
4574  &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4575 #else
4576  __itt_suppress_mark_range(__itt_suppress_range,
4577  __itt_suppress_threading_errors,
4578  &new_thr->th.th_suspend_init_count,
4579  sizeof(new_thr->th.th_suspend_init_count));
4580 #endif
4581  // TODO: check if we need to also suppress b_arrived flags
4582  __itt_suppress_mark_range(__itt_suppress_range,
4583  __itt_suppress_threading_errors,
4584  CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4585  sizeof(new_thr->th.th_bar[0].bb.b_go));
4586  __itt_suppress_mark_range(__itt_suppress_range,
4587  __itt_suppress_threading_errors,
4588  CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4589  sizeof(new_thr->th.th_bar[1].bb.b_go));
4590  __itt_suppress_mark_range(__itt_suppress_range,
4591  __itt_suppress_threading_errors,
4592  CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4593  sizeof(new_thr->th.th_bar[2].bb.b_go));
4594 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4595  if (__kmp_storage_map) {
4596  __kmp_print_thread_storage_map(new_thr, new_gtid);
4597  }
4598 
4599  // add the reserve serialized team, initialized from the team's primary thread
4600  {
4601  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4602  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4603  new_thr->th.th_serial_team = serial_team =
4604  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4605 #if OMPT_SUPPORT
4606  ompt_data_none, // root parallel id
4607 #endif
4608  proc_bind_default, &r_icvs,
4609  0 USE_NESTED_HOT_ARG(NULL));
4610  }
4611  KMP_ASSERT(serial_team);
4612  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4613  // execution (it is unused for now).
4614  serial_team->t.t_threads[0] = new_thr;
4615  KF_TRACE(10,
4616  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4617  new_thr));
4618 
4619  /* setup the thread structures */
4620  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4621 
4622 #if USE_FAST_MEMORY
4623  __kmp_initialize_fast_memory(new_thr);
4624 #endif /* USE_FAST_MEMORY */
4625 
4626 #if KMP_USE_BGET
4627  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4628  __kmp_initialize_bget(new_thr);
4629 #endif
4630 
4631  __kmp_init_random(new_thr); // Initialize random number generator
4632 
4633  /* Initialize these only once when thread is grabbed for a team allocation */
4634  KA_TRACE(20,
4635  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4636  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4637 
4638  int b;
4639  kmp_balign_t *balign = new_thr->th.th_bar;
4640  for (b = 0; b < bs_last_barrier; ++b) {
4641  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4642  balign[b].bb.team = NULL;
4643  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4644  balign[b].bb.use_oncore_barrier = 0;
4645  }
4646 
4647  TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4648  new_thr->th.th_sleep_loc_type = flag_unset;
4649 
4650  new_thr->th.th_spin_here = FALSE;
4651  new_thr->th.th_next_waiting = 0;
4652 #if KMP_OS_UNIX
4653  new_thr->th.th_blocking = false;
4654 #endif
4655 
4656 #if KMP_AFFINITY_SUPPORTED
4657  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4658  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4659  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4660  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4661 #endif
4662  new_thr->th.th_def_allocator = __kmp_def_allocator;
4663  new_thr->th.th_prev_level = 0;
4664  new_thr->th.th_prev_num_threads = 1;
4665 
4666  TCW_4(new_thr->th.th_in_pool, FALSE);
4667  new_thr->th.th_active_in_pool = FALSE;
4668  TCW_4(new_thr->th.th_active, TRUE);
4669 
4670  new_thr->th.th_set_nested_nth = NULL;
4671  new_thr->th.th_set_nested_nth_sz = 0;
4672 
4673  /* adjust the global counters */
4674  __kmp_all_nth++;
4675  __kmp_nth++;
4676 
4677  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4678  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4679  if (__kmp_adjust_gtid_mode) {
4680  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4681  if (TCR_4(__kmp_gtid_mode) != 2) {
4682  TCW_4(__kmp_gtid_mode, 2);
4683  }
4684  } else {
4685  if (TCR_4(__kmp_gtid_mode) != 1) {
4686  TCW_4(__kmp_gtid_mode, 1);
4687  }
4688  }
4689  }
4690 
4691 #ifdef KMP_ADJUST_BLOCKTIME
4692  /* Adjust blocktime back to zero if necessary */
4693  /* Middle initialization might not have occurred yet */
4694  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4695  if (__kmp_nth > __kmp_avail_proc) {
4696  __kmp_zero_bt = TRUE;
4697  }
4698  }
4699 #endif /* KMP_ADJUST_BLOCKTIME */
4700 
4701 #if KMP_AFFINITY_SUPPORTED
4702  // Set the affinity and topology information for new thread
4703  __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
4704 #endif
4705 
4706  /* actually fork it and create the new worker thread */
4707  KF_TRACE(
4708  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4709  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4710  KF_TRACE(10,
4711  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4712 
4713  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4714  new_gtid));
4715  KMP_MB();
4716  return new_thr;
4717 }
4718 
4719 /* Reinitialize team for reuse.
4720  The hot team code calls this case at every fork barrier, so EPCC barrier
4721  test are extremely sensitive to changes in it, esp. writes to the team
4722  struct, which cause a cache invalidation in all threads.
4723  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4724 static void __kmp_reinitialize_team(kmp_team_t *team,
4725  kmp_internal_control_t *new_icvs,
4726  ident_t *loc) {
4727  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4728  team->t.t_threads[0], team));
4729  KMP_DEBUG_ASSERT(team && new_icvs);
4730  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4731  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4732 
4733  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4734  // Copy ICVs to the primary thread's implicit taskdata
4735  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4736  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4737 
4738  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4739  team->t.t_threads[0], team));
4740 }
4741 
4742 /* Initialize the team data structure.
4743  This assumes the t_threads and t_max_nproc are already set.
4744  Also, we don't touch the arguments */
4745 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4746  kmp_internal_control_t *new_icvs,
4747  ident_t *loc) {
4748  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4749 
4750  /* verify */
4751  KMP_DEBUG_ASSERT(team);
4752  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4753  KMP_DEBUG_ASSERT(team->t.t_threads);
4754  KMP_MB();
4755 
4756  team->t.t_master_tid = 0; /* not needed */
4757  /* team->t.t_master_bar; not needed */
4758  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4759  team->t.t_nproc = new_nproc;
4760 
4761  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4762  team->t.t_next_pool = NULL;
4763  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4764  * up hot team */
4765 
4766  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4767  team->t.t_invoke = NULL; /* not needed */
4768 
4769  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4770  team->t.t_sched.sched = new_icvs->sched.sched;
4771 
4772 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4773  team->t.t_fp_control_saved = FALSE; /* not needed */
4774  team->t.t_x87_fpu_control_word = 0; /* not needed */
4775  team->t.t_mxcsr = 0; /* not needed */
4776 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4777 
4778  team->t.t_construct = 0;
4779 
4780  team->t.t_ordered.dt.t_value = 0;
4781  team->t.t_master_active = FALSE;
4782 
4783 #ifdef KMP_DEBUG
4784  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4785 #endif
4786 #if KMP_OS_WINDOWS
4787  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4788 #endif
4789 
4790  team->t.t_control_stack_top = NULL;
4791 
4792  __kmp_reinitialize_team(team, new_icvs, loc);
4793 
4794  KMP_MB();
4795  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4796 }
4797 
4798 #if KMP_AFFINITY_SUPPORTED
4799 static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
4800  int first, int last, int newp) {
4801  th->th.th_first_place = first;
4802  th->th.th_last_place = last;
4803  th->th.th_new_place = newp;
4804  if (newp != th->th.th_current_place) {
4805  if (__kmp_display_affinity && team->t.t_display_affinity != 1)
4806  team->t.t_display_affinity = 1;
4807  // Copy topology information associated with the new place
4808  th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4809  th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4810  }
4811 }
4812 
4813 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4814 // It calculates the worker + primary thread's partition based upon the parent
4815 // thread's partition, and binds each worker to a thread in their partition.
4816 // The primary thread's partition should already include its current binding.
4817 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4818  // Do not partition places for the hidden helper team
4819  if (KMP_HIDDEN_HELPER_TEAM(team))
4820  return;
4821  // Copy the primary thread's place partition to the team struct
4822  kmp_info_t *master_th = team->t.t_threads[0];
4823  KMP_DEBUG_ASSERT(master_th != NULL);
4824  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4825  int first_place = master_th->th.th_first_place;
4826  int last_place = master_th->th.th_last_place;
4827  int masters_place = master_th->th.th_current_place;
4828  int num_masks = __kmp_affinity.num_masks;
4829  team->t.t_first_place = first_place;
4830  team->t.t_last_place = last_place;
4831 
4832  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4833  "bound to place %d partition = [%d,%d]\n",
4834  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4835  team->t.t_id, masters_place, first_place, last_place));
4836 
4837  switch (proc_bind) {
4838 
4839  case proc_bind_default:
4840  // Serial teams might have the proc_bind policy set to proc_bind_default.
4841  // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4842  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4843  break;
4844 
4845  case proc_bind_primary: {
4846  int f;
4847  int n_th = team->t.t_nproc;
4848  for (f = 1; f < n_th; f++) {
4849  kmp_info_t *th = team->t.t_threads[f];
4850  KMP_DEBUG_ASSERT(th != NULL);
4851  __kmp_set_thread_place(team, th, first_place, last_place, masters_place);
4852 
4853  KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4854  "partition = [%d,%d]\n",
4855  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4856  f, masters_place, first_place, last_place));
4857  }
4858  } break;
4859 
4860  case proc_bind_close: {
4861  int f;
4862  int n_th = team->t.t_nproc;
4863  int n_places;
4864  if (first_place <= last_place) {
4865  n_places = last_place - first_place + 1;
4866  } else {
4867  n_places = num_masks - first_place + last_place + 1;
4868  }
4869  if (n_th <= n_places) {
4870  int place = masters_place;
4871  for (f = 1; f < n_th; f++) {
4872  kmp_info_t *th = team->t.t_threads[f];
4873  KMP_DEBUG_ASSERT(th != NULL);
4874 
4875  if (place == last_place) {
4876  place = first_place;
4877  } else if (place == (num_masks - 1)) {
4878  place = 0;
4879  } else {
4880  place++;
4881  }
4882  __kmp_set_thread_place(team, th, first_place, last_place, place);
4883 
4884  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4885  "partition = [%d,%d]\n",
4886  __kmp_gtid_from_thread(team->t.t_threads[f]),
4887  team->t.t_id, f, place, first_place, last_place));
4888  }
4889  } else {
4890  int S, rem, gap, s_count;
4891  S = n_th / n_places;
4892  s_count = 0;
4893  rem = n_th - (S * n_places);
4894  gap = rem > 0 ? n_places / rem : n_places;
4895  int place = masters_place;
4896  int gap_ct = gap;
4897  for (f = 0; f < n_th; f++) {
4898  kmp_info_t *th = team->t.t_threads[f];
4899  KMP_DEBUG_ASSERT(th != NULL);
4900 
4901  __kmp_set_thread_place(team, th, first_place, last_place, place);
4902  s_count++;
4903 
4904  if ((s_count == S) && rem && (gap_ct == gap)) {
4905  // do nothing, add an extra thread to place on next iteration
4906  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4907  // we added an extra thread to this place; move to next place
4908  if (place == last_place) {
4909  place = first_place;
4910  } else if (place == (num_masks - 1)) {
4911  place = 0;
4912  } else {
4913  place++;
4914  }
4915  s_count = 0;
4916  gap_ct = 1;
4917  rem--;
4918  } else if (s_count == S) { // place full; don't add extra
4919  if (place == last_place) {
4920  place = first_place;
4921  } else if (place == (num_masks - 1)) {
4922  place = 0;
4923  } else {
4924  place++;
4925  }
4926  gap_ct++;
4927  s_count = 0;
4928  }
4929 
4930  KA_TRACE(100,
4931  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4932  "partition = [%d,%d]\n",
4933  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4934  th->th.th_new_place, first_place, last_place));
4935  }
4936  KMP_DEBUG_ASSERT(place == masters_place);
4937  }
4938  } break;
4939 
4940  case proc_bind_spread: {
4941  int f;
4942  int n_th = team->t.t_nproc;
4943  int n_places;
4944  int thidx;
4945  if (first_place <= last_place) {
4946  n_places = last_place - first_place + 1;
4947  } else {
4948  n_places = num_masks - first_place + last_place + 1;
4949  }
4950  if (n_th <= n_places) {
4951  int place = -1;
4952 
4953  if (n_places != num_masks) {
4954  int S = n_places / n_th;
4955  int s_count, rem, gap, gap_ct;
4956 
4957  place = masters_place;
4958  rem = n_places - n_th * S;
4959  gap = rem ? n_th / rem : 1;
4960  gap_ct = gap;
4961  thidx = n_th;
4962  if (update_master_only == 1)
4963  thidx = 1;
4964  for (f = 0; f < thidx; f++) {
4965  kmp_info_t *th = team->t.t_threads[f];
4966  KMP_DEBUG_ASSERT(th != NULL);
4967 
4968  int fplace = place, nplace = place;
4969  s_count = 1;
4970  while (s_count < S) {
4971  if (place == last_place) {
4972  place = first_place;
4973  } else if (place == (num_masks - 1)) {
4974  place = 0;
4975  } else {
4976  place++;
4977  }
4978  s_count++;
4979  }
4980  if (rem && (gap_ct == gap)) {
4981  if (place == last_place) {
4982  place = first_place;
4983  } else if (place == (num_masks - 1)) {
4984  place = 0;
4985  } else {
4986  place++;
4987  }
4988  rem--;
4989  gap_ct = 0;
4990  }
4991  __kmp_set_thread_place(team, th, fplace, place, nplace);
4992  gap_ct++;
4993 
4994  if (place == last_place) {
4995  place = first_place;
4996  } else if (place == (num_masks - 1)) {
4997  place = 0;
4998  } else {
4999  place++;
5000  }
5001 
5002  KA_TRACE(100,
5003  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5004  "partition = [%d,%d], num_masks: %u\n",
5005  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
5006  f, th->th.th_new_place, th->th.th_first_place,
5007  th->th.th_last_place, num_masks));
5008  }
5009  } else {
5010  /* Having uniform space of available computation places I can create
5011  T partitions of round(P/T) size and put threads into the first
5012  place of each partition. */
5013  double current = static_cast<double>(masters_place);
5014  double spacing =
5015  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5016  int first, last;
5017  kmp_info_t *th;
5018 
5019  thidx = n_th + 1;
5020  if (update_master_only == 1)
5021  thidx = 1;
5022  for (f = 0; f < thidx; f++) {
5023  first = static_cast<int>(current);
5024  last = static_cast<int>(current + spacing) - 1;
5025  KMP_DEBUG_ASSERT(last >= first);
5026  if (first >= n_places) {
5027  if (masters_place) {
5028  first -= n_places;
5029  last -= n_places;
5030  if (first == (masters_place + 1)) {
5031  KMP_DEBUG_ASSERT(f == n_th);
5032  first--;
5033  }
5034  if (last == masters_place) {
5035  KMP_DEBUG_ASSERT(f == (n_th - 1));
5036  last--;
5037  }
5038  } else {
5039  KMP_DEBUG_ASSERT(f == n_th);
5040  first = 0;
5041  last = 0;
5042  }
5043  }
5044  if (last >= n_places) {
5045  last = (n_places - 1);
5046  }
5047  place = first;
5048  current += spacing;
5049  if (f < n_th) {
5050  KMP_DEBUG_ASSERT(0 <= first);
5051  KMP_DEBUG_ASSERT(n_places > first);
5052  KMP_DEBUG_ASSERT(0 <= last);
5053  KMP_DEBUG_ASSERT(n_places > last);
5054  KMP_DEBUG_ASSERT(last_place >= first_place);
5055  th = team->t.t_threads[f];
5056  KMP_DEBUG_ASSERT(th);
5057  __kmp_set_thread_place(team, th, first, last, place);
5058  KA_TRACE(100,
5059  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5060  "partition = [%d,%d], spacing = %.4f\n",
5061  __kmp_gtid_from_thread(team->t.t_threads[f]),
5062  team->t.t_id, f, th->th.th_new_place,
5063  th->th.th_first_place, th->th.th_last_place, spacing));
5064  }
5065  }
5066  }
5067  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5068  } else {
5069  int S, rem, gap, s_count;
5070  S = n_th / n_places;
5071  s_count = 0;
5072  rem = n_th - (S * n_places);
5073  gap = rem > 0 ? n_places / rem : n_places;
5074  int place = masters_place;
5075  int gap_ct = gap;
5076  thidx = n_th;
5077  if (update_master_only == 1)
5078  thidx = 1;
5079  for (f = 0; f < thidx; f++) {
5080  kmp_info_t *th = team->t.t_threads[f];
5081  KMP_DEBUG_ASSERT(th != NULL);
5082 
5083  __kmp_set_thread_place(team, th, place, place, place);
5084  s_count++;
5085 
5086  if ((s_count == S) && rem && (gap_ct == gap)) {
5087  // do nothing, add an extra thread to place on next iteration
5088  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5089  // we added an extra thread to this place; move on to next place
5090  if (place == last_place) {
5091  place = first_place;
5092  } else if (place == (num_masks - 1)) {
5093  place = 0;
5094  } else {
5095  place++;
5096  }
5097  s_count = 0;
5098  gap_ct = 1;
5099  rem--;
5100  } else if (s_count == S) { // place is full; don't add extra thread
5101  if (place == last_place) {
5102  place = first_place;
5103  } else if (place == (num_masks - 1)) {
5104  place = 0;
5105  } else {
5106  place++;
5107  }
5108  gap_ct++;
5109  s_count = 0;
5110  }
5111 
5112  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5113  "partition = [%d,%d]\n",
5114  __kmp_gtid_from_thread(team->t.t_threads[f]),
5115  team->t.t_id, f, th->th.th_new_place,
5116  th->th.th_first_place, th->th.th_last_place));
5117  }
5118  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5119  }
5120  } break;
5121 
5122  default:
5123  break;
5124  }
5125 
5126  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5127 }
5128 
5129 #endif // KMP_AFFINITY_SUPPORTED
5130 
5131 /* allocate a new team data structure to use. take one off of the free pool if
5132  available */
5133 kmp_team_t *
5134 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5135 #if OMPT_SUPPORT
5136  ompt_data_t ompt_parallel_data,
5137 #endif
5138  kmp_proc_bind_t new_proc_bind,
5139  kmp_internal_control_t *new_icvs,
5140  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5141  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5142  int f;
5143  kmp_team_t *team;
5144  int use_hot_team = !root->r.r_active;
5145  int level = 0;
5146  int do_place_partition = 1;
5147 
5148  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5149  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5150  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5151  KMP_MB();
5152 
5153 #if KMP_NESTED_HOT_TEAMS
5154  kmp_hot_team_ptr_t *hot_teams;
5155  if (master) {
5156  team = master->th.th_team;
5157  level = team->t.t_active_level;
5158  if (master->th.th_teams_microtask) { // in teams construct?
5159  if (master->th.th_teams_size.nteams > 1 &&
5160  ( // #teams > 1
5161  team->t.t_pkfn ==
5162  (microtask_t)__kmp_teams_master || // inner fork of the teams
5163  master->th.th_teams_level <
5164  team->t.t_level)) { // or nested parallel inside the teams
5165  ++level; // not increment if #teams==1, or for outer fork of the teams;
5166  // increment otherwise
5167  }
5168  // Do not perform the place partition if inner fork of the teams
5169  // Wait until nested parallel region encountered inside teams construct
5170  if ((master->th.th_teams_size.nteams == 1 &&
5171  master->th.th_teams_level >= team->t.t_level) ||
5172  (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5173  do_place_partition = 0;
5174  }
5175  hot_teams = master->th.th_hot_teams;
5176  if (level < __kmp_hot_teams_max_level && hot_teams &&
5177  hot_teams[level].hot_team) {
5178  // hot team has already been allocated for given level
5179  use_hot_team = 1;
5180  } else {
5181  use_hot_team = 0;
5182  }
5183  } else {
5184  // check we won't access uninitialized hot_teams, just in case
5185  KMP_DEBUG_ASSERT(new_nproc == 1);
5186  }
5187 #endif
5188  // Optimization to use a "hot" team
5189  if (use_hot_team && new_nproc > 1) {
5190  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5191 #if KMP_NESTED_HOT_TEAMS
5192  team = hot_teams[level].hot_team;
5193 #else
5194  team = root->r.r_hot_team;
5195 #endif
5196 #if KMP_DEBUG
5197  if (__kmp_tasking_mode != tskm_immediate_exec) {
5198  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5199  "task_team[1] = %p before reinit\n",
5200  team->t.t_task_team[0], team->t.t_task_team[1]));
5201  }
5202 #endif
5203 
5204  if (team->t.t_nproc != new_nproc &&
5205  __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5206  // Distributed barrier may need a resize
5207  int old_nthr = team->t.t_nproc;
5208  __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5209  }
5210 
5211  // If not doing the place partition, then reset the team's proc bind
5212  // to indicate that partitioning of all threads still needs to take place
5213  if (do_place_partition == 0)
5214  team->t.t_proc_bind = proc_bind_default;
5215  // Has the number of threads changed?
5216  /* Let's assume the most common case is that the number of threads is
5217  unchanged, and put that case first. */
5218  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5219  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5220  // This case can mean that omp_set_num_threads() was called and the hot
5221  // team size was already reduced, so we check the special flag
5222  if (team->t.t_size_changed == -1) {
5223  team->t.t_size_changed = 1;
5224  } else {
5225  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5226  }
5227 
5228  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5229  kmp_r_sched_t new_sched = new_icvs->sched;
5230  // set primary thread's schedule as new run-time schedule
5231  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5232 
5233  __kmp_reinitialize_team(team, new_icvs,
5234  root->r.r_uber_thread->th.th_ident);
5235 
5236  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5237  team->t.t_threads[0], team));
5238  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5239 
5240 #if KMP_AFFINITY_SUPPORTED
5241  if ((team->t.t_size_changed == 0) &&
5242  (team->t.t_proc_bind == new_proc_bind)) {
5243  if (new_proc_bind == proc_bind_spread) {
5244  if (do_place_partition) {
5245  // add flag to update only master for spread
5246  __kmp_partition_places(team, 1);
5247  }
5248  }
5249  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5250  "proc_bind = %d, partition = [%d,%d]\n",
5251  team->t.t_id, new_proc_bind, team->t.t_first_place,
5252  team->t.t_last_place));
5253  } else {
5254  if (do_place_partition) {
5255  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5256  __kmp_partition_places(team);
5257  }
5258  }
5259 #else
5260  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5261 #endif /* KMP_AFFINITY_SUPPORTED */
5262  } else if (team->t.t_nproc > new_nproc) {
5263  KA_TRACE(20,
5264  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5265  new_nproc));
5266 
5267  team->t.t_size_changed = 1;
5268  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5269  // Barrier size already reduced earlier in this function
5270  // Activate team threads via th_used_in_team
5271  __kmp_add_threads_to_team(team, new_nproc);
5272  }
5273  // When decreasing team size, threads no longer in the team should
5274  // unref task team.
5275  if (__kmp_tasking_mode != tskm_immediate_exec) {
5276  for (f = new_nproc; f < team->t.t_nproc; f++) {
5277  kmp_info_t *th = team->t.t_threads[f];
5278  KMP_DEBUG_ASSERT(th);
5279  th->th.th_task_team = NULL;
5280  }
5281  }
5282 #if KMP_NESTED_HOT_TEAMS
5283  if (__kmp_hot_teams_mode == 0) {
5284  // AC: saved number of threads should correspond to team's value in this
5285  // mode, can be bigger in mode 1, when hot team has threads in reserve
5286  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5287  hot_teams[level].hot_team_nth = new_nproc;
5288 #endif // KMP_NESTED_HOT_TEAMS
5289  /* release the extra threads we don't need any more */
5290  for (f = new_nproc; f < team->t.t_nproc; f++) {
5291  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5292  __kmp_free_thread(team->t.t_threads[f]);
5293  team->t.t_threads[f] = NULL;
5294  }
5295 #if KMP_NESTED_HOT_TEAMS
5296  } // (__kmp_hot_teams_mode == 0)
5297  else {
5298  // When keeping extra threads in team, switch threads to wait on own
5299  // b_go flag
5300  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5301  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5302  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5303  for (int b = 0; b < bs_last_barrier; ++b) {
5304  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5305  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5306  }
5307  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5308  }
5309  }
5310  }
5311 #endif // KMP_NESTED_HOT_TEAMS
5312  team->t.t_nproc = new_nproc;
5313  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5314  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5315  __kmp_reinitialize_team(team, new_icvs,
5316  root->r.r_uber_thread->th.th_ident);
5317 
5318  // Update remaining threads
5319  for (f = 0; f < new_nproc; ++f) {
5320  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5321  }
5322 
5323  // restore the current task state of the primary thread: should be the
5324  // implicit task
5325  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5326  team->t.t_threads[0], team));
5327 
5328  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5329 
5330 #ifdef KMP_DEBUG
5331  for (f = 0; f < team->t.t_nproc; f++) {
5332  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5333  team->t.t_threads[f]->th.th_team_nproc ==
5334  team->t.t_nproc);
5335  }
5336 #endif
5337 
5338  if (do_place_partition) {
5339  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5340 #if KMP_AFFINITY_SUPPORTED
5341  __kmp_partition_places(team);
5342 #endif
5343  }
5344  } else { // team->t.t_nproc < new_nproc
5345 
5346  KA_TRACE(20,
5347  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5348  new_nproc));
5349  int old_nproc = team->t.t_nproc; // save old value and use to update only
5350  team->t.t_size_changed = 1;
5351 
5352 #if KMP_NESTED_HOT_TEAMS
5353  int avail_threads = hot_teams[level].hot_team_nth;
5354  if (new_nproc < avail_threads)
5355  avail_threads = new_nproc;
5356  kmp_info_t **other_threads = team->t.t_threads;
5357  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5358  // Adjust barrier data of reserved threads (if any) of the team
5359  // Other data will be set in __kmp_initialize_info() below.
5360  int b;
5361  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5362  for (b = 0; b < bs_last_barrier; ++b) {
5363  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5364  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5365 #if USE_DEBUGGER
5366  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5367 #endif
5368  }
5369  }
5370  if (hot_teams[level].hot_team_nth >= new_nproc) {
5371  // we have all needed threads in reserve, no need to allocate any
5372  // this only possible in mode 1, cannot have reserved threads in mode 0
5373  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5374  team->t.t_nproc = new_nproc; // just get reserved threads involved
5375  } else {
5376  // We may have some threads in reserve, but not enough;
5377  // get reserved threads involved if any.
5378  team->t.t_nproc = hot_teams[level].hot_team_nth;
5379  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5380 #endif // KMP_NESTED_HOT_TEAMS
5381  if (team->t.t_max_nproc < new_nproc) {
5382  /* reallocate larger arrays */
5383  __kmp_reallocate_team_arrays(team, new_nproc);
5384  __kmp_reinitialize_team(team, new_icvs, NULL);
5385  }
5386 
5387 #if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
5388  KMP_AFFINITY_SUPPORTED
5389  /* Temporarily set full mask for primary thread before creation of
5390  workers. The reason is that workers inherit the affinity from the
5391  primary thread, so if a lot of workers are created on the single
5392  core quickly, they don't get a chance to set their own affinity for
5393  a long time. */
5394  kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5395 #endif
5396 
5397  /* allocate new threads for the hot team */
5398  for (f = team->t.t_nproc; f < new_nproc; f++) {
5399  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5400  KMP_DEBUG_ASSERT(new_worker);
5401  team->t.t_threads[f] = new_worker;
5402 
5403  KA_TRACE(20,
5404  ("__kmp_allocate_team: team %d init T#%d arrived: "
5405  "join=%llu, plain=%llu\n",
5406  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5407  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5408  team->t.t_bar[bs_plain_barrier].b_arrived));
5409 
5410  { // Initialize barrier data for new threads.
5411  int b;
5412  kmp_balign_t *balign = new_worker->th.th_bar;
5413  for (b = 0; b < bs_last_barrier; ++b) {
5414  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5415  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5416  KMP_BARRIER_PARENT_FLAG);
5417 #if USE_DEBUGGER
5418  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5419 #endif
5420  }
5421  }
5422  }
5423 
5424 #if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
5425  KMP_AFFINITY_SUPPORTED
5426  /* Restore initial primary thread's affinity mask */
5427  new_temp_affinity.restore();
5428 #endif
5429 #if KMP_NESTED_HOT_TEAMS
5430  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5431 #endif // KMP_NESTED_HOT_TEAMS
5432  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5433  // Barrier size already increased earlier in this function
5434  // Activate team threads via th_used_in_team
5435  __kmp_add_threads_to_team(team, new_nproc);
5436  }
5437  /* make sure everyone is syncronized */
5438  // new threads below
5439  __kmp_initialize_team(team, new_nproc, new_icvs,
5440  root->r.r_uber_thread->th.th_ident);
5441 
5442  /* reinitialize the threads */
5443  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5444  for (f = 0; f < team->t.t_nproc; ++f)
5445  __kmp_initialize_info(team->t.t_threads[f], team, f,
5446  __kmp_gtid_from_tid(f, team));
5447 
5448  // set th_task_state for new threads in hot team with older thread's state
5449  kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5450  for (f = old_nproc; f < team->t.t_nproc; ++f)
5451  team->t.t_threads[f]->th.th_task_state = old_state;
5452 
5453 #ifdef KMP_DEBUG
5454  for (f = 0; f < team->t.t_nproc; ++f) {
5455  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5456  team->t.t_threads[f]->th.th_team_nproc ==
5457  team->t.t_nproc);
5458  }
5459 #endif
5460 
5461  if (do_place_partition) {
5462  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5463 #if KMP_AFFINITY_SUPPORTED
5464  __kmp_partition_places(team);
5465 #endif
5466  }
5467  } // Check changes in number of threads
5468 
5469  if (master->th.th_teams_microtask) {
5470  for (f = 1; f < new_nproc; ++f) {
5471  // propagate teams construct specific info to workers
5472  kmp_info_t *thr = team->t.t_threads[f];
5473  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5474  thr->th.th_teams_level = master->th.th_teams_level;
5475  thr->th.th_teams_size = master->th.th_teams_size;
5476  }
5477  }
5478 #if KMP_NESTED_HOT_TEAMS
5479  if (level) {
5480  // Sync barrier state for nested hot teams, not needed for outermost hot
5481  // team.
5482  for (f = 1; f < new_nproc; ++f) {
5483  kmp_info_t *thr = team->t.t_threads[f];
5484  int b;
5485  kmp_balign_t *balign = thr->th.th_bar;
5486  for (b = 0; b < bs_last_barrier; ++b) {
5487  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5488  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5489 #if USE_DEBUGGER
5490  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5491 #endif
5492  }
5493  }
5494  }
5495 #endif // KMP_NESTED_HOT_TEAMS
5496 
5497  /* reallocate space for arguments if necessary */
5498  __kmp_alloc_argv_entries(argc, team, TRUE);
5499  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5500  // The hot team re-uses the previous task team,
5501  // if untouched during the previous release->gather phase.
5502 
5503  KF_TRACE(10, (" hot_team = %p\n", team));
5504 
5505 #if KMP_DEBUG
5506  if (__kmp_tasking_mode != tskm_immediate_exec) {
5507  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5508  "task_team[1] = %p after reinit\n",
5509  team->t.t_task_team[0], team->t.t_task_team[1]));
5510  }
5511 #endif
5512 
5513 #if OMPT_SUPPORT
5514  __ompt_team_assign_id(team, ompt_parallel_data);
5515 #endif
5516 
5517  KMP_MB();
5518 
5519  return team;
5520  }
5521 
5522  /* next, let's try to take one from the team pool */
5523  KMP_MB();
5524  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5525  /* TODO: consider resizing undersized teams instead of reaping them, now
5526  that we have a resizing mechanism */
5527  if (team->t.t_max_nproc >= max_nproc) {
5528  /* take this team from the team pool */
5529  __kmp_team_pool = team->t.t_next_pool;
5530 
5531  if (max_nproc > 1 &&
5532  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5533  if (!team->t.b) { // Allocate barrier structure
5534  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5535  }
5536  }
5537 
5538  /* setup the team for fresh use */
5539  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5540 
5541  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5542  "task_team[1] %p to NULL\n",
5543  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5544  team->t.t_task_team[0] = NULL;
5545  team->t.t_task_team[1] = NULL;
5546 
5547  /* reallocate space for arguments if necessary */
5548  __kmp_alloc_argv_entries(argc, team, TRUE);
5549  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5550 
5551  KA_TRACE(
5552  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5553  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5554  { // Initialize barrier data.
5555  int b;
5556  for (b = 0; b < bs_last_barrier; ++b) {
5557  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5558 #if USE_DEBUGGER
5559  team->t.t_bar[b].b_master_arrived = 0;
5560  team->t.t_bar[b].b_team_arrived = 0;
5561 #endif
5562  }
5563  }
5564 
5565  team->t.t_proc_bind = new_proc_bind;
5566 
5567  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5568  team->t.t_id));
5569 
5570 #if OMPT_SUPPORT
5571  __ompt_team_assign_id(team, ompt_parallel_data);
5572 #endif
5573 
5574  team->t.t_nested_nth = NULL;
5575 
5576  KMP_MB();
5577 
5578  return team;
5579  }
5580 
5581  /* reap team if it is too small, then loop back and check the next one */
5582  // not sure if this is wise, but, will be redone during the hot-teams
5583  // rewrite.
5584  /* TODO: Use technique to find the right size hot-team, don't reap them */
5585  team = __kmp_reap_team(team);
5586  __kmp_team_pool = team;
5587  }
5588 
5589  /* nothing available in the pool, no matter, make a new team! */
5590  KMP_MB();
5591  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5592 
5593  /* and set it up */
5594  team->t.t_max_nproc = max_nproc;
5595  if (max_nproc > 1 &&
5596  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5597  // Allocate barrier structure
5598  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5599  }
5600 
5601  /* NOTE well, for some reason allocating one big buffer and dividing it up
5602  seems to really hurt performance a lot on the P4, so, let's not use this */
5603  __kmp_allocate_team_arrays(team, max_nproc);
5604 
5605  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5606  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5607 
5608  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5609  "%p to NULL\n",
5610  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5611  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5612  // memory, no need to duplicate
5613  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5614  // memory, no need to duplicate
5615 
5616  if (__kmp_storage_map) {
5617  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5618  }
5619 
5620  /* allocate space for arguments */
5621  __kmp_alloc_argv_entries(argc, team, FALSE);
5622  team->t.t_argc = argc;
5623 
5624  KA_TRACE(20,
5625  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5626  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5627  { // Initialize barrier data.
5628  int b;
5629  for (b = 0; b < bs_last_barrier; ++b) {
5630  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5631 #if USE_DEBUGGER
5632  team->t.t_bar[b].b_master_arrived = 0;
5633  team->t.t_bar[b].b_team_arrived = 0;
5634 #endif
5635  }
5636  }
5637 
5638  team->t.t_proc_bind = new_proc_bind;
5639 
5640 #if OMPT_SUPPORT
5641  __ompt_team_assign_id(team, ompt_parallel_data);
5642  team->t.ompt_serialized_team_info = NULL;
5643 #endif
5644 
5645  KMP_MB();
5646 
5647  team->t.t_nested_nth = NULL;
5648 
5649  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5650  team->t.t_id));
5651 
5652  return team;
5653 }
5654 
5655 /* TODO implement hot-teams at all levels */
5656 /* TODO implement lazy thread release on demand (disband request) */
5657 
5658 /* free the team. return it to the team pool. release all the threads
5659  * associated with it */
5660 void __kmp_free_team(kmp_root_t *root,
5661  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5662  int f;
5663  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5664  team->t.t_id));
5665 
5666  /* verify state */
5667  KMP_DEBUG_ASSERT(root);
5668  KMP_DEBUG_ASSERT(team);
5669  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5670  KMP_DEBUG_ASSERT(team->t.t_threads);
5671 
5672  int use_hot_team = team == root->r.r_hot_team;
5673 #if KMP_NESTED_HOT_TEAMS
5674  int level;
5675  if (master) {
5676  level = team->t.t_active_level - 1;
5677  if (master->th.th_teams_microtask) { // in teams construct?
5678  if (master->th.th_teams_size.nteams > 1) {
5679  ++level; // level was not increased in teams construct for
5680  // team_of_masters
5681  }
5682  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5683  master->th.th_teams_level == team->t.t_level) {
5684  ++level; // level was not increased in teams construct for
5685  // team_of_workers before the parallel
5686  } // team->t.t_level will be increased inside parallel
5687  }
5688 #if KMP_DEBUG
5689  kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5690 #endif
5691  if (level < __kmp_hot_teams_max_level) {
5692  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5693  use_hot_team = 1;
5694  }
5695  }
5696 #endif // KMP_NESTED_HOT_TEAMS
5697 
5698  /* team is done working */
5699  TCW_SYNC_PTR(team->t.t_pkfn,
5700  NULL); // Important for Debugging Support Library.
5701 #if KMP_OS_WINDOWS
5702  team->t.t_copyin_counter = 0; // init counter for possible reuse
5703 #endif
5704  // Do not reset pointer to parent team to NULL for hot teams.
5705 
5706  /* if we are non-hot team, release our threads */
5707  if (!use_hot_team) {
5708  if (__kmp_tasking_mode != tskm_immediate_exec) {
5709  // Wait for threads to reach reapable state
5710  for (f = 1; f < team->t.t_nproc; ++f) {
5711  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5712  kmp_info_t *th = team->t.t_threads[f];
5713  volatile kmp_uint32 *state = &th->th.th_reap_state;
5714  while (*state != KMP_SAFE_TO_REAP) {
5715 #if KMP_OS_WINDOWS
5716  // On Windows a thread can be killed at any time, check this
5717  DWORD ecode;
5718  if (!__kmp_is_thread_alive(th, &ecode)) {
5719  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5720  break;
5721  }
5722 #endif
5723  // first check if thread is sleeping
5724  if (th->th.th_sleep_loc)
5725  __kmp_null_resume_wrapper(th);
5726  KMP_CPU_PAUSE();
5727  }
5728  }
5729 
5730  // Delete task teams
5731  int tt_idx;
5732  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5733  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5734  if (task_team != NULL) {
5735  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5736  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5737  team->t.t_threads[f]->th.th_task_team = NULL;
5738  }
5739  KA_TRACE(
5740  20,
5741  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5742  __kmp_get_gtid(), task_team, team->t.t_id));
5743 #if KMP_NESTED_HOT_TEAMS
5744  __kmp_free_task_team(master, task_team);
5745 #endif
5746  team->t.t_task_team[tt_idx] = NULL;
5747  }
5748  }
5749  }
5750 
5751  // Before clearing parent pointer, check if nested_nth list should be freed
5752  if (team->t.t_nested_nth && team->t.t_nested_nth != &__kmp_nested_nth &&
5753  team->t.t_nested_nth != team->t.t_parent->t.t_nested_nth) {
5754  KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
5755  KMP_INTERNAL_FREE(team->t.t_nested_nth);
5756  }
5757  team->t.t_nested_nth = NULL;
5758 
5759  // Reset pointer to parent team only for non-hot teams.
5760  team->t.t_parent = NULL;
5761  team->t.t_level = 0;
5762  team->t.t_active_level = 0;
5763 
5764  /* free the worker threads */
5765  for (f = 1; f < team->t.t_nproc; ++f) {
5766  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5767  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5768  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5769  1, 2);
5770  }
5771  __kmp_free_thread(team->t.t_threads[f]);
5772  }
5773 
5774  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5775  if (team->t.b) {
5776  // wake up thread at old location
5777  team->t.b->go_release();
5778  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5779  for (f = 1; f < team->t.t_nproc; ++f) {
5780  if (team->t.b->sleep[f].sleep) {
5781  __kmp_atomic_resume_64(
5782  team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5783  (kmp_atomic_flag_64<> *)NULL);
5784  }
5785  }
5786  }
5787  // Wait for threads to be removed from team
5788  for (int f = 1; f < team->t.t_nproc; ++f) {
5789  while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5790  KMP_CPU_PAUSE();
5791  }
5792  }
5793  }
5794 
5795  for (f = 1; f < team->t.t_nproc; ++f) {
5796  team->t.t_threads[f] = NULL;
5797  }
5798 
5799  if (team->t.t_max_nproc > 1 &&
5800  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5801  distributedBarrier::deallocate(team->t.b);
5802  team->t.b = NULL;
5803  }
5804  /* put the team back in the team pool */
5805  /* TODO limit size of team pool, call reap_team if pool too large */
5806  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5807  __kmp_team_pool = (volatile kmp_team_t *)team;
5808  } else { // Check if team was created for primary threads in teams construct
5809  // See if first worker is a CG root
5810  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5811  team->t.t_threads[1]->th.th_cg_roots);
5812  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5813  // Clean up the CG root nodes on workers so that this team can be re-used
5814  for (f = 1; f < team->t.t_nproc; ++f) {
5815  kmp_info_t *thr = team->t.t_threads[f];
5816  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5817  thr->th.th_cg_roots->cg_root == thr);
5818  // Pop current CG root off list
5819  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5820  thr->th.th_cg_roots = tmp->up;
5821  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5822  " up to node %p. cg_nthreads was %d\n",
5823  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5824  int i = tmp->cg_nthreads--;
5825  if (i == 1) {
5826  __kmp_free(tmp); // free CG if we are the last thread in it
5827  }
5828  // Restore current task's thread_limit from CG root
5829  if (thr->th.th_cg_roots)
5830  thr->th.th_current_task->td_icvs.thread_limit =
5831  thr->th.th_cg_roots->cg_thread_limit;
5832  }
5833  }
5834  }
5835 
5836  KMP_MB();
5837 }
5838 
5839 /* reap the team. destroy it, reclaim all its resources and free its memory */
5840 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5841  kmp_team_t *next_pool = team->t.t_next_pool;
5842 
5843  KMP_DEBUG_ASSERT(team);
5844  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5845  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5846  KMP_DEBUG_ASSERT(team->t.t_threads);
5847  KMP_DEBUG_ASSERT(team->t.t_argv);
5848 
5849  /* TODO clean the threads that are a part of this? */
5850 
5851  /* free stuff */
5852  __kmp_free_team_arrays(team);
5853  if (team->t.t_argv != &team->t.t_inline_argv[0])
5854  __kmp_free((void *)team->t.t_argv);
5855  __kmp_free(team);
5856 
5857  KMP_MB();
5858  return next_pool;
5859 }
5860 
5861 // Free the thread. Don't reap it, just place it on the pool of available
5862 // threads.
5863 //
5864 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5865 // binding for the affinity mechanism to be useful.
5866 //
5867 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5868 // However, we want to avoid a potential performance problem by always
5869 // scanning through the list to find the correct point at which to insert
5870 // the thread (potential N**2 behavior). To do this we keep track of the
5871 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5872 // With single-level parallelism, threads will always be added to the tail
5873 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5874 // parallelism, all bets are off and we may need to scan through the entire
5875 // free list.
5876 //
5877 // This change also has a potentially large performance benefit, for some
5878 // applications. Previously, as threads were freed from the hot team, they
5879 // would be placed back on the free list in inverse order. If the hot team
5880 // grew back to it's original size, then the freed thread would be placed
5881 // back on the hot team in reverse order. This could cause bad cache
5882 // locality problems on programs where the size of the hot team regularly
5883 // grew and shrunk.
5884 //
5885 // Now, for single-level parallelism, the OMP tid is always == gtid.
5886 void __kmp_free_thread(kmp_info_t *this_th) {
5887  int gtid;
5888  kmp_info_t **scan;
5889 
5890  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5891  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5892 
5893  KMP_DEBUG_ASSERT(this_th);
5894 
5895  // When moving thread to pool, switch thread to wait on own b_go flag, and
5896  // uninitialized (NULL team).
5897  int b;
5898  kmp_balign_t *balign = this_th->th.th_bar;
5899  for (b = 0; b < bs_last_barrier; ++b) {
5900  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5901  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5902  balign[b].bb.team = NULL;
5903  balign[b].bb.leaf_kids = 0;
5904  }
5905  this_th->th.th_task_state = 0;
5906  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5907 
5908  /* put thread back on the free pool */
5909  TCW_PTR(this_th->th.th_team, NULL);
5910  TCW_PTR(this_th->th.th_root, NULL);
5911  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5912 
5913  while (this_th->th.th_cg_roots) {
5914  this_th->th.th_cg_roots->cg_nthreads--;
5915  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5916  " %p of thread %p to %d\n",
5917  this_th, this_th->th.th_cg_roots,
5918  this_th->th.th_cg_roots->cg_root,
5919  this_th->th.th_cg_roots->cg_nthreads));
5920  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5921  if (tmp->cg_root == this_th) { // Thread is a cg_root
5922  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5923  KA_TRACE(
5924  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5925  this_th->th.th_cg_roots = tmp->up;
5926  __kmp_free(tmp);
5927  } else { // Worker thread
5928  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5929  __kmp_free(tmp);
5930  }
5931  this_th->th.th_cg_roots = NULL;
5932  break;
5933  }
5934  }
5935 
5936  /* If the implicit task assigned to this thread can be used by other threads
5937  * -> multiple threads can share the data and try to free the task at
5938  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5939  * with higher probability when hot team is disabled but can occurs even when
5940  * the hot team is enabled */
5941  __kmp_free_implicit_task(this_th);
5942  this_th->th.th_current_task = NULL;
5943 
5944  // If the __kmp_thread_pool_insert_pt is already past the new insert
5945  // point, then we need to re-scan the entire list.
5946  gtid = this_th->th.th_info.ds.ds_gtid;
5947  if (__kmp_thread_pool_insert_pt != NULL) {
5948  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5949  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5950  __kmp_thread_pool_insert_pt = NULL;
5951  }
5952  }
5953 
5954  // Scan down the list to find the place to insert the thread.
5955  // scan is the address of a link in the list, possibly the address of
5956  // __kmp_thread_pool itself.
5957  //
5958  // In the absence of nested parallelism, the for loop will have 0 iterations.
5959  if (__kmp_thread_pool_insert_pt != NULL) {
5960  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5961  } else {
5962  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5963  }
5964  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5965  scan = &((*scan)->th.th_next_pool))
5966  ;
5967 
5968  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5969  // to its address.
5970  TCW_PTR(this_th->th.th_next_pool, *scan);
5971  __kmp_thread_pool_insert_pt = *scan = this_th;
5972  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5973  (this_th->th.th_info.ds.ds_gtid <
5974  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5975  TCW_4(this_th->th.th_in_pool, TRUE);
5976  __kmp_suspend_initialize_thread(this_th);
5977  __kmp_lock_suspend_mx(this_th);
5978  if (this_th->th.th_active == TRUE) {
5979  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5980  this_th->th.th_active_in_pool = TRUE;
5981  }
5982 #if KMP_DEBUG
5983  else {
5984  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5985  }
5986 #endif
5987  __kmp_unlock_suspend_mx(this_th);
5988 
5989  TCW_4(__kmp_nth, __kmp_nth - 1);
5990 
5991 #ifdef KMP_ADJUST_BLOCKTIME
5992  /* Adjust blocktime back to user setting or default if necessary */
5993  /* Middle initialization might never have occurred */
5994  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5995  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5996  if (__kmp_nth <= __kmp_avail_proc) {
5997  __kmp_zero_bt = FALSE;
5998  }
5999  }
6000 #endif /* KMP_ADJUST_BLOCKTIME */
6001 
6002  KMP_MB();
6003 }
6004 
6005 /* ------------------------------------------------------------------------ */
6006 
6007 void *__kmp_launch_thread(kmp_info_t *this_thr) {
6008 #if OMP_PROFILING_SUPPORT
6009  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
6010  // TODO: add a configuration option for time granularity
6011  if (ProfileTraceFile)
6012  llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
6013 #endif
6014 
6015  int gtid = this_thr->th.th_info.ds.ds_gtid;
6016  /* void *stack_data;*/
6017  kmp_team_t **volatile pteam;
6018 
6019  KMP_MB();
6020  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
6021 
6022  if (__kmp_env_consistency_check) {
6023  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6024  }
6025 
6026 #if OMPD_SUPPORT
6027  if (ompd_state & OMPD_ENABLE_BP)
6028  ompd_bp_thread_begin();
6029 #endif
6030 
6031 #if OMPT_SUPPORT
6032  ompt_data_t *thread_data = nullptr;
6033  if (ompt_enabled.enabled) {
6034  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6035  *thread_data = ompt_data_none;
6036 
6037  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6038  this_thr->th.ompt_thread_info.wait_id = 0;
6039  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6040  this_thr->th.ompt_thread_info.parallel_flags = 0;
6041  if (ompt_enabled.ompt_callback_thread_begin) {
6042  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6043  ompt_thread_worker, thread_data);
6044  }
6045  this_thr->th.ompt_thread_info.state = ompt_state_idle;
6046  }
6047 #endif
6048 
6049  /* This is the place where threads wait for work */
6050  while (!TCR_4(__kmp_global.g.g_done)) {
6051  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6052  KMP_MB();
6053 
6054  /* wait for work to do */
6055  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6056 
6057  /* No tid yet since not part of a team */
6058  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6059 
6060 #if OMPT_SUPPORT
6061  if (ompt_enabled.enabled) {
6062  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6063  }
6064 #endif
6065 
6066  pteam = &this_thr->th.th_team;
6067 
6068  /* have we been allocated? */
6069  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6070  /* we were just woken up, so run our new task */
6071  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6072  int rc;
6073  KA_TRACE(20,
6074  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6075  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6076  (*pteam)->t.t_pkfn));
6077 
6078  updateHWFPControl(*pteam);
6079 
6080 #if OMPT_SUPPORT
6081  if (ompt_enabled.enabled) {
6082  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6083  }
6084 #endif
6085 
6086  rc = (*pteam)->t.t_invoke(gtid);
6087  KMP_ASSERT(rc);
6088 
6089  KMP_MB();
6090  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6091  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6092  (*pteam)->t.t_pkfn));
6093  }
6094 #if OMPT_SUPPORT
6095  if (ompt_enabled.enabled) {
6096  /* no frame set while outside task */
6097  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6098 
6099  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6100  }
6101 #endif
6102  /* join barrier after parallel region */
6103  __kmp_join_barrier(gtid);
6104  }
6105  }
6106 
6107 #if OMPD_SUPPORT
6108  if (ompd_state & OMPD_ENABLE_BP)
6109  ompd_bp_thread_end();
6110 #endif
6111 
6112 #if OMPT_SUPPORT
6113  if (ompt_enabled.ompt_callback_thread_end) {
6114  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6115  }
6116 #endif
6117 
6118  this_thr->th.th_task_team = NULL;
6119  /* run the destructors for the threadprivate data for this thread */
6120  __kmp_common_destroy_gtid(gtid);
6121 
6122  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6123  KMP_MB();
6124 
6125 #if OMP_PROFILING_SUPPORT
6126  llvm::timeTraceProfilerFinishThread();
6127 #endif
6128  return this_thr;
6129 }
6130 
6131 /* ------------------------------------------------------------------------ */
6132 
6133 void __kmp_internal_end_dest(void *specific_gtid) {
6134  // Make sure no significant bits are lost
6135  int gtid;
6136  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6137 
6138  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6139  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6140  * this is because 0 is reserved for the nothing-stored case */
6141 
6142  __kmp_internal_end_thread(gtid);
6143 }
6144 
6145 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6146 
6147 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6148  __kmp_internal_end_atexit();
6149 }
6150 
6151 #endif
6152 
6153 /* [Windows] josh: when the atexit handler is called, there may still be more
6154  than one thread alive */
6155 void __kmp_internal_end_atexit(void) {
6156  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6157  /* [Windows]
6158  josh: ideally, we want to completely shutdown the library in this atexit
6159  handler, but stat code that depends on thread specific data for gtid fails
6160  because that data becomes unavailable at some point during the shutdown, so
6161  we call __kmp_internal_end_thread instead. We should eventually remove the
6162  dependency on __kmp_get_specific_gtid in the stat code and use
6163  __kmp_internal_end_library to cleanly shutdown the library.
6164 
6165  // TODO: Can some of this comment about GVS be removed?
6166  I suspect that the offending stat code is executed when the calling thread
6167  tries to clean up a dead root thread's data structures, resulting in GVS
6168  code trying to close the GVS structures for that thread, but since the stat
6169  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6170  the calling thread is cleaning up itself instead of another thread, it get
6171  confused. This happens because allowing a thread to unregister and cleanup
6172  another thread is a recent modification for addressing an issue.
6173  Based on the current design (20050722), a thread may end up
6174  trying to unregister another thread only if thread death does not trigger
6175  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6176  thread specific data destructor function to detect thread death. For
6177  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6178  is nothing. Thus, the workaround is applicable only for Windows static
6179  stat library. */
6180  __kmp_internal_end_library(-1);
6181 #if KMP_OS_WINDOWS
6182  __kmp_close_console();
6183 #endif
6184 }
6185 
6186 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6187  // It is assumed __kmp_forkjoin_lock is acquired.
6188 
6189  int gtid;
6190 
6191  KMP_DEBUG_ASSERT(thread != NULL);
6192 
6193  gtid = thread->th.th_info.ds.ds_gtid;
6194 
6195  if (!is_root) {
6196  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6197  /* Assume the threads are at the fork barrier here */
6198  KA_TRACE(
6199  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6200  gtid));
6201  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6202  while (
6203  !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6204  KMP_CPU_PAUSE();
6205  __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6206  } else {
6207  /* Need release fence here to prevent seg faults for tree forkjoin
6208  barrier (GEH) */
6209  kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6210  thread);
6211  __kmp_release_64(&flag);
6212  }
6213  }
6214 
6215  // Terminate OS thread.
6216  __kmp_reap_worker(thread);
6217 
6218  // The thread was killed asynchronously. If it was actively
6219  // spinning in the thread pool, decrement the global count.
6220  //
6221  // There is a small timing hole here - if the worker thread was just waking
6222  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6223  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6224  // the global counter might not get updated.
6225  //
6226  // Currently, this can only happen as the library is unloaded,
6227  // so there are no harmful side effects.
6228  if (thread->th.th_active_in_pool) {
6229  thread->th.th_active_in_pool = FALSE;
6230  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6231  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6232  }
6233  }
6234 
6235  __kmp_free_implicit_task(thread);
6236 
6237 // Free the fast memory for tasking
6238 #if USE_FAST_MEMORY
6239  __kmp_free_fast_memory(thread);
6240 #endif /* USE_FAST_MEMORY */
6241 
6242  __kmp_suspend_uninitialize_thread(thread);
6243 
6244  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6245  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6246 
6247  --__kmp_all_nth;
6248  // __kmp_nth was decremented when thread is added to the pool.
6249 
6250 #ifdef KMP_ADJUST_BLOCKTIME
6251  /* Adjust blocktime back to user setting or default if necessary */
6252  /* Middle initialization might never have occurred */
6253  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6254  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6255  if (__kmp_nth <= __kmp_avail_proc) {
6256  __kmp_zero_bt = FALSE;
6257  }
6258  }
6259 #endif /* KMP_ADJUST_BLOCKTIME */
6260 
6261  /* free the memory being used */
6262  if (__kmp_env_consistency_check) {
6263  if (thread->th.th_cons) {
6264  __kmp_free_cons_stack(thread->th.th_cons);
6265  thread->th.th_cons = NULL;
6266  }
6267  }
6268 
6269  if (thread->th.th_pri_common != NULL) {
6270  __kmp_free(thread->th.th_pri_common);
6271  thread->th.th_pri_common = NULL;
6272  }
6273 
6274 #if KMP_USE_BGET
6275  if (thread->th.th_local.bget_data != NULL) {
6276  __kmp_finalize_bget(thread);
6277  }
6278 #endif
6279 
6280 #if KMP_AFFINITY_SUPPORTED
6281  if (thread->th.th_affin_mask != NULL) {
6282  KMP_CPU_FREE(thread->th.th_affin_mask);
6283  thread->th.th_affin_mask = NULL;
6284  }
6285 #endif /* KMP_AFFINITY_SUPPORTED */
6286 
6287 #if KMP_USE_HIER_SCHED
6288  if (thread->th.th_hier_bar_data != NULL) {
6289  __kmp_free(thread->th.th_hier_bar_data);
6290  thread->th.th_hier_bar_data = NULL;
6291  }
6292 #endif
6293 
6294  __kmp_reap_team(thread->th.th_serial_team);
6295  thread->th.th_serial_team = NULL;
6296  __kmp_free(thread);
6297 
6298  KMP_MB();
6299 
6300 } // __kmp_reap_thread
6301 
6302 static void __kmp_itthash_clean(kmp_info_t *th) {
6303 #if USE_ITT_NOTIFY
6304  if (__kmp_itt_region_domains.count > 0) {
6305  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6306  kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6307  while (bucket) {
6308  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6309  __kmp_thread_free(th, bucket);
6310  bucket = next;
6311  }
6312  }
6313  }
6314  if (__kmp_itt_barrier_domains.count > 0) {
6315  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6316  kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6317  while (bucket) {
6318  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6319  __kmp_thread_free(th, bucket);
6320  bucket = next;
6321  }
6322  }
6323  }
6324 #endif
6325 }
6326 
6327 static void __kmp_internal_end(void) {
6328  int i;
6329 
6330  /* First, unregister the library */
6331  __kmp_unregister_library();
6332 
6333 #if KMP_OS_WINDOWS
6334  /* In Win static library, we can't tell when a root actually dies, so we
6335  reclaim the data structures for any root threads that have died but not
6336  unregistered themselves, in order to shut down cleanly.
6337  In Win dynamic library we also can't tell when a thread dies. */
6338  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6339 // dead roots
6340 #endif
6341 
6342  for (i = 0; i < __kmp_threads_capacity; i++)
6343  if (__kmp_root[i])
6344  if (__kmp_root[i]->r.r_active)
6345  break;
6346  KMP_MB(); /* Flush all pending memory write invalidates. */
6347  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6348 
6349  if (i < __kmp_threads_capacity) {
6350 #if KMP_USE_MONITOR
6351  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6352  KMP_MB(); /* Flush all pending memory write invalidates. */
6353 
6354  // Need to check that monitor was initialized before reaping it. If we are
6355  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6356  // __kmp_monitor will appear to contain valid data, but it is only valid in
6357  // the parent process, not the child.
6358  // New behavior (201008): instead of keying off of the flag
6359  // __kmp_init_parallel, the monitor thread creation is keyed off
6360  // of the new flag __kmp_init_monitor.
6361  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6362  if (TCR_4(__kmp_init_monitor)) {
6363  __kmp_reap_monitor(&__kmp_monitor);
6364  TCW_4(__kmp_init_monitor, 0);
6365  }
6366  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6367  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6368 #endif // KMP_USE_MONITOR
6369  } else {
6370 /* TODO move this to cleanup code */
6371 #ifdef KMP_DEBUG
6372  /* make sure that everything has properly ended */
6373  for (i = 0; i < __kmp_threads_capacity; i++) {
6374  if (__kmp_root[i]) {
6375  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6376  // there can be uber threads alive here
6377  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6378  }
6379  }
6380 #endif
6381 
6382  KMP_MB();
6383 
6384  // Reap the worker threads.
6385  // This is valid for now, but be careful if threads are reaped sooner.
6386  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6387  // Get the next thread from the pool.
6388  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6389  __kmp_thread_pool = thread->th.th_next_pool;
6390  // Reap it.
6391  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6392  thread->th.th_next_pool = NULL;
6393  thread->th.th_in_pool = FALSE;
6394  __kmp_reap_thread(thread, 0);
6395  }
6396  __kmp_thread_pool_insert_pt = NULL;
6397 
6398  // Reap teams.
6399  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6400  // Get the next team from the pool.
6401  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6402  __kmp_team_pool = team->t.t_next_pool;
6403  // Reap it.
6404  team->t.t_next_pool = NULL;
6405  __kmp_reap_team(team);
6406  }
6407 
6408  __kmp_reap_task_teams();
6409 
6410 #if KMP_OS_UNIX
6411  // Threads that are not reaped should not access any resources since they
6412  // are going to be deallocated soon, so the shutdown sequence should wait
6413  // until all threads either exit the final spin-waiting loop or begin
6414  // sleeping after the given blocktime.
6415  for (i = 0; i < __kmp_threads_capacity; i++) {
6416  kmp_info_t *thr = __kmp_threads[i];
6417  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6418  KMP_CPU_PAUSE();
6419  }
6420 #endif
6421 
6422  for (i = 0; i < __kmp_threads_capacity; ++i) {
6423  // TBD: Add some checking...
6424  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6425  }
6426 
6427  /* Make sure all threadprivate destructors get run by joining with all
6428  worker threads before resetting this flag */
6429  TCW_SYNC_4(__kmp_init_common, FALSE);
6430 
6431  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6432  KMP_MB();
6433 
6434 #if KMP_USE_MONITOR
6435  // See note above: One of the possible fixes for CQ138434 / CQ140126
6436  //
6437  // FIXME: push both code fragments down and CSE them?
6438  // push them into __kmp_cleanup() ?
6439  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6440  if (TCR_4(__kmp_init_monitor)) {
6441  __kmp_reap_monitor(&__kmp_monitor);
6442  TCW_4(__kmp_init_monitor, 0);
6443  }
6444  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6445  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6446 #endif
6447  } /* else !__kmp_global.t_active */
6448  TCW_4(__kmp_init_gtid, FALSE);
6449  KMP_MB(); /* Flush all pending memory write invalidates. */
6450 
6451  __kmp_cleanup();
6452 #if OMPT_SUPPORT
6453  ompt_fini();
6454 #endif
6455 }
6456 
6457 void __kmp_internal_end_library(int gtid_req) {
6458  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6459  /* this shouldn't be a race condition because __kmp_internal_end() is the
6460  only place to clear __kmp_serial_init */
6461  /* we'll check this later too, after we get the lock */
6462  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6463  // redundant, because the next check will work in any case.
6464  if (__kmp_global.g.g_abort) {
6465  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6466  /* TODO abort? */
6467  return;
6468  }
6469  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6470  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6471  return;
6472  }
6473 
6474  // If hidden helper team has been initialized, we need to deinit it
6475  if (TCR_4(__kmp_init_hidden_helper) &&
6476  !TCR_4(__kmp_hidden_helper_team_done)) {
6477  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6478  // First release the main thread to let it continue its work
6479  __kmp_hidden_helper_main_thread_release();
6480  // Wait until the hidden helper team has been destroyed
6481  __kmp_hidden_helper_threads_deinitz_wait();
6482  }
6483 
6484  KMP_MB(); /* Flush all pending memory write invalidates. */
6485  /* find out who we are and what we should do */
6486  {
6487  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6488  KA_TRACE(
6489  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6490  if (gtid == KMP_GTID_SHUTDOWN) {
6491  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6492  "already shutdown\n"));
6493  return;
6494  } else if (gtid == KMP_GTID_MONITOR) {
6495  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6496  "registered, or system shutdown\n"));
6497  return;
6498  } else if (gtid == KMP_GTID_DNE) {
6499  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6500  "shutdown\n"));
6501  /* we don't know who we are, but we may still shutdown the library */
6502  } else if (KMP_UBER_GTID(gtid)) {
6503  /* unregister ourselves as an uber thread. gtid is no longer valid */
6504  if (__kmp_root[gtid]->r.r_active) {
6505  __kmp_global.g.g_abort = -1;
6506  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6507  __kmp_unregister_library();
6508  KA_TRACE(10,
6509  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6510  gtid));
6511  return;
6512  } else {
6513  __kmp_itthash_clean(__kmp_threads[gtid]);
6514  KA_TRACE(
6515  10,
6516  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6517  __kmp_unregister_root_current_thread(gtid);
6518  }
6519  } else {
6520 /* worker threads may call this function through the atexit handler, if they
6521  * call exit() */
6522 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6523  TODO: do a thorough shutdown instead */
6524 #ifdef DUMP_DEBUG_ON_EXIT
6525  if (__kmp_debug_buf)
6526  __kmp_dump_debug_buffer();
6527 #endif
6528  // added unregister library call here when we switch to shm linux
6529  // if we don't, it will leave lots of files in /dev/shm
6530  // cleanup shared memory file before exiting.
6531  __kmp_unregister_library();
6532  return;
6533  }
6534  }
6535  /* synchronize the termination process */
6536  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6537 
6538  /* have we already finished */
6539  if (__kmp_global.g.g_abort) {
6540  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6541  /* TODO abort? */
6542  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6543  return;
6544  }
6545  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6546  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6547  return;
6548  }
6549 
6550  /* We need this lock to enforce mutex between this reading of
6551  __kmp_threads_capacity and the writing by __kmp_register_root.
6552  Alternatively, we can use a counter of roots that is atomically updated by
6553  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6554  __kmp_internal_end_*. */
6555  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6556 
6557  /* now we can safely conduct the actual termination */
6558  __kmp_internal_end();
6559 
6560  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6561  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6562 
6563  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6564 
6565 #ifdef DUMP_DEBUG_ON_EXIT
6566  if (__kmp_debug_buf)
6567  __kmp_dump_debug_buffer();
6568 #endif
6569 
6570 #if KMP_OS_WINDOWS
6571  __kmp_close_console();
6572 #endif
6573 
6574  __kmp_fini_allocator();
6575 
6576 } // __kmp_internal_end_library
6577 
6578 void __kmp_internal_end_thread(int gtid_req) {
6579  int i;
6580 
6581  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6582  /* this shouldn't be a race condition because __kmp_internal_end() is the
6583  * only place to clear __kmp_serial_init */
6584  /* we'll check this later too, after we get the lock */
6585  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6586  // redundant, because the next check will work in any case.
6587  if (__kmp_global.g.g_abort) {
6588  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6589  /* TODO abort? */
6590  return;
6591  }
6592  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6593  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6594  return;
6595  }
6596 
6597  // If hidden helper team has been initialized, we need to deinit it
6598  if (TCR_4(__kmp_init_hidden_helper) &&
6599  !TCR_4(__kmp_hidden_helper_team_done)) {
6600  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6601  // First release the main thread to let it continue its work
6602  __kmp_hidden_helper_main_thread_release();
6603  // Wait until the hidden helper team has been destroyed
6604  __kmp_hidden_helper_threads_deinitz_wait();
6605  }
6606 
6607  KMP_MB(); /* Flush all pending memory write invalidates. */
6608 
6609  /* find out who we are and what we should do */
6610  {
6611  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6612  KA_TRACE(10,
6613  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6614  if (gtid == KMP_GTID_SHUTDOWN) {
6615  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6616  "already shutdown\n"));
6617  return;
6618  } else if (gtid == KMP_GTID_MONITOR) {
6619  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6620  "registered, or system shutdown\n"));
6621  return;
6622  } else if (gtid == KMP_GTID_DNE) {
6623  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6624  "shutdown\n"));
6625  return;
6626  /* we don't know who we are */
6627  } else if (KMP_UBER_GTID(gtid)) {
6628  /* unregister ourselves as an uber thread. gtid is no longer valid */
6629  if (__kmp_root[gtid]->r.r_active) {
6630  __kmp_global.g.g_abort = -1;
6631  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6632  KA_TRACE(10,
6633  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6634  gtid));
6635  return;
6636  } else {
6637  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6638  gtid));
6639  __kmp_unregister_root_current_thread(gtid);
6640  }
6641  } else {
6642  /* just a worker thread, let's leave */
6643  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6644 
6645  if (gtid >= 0) {
6646  __kmp_threads[gtid]->th.th_task_team = NULL;
6647  }
6648 
6649  KA_TRACE(10,
6650  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6651  gtid));
6652  return;
6653  }
6654  }
6655 #if KMP_DYNAMIC_LIB
6656  if (__kmp_pause_status != kmp_hard_paused)
6657  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6658  // because we will better shutdown later in the library destructor.
6659  {
6660  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6661  return;
6662  }
6663 #endif
6664  /* synchronize the termination process */
6665  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6666 
6667  /* have we already finished */
6668  if (__kmp_global.g.g_abort) {
6669  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6670  /* TODO abort? */
6671  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6672  return;
6673  }
6674  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6675  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6676  return;
6677  }
6678 
6679  /* We need this lock to enforce mutex between this reading of
6680  __kmp_threads_capacity and the writing by __kmp_register_root.
6681  Alternatively, we can use a counter of roots that is atomically updated by
6682  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6683  __kmp_internal_end_*. */
6684 
6685  /* should we finish the run-time? are all siblings done? */
6686  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6687 
6688  for (i = 0; i < __kmp_threads_capacity; ++i) {
6689  if (KMP_UBER_GTID(i)) {
6690  KA_TRACE(
6691  10,
6692  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6693  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6694  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6695  return;
6696  }
6697  }
6698 
6699  /* now we can safely conduct the actual termination */
6700 
6701  __kmp_internal_end();
6702 
6703  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6704  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6705 
6706  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6707 
6708 #ifdef DUMP_DEBUG_ON_EXIT
6709  if (__kmp_debug_buf)
6710  __kmp_dump_debug_buffer();
6711 #endif
6712 } // __kmp_internal_end_thread
6713 
6714 // -----------------------------------------------------------------------------
6715 // Library registration stuff.
6716 
6717 static long __kmp_registration_flag = 0;
6718 // Random value used to indicate library initialization.
6719 static char *__kmp_registration_str = NULL;
6720 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6721 
6722 static inline char *__kmp_reg_status_name() {
6723 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6724  each thread. If registration and unregistration go in different threads
6725  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6726  env var can not be found, because the name will contain different pid. */
6727 // macOS* complains about name being too long with additional getuid()
6728 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6729  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6730  (int)getuid());
6731 #else
6732  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6733 #endif
6734 } // __kmp_reg_status_get
6735 
6736 #if defined(KMP_USE_SHM)
6737 bool __kmp_shm_available = false;
6738 bool __kmp_tmp_available = false;
6739 // If /dev/shm is not accessible, we will create a temporary file under /tmp.
6740 char *temp_reg_status_file_name = nullptr;
6741 #endif
6742 
6743 void __kmp_register_library_startup(void) {
6744 
6745  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6746  int done = 0;
6747  union {
6748  double dtime;
6749  long ltime;
6750  } time;
6751 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6752  __kmp_initialize_system_tick();
6753 #endif
6754  __kmp_read_system_time(&time.dtime);
6755  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6756  __kmp_registration_str =
6757  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6758  __kmp_registration_flag, KMP_LIBRARY_FILE);
6759 
6760  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6761  __kmp_registration_str));
6762 
6763  while (!done) {
6764 
6765  char *value = NULL; // Actual value of the environment variable.
6766 
6767 #if defined(KMP_USE_SHM)
6768  char *shm_name = nullptr;
6769  char *data1 = nullptr;
6770  __kmp_shm_available = __kmp_detect_shm();
6771  if (__kmp_shm_available) {
6772  int fd1 = -1;
6773  shm_name = __kmp_str_format("/%s", name);
6774  int shm_preexist = 0;
6775  fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6776  if ((fd1 == -1) && (errno == EEXIST)) {
6777  // file didn't open because it already exists.
6778  // try opening existing file
6779  fd1 = shm_open(shm_name, O_RDWR, 0600);
6780  if (fd1 == -1) { // file didn't open
6781  KMP_WARNING(FunctionError, "Can't open SHM");
6782  __kmp_shm_available = false;
6783  } else { // able to open existing file
6784  shm_preexist = 1;
6785  }
6786  }
6787  if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
6788  if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6789  KMP_WARNING(FunctionError, "Can't set size of SHM");
6790  __kmp_shm_available = false;
6791  }
6792  }
6793  if (__kmp_shm_available) { // SHM exists, now map it
6794  data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6795  fd1, 0);
6796  if (data1 == MAP_FAILED) { // failed to map shared memory
6797  KMP_WARNING(FunctionError, "Can't map SHM");
6798  __kmp_shm_available = false;
6799  }
6800  }
6801  if (__kmp_shm_available) { // SHM mapped
6802  if (shm_preexist == 0) { // set data to SHM, set value
6803  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6804  }
6805  // Read value from either what we just wrote or existing file.
6806  value = __kmp_str_format("%s", data1); // read value from SHM
6807  munmap(data1, SHM_SIZE);
6808  }
6809  if (fd1 != -1)
6810  close(fd1);
6811  }
6812  if (!__kmp_shm_available)
6813  __kmp_tmp_available = __kmp_detect_tmp();
6814  if (!__kmp_shm_available && __kmp_tmp_available) {
6815  // SHM failed to work due to an error other than that the file already
6816  // exists. Try to create a temp file under /tmp.
6817  // If /tmp isn't accessible, fall back to using environment variable.
6818  // TODO: /tmp might not always be the temporary directory. For now we will
6819  // not consider TMPDIR.
6820  int fd1 = -1;
6821  temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
6822  int tmp_preexist = 0;
6823  fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6824  if ((fd1 == -1) && (errno == EEXIST)) {
6825  // file didn't open because it already exists.
6826  // try opening existing file
6827  fd1 = open(temp_reg_status_file_name, O_RDWR, 0600);
6828  if (fd1 == -1) { // file didn't open if (fd1 == -1) {
6829  KMP_WARNING(FunctionError, "Can't open TEMP");
6830  __kmp_tmp_available = false;
6831  } else {
6832  tmp_preexist = 1;
6833  }
6834  }
6835  if (__kmp_tmp_available && tmp_preexist == 0) {
6836  // we created /tmp file now set size
6837  if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6838  KMP_WARNING(FunctionError, "Can't set size of /tmp file");
6839  __kmp_tmp_available = false;
6840  }
6841  }
6842  if (__kmp_tmp_available) {
6843  data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6844  fd1, 0);
6845  if (data1 == MAP_FAILED) { // failed to map /tmp
6846  KMP_WARNING(FunctionError, "Can't map /tmp");
6847  __kmp_tmp_available = false;
6848  }
6849  }
6850  if (__kmp_tmp_available) {
6851  if (tmp_preexist == 0) { // set data to TMP, set value
6852  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6853  }
6854  // Read value from either what we just wrote or existing file.
6855  value = __kmp_str_format("%s", data1); // read value from SHM
6856  munmap(data1, SHM_SIZE);
6857  }
6858  if (fd1 != -1)
6859  close(fd1);
6860  }
6861  if (!__kmp_shm_available && !__kmp_tmp_available) {
6862  // no /dev/shm and no /tmp -- fall back to environment variable
6863  // Set environment variable, but do not overwrite if it exists.
6864  __kmp_env_set(name, __kmp_registration_str, 0);
6865  // read value to see if it got set
6866  value = __kmp_env_get(name);
6867  }
6868 #else // Windows and unix with static library
6869  // Set environment variable, but do not overwrite if it exists.
6870  __kmp_env_set(name, __kmp_registration_str, 0);
6871  // read value to see if it got set
6872  value = __kmp_env_get(name);
6873 #endif
6874 
6875  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6876  done = 1; // Ok, environment variable set successfully, exit the loop.
6877  } else {
6878  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6879  // Check whether it alive or dead.
6880  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6881  char *tail = value;
6882  char *flag_addr_str = NULL;
6883  char *flag_val_str = NULL;
6884  char const *file_name = NULL;
6885  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6886  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6887  file_name = tail;
6888  if (tail != NULL) {
6889  unsigned long *flag_addr = 0;
6890  unsigned long flag_val = 0;
6891  KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6892  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6893  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6894  // First, check whether environment-encoded address is mapped into
6895  // addr space.
6896  // If so, dereference it to see if it still has the right value.
6897  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6898  neighbor = 1;
6899  } else {
6900  // If not, then we know the other copy of the library is no longer
6901  // running.
6902  neighbor = 2;
6903  }
6904  }
6905  }
6906  switch (neighbor) {
6907  case 0: // Cannot parse environment variable -- neighbor status unknown.
6908  // Assume it is the incompatible format of future version of the
6909  // library. Assume the other library is alive.
6910  // WARN( ... ); // TODO: Issue a warning.
6911  file_name = "unknown library";
6912  KMP_FALLTHROUGH();
6913  // Attention! Falling to the next case. That's intentional.
6914  case 1: { // Neighbor is alive.
6915  // Check it is allowed.
6916  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6917  if (!__kmp_str_match_true(duplicate_ok)) {
6918  // That's not allowed. Issue fatal error.
6919  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6920  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6921  }
6922  KMP_INTERNAL_FREE(duplicate_ok);
6923  __kmp_duplicate_library_ok = 1;
6924  done = 1; // Exit the loop.
6925  } break;
6926  case 2: { // Neighbor is dead.
6927 
6928 #if defined(KMP_USE_SHM)
6929  if (__kmp_shm_available) { // close shared memory.
6930  shm_unlink(shm_name); // this removes file in /dev/shm
6931  } else if (__kmp_tmp_available) {
6932  unlink(temp_reg_status_file_name); // this removes the temp file
6933  } else {
6934  // Clear the variable and try to register library again.
6935  __kmp_env_unset(name);
6936  }
6937 #else
6938  // Clear the variable and try to register library again.
6939  __kmp_env_unset(name);
6940 #endif
6941  } break;
6942  default: {
6943  KMP_DEBUG_ASSERT(0);
6944  } break;
6945  }
6946  }
6947  KMP_INTERNAL_FREE((void *)value);
6948 #if defined(KMP_USE_SHM)
6949  if (shm_name)
6950  KMP_INTERNAL_FREE((void *)shm_name);
6951 #endif
6952  } // while
6953  KMP_INTERNAL_FREE((void *)name);
6954 
6955 } // func __kmp_register_library_startup
6956 
6957 void __kmp_unregister_library(void) {
6958 
6959  char *name = __kmp_reg_status_name();
6960  char *value = NULL;
6961 
6962 #if defined(KMP_USE_SHM)
6963  char *shm_name = nullptr;
6964  int fd1;
6965  if (__kmp_shm_available) {
6966  shm_name = __kmp_str_format("/%s", name);
6967  fd1 = shm_open(shm_name, O_RDONLY, 0600);
6968  if (fd1 != -1) { // File opened successfully
6969  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6970  if (data1 != MAP_FAILED) {
6971  value = __kmp_str_format("%s", data1); // read value from SHM
6972  munmap(data1, SHM_SIZE);
6973  }
6974  close(fd1);
6975  }
6976  } else if (__kmp_tmp_available) { // try /tmp
6977  fd1 = open(temp_reg_status_file_name, O_RDONLY);
6978  if (fd1 != -1) { // File opened successfully
6979  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6980  if (data1 != MAP_FAILED) {
6981  value = __kmp_str_format("%s", data1); // read value from /tmp
6982  munmap(data1, SHM_SIZE);
6983  }
6984  close(fd1);
6985  }
6986  } else { // fall back to envirable
6987  value = __kmp_env_get(name);
6988  }
6989 #else
6990  value = __kmp_env_get(name);
6991 #endif
6992 
6993  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6994  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6995  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6996 // Ok, this is our variable. Delete it.
6997 #if defined(KMP_USE_SHM)
6998  if (__kmp_shm_available) {
6999  shm_unlink(shm_name); // this removes file in /dev/shm
7000  } else if (__kmp_tmp_available) {
7001  unlink(temp_reg_status_file_name); // this removes the temp file
7002  } else {
7003  __kmp_env_unset(name);
7004  }
7005 #else
7006  __kmp_env_unset(name);
7007 #endif
7008  }
7009 
7010 #if defined(KMP_USE_SHM)
7011  if (shm_name)
7012  KMP_INTERNAL_FREE(shm_name);
7013  if (temp_reg_status_file_name)
7014  KMP_INTERNAL_FREE(temp_reg_status_file_name);
7015 #endif
7016 
7017  KMP_INTERNAL_FREE(__kmp_registration_str);
7018  KMP_INTERNAL_FREE(value);
7019  KMP_INTERNAL_FREE(name);
7020 
7021  __kmp_registration_flag = 0;
7022  __kmp_registration_str = NULL;
7023 
7024 } // __kmp_unregister_library
7025 
7026 // End of Library registration stuff.
7027 // -----------------------------------------------------------------------------
7028 
7029 #if KMP_MIC_SUPPORTED
7030 
7031 static void __kmp_check_mic_type() {
7032  kmp_cpuid_t cpuid_state = {0};
7033  kmp_cpuid_t *cs_p = &cpuid_state;
7034  __kmp_x86_cpuid(1, 0, cs_p);
7035  // We don't support mic1 at the moment
7036  if ((cs_p->eax & 0xff0) == 0xB10) {
7037  __kmp_mic_type = mic2;
7038  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
7039  __kmp_mic_type = mic3;
7040  } else {
7041  __kmp_mic_type = non_mic;
7042  }
7043 }
7044 
7045 #endif /* KMP_MIC_SUPPORTED */
7046 
7047 #if KMP_HAVE_UMWAIT
7048 static void __kmp_user_level_mwait_init() {
7049  struct kmp_cpuid buf;
7050  __kmp_x86_cpuid(7, 0, &buf);
7051  __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
7052  __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
7053  __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
7054  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
7055  __kmp_umwait_enabled));
7056 }
7057 #elif KMP_HAVE_MWAIT
7058 #ifndef AT_INTELPHIUSERMWAIT
7059 // Spurious, non-existent value that should always fail to return anything.
7060 // Will be replaced with the correct value when we know that.
7061 #define AT_INTELPHIUSERMWAIT 10000
7062 #endif
7063 // getauxval() function is available in RHEL7 and SLES12. If a system with an
7064 // earlier OS is used to build the RTL, we'll use the following internal
7065 // function when the entry is not found.
7066 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7067 unsigned long getauxval(unsigned long) { return 0; }
7068 
7069 static void __kmp_user_level_mwait_init() {
7070  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7071  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7072  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7073  // KMP_USER_LEVEL_MWAIT was set to TRUE.
7074  if (__kmp_mic_type == mic3) {
7075  unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7076  if ((res & 0x1) || __kmp_user_level_mwait) {
7077  __kmp_mwait_enabled = TRUE;
7078  if (__kmp_user_level_mwait) {
7079  KMP_INFORM(EnvMwaitWarn);
7080  }
7081  } else {
7082  __kmp_mwait_enabled = FALSE;
7083  }
7084  }
7085  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7086  "__kmp_mwait_enabled = %d\n",
7087  __kmp_mic_type, __kmp_mwait_enabled));
7088 }
7089 #endif /* KMP_HAVE_UMWAIT */
7090 
7091 static void __kmp_do_serial_initialize(void) {
7092  int i, gtid;
7093  size_t size;
7094 
7095  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7096 
7097  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7098  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7099  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7100  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7101  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7102 
7103 #if OMPT_SUPPORT
7104  ompt_pre_init();
7105 #endif
7106 #if OMPD_SUPPORT
7107  __kmp_env_dump();
7108  ompd_init();
7109 #endif
7110 
7111  __kmp_validate_locks();
7112 
7113 #if ENABLE_LIBOMPTARGET
7114  /* Initialize functions from libomptarget */
7115  __kmp_init_omptarget();
7116 #endif
7117 
7118  /* Initialize internal memory allocator */
7119  __kmp_init_allocator();
7120 
7121  /* Register the library startup via an environment variable or via mapped
7122  shared memory file and check to see whether another copy of the library is
7123  already registered. Since forked child process is often terminated, we
7124  postpone the registration till middle initialization in the child */
7125  if (__kmp_need_register_serial)
7126  __kmp_register_library_startup();
7127 
7128  /* TODO reinitialization of library */
7129  if (TCR_4(__kmp_global.g.g_done)) {
7130  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7131  }
7132 
7133  __kmp_global.g.g_abort = 0;
7134  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7135 
7136 /* initialize the locks */
7137 #if KMP_USE_ADAPTIVE_LOCKS
7138 #if KMP_DEBUG_ADAPTIVE_LOCKS
7139  __kmp_init_speculative_stats();
7140 #endif
7141 #endif
7142 #if KMP_STATS_ENABLED
7143  __kmp_stats_init();
7144 #endif
7145  __kmp_init_lock(&__kmp_global_lock);
7146  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7147  __kmp_init_lock(&__kmp_debug_lock);
7148  __kmp_init_atomic_lock(&__kmp_atomic_lock);
7149  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7150  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7151  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7152  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7153  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7154  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7155  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7156  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7157  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7158  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7159  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7160  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7161  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7162  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7163 #if KMP_USE_MONITOR
7164  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7165 #endif
7166  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7167 
7168  /* conduct initialization and initial setup of configuration */
7169 
7170  __kmp_runtime_initialize();
7171 
7172 #if KMP_MIC_SUPPORTED
7173  __kmp_check_mic_type();
7174 #endif
7175 
7176 // Some global variable initialization moved here from kmp_env_initialize()
7177 #ifdef KMP_DEBUG
7178  kmp_diag = 0;
7179 #endif
7180  __kmp_abort_delay = 0;
7181 
7182  // From __kmp_init_dflt_team_nth()
7183  /* assume the entire machine will be used */
7184  __kmp_dflt_team_nth_ub = __kmp_xproc;
7185  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7186  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7187  }
7188  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7189  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7190  }
7191  __kmp_max_nth = __kmp_sys_max_nth;
7192  __kmp_cg_max_nth = __kmp_sys_max_nth;
7193  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7194  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7195  __kmp_teams_max_nth = __kmp_sys_max_nth;
7196  }
7197 
7198  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7199  // part
7200  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7201 #if KMP_USE_MONITOR
7202  __kmp_monitor_wakeups =
7203  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7204  __kmp_bt_intervals =
7205  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7206 #endif
7207  // From "KMP_LIBRARY" part of __kmp_env_initialize()
7208  __kmp_library = library_throughput;
7209  // From KMP_SCHEDULE initialization
7210  __kmp_static = kmp_sch_static_balanced;
7211 // AC: do not use analytical here, because it is non-monotonous
7212 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7213 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7214 // need to repeat assignment
7215 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7216 // bit control and barrier method control parts
7217 #if KMP_FAST_REDUCTION_BARRIER
7218 #define kmp_reduction_barrier_gather_bb ((int)1)
7219 #define kmp_reduction_barrier_release_bb ((int)1)
7220 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7221 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7222 #endif // KMP_FAST_REDUCTION_BARRIER
7223  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7224  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7225  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7226  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7227  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7228 #if KMP_FAST_REDUCTION_BARRIER
7229  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7230  // lin_64 ): hyper,1
7231  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7232  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7233  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7234  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7235  }
7236 #endif // KMP_FAST_REDUCTION_BARRIER
7237  }
7238 #if KMP_FAST_REDUCTION_BARRIER
7239 #undef kmp_reduction_barrier_release_pat
7240 #undef kmp_reduction_barrier_gather_pat
7241 #undef kmp_reduction_barrier_release_bb
7242 #undef kmp_reduction_barrier_gather_bb
7243 #endif // KMP_FAST_REDUCTION_BARRIER
7244 #if KMP_MIC_SUPPORTED
7245  if (__kmp_mic_type == mic2) { // KNC
7246  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7247  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7248  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7249  1; // forkjoin release
7250  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7251  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7252  }
7253 #if KMP_FAST_REDUCTION_BARRIER
7254  if (__kmp_mic_type == mic2) { // KNC
7255  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7256  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7257  }
7258 #endif // KMP_FAST_REDUCTION_BARRIER
7259 #endif // KMP_MIC_SUPPORTED
7260 
7261 // From KMP_CHECKS initialization
7262 #ifdef KMP_DEBUG
7263  __kmp_env_checks = TRUE; /* development versions have the extra checks */
7264 #else
7265  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7266 #endif
7267 
7268  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7269  __kmp_foreign_tp = TRUE;
7270 
7271  __kmp_global.g.g_dynamic = FALSE;
7272  __kmp_global.g.g_dynamic_mode = dynamic_default;
7273 
7274  __kmp_init_nesting_mode();
7275 
7276  __kmp_env_initialize(NULL);
7277 
7278 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7279  __kmp_user_level_mwait_init();
7280 #endif
7281 // Print all messages in message catalog for testing purposes.
7282 #ifdef KMP_DEBUG
7283  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7284  if (__kmp_str_match_true(val)) {
7285  kmp_str_buf_t buffer;
7286  __kmp_str_buf_init(&buffer);
7287  __kmp_i18n_dump_catalog(&buffer);
7288  __kmp_printf("%s", buffer.str);
7289  __kmp_str_buf_free(&buffer);
7290  }
7291  __kmp_env_free(&val);
7292 #endif
7293 
7294  __kmp_threads_capacity =
7295  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7296  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7297  __kmp_tp_capacity = __kmp_default_tp_capacity(
7298  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7299 
7300  // If the library is shut down properly, both pools must be NULL. Just in
7301  // case, set them to NULL -- some memory may leak, but subsequent code will
7302  // work even if pools are not freed.
7303  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7304  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7305  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7306  __kmp_thread_pool = NULL;
7307  __kmp_thread_pool_insert_pt = NULL;
7308  __kmp_team_pool = NULL;
7309 
7310  /* Allocate all of the variable sized records */
7311  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7312  * expandable */
7313  /* Since allocation is cache-aligned, just add extra padding at the end */
7314  size =
7315  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7316  CACHE_LINE;
7317  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7318  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7319  sizeof(kmp_info_t *) * __kmp_threads_capacity);
7320 
7321  /* init thread counts */
7322  KMP_DEBUG_ASSERT(__kmp_all_nth ==
7323  0); // Asserts fail if the library is reinitializing and
7324  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7325  __kmp_all_nth = 0;
7326  __kmp_nth = 0;
7327 
7328  /* setup the uber master thread and hierarchy */
7329  gtid = __kmp_register_root(TRUE);
7330  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7331  KMP_ASSERT(KMP_UBER_GTID(gtid));
7332  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7333 
7334  KMP_MB(); /* Flush all pending memory write invalidates. */
7335 
7336  __kmp_common_initialize();
7337 
7338 #if KMP_OS_UNIX
7339  /* invoke the child fork handler */
7340  __kmp_register_atfork();
7341 #endif
7342 
7343 #if !KMP_DYNAMIC_LIB || \
7344  ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7345  {
7346  /* Invoke the exit handler when the program finishes, only for static
7347  library and macOS* dynamic. For other dynamic libraries, we already
7348  have _fini and DllMain. */
7349  int rc = atexit(__kmp_internal_end_atexit);
7350  if (rc != 0) {
7351  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7352  __kmp_msg_null);
7353  }
7354  }
7355 #endif
7356 
7357 #if KMP_HANDLE_SIGNALS
7358 #if KMP_OS_UNIX
7359  /* NOTE: make sure that this is called before the user installs their own
7360  signal handlers so that the user handlers are called first. this way they
7361  can return false, not call our handler, avoid terminating the library, and
7362  continue execution where they left off. */
7363  __kmp_install_signals(FALSE);
7364 #endif /* KMP_OS_UNIX */
7365 #if KMP_OS_WINDOWS
7366  __kmp_install_signals(TRUE);
7367 #endif /* KMP_OS_WINDOWS */
7368 #endif
7369 
7370  /* we have finished the serial initialization */
7371  __kmp_init_counter++;
7372 
7373  __kmp_init_serial = TRUE;
7374 
7375  if (__kmp_version) {
7376  __kmp_print_version_1();
7377  }
7378 
7379  if (__kmp_settings) {
7380  __kmp_env_print();
7381  }
7382 
7383  if (__kmp_display_env || __kmp_display_env_verbose) {
7384  __kmp_env_print_2();
7385  }
7386 
7387 #if OMPT_SUPPORT
7388  ompt_post_init();
7389 #endif
7390 
7391  KMP_MB();
7392 
7393  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7394 }
7395 
7396 void __kmp_serial_initialize(void) {
7397  if (__kmp_init_serial) {
7398  return;
7399  }
7400  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7401  if (__kmp_init_serial) {
7402  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7403  return;
7404  }
7405  __kmp_do_serial_initialize();
7406  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7407 }
7408 
7409 static void __kmp_do_middle_initialize(void) {
7410  int i, j;
7411  int prev_dflt_team_nth;
7412 
7413  if (!__kmp_init_serial) {
7414  __kmp_do_serial_initialize();
7415  }
7416 
7417  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7418 
7419  if (UNLIKELY(!__kmp_need_register_serial)) {
7420  // We are in a forked child process. The registration was skipped during
7421  // serial initialization in __kmp_atfork_child handler. Do it here.
7422  __kmp_register_library_startup();
7423  }
7424 
7425  // Save the previous value for the __kmp_dflt_team_nth so that
7426  // we can avoid some reinitialization if it hasn't changed.
7427  prev_dflt_team_nth = __kmp_dflt_team_nth;
7428 
7429 #if KMP_AFFINITY_SUPPORTED
7430  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7431  // number of cores on the machine.
7432  __kmp_affinity_initialize(__kmp_affinity);
7433 
7434 #endif /* KMP_AFFINITY_SUPPORTED */
7435 
7436  KMP_ASSERT(__kmp_xproc > 0);
7437  if (__kmp_avail_proc == 0) {
7438  __kmp_avail_proc = __kmp_xproc;
7439  }
7440 
7441  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7442  // correct them now
7443  j = 0;
7444  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7445  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7446  __kmp_avail_proc;
7447  j++;
7448  }
7449 
7450  if (__kmp_dflt_team_nth == 0) {
7451 #ifdef KMP_DFLT_NTH_CORES
7452  // Default #threads = #cores
7453  __kmp_dflt_team_nth = __kmp_ncores;
7454  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7455  "__kmp_ncores (%d)\n",
7456  __kmp_dflt_team_nth));
7457 #else
7458  // Default #threads = #available OS procs
7459  __kmp_dflt_team_nth = __kmp_avail_proc;
7460  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7461  "__kmp_avail_proc(%d)\n",
7462  __kmp_dflt_team_nth));
7463 #endif /* KMP_DFLT_NTH_CORES */
7464  }
7465 
7466  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7467  __kmp_dflt_team_nth = KMP_MIN_NTH;
7468  }
7469  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7470  __kmp_dflt_team_nth = __kmp_sys_max_nth;
7471  }
7472 
7473  if (__kmp_nesting_mode > 0)
7474  __kmp_set_nesting_mode_threads();
7475 
7476  // There's no harm in continuing if the following check fails,
7477  // but it indicates an error in the previous logic.
7478  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7479 
7480  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7481  // Run through the __kmp_threads array and set the num threads icv for each
7482  // root thread that is currently registered with the RTL (which has not
7483  // already explicitly set its nthreads-var with a call to
7484  // omp_set_num_threads()).
7485  for (i = 0; i < __kmp_threads_capacity; i++) {
7486  kmp_info_t *thread = __kmp_threads[i];
7487  if (thread == NULL)
7488  continue;
7489  if (thread->th.th_current_task->td_icvs.nproc != 0)
7490  continue;
7491 
7492  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7493  }
7494  }
7495  KA_TRACE(
7496  20,
7497  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7498  __kmp_dflt_team_nth));
7499 
7500 #ifdef KMP_ADJUST_BLOCKTIME
7501  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7502  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7503  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7504  if (__kmp_nth > __kmp_avail_proc) {
7505  __kmp_zero_bt = TRUE;
7506  }
7507  }
7508 #endif /* KMP_ADJUST_BLOCKTIME */
7509 
7510  /* we have finished middle initialization */
7511  TCW_SYNC_4(__kmp_init_middle, TRUE);
7512 
7513  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7514 }
7515 
7516 void __kmp_middle_initialize(void) {
7517  if (__kmp_init_middle) {
7518  return;
7519  }
7520  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7521  if (__kmp_init_middle) {
7522  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7523  return;
7524  }
7525  __kmp_do_middle_initialize();
7526  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7527 }
7528 
7529 void __kmp_parallel_initialize(void) {
7530  int gtid = __kmp_entry_gtid(); // this might be a new root
7531 
7532  /* synchronize parallel initialization (for sibling) */
7533  if (TCR_4(__kmp_init_parallel))
7534  return;
7535  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7536  if (TCR_4(__kmp_init_parallel)) {
7537  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7538  return;
7539  }
7540 
7541  /* TODO reinitialization after we have already shut down */
7542  if (TCR_4(__kmp_global.g.g_done)) {
7543  KA_TRACE(
7544  10,
7545  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7546  __kmp_infinite_loop();
7547  }
7548 
7549  /* jc: The lock __kmp_initz_lock is already held, so calling
7550  __kmp_serial_initialize would cause a deadlock. So we call
7551  __kmp_do_serial_initialize directly. */
7552  if (!__kmp_init_middle) {
7553  __kmp_do_middle_initialize();
7554  }
7555  __kmp_assign_root_init_mask();
7556  __kmp_resume_if_hard_paused();
7557 
7558  /* begin initialization */
7559  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7560  KMP_ASSERT(KMP_UBER_GTID(gtid));
7561 
7562 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7563  // Save the FP control regs.
7564  // Worker threads will set theirs to these values at thread startup.
7565  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7566  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7567  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7568 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7569 
7570 #if KMP_OS_UNIX
7571 #if KMP_HANDLE_SIGNALS
7572  /* must be after __kmp_serial_initialize */
7573  __kmp_install_signals(TRUE);
7574 #endif
7575 #endif
7576 
7577  __kmp_suspend_initialize();
7578 
7579 #if defined(USE_LOAD_BALANCE)
7580  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7581  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7582  }
7583 #else
7584  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7585  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7586  }
7587 #endif
7588 
7589  if (__kmp_version) {
7590  __kmp_print_version_2();
7591  }
7592 
7593  /* we have finished parallel initialization */
7594  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7595 
7596  KMP_MB();
7597  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7598 
7599  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7600 }
7601 
7602 void __kmp_hidden_helper_initialize() {
7603  if (TCR_4(__kmp_init_hidden_helper))
7604  return;
7605 
7606  // __kmp_parallel_initialize is required before we initialize hidden helper
7607  if (!TCR_4(__kmp_init_parallel))
7608  __kmp_parallel_initialize();
7609 
7610  // Double check. Note that this double check should not be placed before
7611  // __kmp_parallel_initialize as it will cause dead lock.
7612  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7613  if (TCR_4(__kmp_init_hidden_helper)) {
7614  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7615  return;
7616  }
7617 
7618 #if KMP_AFFINITY_SUPPORTED
7619  // Initialize hidden helper affinity settings.
7620  // The above __kmp_parallel_initialize() will initialize
7621  // regular affinity (and topology) if not already done.
7622  if (!__kmp_hh_affinity.flags.initialized)
7623  __kmp_affinity_initialize(__kmp_hh_affinity);
7624 #endif
7625 
7626  // Set the count of hidden helper tasks to be executed to zero
7627  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7628 
7629  // Set the global variable indicating that we're initializing hidden helper
7630  // team/threads
7631  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7632 
7633  // Platform independent initialization
7634  __kmp_do_initialize_hidden_helper_threads();
7635 
7636  // Wait here for the finish of initialization of hidden helper teams
7637  __kmp_hidden_helper_threads_initz_wait();
7638 
7639  // We have finished hidden helper initialization
7640  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7641 
7642  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7643 }
7644 
7645 /* ------------------------------------------------------------------------ */
7646 
7647 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7648  kmp_team_t *team) {
7649  kmp_disp_t *dispatch;
7650 
7651  KMP_MB();
7652 
7653  /* none of the threads have encountered any constructs, yet. */
7654  this_thr->th.th_local.this_construct = 0;
7655 #if KMP_CACHE_MANAGE
7656  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7657 #endif /* KMP_CACHE_MANAGE */
7658  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7659  KMP_DEBUG_ASSERT(dispatch);
7660  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7661  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7662  // this_thr->th.th_info.ds.ds_tid ] );
7663 
7664  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7665  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7666  if (__kmp_env_consistency_check)
7667  __kmp_push_parallel(gtid, team->t.t_ident);
7668 
7669  KMP_MB(); /* Flush all pending memory write invalidates. */
7670 }
7671 
7672 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7673  kmp_team_t *team) {
7674  if (__kmp_env_consistency_check)
7675  __kmp_pop_parallel(gtid, team->t.t_ident);
7676 
7677  __kmp_finish_implicit_task(this_thr);
7678 }
7679 
7680 int __kmp_invoke_task_func(int gtid) {
7681  int rc;
7682  int tid = __kmp_tid_from_gtid(gtid);
7683  kmp_info_t *this_thr = __kmp_threads[gtid];
7684  kmp_team_t *team = this_thr->th.th_team;
7685 
7686  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7687 #if USE_ITT_BUILD
7688  if (__itt_stack_caller_create_ptr) {
7689  // inform ittnotify about entering user's code
7690  if (team->t.t_stack_id != NULL) {
7691  __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7692  } else {
7693  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7694  __kmp_itt_stack_callee_enter(
7695  (__itt_caller)team->t.t_parent->t.t_stack_id);
7696  }
7697  }
7698 #endif /* USE_ITT_BUILD */
7699 #if INCLUDE_SSC_MARKS
7700  SSC_MARK_INVOKING();
7701 #endif
7702 
7703 #if OMPT_SUPPORT
7704  void *dummy;
7705  void **exit_frame_p;
7706  ompt_data_t *my_task_data;
7707  ompt_data_t *my_parallel_data;
7708  int ompt_team_size;
7709 
7710  if (ompt_enabled.enabled) {
7711  exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7712  .ompt_task_info.frame.exit_frame.ptr);
7713  } else {
7714  exit_frame_p = &dummy;
7715  }
7716 
7717  my_task_data =
7718  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7719  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7720  if (ompt_enabled.ompt_callback_implicit_task) {
7721  ompt_team_size = team->t.t_nproc;
7722  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7723  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7724  __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7725  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7726  }
7727 #endif
7728 
7729 #if KMP_STATS_ENABLED
7730  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7731  if (previous_state == stats_state_e::TEAMS_REGION) {
7732  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7733  } else {
7734  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7735  }
7736  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7737 #endif
7738 
7739  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7740  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7741 #if OMPT_SUPPORT
7742  ,
7743  exit_frame_p
7744 #endif
7745  );
7746 #if OMPT_SUPPORT
7747  *exit_frame_p = NULL;
7748  this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_team;
7749 #endif
7750 
7751 #if KMP_STATS_ENABLED
7752  if (previous_state == stats_state_e::TEAMS_REGION) {
7753  KMP_SET_THREAD_STATE(previous_state);
7754  }
7755  KMP_POP_PARTITIONED_TIMER();
7756 #endif
7757 
7758 #if USE_ITT_BUILD
7759  if (__itt_stack_caller_create_ptr) {
7760  // inform ittnotify about leaving user's code
7761  if (team->t.t_stack_id != NULL) {
7762  __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7763  } else {
7764  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7765  __kmp_itt_stack_callee_leave(
7766  (__itt_caller)team->t.t_parent->t.t_stack_id);
7767  }
7768  }
7769 #endif /* USE_ITT_BUILD */
7770  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7771 
7772  return rc;
7773 }
7774 
7775 void __kmp_teams_master(int gtid) {
7776  // This routine is called by all primary threads in teams construct
7777  kmp_info_t *thr = __kmp_threads[gtid];
7778  kmp_team_t *team = thr->th.th_team;
7779  ident_t *loc = team->t.t_ident;
7780  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7781  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7782  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7783  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7784  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7785 
7786  // This thread is a new CG root. Set up the proper variables.
7787  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7788  tmp->cg_root = thr; // Make thr the CG root
7789  // Init to thread limit stored when league primary threads were forked
7790  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7791  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7792  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7793  " cg_nthreads to 1\n",
7794  thr, tmp));
7795  tmp->up = thr->th.th_cg_roots;
7796  thr->th.th_cg_roots = tmp;
7797 
7798 // Launch league of teams now, but not let workers execute
7799 // (they hang on fork barrier until next parallel)
7800 #if INCLUDE_SSC_MARKS
7801  SSC_MARK_FORKING();
7802 #endif
7803  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7804  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7805  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7806 #if INCLUDE_SSC_MARKS
7807  SSC_MARK_JOINING();
7808 #endif
7809  // If the team size was reduced from the limit, set it to the new size
7810  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7811  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7812  // AC: last parameter "1" eliminates join barrier which won't work because
7813  // worker threads are in a fork barrier waiting for more parallel regions
7814  __kmp_join_call(loc, gtid
7815 #if OMPT_SUPPORT
7816  ,
7817  fork_context_intel
7818 #endif
7819  ,
7820  1);
7821 }
7822 
7823 int __kmp_invoke_teams_master(int gtid) {
7824  kmp_info_t *this_thr = __kmp_threads[gtid];
7825  kmp_team_t *team = this_thr->th.th_team;
7826 #if KMP_DEBUG
7827  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7828  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7829  (void *)__kmp_teams_master);
7830 #endif
7831  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7832 #if OMPT_SUPPORT
7833  int tid = __kmp_tid_from_gtid(gtid);
7834  ompt_data_t *task_data =
7835  &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7836  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7837  if (ompt_enabled.ompt_callback_implicit_task) {
7838  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7839  ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7840  ompt_task_initial);
7841  OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7842  }
7843 #endif
7844  __kmp_teams_master(gtid);
7845 #if OMPT_SUPPORT
7846  this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_league;
7847 #endif
7848  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7849  return 1;
7850 }
7851 
7852 /* this sets the requested number of threads for the next parallel region
7853  encountered by this team. since this should be enclosed in the forkjoin
7854  critical section it should avoid race conditions with asymmetrical nested
7855  parallelism */
7856 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7857  kmp_info_t *thr = __kmp_threads[gtid];
7858 
7859  if (num_threads > 0)
7860  thr->th.th_set_nproc = num_threads;
7861 }
7862 
7863 void __kmp_push_num_threads_list(ident_t *id, int gtid, kmp_uint32 list_length,
7864  int *num_threads_list) {
7865  kmp_info_t *thr = __kmp_threads[gtid];
7866 
7867  KMP_DEBUG_ASSERT(list_length > 1);
7868 
7869  if (num_threads_list[0] > 0)
7870  thr->th.th_set_nproc = num_threads_list[0];
7871  thr->th.th_set_nested_nth =
7872  (int *)KMP_INTERNAL_MALLOC(list_length * sizeof(int));
7873  for (kmp_uint32 i = 0; i < list_length; ++i)
7874  thr->th.th_set_nested_nth[i] = num_threads_list[i];
7875  thr->th.th_set_nested_nth_sz = list_length;
7876 }
7877 
7878 void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
7879  const char *msg) {
7880  kmp_info_t *thr = __kmp_threads[gtid];
7881  thr->th.th_nt_strict = true;
7882  thr->th.th_nt_loc = loc;
7883  // if sev is unset make fatal
7884  if (sev == severity_warning)
7885  thr->th.th_nt_sev = sev;
7886  else
7887  thr->th.th_nt_sev = severity_fatal;
7888  // if msg is unset, use an appropriate message
7889  if (msg)
7890  thr->th.th_nt_msg = msg;
7891  else
7892  thr->th.th_nt_msg = "Cannot form team with number of threads specified by "
7893  "strict num_threads clause.";
7894 }
7895 
7896 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7897  int num_threads) {
7898  KMP_DEBUG_ASSERT(thr);
7899  // Remember the number of threads for inner parallel regions
7900  if (!TCR_4(__kmp_init_middle))
7901  __kmp_middle_initialize(); // get internal globals calculated
7902  __kmp_assign_root_init_mask();
7903  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7904  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7905 
7906  if (num_threads == 0) {
7907  if (__kmp_teams_thread_limit > 0) {
7908  num_threads = __kmp_teams_thread_limit;
7909  } else {
7910  num_threads = __kmp_avail_proc / num_teams;
7911  }
7912  // adjust num_threads w/o warning as it is not user setting
7913  // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7914  // no thread_limit clause specified - do not change thread-limit-var ICV
7915  if (num_threads > __kmp_dflt_team_nth) {
7916  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7917  }
7918  if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7919  num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7920  } // prevent team size to exceed thread-limit-var
7921  if (num_teams * num_threads > __kmp_teams_max_nth) {
7922  num_threads = __kmp_teams_max_nth / num_teams;
7923  }
7924  if (num_threads == 0) {
7925  num_threads = 1;
7926  }
7927  } else {
7928  if (num_threads < 0) {
7929  __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7930  __kmp_msg_null);
7931  num_threads = 1;
7932  }
7933  // This thread will be the primary thread of the league primary threads
7934  // Store new thread limit; old limit is saved in th_cg_roots list
7935  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7936  // num_threads = min(num_threads, nthreads-var)
7937  if (num_threads > __kmp_dflt_team_nth) {
7938  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7939  }
7940  if (num_teams * num_threads > __kmp_teams_max_nth) {
7941  int new_threads = __kmp_teams_max_nth / num_teams;
7942  if (new_threads == 0) {
7943  new_threads = 1;
7944  }
7945  if (new_threads != num_threads) {
7946  if (!__kmp_reserve_warn) { // user asked for too many threads
7947  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7948  __kmp_msg(kmp_ms_warning,
7949  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7950  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7951  }
7952  }
7953  num_threads = new_threads;
7954  }
7955  }
7956  thr->th.th_teams_size.nth = num_threads;
7957 }
7958 
7959 /* this sets the requested number of teams for the teams region and/or
7960  the number of threads for the next parallel region encountered */
7961 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7962  int num_threads) {
7963  kmp_info_t *thr = __kmp_threads[gtid];
7964  if (num_teams < 0) {
7965  // OpenMP specification requires requested values to be positive,
7966  // but people can send us any value, so we'd better check
7967  __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7968  __kmp_msg_null);
7969  num_teams = 1;
7970  }
7971  if (num_teams == 0) {
7972  if (__kmp_nteams > 0) {
7973  num_teams = __kmp_nteams;
7974  } else {
7975  num_teams = 1; // default number of teams is 1.
7976  }
7977  }
7978  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7979  if (!__kmp_reserve_warn) {
7980  __kmp_reserve_warn = 1;
7981  __kmp_msg(kmp_ms_warning,
7982  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7983  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7984  }
7985  num_teams = __kmp_teams_max_nth;
7986  }
7987  // Set number of teams (number of threads in the outer "parallel" of the
7988  // teams)
7989  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7990 
7991  __kmp_push_thread_limit(thr, num_teams, num_threads);
7992 }
7993 
7994 /* This sets the requested number of teams for the teams region and/or
7995  the number of threads for the next parallel region encountered */
7996 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7997  int num_teams_ub, int num_threads) {
7998  kmp_info_t *thr = __kmp_threads[gtid];
7999  KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
8000  KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
8001  KMP_DEBUG_ASSERT(num_threads >= 0);
8002 
8003  if (num_teams_lb > num_teams_ub) {
8004  __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
8005  KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
8006  }
8007 
8008  int num_teams = 1; // defalt number of teams is 1.
8009 
8010  if (num_teams_lb == 0 && num_teams_ub > 0)
8011  num_teams_lb = num_teams_ub;
8012 
8013  if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
8014  num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
8015  if (num_teams > __kmp_teams_max_nth) {
8016  if (!__kmp_reserve_warn) {
8017  __kmp_reserve_warn = 1;
8018  __kmp_msg(kmp_ms_warning,
8019  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
8020  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
8021  }
8022  num_teams = __kmp_teams_max_nth;
8023  }
8024  } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
8025  num_teams = num_teams_ub;
8026  } else { // num_teams_lb <= num_teams <= num_teams_ub
8027  if (num_threads <= 0) {
8028  if (num_teams_ub > __kmp_teams_max_nth) {
8029  num_teams = num_teams_lb;
8030  } else {
8031  num_teams = num_teams_ub;
8032  }
8033  } else {
8034  num_teams = (num_threads > __kmp_teams_max_nth)
8035  ? num_teams
8036  : __kmp_teams_max_nth / num_threads;
8037  if (num_teams < num_teams_lb) {
8038  num_teams = num_teams_lb;
8039  } else if (num_teams > num_teams_ub) {
8040  num_teams = num_teams_ub;
8041  }
8042  }
8043  }
8044  // Set number of teams (number of threads in the outer "parallel" of the
8045  // teams)
8046  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
8047 
8048  __kmp_push_thread_limit(thr, num_teams, num_threads);
8049 }
8050 
8051 // Set the proc_bind var to use in the following parallel region.
8052 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
8053  kmp_info_t *thr = __kmp_threads[gtid];
8054  thr->th.th_set_proc_bind = proc_bind;
8055 }
8056 
8057 /* Launch the worker threads into the microtask. */
8058 
8059 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
8060  kmp_info_t *this_thr = __kmp_threads[gtid];
8061 
8062 #ifdef KMP_DEBUG
8063  int f;
8064 #endif /* KMP_DEBUG */
8065 
8066  KMP_DEBUG_ASSERT(team);
8067  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8068  KMP_ASSERT(KMP_MASTER_GTID(gtid));
8069  KMP_MB(); /* Flush all pending memory write invalidates. */
8070 
8071  team->t.t_construct = 0; /* no single directives seen yet */
8072  team->t.t_ordered.dt.t_value =
8073  0; /* thread 0 enters the ordered section first */
8074 
8075  /* Reset the identifiers on the dispatch buffer */
8076  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
8077  if (team->t.t_max_nproc > 1) {
8078  int i;
8079  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
8080  team->t.t_disp_buffer[i].buffer_index = i;
8081  team->t.t_disp_buffer[i].doacross_buf_idx = i;
8082  }
8083  } else {
8084  team->t.t_disp_buffer[0].buffer_index = 0;
8085  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
8086  }
8087 
8088  KMP_MB(); /* Flush all pending memory write invalidates. */
8089  KMP_ASSERT(this_thr->th.th_team == team);
8090 
8091 #ifdef KMP_DEBUG
8092  for (f = 0; f < team->t.t_nproc; f++) {
8093  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8094  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8095  }
8096 #endif /* KMP_DEBUG */
8097 
8098  /* release the worker threads so they may begin working */
8099  __kmp_fork_barrier(gtid, 0);
8100 }
8101 
8102 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8103  kmp_info_t *this_thr = __kmp_threads[gtid];
8104 
8105  KMP_DEBUG_ASSERT(team);
8106  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8107  KMP_ASSERT(KMP_MASTER_GTID(gtid));
8108  KMP_MB(); /* Flush all pending memory write invalidates. */
8109 
8110  /* Join barrier after fork */
8111 
8112 #ifdef KMP_DEBUG
8113  if (__kmp_threads[gtid] &&
8114  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8115  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8116  __kmp_threads[gtid]);
8117  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8118  "team->t.t_nproc=%d\n",
8119  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8120  team->t.t_nproc);
8121  __kmp_print_structure();
8122  }
8123  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8124  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8125 #endif /* KMP_DEBUG */
8126 
8127  __kmp_join_barrier(gtid); /* wait for everyone */
8128 #if OMPT_SUPPORT
8129  ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
8130  if (ompt_enabled.enabled &&
8131  (ompt_state == ompt_state_wait_barrier_teams ||
8132  ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
8133  int ds_tid = this_thr->th.th_info.ds.ds_tid;
8134  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8135  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8136 #if OMPT_OPTIONAL
8137  void *codeptr = NULL;
8138  if (KMP_MASTER_TID(ds_tid) &&
8139  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8140  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8141  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8142 
8143  ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
8144  if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
8145  sync_kind = ompt_sync_region_barrier_teams;
8146  if (ompt_enabled.ompt_callback_sync_region_wait) {
8147  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8148  sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8149  }
8150  if (ompt_enabled.ompt_callback_sync_region) {
8151  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8152  sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8153  }
8154 #endif
8155  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8156  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8157  ompt_scope_end, NULL, task_data, 0, ds_tid,
8158  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8159  }
8160  }
8161 #endif
8162 
8163  KMP_MB(); /* Flush all pending memory write invalidates. */
8164  KMP_ASSERT(this_thr->th.th_team == team);
8165 }
8166 
8167 /* ------------------------------------------------------------------------ */
8168 
8169 #ifdef USE_LOAD_BALANCE
8170 
8171 // Return the worker threads actively spinning in the hot team, if we
8172 // are at the outermost level of parallelism. Otherwise, return 0.
8173 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8174  int i;
8175  int retval;
8176  kmp_team_t *hot_team;
8177 
8178  if (root->r.r_active) {
8179  return 0;
8180  }
8181  hot_team = root->r.r_hot_team;
8182  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8183  return hot_team->t.t_nproc - 1; // Don't count primary thread
8184  }
8185 
8186  // Skip the primary thread - it is accounted for elsewhere.
8187  retval = 0;
8188  for (i = 1; i < hot_team->t.t_nproc; i++) {
8189  if (hot_team->t.t_threads[i]->th.th_active) {
8190  retval++;
8191  }
8192  }
8193  return retval;
8194 }
8195 
8196 // Perform an automatic adjustment to the number of
8197 // threads used by the next parallel region.
8198 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8199  int retval;
8200  int pool_active;
8201  int hot_team_active;
8202  int team_curr_active;
8203  int system_active;
8204 
8205  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8206  set_nproc));
8207  KMP_DEBUG_ASSERT(root);
8208  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8209  ->th.th_current_task->td_icvs.dynamic == TRUE);
8210  KMP_DEBUG_ASSERT(set_nproc > 1);
8211 
8212  if (set_nproc == 1) {
8213  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8214  return 1;
8215  }
8216 
8217  // Threads that are active in the thread pool, active in the hot team for this
8218  // particular root (if we are at the outer par level), and the currently
8219  // executing thread (to become the primary thread) are available to add to the
8220  // new team, but are currently contributing to the system load, and must be
8221  // accounted for.
8222  pool_active = __kmp_thread_pool_active_nth;
8223  hot_team_active = __kmp_active_hot_team_nproc(root);
8224  team_curr_active = pool_active + hot_team_active + 1;
8225 
8226  // Check the system load.
8227  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8228  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8229  "hot team active = %d\n",
8230  system_active, pool_active, hot_team_active));
8231 
8232  if (system_active < 0) {
8233  // There was an error reading the necessary info from /proc, so use the
8234  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8235  // = dynamic_thread_limit, we shouldn't wind up getting back here.
8236  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8237  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8238 
8239  // Make this call behave like the thread limit algorithm.
8240  retval = __kmp_avail_proc - __kmp_nth +
8241  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8242  if (retval > set_nproc) {
8243  retval = set_nproc;
8244  }
8245  if (retval < KMP_MIN_NTH) {
8246  retval = KMP_MIN_NTH;
8247  }
8248 
8249  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8250  retval));
8251  return retval;
8252  }
8253 
8254  // There is a slight delay in the load balance algorithm in detecting new
8255  // running procs. The real system load at this instant should be at least as
8256  // large as the #active omp thread that are available to add to the team.
8257  if (system_active < team_curr_active) {
8258  system_active = team_curr_active;
8259  }
8260  retval = __kmp_avail_proc - system_active + team_curr_active;
8261  if (retval > set_nproc) {
8262  retval = set_nproc;
8263  }
8264  if (retval < KMP_MIN_NTH) {
8265  retval = KMP_MIN_NTH;
8266  }
8267 
8268  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8269  return retval;
8270 } // __kmp_load_balance_nproc()
8271 
8272 #endif /* USE_LOAD_BALANCE */
8273 
8274 /* ------------------------------------------------------------------------ */
8275 
8276 /* NOTE: this is called with the __kmp_init_lock held */
8277 void __kmp_cleanup(void) {
8278  int f;
8279 
8280  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8281 
8282  if (TCR_4(__kmp_init_parallel)) {
8283 #if KMP_HANDLE_SIGNALS
8284  __kmp_remove_signals();
8285 #endif
8286  TCW_4(__kmp_init_parallel, FALSE);
8287  }
8288 
8289  if (TCR_4(__kmp_init_middle)) {
8290 #if KMP_AFFINITY_SUPPORTED
8291  __kmp_affinity_uninitialize();
8292 #endif /* KMP_AFFINITY_SUPPORTED */
8293  __kmp_cleanup_hierarchy();
8294  TCW_4(__kmp_init_middle, FALSE);
8295  }
8296 
8297  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8298 
8299  if (__kmp_init_serial) {
8300  __kmp_runtime_destroy();
8301  __kmp_init_serial = FALSE;
8302  }
8303 
8304  __kmp_cleanup_threadprivate_caches();
8305 
8306  for (f = 0; f < __kmp_threads_capacity; f++) {
8307  if (__kmp_root[f] != NULL) {
8308  __kmp_free(__kmp_root[f]);
8309  __kmp_root[f] = NULL;
8310  }
8311  }
8312  __kmp_free(__kmp_threads);
8313  // __kmp_threads and __kmp_root were allocated at once, as single block, so
8314  // there is no need in freeing __kmp_root.
8315  __kmp_threads = NULL;
8316  __kmp_root = NULL;
8317  __kmp_threads_capacity = 0;
8318 
8319  // Free old __kmp_threads arrays if they exist.
8320  kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8321  while (ptr) {
8322  kmp_old_threads_list_t *next = ptr->next;
8323  __kmp_free(ptr->threads);
8324  __kmp_free(ptr);
8325  ptr = next;
8326  }
8327 
8328 #if KMP_USE_DYNAMIC_LOCK
8329  __kmp_cleanup_indirect_user_locks();
8330 #else
8331  __kmp_cleanup_user_locks();
8332 #endif
8333 #if OMPD_SUPPORT
8334  if (ompd_state) {
8335  __kmp_free(ompd_env_block);
8336  ompd_env_block = NULL;
8337  ompd_env_block_size = 0;
8338  }
8339 #endif
8340 
8341 #if KMP_AFFINITY_SUPPORTED
8342  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8343  __kmp_cpuinfo_file = NULL;
8344 #endif /* KMP_AFFINITY_SUPPORTED */
8345 
8346 #if KMP_USE_ADAPTIVE_LOCKS
8347 #if KMP_DEBUG_ADAPTIVE_LOCKS
8348  __kmp_print_speculative_stats();
8349 #endif
8350 #endif
8351  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8352  __kmp_nested_nth.nth = NULL;
8353  __kmp_nested_nth.size = 0;
8354  __kmp_nested_nth.used = 0;
8355 
8356  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8357  __kmp_nested_proc_bind.bind_types = NULL;
8358  __kmp_nested_proc_bind.size = 0;
8359  __kmp_nested_proc_bind.used = 0;
8360  if (__kmp_affinity_format) {
8361  KMP_INTERNAL_FREE(__kmp_affinity_format);
8362  __kmp_affinity_format = NULL;
8363  }
8364 
8365  __kmp_i18n_catclose();
8366 
8367 #if KMP_USE_HIER_SCHED
8368  __kmp_hier_scheds.deallocate();
8369 #endif
8370 
8371 #if KMP_STATS_ENABLED
8372  __kmp_stats_fini();
8373 #endif
8374 
8375  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8376 }
8377 
8378 /* ------------------------------------------------------------------------ */
8379 
8380 int __kmp_ignore_mppbeg(void) {
8381  char *env;
8382 
8383  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8384  if (__kmp_str_match_false(env))
8385  return FALSE;
8386  }
8387  // By default __kmpc_begin() is no-op.
8388  return TRUE;
8389 }
8390 
8391 int __kmp_ignore_mppend(void) {
8392  char *env;
8393 
8394  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8395  if (__kmp_str_match_false(env))
8396  return FALSE;
8397  }
8398  // By default __kmpc_end() is no-op.
8399  return TRUE;
8400 }
8401 
8402 void __kmp_internal_begin(void) {
8403  int gtid;
8404  kmp_root_t *root;
8405 
8406  /* this is a very important step as it will register new sibling threads
8407  and assign these new uber threads a new gtid */
8408  gtid = __kmp_entry_gtid();
8409  root = __kmp_threads[gtid]->th.th_root;
8410  KMP_ASSERT(KMP_UBER_GTID(gtid));
8411 
8412  if (root->r.r_begin)
8413  return;
8414  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8415  if (root->r.r_begin) {
8416  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8417  return;
8418  }
8419 
8420  root->r.r_begin = TRUE;
8421 
8422  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8423 }
8424 
8425 /* ------------------------------------------------------------------------ */
8426 
8427 void __kmp_user_set_library(enum library_type arg) {
8428  int gtid;
8429  kmp_root_t *root;
8430  kmp_info_t *thread;
8431 
8432  /* first, make sure we are initialized so we can get our gtid */
8433 
8434  gtid = __kmp_entry_gtid();
8435  thread = __kmp_threads[gtid];
8436 
8437  root = thread->th.th_root;
8438 
8439  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8440  library_serial));
8441  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8442  thread */
8443  KMP_WARNING(SetLibraryIncorrectCall);
8444  return;
8445  }
8446 
8447  switch (arg) {
8448  case library_serial:
8449  thread->th.th_set_nproc = 0;
8450  set__nproc(thread, 1);
8451  break;
8452  case library_turnaround:
8453  thread->th.th_set_nproc = 0;
8454  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8455  : __kmp_dflt_team_nth_ub);
8456  break;
8457  case library_throughput:
8458  thread->th.th_set_nproc = 0;
8459  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8460  : __kmp_dflt_team_nth_ub);
8461  break;
8462  default:
8463  KMP_FATAL(UnknownLibraryType, arg);
8464  }
8465 
8466  __kmp_aux_set_library(arg);
8467 }
8468 
8469 void __kmp_aux_set_stacksize(size_t arg) {
8470  if (!__kmp_init_serial)
8471  __kmp_serial_initialize();
8472 
8473 #if KMP_OS_DARWIN
8474  if (arg & (0x1000 - 1)) {
8475  arg &= ~(0x1000 - 1);
8476  if (arg + 0x1000) /* check for overflow if we round up */
8477  arg += 0x1000;
8478  }
8479 #endif
8480  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8481 
8482  /* only change the default stacksize before the first parallel region */
8483  if (!TCR_4(__kmp_init_parallel)) {
8484  size_t value = arg; /* argument is in bytes */
8485 
8486  if (value < __kmp_sys_min_stksize)
8487  value = __kmp_sys_min_stksize;
8488  else if (value > KMP_MAX_STKSIZE)
8489  value = KMP_MAX_STKSIZE;
8490 
8491  __kmp_stksize = value;
8492 
8493  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8494  }
8495 
8496  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8497 }
8498 
8499 /* set the behaviour of the runtime library */
8500 /* TODO this can cause some odd behaviour with sibling parallelism... */
8501 void __kmp_aux_set_library(enum library_type arg) {
8502  __kmp_library = arg;
8503 
8504  switch (__kmp_library) {
8505  case library_serial: {
8506  KMP_INFORM(LibraryIsSerial);
8507  } break;
8508  case library_turnaround:
8509  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8510  __kmp_use_yield = 2; // only yield when oversubscribed
8511  break;
8512  case library_throughput:
8513  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8514  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8515  break;
8516  default:
8517  KMP_FATAL(UnknownLibraryType, arg);
8518  }
8519 }
8520 
8521 /* Getting team information common for all team API */
8522 // Returns NULL if not in teams construct
8523 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8524  kmp_info_t *thr = __kmp_entry_thread();
8525  teams_serialized = 0;
8526  if (thr->th.th_teams_microtask) {
8527  kmp_team_t *team = thr->th.th_team;
8528  int tlevel = thr->th.th_teams_level; // the level of the teams construct
8529  int ii = team->t.t_level;
8530  teams_serialized = team->t.t_serialized;
8531  int level = tlevel + 1;
8532  KMP_DEBUG_ASSERT(ii >= tlevel);
8533  while (ii > level) {
8534  for (teams_serialized = team->t.t_serialized;
8535  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8536  }
8537  if (team->t.t_serialized && (!teams_serialized)) {
8538  team = team->t.t_parent;
8539  continue;
8540  }
8541  if (ii > level) {
8542  team = team->t.t_parent;
8543  ii--;
8544  }
8545  }
8546  return team;
8547  }
8548  return NULL;
8549 }
8550 
8551 int __kmp_aux_get_team_num() {
8552  int serialized;
8553  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8554  if (team) {
8555  if (serialized > 1) {
8556  return 0; // teams region is serialized ( 1 team of 1 thread ).
8557  } else {
8558  return team->t.t_master_tid;
8559  }
8560  }
8561  return 0;
8562 }
8563 
8564 int __kmp_aux_get_num_teams() {
8565  int serialized;
8566  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8567  if (team) {
8568  if (serialized > 1) {
8569  return 1;
8570  } else {
8571  return team->t.t_parent->t.t_nproc;
8572  }
8573  }
8574  return 1;
8575 }
8576 
8577 /* ------------------------------------------------------------------------ */
8578 
8579 /*
8580  * Affinity Format Parser
8581  *
8582  * Field is in form of: %[[[0].]size]type
8583  * % and type are required (%% means print a literal '%')
8584  * type is either single char or long name surrounded by {},
8585  * e.g., N or {num_threads}
8586  * 0 => leading zeros
8587  * . => right justified when size is specified
8588  * by default output is left justified
8589  * size is the *minimum* field length
8590  * All other characters are printed as is
8591  *
8592  * Available field types:
8593  * L {thread_level} - omp_get_level()
8594  * n {thread_num} - omp_get_thread_num()
8595  * h {host} - name of host machine
8596  * P {process_id} - process id (integer)
8597  * T {thread_identifier} - native thread identifier (integer)
8598  * N {num_threads} - omp_get_num_threads()
8599  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8600  * a {thread_affinity} - comma separated list of integers or integer ranges
8601  * (values of affinity mask)
8602  *
8603  * Implementation-specific field types can be added
8604  * If a type is unknown, print "undefined"
8605  */
8606 
8607 // Structure holding the short name, long name, and corresponding data type
8608 // for snprintf. A table of these will represent the entire valid keyword
8609 // field types.
8610 typedef struct kmp_affinity_format_field_t {
8611  char short_name; // from spec e.g., L -> thread level
8612  const char *long_name; // from spec thread_level -> thread level
8613  char field_format; // data type for snprintf (typically 'd' or 's'
8614  // for integer or string)
8615 } kmp_affinity_format_field_t;
8616 
8617 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8618 #if KMP_AFFINITY_SUPPORTED
8619  {'A', "thread_affinity", 's'},
8620 #endif
8621  {'t', "team_num", 'd'},
8622  {'T', "num_teams", 'd'},
8623  {'L', "nesting_level", 'd'},
8624  {'n', "thread_num", 'd'},
8625  {'N', "num_threads", 'd'},
8626  {'a', "ancestor_tnum", 'd'},
8627  {'H', "host", 's'},
8628  {'P', "process_id", 'd'},
8629  {'i', "native_thread_id", 'd'}};
8630 
8631 // Return the number of characters it takes to hold field
8632 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8633  const char **ptr,
8634  kmp_str_buf_t *field_buffer) {
8635  int rc, format_index, field_value;
8636  const char *width_left, *width_right;
8637  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8638  static const int FORMAT_SIZE = 20;
8639  char format[FORMAT_SIZE] = {0};
8640  char absolute_short_name = 0;
8641 
8642  KMP_DEBUG_ASSERT(gtid >= 0);
8643  KMP_DEBUG_ASSERT(th);
8644  KMP_DEBUG_ASSERT(**ptr == '%');
8645  KMP_DEBUG_ASSERT(field_buffer);
8646 
8647  __kmp_str_buf_clear(field_buffer);
8648 
8649  // Skip the initial %
8650  (*ptr)++;
8651 
8652  // Check for %% first
8653  if (**ptr == '%') {
8654  __kmp_str_buf_cat(field_buffer, "%", 1);
8655  (*ptr)++; // skip over the second %
8656  return 1;
8657  }
8658 
8659  // Parse field modifiers if they are present
8660  pad_zeros = false;
8661  if (**ptr == '0') {
8662  pad_zeros = true;
8663  (*ptr)++; // skip over 0
8664  }
8665  right_justify = false;
8666  if (**ptr == '.') {
8667  right_justify = true;
8668  (*ptr)++; // skip over .
8669  }
8670  // Parse width of field: [width_left, width_right)
8671  width_left = width_right = NULL;
8672  if (**ptr >= '0' && **ptr <= '9') {
8673  width_left = *ptr;
8674  SKIP_DIGITS(*ptr);
8675  width_right = *ptr;
8676  }
8677 
8678  // Create the format for KMP_SNPRINTF based on flags parsed above
8679  format_index = 0;
8680  format[format_index++] = '%';
8681  if (!right_justify)
8682  format[format_index++] = '-';
8683  if (pad_zeros)
8684  format[format_index++] = '0';
8685  if (width_left && width_right) {
8686  int i = 0;
8687  // Only allow 8 digit number widths.
8688  // This also prevents overflowing format variable
8689  while (i < 8 && width_left < width_right) {
8690  format[format_index++] = *width_left;
8691  width_left++;
8692  i++;
8693  }
8694  }
8695 
8696  // Parse a name (long or short)
8697  // Canonicalize the name into absolute_short_name
8698  found_valid_name = false;
8699  parse_long_name = (**ptr == '{');
8700  if (parse_long_name)
8701  (*ptr)++; // skip initial left brace
8702  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8703  sizeof(__kmp_affinity_format_table[0]);
8704  ++i) {
8705  char short_name = __kmp_affinity_format_table[i].short_name;
8706  const char *long_name = __kmp_affinity_format_table[i].long_name;
8707  char field_format = __kmp_affinity_format_table[i].field_format;
8708  if (parse_long_name) {
8709  size_t length = KMP_STRLEN(long_name);
8710  if (strncmp(*ptr, long_name, length) == 0) {
8711  found_valid_name = true;
8712  (*ptr) += length; // skip the long name
8713  }
8714  } else if (**ptr == short_name) {
8715  found_valid_name = true;
8716  (*ptr)++; // skip the short name
8717  }
8718  if (found_valid_name) {
8719  format[format_index++] = field_format;
8720  format[format_index++] = '\0';
8721  absolute_short_name = short_name;
8722  break;
8723  }
8724  }
8725  if (parse_long_name) {
8726  if (**ptr != '}') {
8727  absolute_short_name = 0;
8728  } else {
8729  (*ptr)++; // skip over the right brace
8730  }
8731  }
8732 
8733  // Attempt to fill the buffer with the requested
8734  // value using snprintf within __kmp_str_buf_print()
8735  switch (absolute_short_name) {
8736  case 't':
8737  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8738  break;
8739  case 'T':
8740  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8741  break;
8742  case 'L':
8743  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8744  break;
8745  case 'n':
8746  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8747  break;
8748  case 'H': {
8749  static const int BUFFER_SIZE = 256;
8750  char buf[BUFFER_SIZE];
8751  __kmp_expand_host_name(buf, BUFFER_SIZE);
8752  rc = __kmp_str_buf_print(field_buffer, format, buf);
8753  } break;
8754  case 'P':
8755  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8756  break;
8757  case 'i':
8758  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8759  break;
8760  case 'N':
8761  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8762  break;
8763  case 'a':
8764  field_value =
8765  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8766  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8767  break;
8768 #if KMP_AFFINITY_SUPPORTED
8769  case 'A': {
8770  kmp_str_buf_t buf;
8771  __kmp_str_buf_init(&buf);
8772  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8773  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8774  __kmp_str_buf_free(&buf);
8775  } break;
8776 #endif
8777  default:
8778  // According to spec, If an implementation does not have info for field
8779  // type, then "undefined" is printed
8780  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8781  // Skip the field
8782  if (parse_long_name) {
8783  SKIP_TOKEN(*ptr);
8784  if (**ptr == '}')
8785  (*ptr)++;
8786  } else {
8787  (*ptr)++;
8788  }
8789  }
8790 
8791  KMP_ASSERT(format_index <= FORMAT_SIZE);
8792  return rc;
8793 }
8794 
8795 /*
8796  * Return number of characters needed to hold the affinity string
8797  * (not including null byte character)
8798  * The resultant string is printed to buffer, which the caller can then
8799  * handle afterwards
8800  */
8801 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8802  kmp_str_buf_t *buffer) {
8803  const char *parse_ptr;
8804  size_t retval;
8805  const kmp_info_t *th;
8806  kmp_str_buf_t field;
8807 
8808  KMP_DEBUG_ASSERT(buffer);
8809  KMP_DEBUG_ASSERT(gtid >= 0);
8810 
8811  __kmp_str_buf_init(&field);
8812  __kmp_str_buf_clear(buffer);
8813 
8814  th = __kmp_threads[gtid];
8815  retval = 0;
8816 
8817  // If format is NULL or zero-length string, then we use
8818  // affinity-format-var ICV
8819  parse_ptr = format;
8820  if (parse_ptr == NULL || *parse_ptr == '\0') {
8821  parse_ptr = __kmp_affinity_format;
8822  }
8823  KMP_DEBUG_ASSERT(parse_ptr);
8824 
8825  while (*parse_ptr != '\0') {
8826  // Parse a field
8827  if (*parse_ptr == '%') {
8828  // Put field in the buffer
8829  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8830  __kmp_str_buf_catbuf(buffer, &field);
8831  retval += rc;
8832  } else {
8833  // Put literal character in buffer
8834  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8835  retval++;
8836  parse_ptr++;
8837  }
8838  }
8839  __kmp_str_buf_free(&field);
8840  return retval;
8841 }
8842 
8843 // Displays the affinity string to stdout
8844 void __kmp_aux_display_affinity(int gtid, const char *format) {
8845  kmp_str_buf_t buf;
8846  __kmp_str_buf_init(&buf);
8847  __kmp_aux_capture_affinity(gtid, format, &buf);
8848  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8849  __kmp_str_buf_free(&buf);
8850 }
8851 
8852 /* ------------------------------------------------------------------------ */
8853 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8854  int blocktime = arg; /* argument is in microseconds */
8855 #if KMP_USE_MONITOR
8856  int bt_intervals;
8857 #endif
8858  kmp_int8 bt_set;
8859 
8860  __kmp_save_internal_controls(thread);
8861 
8862  /* Normalize and set blocktime for the teams */
8863  if (blocktime < KMP_MIN_BLOCKTIME)
8864  blocktime = KMP_MIN_BLOCKTIME;
8865  else if (blocktime > KMP_MAX_BLOCKTIME)
8866  blocktime = KMP_MAX_BLOCKTIME;
8867 
8868  set__blocktime_team(thread->th.th_team, tid, blocktime);
8869  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8870 
8871 #if KMP_USE_MONITOR
8872  /* Calculate and set blocktime intervals for the teams */
8873  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8874 
8875  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8876  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8877 #endif
8878 
8879  /* Set whether blocktime has been set to "TRUE" */
8880  bt_set = TRUE;
8881 
8882  set__bt_set_team(thread->th.th_team, tid, bt_set);
8883  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8884 #if KMP_USE_MONITOR
8885  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8886  "bt_intervals=%d, monitor_updates=%d\n",
8887  __kmp_gtid_from_tid(tid, thread->th.th_team),
8888  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8889  __kmp_monitor_wakeups));
8890 #else
8891  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8892  __kmp_gtid_from_tid(tid, thread->th.th_team),
8893  thread->th.th_team->t.t_id, tid, blocktime));
8894 #endif
8895 }
8896 
8897 void __kmp_aux_set_defaults(char const *str, size_t len) {
8898  if (!__kmp_init_serial) {
8899  __kmp_serial_initialize();
8900  }
8901  __kmp_env_initialize(str);
8902 
8903  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8904  __kmp_env_print();
8905  }
8906 } // __kmp_aux_set_defaults
8907 
8908 /* ------------------------------------------------------------------------ */
8909 /* internal fast reduction routines */
8910 
8911 PACKED_REDUCTION_METHOD_T
8912 __kmp_determine_reduction_method(
8913  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8914  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8915  kmp_critical_name *lck) {
8916 
8917  // Default reduction method: critical construct ( lck != NULL, like in current
8918  // PAROPT )
8919  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8920  // can be selected by RTL
8921  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8922  // can be selected by RTL
8923  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8924  // among generated by PAROPT.
8925 
8926  PACKED_REDUCTION_METHOD_T retval;
8927 
8928  int team_size;
8929 
8930  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8931 
8932 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8933  (loc && \
8934  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8935 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8936 
8937  retval = critical_reduce_block;
8938 
8939  // another choice of getting a team size (with 1 dynamic deference) is slower
8940  team_size = __kmp_get_team_num_threads(global_tid);
8941  if (team_size == 1) {
8942 
8943  retval = empty_reduce_block;
8944 
8945  } else {
8946 
8947  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8948 
8949 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8950  KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
8951  KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM
8952 
8953 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8954  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD || \
8955  KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8956 
8957  int teamsize_cutoff = 4;
8958 
8959 #if KMP_MIC_SUPPORTED
8960  if (__kmp_mic_type != non_mic) {
8961  teamsize_cutoff = 8;
8962  }
8963 #endif
8964  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8965  if (tree_available) {
8966  if (team_size <= teamsize_cutoff) {
8967  if (atomic_available) {
8968  retval = atomic_reduce_block;
8969  }
8970  } else {
8971  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8972  }
8973  } else if (atomic_available) {
8974  retval = atomic_reduce_block;
8975  }
8976 #else
8977 #error "Unknown or unsupported OS"
8978 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8979  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||
8980  // KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8981 
8982 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS || \
8983  KMP_ARCH_WASM || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32
8984 
8985 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8986  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_SOLARIS || \
8987  KMP_OS_WASI || KMP_OS_AIX
8988 
8989  // basic tuning
8990 
8991  if (atomic_available) {
8992  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8993  retval = atomic_reduce_block;
8994  }
8995  } // otherwise: use critical section
8996 
8997 #elif KMP_OS_DARWIN
8998 
8999  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9000  if (atomic_available && (num_vars <= 3)) {
9001  retval = atomic_reduce_block;
9002  } else if (tree_available) {
9003  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
9004  (reduce_size < (2000 * sizeof(kmp_real64)))) {
9005  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
9006  }
9007  } // otherwise: use critical section
9008 
9009 #else
9010 #error "Unknown or unsupported OS"
9011 #endif
9012 
9013 #else
9014 #error "Unknown or unsupported architecture"
9015 #endif
9016  }
9017 
9018  // KMP_FORCE_REDUCTION
9019 
9020  // If the team is serialized (team_size == 1), ignore the forced reduction
9021  // method and stay with the unsynchronized method (empty_reduce_block)
9022  if (__kmp_force_reduction_method != reduction_method_not_defined &&
9023  team_size != 1) {
9024 
9025  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
9026 
9027  int atomic_available, tree_available;
9028 
9029  switch ((forced_retval = __kmp_force_reduction_method)) {
9030  case critical_reduce_block:
9031  KMP_ASSERT(lck); // lck should be != 0
9032  break;
9033 
9034  case atomic_reduce_block:
9035  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
9036  if (!atomic_available) {
9037  KMP_WARNING(RedMethodNotSupported, "atomic");
9038  forced_retval = critical_reduce_block;
9039  }
9040  break;
9041 
9042  case tree_reduce_block:
9043  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9044  if (!tree_available) {
9045  KMP_WARNING(RedMethodNotSupported, "tree");
9046  forced_retval = critical_reduce_block;
9047  } else {
9048 #if KMP_FAST_REDUCTION_BARRIER
9049  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
9050 #endif
9051  }
9052  break;
9053 
9054  default:
9055  KMP_ASSERT(0); // "unsupported method specified"
9056  }
9057 
9058  retval = forced_retval;
9059  }
9060 
9061  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
9062 
9063 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
9064 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
9065 
9066  return (retval);
9067 }
9068 // this function is for testing set/get/determine reduce method
9069 kmp_int32 __kmp_get_reduce_method(void) {
9070  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
9071 }
9072 
9073 // Soft pause sets up threads to ignore blocktime and just go to sleep.
9074 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
9075 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
9076 
9077 // Hard pause shuts down the runtime completely. Resume happens naturally when
9078 // OpenMP is used subsequently.
9079 void __kmp_hard_pause() {
9080  __kmp_pause_status = kmp_hard_paused;
9081  __kmp_internal_end_thread(-1);
9082 }
9083 
9084 // Soft resume sets __kmp_pause_status, and wakes up all threads.
9085 void __kmp_resume_if_soft_paused() {
9086  if (__kmp_pause_status == kmp_soft_paused) {
9087  __kmp_pause_status = kmp_not_paused;
9088 
9089  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
9090  kmp_info_t *thread = __kmp_threads[gtid];
9091  if (thread) { // Wake it if sleeping
9092  kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
9093  thread);
9094  if (fl.is_sleeping())
9095  fl.resume(gtid);
9096  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
9097  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
9098  } else { // thread holds the lock and may sleep soon
9099  do { // until either the thread sleeps, or we can get the lock
9100  if (fl.is_sleeping()) {
9101  fl.resume(gtid);
9102  break;
9103  } else if (__kmp_try_suspend_mx(thread)) {
9104  __kmp_unlock_suspend_mx(thread);
9105  break;
9106  }
9107  } while (1);
9108  }
9109  }
9110  }
9111  }
9112 }
9113 
9114 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
9115 // TODO: add warning messages
9116 int __kmp_pause_resource(kmp_pause_status_t level) {
9117  if (level == kmp_not_paused) { // requesting resume
9118  if (__kmp_pause_status == kmp_not_paused) {
9119  // error message about runtime not being paused, so can't resume
9120  return 1;
9121  } else {
9122  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9123  __kmp_pause_status == kmp_hard_paused);
9124  __kmp_pause_status = kmp_not_paused;
9125  return 0;
9126  }
9127  } else if (level == kmp_soft_paused) { // requesting soft pause
9128  if (__kmp_pause_status != kmp_not_paused) {
9129  // error message about already being paused
9130  return 1;
9131  } else {
9132  __kmp_soft_pause();
9133  return 0;
9134  }
9135  } else if (level == kmp_hard_paused) { // requesting hard pause
9136  if (__kmp_pause_status != kmp_not_paused) {
9137  // error message about already being paused
9138  return 1;
9139  } else {
9140  __kmp_hard_pause();
9141  return 0;
9142  }
9143  } else {
9144  // error message about invalid level
9145  return 1;
9146  }
9147 }
9148 
9149 void __kmp_omp_display_env(int verbose) {
9150  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9151  if (__kmp_init_serial == 0)
9152  __kmp_do_serial_initialize();
9153  __kmp_display_env_impl(!verbose, verbose);
9154  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9155 }
9156 
9157 // The team size is changing, so distributed barrier must be modified
9158 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9159  int new_nthreads) {
9160  KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9161  bp_dist_bar);
9162  kmp_info_t **other_threads = team->t.t_threads;
9163 
9164  // We want all the workers to stop waiting on the barrier while we adjust the
9165  // size of the team.
9166  for (int f = 1; f < old_nthreads; ++f) {
9167  KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9168  // Ignore threads that are already inactive or not present in the team
9169  if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9170  // teams construct causes thread_limit to get passed in, and some of
9171  // those could be inactive; just ignore them
9172  continue;
9173  }
9174  // If thread is transitioning still to in_use state, wait for it
9175  if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9176  while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9177  KMP_CPU_PAUSE();
9178  }
9179  // The thread should be in_use now
9180  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9181  // Transition to unused state
9182  team->t.t_threads[f]->th.th_used_in_team.store(2);
9183  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9184  }
9185  // Release all the workers
9186  team->t.b->go_release();
9187 
9188  KMP_MFENCE();
9189 
9190  // Workers should see transition status 2 and move to 0; but may need to be
9191  // woken up first
9192  int count = old_nthreads - 1;
9193  while (count > 0) {
9194  count = old_nthreads - 1;
9195  for (int f = 1; f < old_nthreads; ++f) {
9196  if (other_threads[f]->th.th_used_in_team.load() != 0) {
9197  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9198  kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9199  void *, other_threads[f]->th.th_sleep_loc);
9200  __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9201  }
9202  } else {
9203  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9204  count--;
9205  }
9206  }
9207  }
9208  // Now update the barrier size
9209  team->t.b->update_num_threads(new_nthreads);
9210  team->t.b->go_reset();
9211 }
9212 
9213 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9214  // Add the threads back to the team
9215  KMP_DEBUG_ASSERT(team);
9216  // Threads were paused and pointed at th_used_in_team temporarily during a
9217  // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9218  // the thread that it should transition itself back into the team. Then, if
9219  // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9220  // to wake it up.
9221  for (int f = 1; f < new_nthreads; ++f) {
9222  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9223  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9224  3);
9225  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9226  __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9227  (kmp_flag_32<false, false> *)NULL);
9228  }
9229  }
9230  // The threads should be transitioning to the team; when they are done, they
9231  // should have set th_used_in_team to 1. This loop forces master to wait until
9232  // all threads have moved into the team and are waiting in the barrier.
9233  int count = new_nthreads - 1;
9234  while (count > 0) {
9235  count = new_nthreads - 1;
9236  for (int f = 1; f < new_nthreads; ++f) {
9237  if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9238  count--;
9239  }
9240  }
9241  }
9242 }
9243 
9244 // Globals and functions for hidden helper task
9245 kmp_info_t **__kmp_hidden_helper_threads;
9246 kmp_info_t *__kmp_hidden_helper_main_thread;
9247 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9248 #if KMP_OS_LINUX
9249 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9250 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9251 #else
9252 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9253 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9254 #endif
9255 
9256 namespace {
9257 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9258 
9259 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9260  // This is an explicit synchronization on all hidden helper threads in case
9261  // that when a regular thread pushes a hidden helper task to one hidden
9262  // helper thread, the thread has not been awaken once since they're released
9263  // by the main thread after creating the team.
9264  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9265  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9266  __kmp_hidden_helper_threads_num)
9267  ;
9268 
9269  // If main thread, then wait for signal
9270  if (__kmpc_master(nullptr, *gtid)) {
9271  // First, unset the initial state and release the initial thread
9272  TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9273  __kmp_hidden_helper_initz_release();
9274  __kmp_hidden_helper_main_thread_wait();
9275  // Now wake up all worker threads
9276  for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9277  __kmp_hidden_helper_worker_thread_signal();
9278  }
9279  }
9280 }
9281 } // namespace
9282 
9283 void __kmp_hidden_helper_threads_initz_routine() {
9284  // Create a new root for hidden helper team/threads
9285  const int gtid = __kmp_register_root(TRUE);
9286  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9287  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9288  __kmp_hidden_helper_main_thread->th.th_set_nproc =
9289  __kmp_hidden_helper_threads_num;
9290 
9291  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9292 
9293  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9294 
9295  // Set the initialization flag to FALSE
9296  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9297 
9298  __kmp_hidden_helper_threads_deinitz_release();
9299 }
9300 
9301 /* Nesting Mode:
9302  Set via KMP_NESTING_MODE, which takes an integer.
9303  Note: we skip duplicate topology levels, and skip levels with only
9304  one entity.
9305  KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9306  KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9307  in the topology, and initializes the number of threads at each of those
9308  levels to the number of entities at each level, respectively, below the
9309  entity at the parent level.
9310  KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9311  but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9312  the user to turn nesting on explicitly. This is an even more experimental
9313  option to this experimental feature, and may change or go away in the
9314  future.
9315 */
9316 
9317 // Allocate space to store nesting levels
9318 void __kmp_init_nesting_mode() {
9319  int levels = KMP_HW_LAST;
9320  __kmp_nesting_mode_nlevels = levels;
9321  __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9322  for (int i = 0; i < levels; ++i)
9323  __kmp_nesting_nth_level[i] = 0;
9324  if (__kmp_nested_nth.size < levels) {
9325  __kmp_nested_nth.nth =
9326  (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9327  __kmp_nested_nth.size = levels;
9328  }
9329 }
9330 
9331 // Set # threads for top levels of nesting; must be called after topology set
9332 void __kmp_set_nesting_mode_threads() {
9333  kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9334 
9335  if (__kmp_nesting_mode == 1)
9336  __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9337  else if (__kmp_nesting_mode > 1)
9338  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9339 
9340  if (__kmp_topology) { // use topology info
9341  int loc, hw_level;
9342  for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9343  loc < __kmp_nesting_mode_nlevels;
9344  loc++, hw_level++) {
9345  __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9346  if (__kmp_nesting_nth_level[loc] == 1)
9347  loc--;
9348  }
9349  // Make sure all cores are used
9350  if (__kmp_nesting_mode > 1 && loc > 1) {
9351  int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9352  int num_cores = __kmp_topology->get_count(core_level);
9353  int upper_levels = 1;
9354  for (int level = 0; level < loc - 1; ++level)
9355  upper_levels *= __kmp_nesting_nth_level[level];
9356  if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9357  __kmp_nesting_nth_level[loc - 1] =
9358  num_cores / __kmp_nesting_nth_level[loc - 2];
9359  }
9360  __kmp_nesting_mode_nlevels = loc;
9361  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9362  } else { // no topology info available; provide a reasonable guesstimation
9363  if (__kmp_avail_proc >= 4) {
9364  __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9365  __kmp_nesting_nth_level[1] = 2;
9366  __kmp_nesting_mode_nlevels = 2;
9367  } else {
9368  __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9369  __kmp_nesting_mode_nlevels = 1;
9370  }
9371  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9372  }
9373  for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9374  __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9375  }
9376  set__nproc(thread, __kmp_nesting_nth_level[0]);
9377  if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9378  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9379  if (get__max_active_levels(thread) > 1) {
9380  // if max levels was set, set nesting mode levels to same
9381  __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9382  }
9383  if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9384  set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9385 }
9386 
9387 // Empty symbols to export (see exports_so.txt) when feature is disabled
9388 extern "C" {
9389 #if !KMP_STATS_ENABLED
9390 void __kmp_reset_stats() {}
9391 #endif
9392 #if !USE_DEBUGGER
9393 int __kmp_omp_debug_struct_info = FALSE;
9394 int __kmp_debugging = FALSE;
9395 #endif
9396 #if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9397 void __kmp_itt_fini_ittlib() {}
9398 void __kmp_itt_init_ittlib() {}
9399 #endif
9400 }
9401 
9402 // end of file
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:212
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:940
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:898
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:370
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:377
@ kmp_sch_static
Definition: kmp.h:373
@ kmp_sch_guided_chunked
Definition: kmp.h:375
Definition: kmp.h:247
kmp_int32 flags
Definition: kmp.h:249