LLVM OpenMP* Runtime Library
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp_wait_release.h"
14 #include "kmp_barrier.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 // for distributed barrier
20 #include "kmp_affinity.h"
21 
22 #if KMP_MIC
23 #include <immintrin.h>
24 #define USE_NGO_STORES 1
25 #endif // KMP_MIC
26 
27 #if KMP_MIC && USE_NGO_STORES
28 // ICV copying
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33 #else
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
39 
40 void __kmp_print_structure(void); // Forward declaration
41 
42 // ---------------------------- Barrier Algorithms ----------------------------
43 // Distributed barrier
44 
45 // Compute how many threads to have polling each cache-line.
46 // We want to limit the number of writes to IDEAL_GO_RESOLUTION.
47 void distributedBarrier::computeVarsForN(size_t n) {
48  int nsockets = 1;
49  if (__kmp_topology) {
50  int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET);
51  int core_level = __kmp_topology->get_level(KMP_HW_CORE);
52  int ncores_per_socket =
53  __kmp_topology->calculate_ratio(core_level, socket_level);
54  nsockets = __kmp_topology->get_count(socket_level);
55 
56  if (nsockets <= 0)
57  nsockets = 1;
58  if (ncores_per_socket <= 0)
59  ncores_per_socket = 1;
60 
61  threads_per_go = ncores_per_socket >> 1;
62  if (!fix_threads_per_go) {
63  // Minimize num_gos
64  if (threads_per_go > 4) {
65  if (KMP_OPTIMIZE_FOR_REDUCTIONS) {
66  threads_per_go = threads_per_go >> 1;
67  }
68  if (threads_per_go > 4 && nsockets == 1)
69  threads_per_go = threads_per_go >> 1;
70  }
71  }
72  if (threads_per_go == 0)
73  threads_per_go = 1;
74  fix_threads_per_go = true;
75  num_gos = n / threads_per_go;
76  if (n % threads_per_go)
77  num_gos++;
78  if (nsockets == 1 || num_gos == 1)
79  num_groups = 1;
80  else {
81  num_groups = num_gos / nsockets;
82  if (num_gos % nsockets)
83  num_groups++;
84  }
85  if (num_groups <= 0)
86  num_groups = 1;
87  gos_per_group = num_gos / num_groups;
88  if (num_gos % num_groups)
89  gos_per_group++;
90  threads_per_group = threads_per_go * gos_per_group;
91  } else {
92  num_gos = n / threads_per_go;
93  if (n % threads_per_go)
94  num_gos++;
95  if (num_gos == 1)
96  num_groups = 1;
97  else {
98  num_groups = num_gos / 2;
99  if (num_gos % 2)
100  num_groups++;
101  }
102  gos_per_group = num_gos / num_groups;
103  if (num_gos % num_groups)
104  gos_per_group++;
105  threads_per_group = threads_per_go * gos_per_group;
106  }
107 }
108 
109 void distributedBarrier::computeGo(size_t n) {
110  // Minimize num_gos
111  for (num_gos = 1;; num_gos++)
112  if (IDEAL_CONTENTION * num_gos >= n)
113  break;
114  threads_per_go = n / num_gos;
115  if (n % num_gos)
116  threads_per_go++;
117  while (num_gos > MAX_GOS) {
118  threads_per_go++;
119  num_gos = n / threads_per_go;
120  if (n % threads_per_go)
121  num_gos++;
122  }
123  computeVarsForN(n);
124 }
125 
126 // This function is to resize the barrier arrays when the new number of threads
127 // exceeds max_threads, which is the current size of all the arrays
128 void distributedBarrier::resize(size_t nthr) {
129  KMP_DEBUG_ASSERT(nthr > max_threads);
130 
131  // expand to requested size * 2
132  max_threads = nthr * 2;
133 
134  // allocate arrays to new max threads
135  for (int i = 0; i < MAX_ITERS; ++i) {
136  if (flags[i])
137  flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],
138  max_threads * sizeof(flags_s));
139  else
140  flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s));
141  }
142 
143  if (go)
144  go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s));
145  else
146  go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s));
147 
148  if (iter)
149  iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s));
150  else
151  iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s));
152 
153  if (sleep)
154  sleep =
155  (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s));
156  else
157  sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s));
158 }
159 
160 // This function is to set all the go flags that threads might be waiting
161 // on, and when blocktime is not infinite, it should be followed by a wake-up
162 // call to each thread
163 kmp_uint64 distributedBarrier::go_release() {
164  kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS;
165  for (size_t j = 0; j < num_gos; j++) {
166  go[j].go.store(next_go);
167  }
168  return next_go;
169 }
170 
171 void distributedBarrier::go_reset() {
172  for (size_t j = 0; j < max_threads; ++j) {
173  for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) {
174  flags[i][j].stillNeed = 1;
175  }
176  go[j].go.store(0);
177  iter[j].iter = 0;
178  }
179 }
180 
181 // This function inits/re-inits the distributed barrier for a particular number
182 // of threads. If a resize of arrays is needed, it calls the resize function.
183 void distributedBarrier::init(size_t nthr) {
184  size_t old_max = max_threads;
185  if (nthr > max_threads) { // need more space in arrays
186  resize(nthr);
187  }
188 
189  for (size_t i = 0; i < max_threads; i++) {
190  for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) {
191  flags[j][i].stillNeed = 1;
192  }
193  go[i].go.store(0);
194  iter[i].iter = 0;
195  if (i >= old_max)
196  sleep[i].sleep = false;
197  }
198 
199  // Recalculate num_gos, etc. based on new nthr
200  computeVarsForN(nthr);
201 
202  num_threads = nthr;
203 
204  if (team_icvs == NULL)
205  team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t));
206 }
207 
208 // This function is used only when KMP_BLOCKTIME is not infinite.
209 // static
210 void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team,
211  size_t start, size_t stop, size_t inc,
212  size_t tid) {
213  KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME);
214  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
215  return;
216 
217  kmp_info_t **other_threads = team->t.t_threads;
218  for (size_t thr = start; thr < stop; thr += inc) {
219  KMP_DEBUG_ASSERT(other_threads[thr]);
220  int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;
221  // Wake up worker regardless of if it appears to be sleeping or not
222  __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL);
223  }
224 }
225 
226 static void __kmp_dist_barrier_gather(
227  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
228  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
229  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather);
230  kmp_team_t *team;
231  distributedBarrier *b;
232  kmp_info_t **other_threads;
233  kmp_uint64 my_current_iter, my_next_iter;
234  kmp_uint32 nproc;
235  bool group_leader;
236 
237  team = this_thr->th.th_team;
238  nproc = this_thr->th.th_team_nproc;
239  other_threads = team->t.t_threads;
240  b = team->t.b;
241  my_current_iter = b->iter[tid].iter;
242  my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS;
243  group_leader = ((tid % b->threads_per_group) == 0);
244 
245  KA_TRACE(20,
246  ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
247  gtid, team->t.t_id, tid, bt));
248 
249 #if USE_ITT_BUILD && USE_ITT_NOTIFY
250  // Barrier imbalance - save arrive time to the thread
251  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
252  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
253  __itt_get_timestamp();
254  }
255 #endif
256 
257  if (group_leader) {
258  // Start from the thread after the group leader
259  size_t group_start = tid + 1;
260  size_t group_end = tid + b->threads_per_group;
261  size_t threads_pending = 0;
262 
263  if (group_end > nproc)
264  group_end = nproc;
265  do { // wait for threads in my group
266  threads_pending = 0;
267  // Check all the flags every time to avoid branch misspredict
268  for (size_t thr = group_start; thr < group_end; thr++) {
269  // Each thread uses a different cache line
270  threads_pending += b->flags[my_current_iter][thr].stillNeed;
271  }
272  // Execute tasks here
273  if (__kmp_tasking_mode != tskm_immediate_exec) {
274  kmp_task_team_t *task_team = this_thr->th.th_task_team;
275  if (task_team != NULL) {
276  if (TCR_SYNC_4(task_team->tt.tt_active)) {
277  if (KMP_TASKING_ENABLED(task_team)) {
278  int tasks_completed = FALSE;
279  __kmp_atomic_execute_tasks_64(
280  this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
281  &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
282  } else
283  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
284  }
285  } else {
286  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
287  } // if
288  }
289  if (TCR_4(__kmp_global.g.g_done)) {
290  if (__kmp_global.g.g_abort)
291  __kmp_abort_thread();
292  break;
293  } else if (__kmp_tasking_mode != tskm_immediate_exec &&
294  this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
295  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
296  }
297  } while (threads_pending > 0);
298 
299  if (reduce) { // Perform reduction if needed
300  OMPT_REDUCTION_DECL(this_thr, gtid);
301  OMPT_REDUCTION_BEGIN;
302  // Group leader reduces all threads in group
303  for (size_t thr = group_start; thr < group_end; thr++) {
304  (*reduce)(this_thr->th.th_local.reduce_data,
305  other_threads[thr]->th.th_local.reduce_data);
306  }
307  OMPT_REDUCTION_END;
308  }
309 
310  // Set flag for next iteration
311  b->flags[my_next_iter][tid].stillNeed = 1;
312  // Each thread uses a different cache line; resets stillNeed to 0 to
313  // indicate it has reached the barrier
314  b->flags[my_current_iter][tid].stillNeed = 0;
315 
316  do { // wait for all group leaders
317  threads_pending = 0;
318  for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) {
319  threads_pending += b->flags[my_current_iter][thr].stillNeed;
320  }
321  // Execute tasks here
322  if (__kmp_tasking_mode != tskm_immediate_exec) {
323  kmp_task_team_t *task_team = this_thr->th.th_task_team;
324  if (task_team != NULL) {
325  if (TCR_SYNC_4(task_team->tt.tt_active)) {
326  if (KMP_TASKING_ENABLED(task_team)) {
327  int tasks_completed = FALSE;
328  __kmp_atomic_execute_tasks_64(
329  this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
330  &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
331  } else
332  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
333  }
334  } else {
335  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
336  } // if
337  }
338  if (TCR_4(__kmp_global.g.g_done)) {
339  if (__kmp_global.g.g_abort)
340  __kmp_abort_thread();
341  break;
342  } else if (__kmp_tasking_mode != tskm_immediate_exec &&
343  this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
344  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
345  }
346  } while (threads_pending > 0);
347 
348  if (reduce) { // Perform reduction if needed
349  if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders
350  OMPT_REDUCTION_DECL(this_thr, gtid);
351  OMPT_REDUCTION_BEGIN;
352  for (size_t thr = b->threads_per_group; thr < nproc;
353  thr += b->threads_per_group) {
354  (*reduce)(this_thr->th.th_local.reduce_data,
355  other_threads[thr]->th.th_local.reduce_data);
356  }
357  OMPT_REDUCTION_END;
358  }
359  }
360  } else {
361  // Set flag for next iteration
362  b->flags[my_next_iter][tid].stillNeed = 1;
363  // Each thread uses a different cache line; resets stillNeed to 0 to
364  // indicate it has reached the barrier
365  b->flags[my_current_iter][tid].stillNeed = 0;
366  }
367 
368  KMP_MFENCE();
369 
370  KA_TRACE(20,
371  ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
372  gtid, team->t.t_id, tid, bt));
373 }
374 
375 static void __kmp_dist_barrier_release(
376  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
377  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
378  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);
379  kmp_team_t *team;
380  distributedBarrier *b;
381  kmp_bstate_t *thr_bar;
382  kmp_uint64 my_current_iter, next_go;
383  size_t my_go_index;
384  bool group_leader;
385 
386  KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
387  gtid, tid, bt));
388 
389  thr_bar = &this_thr->th.th_bar[bt].bb;
390 
391  if (!KMP_MASTER_TID(tid)) {
392  // workers and non-master group leaders need to check their presence in team
393  do {
394  if (this_thr->th.th_used_in_team.load() != 1 &&
395  this_thr->th.th_used_in_team.load() != 3) {
396  // Thread is not in use in a team. Wait on location in tid's thread
397  // struct. The 0 value tells anyone looking that this thread is spinning
398  // or sleeping until this location becomes 3 again; 3 is the transition
399  // state to get to 1 which is waiting on go and being in the team
400  kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);
401  if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,
402  0) ||
403  this_thr->th.th_used_in_team.load() == 0) {
404  my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
405  }
406 #if USE_ITT_BUILD && USE_ITT_NOTIFY
407  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
408  // In fork barrier where we could not get the object reliably
409  itt_sync_obj =
410  __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
411  // Cancel wait on previous parallel region...
412  __kmp_itt_task_starting(itt_sync_obj);
413 
414  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
415  return;
416 
417  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
418  if (itt_sync_obj != NULL)
419  // Call prepare as early as possible for "new" barrier
420  __kmp_itt_task_finished(itt_sync_obj);
421  } else
422 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
423  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
424  return;
425  }
426  if (this_thr->th.th_used_in_team.load() != 1 &&
427  this_thr->th.th_used_in_team.load() != 3) // spurious wake-up?
428  continue;
429  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
430  return;
431 
432  // At this point, the thread thinks it is in use in a team, or in
433  // transition to be used in a team, but it might have reached this barrier
434  // before it was marked unused by the team. Unused threads are awoken and
435  // shifted to wait on local thread struct elsewhere. It also might reach
436  // this point by being picked up for use by a different team. Either way,
437  // we need to update the tid.
438  tid = __kmp_tid_from_gtid(gtid);
439  team = this_thr->th.th_team;
440  KMP_DEBUG_ASSERT(tid >= 0);
441  KMP_DEBUG_ASSERT(team);
442  b = team->t.b;
443  my_current_iter = b->iter[tid].iter;
444  next_go = my_current_iter + distributedBarrier::MAX_ITERS;
445  my_go_index = tid / b->threads_per_go;
446  if (this_thr->th.th_used_in_team.load() == 3) {
447  KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 1);
448  }
449  // Check if go flag is set
450  if (b->go[my_go_index].go.load() != next_go) {
451  // Wait on go flag on team
452  kmp_atomic_flag_64<false, true> my_flag(
453  &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));
454  my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
455  KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter ||
456  b->iter[tid].iter == 0);
457  KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false);
458  }
459 
460  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
461  return;
462  // At this point, the thread's go location was set. This means the primary
463  // thread is safely in the barrier, and so this thread's data is
464  // up-to-date, but we should check again that this thread is really in
465  // use in the team, as it could have been woken up for the purpose of
466  // changing team size, or reaping threads at shutdown.
467  if (this_thr->th.th_used_in_team.load() == 1)
468  break;
469  } while (1);
470 
471  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
472  return;
473 
474  group_leader = ((tid % b->threads_per_group) == 0);
475  if (group_leader) {
476  // Tell all the threads in my group they can go!
477  for (size_t go_idx = my_go_index + 1;
478  go_idx < my_go_index + b->gos_per_group; go_idx++) {
479  b->go[go_idx].go.store(next_go);
480  }
481  // Fence added so that workers can see changes to go. sfence inadequate.
482  KMP_MFENCE();
483  }
484 
485 #if KMP_BARRIER_ICV_PUSH
486  if (propagate_icvs) { // copy ICVs to final dest
487  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
488  tid, FALSE);
489  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
490  (kmp_internal_control_t *)team->t.b->team_icvs);
491  copy_icvs(&thr_bar->th_fixed_icvs,
492  &team->t.t_implicit_task_taskdata[tid].td_icvs);
493  }
494 #endif
495  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {
496  // This thread is now awake and participating in the barrier;
497  // wake up the other threads in the group
498  size_t nproc = this_thr->th.th_team_nproc;
499  size_t group_end = tid + b->threads_per_group;
500  if (nproc < group_end)
501  group_end = nproc;
502  __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
503  }
504  } else { // Primary thread
505  team = this_thr->th.th_team;
506  b = team->t.b;
507  my_current_iter = b->iter[tid].iter;
508  next_go = my_current_iter + distributedBarrier::MAX_ITERS;
509 #if KMP_BARRIER_ICV_PUSH
510  if (propagate_icvs) {
511  // primary thread has ICVs in final destination; copy
512  copy_icvs(&thr_bar->th_fixed_icvs,
513  &team->t.t_implicit_task_taskdata[tid].td_icvs);
514  }
515 #endif
516  // Tell all the group leaders they can go!
517  for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) {
518  b->go[go_idx].go.store(next_go);
519  }
520 
521  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
522  // Wake-up the group leaders
523  size_t nproc = this_thr->th.th_team_nproc;
524  __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc,
525  b->threads_per_group, tid);
526  }
527 
528  // Tell all the threads in my group they can go!
529  for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) {
530  b->go[go_idx].go.store(next_go);
531  }
532 
533  // Fence added so that workers can see changes to go. sfence inadequate.
534  KMP_MFENCE();
535 
536  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
537  // Wake-up the other threads in my group
538  size_t nproc = this_thr->th.th_team_nproc;
539  size_t group_end = tid + b->threads_per_group;
540  if (nproc < group_end)
541  group_end = nproc;
542  __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
543  }
544  }
545  // Update to next iteration
546  KMP_ASSERT(my_current_iter == b->iter[tid].iter);
547  b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS;
548 
549  KA_TRACE(
550  20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
551  gtid, team->t.t_id, tid, bt));
552 }
553 
554 // Linear Barrier
555 template <bool cancellable = false>
556 static bool __kmp_linear_barrier_gather_template(
557  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
558  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
559  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
560  kmp_team_t *team = this_thr->th.th_team;
561  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
562  kmp_info_t **other_threads = team->t.t_threads;
563 
564  KA_TRACE(
565  20,
566  ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
567  gtid, team->t.t_id, tid, bt));
568  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
569 
570 #if USE_ITT_BUILD && USE_ITT_NOTIFY
571  // Barrier imbalance - save arrive time to the thread
572  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
573  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
574  __itt_get_timestamp();
575  }
576 #endif
577  // We now perform a linear reduction to signal that all of the threads have
578  // arrived.
579  if (!KMP_MASTER_TID(tid)) {
580  KA_TRACE(20,
581  ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
582  "arrived(%p): %llu => %llu\n",
583  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
584  team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
585  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
586  // Mark arrival to primary thread
587  /* After performing this write, a worker thread may not assume that the team
588  is valid any more - it could be deallocated by the primary thread at any
589  time. */
590  kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
591  flag.release();
592  } else {
593  kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
594  int nproc = this_thr->th.th_team_nproc;
595  int i;
596  // Don't have to worry about sleep bit here or atomic since team setting
597  kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
598 
599  // Collect all the worker team member threads.
600  for (i = 1; i < nproc; ++i) {
601 #if KMP_CACHE_MANAGE
602  // Prefetch next thread's arrived count
603  if (i + 1 < nproc)
604  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
605 #endif /* KMP_CACHE_MANAGE */
606  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
607  "arrived(%p) == %llu\n",
608  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
609  team->t.t_id, i,
610  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
611 
612  // Wait for worker thread to arrive
613  if (cancellable) {
614  kmp_flag_64<true, false> flag(
615  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
616  if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
617  return true;
618  } else {
619  kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
620  new_state);
621  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
622  }
623 #if USE_ITT_BUILD && USE_ITT_NOTIFY
624  // Barrier imbalance - write min of the thread time and the other thread
625  // time to the thread.
626  if (__kmp_forkjoin_frames_mode == 2) {
627  this_thr->th.th_bar_min_time = KMP_MIN(
628  this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
629  }
630 #endif
631  if (reduce) {
632  KA_TRACE(100,
633  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
634  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
635  team->t.t_id, i));
636  OMPT_REDUCTION_DECL(this_thr, gtid);
637  OMPT_REDUCTION_BEGIN;
638  (*reduce)(this_thr->th.th_local.reduce_data,
639  other_threads[i]->th.th_local.reduce_data);
640  OMPT_REDUCTION_END;
641  }
642  }
643  // Don't have to worry about sleep bit here or atomic since team setting
644  team_bar->b_arrived = new_state;
645  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
646  "arrived(%p) = %llu\n",
647  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
648  new_state));
649  }
650  KA_TRACE(
651  20,
652  ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
653  gtid, team->t.t_id, tid, bt));
654  return false;
655 }
656 
657 template <bool cancellable = false>
658 static bool __kmp_linear_barrier_release_template(
659  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
660  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
661  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
662  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
663  kmp_team_t *team;
664 
665  if (KMP_MASTER_TID(tid)) {
666  unsigned int i;
667  kmp_uint32 nproc = this_thr->th.th_team_nproc;
668  kmp_info_t **other_threads;
669 
670  team = __kmp_threads[gtid]->th.th_team;
671  KMP_DEBUG_ASSERT(team != NULL);
672  other_threads = team->t.t_threads;
673 
674  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
675  "barrier type %d\n",
676  gtid, team->t.t_id, tid, bt));
677 
678  if (nproc > 1) {
679 #if KMP_BARRIER_ICV_PUSH
680  {
681  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
682  if (propagate_icvs) {
683  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
684  for (i = 1; i < nproc; ++i) {
685  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
686  team, i, FALSE);
687  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
688  &team->t.t_implicit_task_taskdata[0].td_icvs);
689  }
690  ngo_sync();
691  }
692  }
693 #endif // KMP_BARRIER_ICV_PUSH
694 
695  // Now, release all of the worker threads
696  for (i = 1; i < nproc; ++i) {
697 #if KMP_CACHE_MANAGE
698  // Prefetch next thread's go flag
699  if (i + 1 < nproc)
700  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
701 #endif /* KMP_CACHE_MANAGE */
702  KA_TRACE(
703  20,
704  ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
705  "go(%p): %u => %u\n",
706  gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
707  team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
708  other_threads[i]->th.th_bar[bt].bb.b_go,
709  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
710  kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
711  other_threads[i]);
712  flag.release();
713  }
714  }
715  } else { // Wait for the PRIMARY thread to release us
716  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
717  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
718  if (cancellable) {
719  kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
720  if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
721  return true;
722  } else {
723  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
724  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
725  }
726 #if USE_ITT_BUILD && USE_ITT_NOTIFY
727  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
728  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
729  // disabled)
730  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
731  // Cancel wait on previous parallel region...
732  __kmp_itt_task_starting(itt_sync_obj);
733 
734  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
735  return false;
736 
737  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
738  if (itt_sync_obj != NULL)
739  // Call prepare as early as possible for "new" barrier
740  __kmp_itt_task_finished(itt_sync_obj);
741  } else
742 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
743  // Early exit for reaping threads releasing forkjoin barrier
744  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
745  return false;
746 // The worker thread may now assume that the team is valid.
747 #ifdef KMP_DEBUG
748  tid = __kmp_tid_from_gtid(gtid);
749  team = __kmp_threads[gtid]->th.th_team;
750 #endif
751  KMP_DEBUG_ASSERT(team != NULL);
752  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
753  KA_TRACE(20,
754  ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
755  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
756  KMP_MB(); // Flush all pending memory write invalidates.
757  }
758  KA_TRACE(
759  20,
760  ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
761  gtid, team->t.t_id, tid, bt));
762  return false;
763 }
764 
765 static void __kmp_linear_barrier_gather(
766  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
767  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
768  __kmp_linear_barrier_gather_template<false>(
769  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
770 }
771 
772 static bool __kmp_linear_barrier_gather_cancellable(
773  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
774  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
775  return __kmp_linear_barrier_gather_template<true>(
776  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
777 }
778 
779 static void __kmp_linear_barrier_release(
780  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
781  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
782  __kmp_linear_barrier_release_template<false>(
783  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
784 }
785 
786 static bool __kmp_linear_barrier_release_cancellable(
787  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
788  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
789  return __kmp_linear_barrier_release_template<true>(
790  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
791 }
792 
793 // Tree barrier
794 static void __kmp_tree_barrier_gather(
795  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
796  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
797  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
798  kmp_team_t *team = this_thr->th.th_team;
799  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
800  kmp_info_t **other_threads = team->t.t_threads;
801  kmp_uint32 nproc = this_thr->th.th_team_nproc;
802  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
803  kmp_uint32 branch_factor = 1 << branch_bits;
804  kmp_uint32 child;
805  kmp_uint32 child_tid;
806  kmp_uint64 new_state = 0;
807 
808  KA_TRACE(
809  20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
810  gtid, team->t.t_id, tid, bt));
811  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
812 
813 #if USE_ITT_BUILD && USE_ITT_NOTIFY
814  // Barrier imbalance - save arrive time to the thread
815  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
816  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
817  __itt_get_timestamp();
818  }
819 #endif
820  // Perform tree gather to wait until all threads have arrived; reduce any
821  // required data as we go
822  child_tid = (tid << branch_bits) + 1;
823  if (child_tid < nproc) {
824  // Parent threads wait for all their children to arrive
825  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
826  child = 1;
827  do {
828  kmp_info_t *child_thr = other_threads[child_tid];
829  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
830 #if KMP_CACHE_MANAGE
831  // Prefetch next thread's arrived count
832  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
833  KMP_CACHE_PREFETCH(
834  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
835 #endif /* KMP_CACHE_MANAGE */
836  KA_TRACE(20,
837  ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
838  "arrived(%p) == %llu\n",
839  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
840  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
841  // Wait for child to arrive
842  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
843  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
844 #if USE_ITT_BUILD && USE_ITT_NOTIFY
845  // Barrier imbalance - write min of the thread time and a child time to
846  // the thread.
847  if (__kmp_forkjoin_frames_mode == 2) {
848  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
849  child_thr->th.th_bar_min_time);
850  }
851 #endif
852  if (reduce) {
853  KA_TRACE(100,
854  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
855  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
856  team->t.t_id, child_tid));
857  OMPT_REDUCTION_DECL(this_thr, gtid);
858  OMPT_REDUCTION_BEGIN;
859  (*reduce)(this_thr->th.th_local.reduce_data,
860  child_thr->th.th_local.reduce_data);
861  OMPT_REDUCTION_END;
862  }
863  child++;
864  child_tid++;
865  } while (child <= branch_factor && child_tid < nproc);
866  }
867 
868  if (!KMP_MASTER_TID(tid)) { // Worker threads
869  kmp_int32 parent_tid = (tid - 1) >> branch_bits;
870 
871  KA_TRACE(20,
872  ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
873  "arrived(%p): %llu => %llu\n",
874  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
875  team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
876  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
877 
878  // Mark arrival to parent thread
879  /* After performing this write, a worker thread may not assume that the team
880  is valid any more - it could be deallocated by the primary thread at any
881  time. */
882  kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
883  flag.release();
884  } else {
885  // Need to update the team arrived pointer if we are the primary thread
886  if (nproc > 1) // New value was already computed above
887  team->t.t_bar[bt].b_arrived = new_state;
888  else
889  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
890  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
891  "arrived(%p) = %llu\n",
892  gtid, team->t.t_id, tid, team->t.t_id,
893  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
894  }
895  KA_TRACE(20,
896  ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
897  gtid, team->t.t_id, tid, bt));
898 }
899 
900 static void __kmp_tree_barrier_release(
901  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
902  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
903  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
904  kmp_team_t *team;
905  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
906  kmp_uint32 nproc;
907  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
908  kmp_uint32 branch_factor = 1 << branch_bits;
909  kmp_uint32 child;
910  kmp_uint32 child_tid;
911 
912  // Perform a tree release for all of the threads that have been gathered
913  if (!KMP_MASTER_TID(
914  tid)) { // Handle fork barrier workers who aren't part of a team yet
915  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
916  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
917  // Wait for parent thread to release us
918  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
919  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
920 #if USE_ITT_BUILD && USE_ITT_NOTIFY
921  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
922  // In fork barrier where we could not get the object reliably (or
923  // ITTNOTIFY is disabled)
924  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
925  // Cancel wait on previous parallel region...
926  __kmp_itt_task_starting(itt_sync_obj);
927 
928  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
929  return;
930 
931  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
932  if (itt_sync_obj != NULL)
933  // Call prepare as early as possible for "new" barrier
934  __kmp_itt_task_finished(itt_sync_obj);
935  } else
936 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
937  // Early exit for reaping threads releasing forkjoin barrier
938  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
939  return;
940 
941  // The worker thread may now assume that the team is valid.
942  team = __kmp_threads[gtid]->th.th_team;
943  KMP_DEBUG_ASSERT(team != NULL);
944  tid = __kmp_tid_from_gtid(gtid);
945 
946  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
947  KA_TRACE(20,
948  ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
949  team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
950  KMP_MB(); // Flush all pending memory write invalidates.
951  } else {
952  team = __kmp_threads[gtid]->th.th_team;
953  KMP_DEBUG_ASSERT(team != NULL);
954  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
955  "barrier type %d\n",
956  gtid, team->t.t_id, tid, bt));
957  }
958  nproc = this_thr->th.th_team_nproc;
959  child_tid = (tid << branch_bits) + 1;
960 
961  if (child_tid < nproc) {
962  kmp_info_t **other_threads = team->t.t_threads;
963  child = 1;
964  // Parent threads release all their children
965  do {
966  kmp_info_t *child_thr = other_threads[child_tid];
967  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
968 #if KMP_CACHE_MANAGE
969  // Prefetch next thread's go count
970  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
971  KMP_CACHE_PREFETCH(
972  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
973 #endif /* KMP_CACHE_MANAGE */
974 
975 #if KMP_BARRIER_ICV_PUSH
976  {
977  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
978  if (propagate_icvs) {
979  __kmp_init_implicit_task(team->t.t_ident,
980  team->t.t_threads[child_tid], team,
981  child_tid, FALSE);
982  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
983  &team->t.t_implicit_task_taskdata[0].td_icvs);
984  }
985  }
986 #endif // KMP_BARRIER_ICV_PUSH
987  KA_TRACE(20,
988  ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
989  "go(%p): %u => %u\n",
990  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
991  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
992  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
993  // Release child from barrier
994  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
995  flag.release();
996  child++;
997  child_tid++;
998  } while (child <= branch_factor && child_tid < nproc);
999  }
1000  KA_TRACE(
1001  20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1002  gtid, team->t.t_id, tid, bt));
1003 }
1004 
1005 // Hyper Barrier
1006 static void __kmp_hyper_barrier_gather(
1007  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1008  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1009  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
1010  kmp_team_t *team = this_thr->th.th_team;
1011  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1012  kmp_info_t **other_threads = team->t.t_threads;
1013  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
1014  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
1015  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
1016  kmp_uint32 branch_factor = 1 << branch_bits;
1017  kmp_uint32 offset;
1018  kmp_uint32 level;
1019 
1020  KA_TRACE(
1021  20,
1022  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1023  gtid, team->t.t_id, tid, bt));
1024  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1025 
1026 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1027  // Barrier imbalance - save arrive time to the thread
1028  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1029  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
1030  __itt_get_timestamp();
1031  }
1032 #endif
1033  /* Perform a hypercube-embedded tree gather to wait until all of the threads
1034  have arrived, and reduce any required data as we go. */
1035  kmp_flag_64<> p_flag(&thr_bar->b_arrived);
1036  for (level = 0, offset = 1; offset < num_threads;
1037  level += branch_bits, offset <<= branch_bits) {
1038  kmp_uint32 child;
1039  kmp_uint32 child_tid;
1040 
1041  if (((tid >> level) & (branch_factor - 1)) != 0) {
1042  kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
1043 
1044  KMP_MB(); // Synchronize parent and child threads.
1045  KA_TRACE(20,
1046  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1047  "arrived(%p): %llu => %llu\n",
1048  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
1049  team->t.t_id, parent_tid, &thr_bar->b_arrived,
1050  thr_bar->b_arrived,
1051  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1052  // Mark arrival to parent thread
1053  /* After performing this write (in the last iteration of the enclosing for
1054  loop), a worker thread may not assume that the team is valid any more
1055  - it could be deallocated by the primary thread at any time. */
1056  p_flag.set_waiter(other_threads[parent_tid]);
1057  p_flag.release();
1058  break;
1059  }
1060 
1061  // Parent threads wait for children to arrive
1062  if (new_state == KMP_BARRIER_UNUSED_STATE)
1063  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1064  for (child = 1, child_tid = tid + (1 << level);
1065  child < branch_factor && child_tid < num_threads;
1066  child++, child_tid += (1 << level)) {
1067  kmp_info_t *child_thr = other_threads[child_tid];
1068  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1069 #if KMP_CACHE_MANAGE
1070  kmp_uint32 next_child_tid = child_tid + (1 << level);
1071  // Prefetch next thread's arrived count
1072  if (child + 1 < branch_factor && next_child_tid < num_threads)
1073  KMP_CACHE_PREFETCH(
1074  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
1075 #endif /* KMP_CACHE_MANAGE */
1076  KA_TRACE(20,
1077  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
1078  "arrived(%p) == %llu\n",
1079  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1080  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
1081  // Wait for child to arrive
1082  kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
1083  c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1084  KMP_MB(); // Synchronize parent and child threads.
1085 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1086  // Barrier imbalance - write min of the thread time and a child time to
1087  // the thread.
1088  if (__kmp_forkjoin_frames_mode == 2) {
1089  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
1090  child_thr->th.th_bar_min_time);
1091  }
1092 #endif
1093  if (reduce) {
1094  KA_TRACE(100,
1095  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
1096  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1097  team->t.t_id, child_tid));
1098  OMPT_REDUCTION_DECL(this_thr, gtid);
1099  OMPT_REDUCTION_BEGIN;
1100  (*reduce)(this_thr->th.th_local.reduce_data,
1101  child_thr->th.th_local.reduce_data);
1102  OMPT_REDUCTION_END;
1103  }
1104  }
1105  }
1106 
1107  if (KMP_MASTER_TID(tid)) {
1108  // Need to update the team arrived pointer if we are the primary thread
1109  if (new_state == KMP_BARRIER_UNUSED_STATE)
1110  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
1111  else
1112  team->t.t_bar[bt].b_arrived = new_state;
1113  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
1114  "arrived(%p) = %llu\n",
1115  gtid, team->t.t_id, tid, team->t.t_id,
1116  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1117  }
1118  KA_TRACE(
1119  20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1120  gtid, team->t.t_id, tid, bt));
1121 }
1122 
1123 // The reverse versions seem to beat the forward versions overall
1124 #define KMP_REVERSE_HYPER_BAR
1125 static void __kmp_hyper_barrier_release(
1126  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1127  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1128  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
1129  kmp_team_t *team;
1130  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1131  kmp_info_t **other_threads;
1132  kmp_uint32 num_threads;
1133  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
1134  kmp_uint32 branch_factor = 1 << branch_bits;
1135  kmp_uint32 child;
1136  kmp_uint32 child_tid;
1137  kmp_uint32 offset;
1138  kmp_uint32 level;
1139 
1140  /* Perform a hypercube-embedded tree release for all of the threads that have
1141  been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
1142  are released in the reverse order of the corresponding gather, otherwise
1143  threads are released in the same order. */
1144  if (KMP_MASTER_TID(tid)) { // primary thread
1145  team = __kmp_threads[gtid]->th.th_team;
1146  KMP_DEBUG_ASSERT(team != NULL);
1147  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
1148  "barrier type %d\n",
1149  gtid, team->t.t_id, tid, bt));
1150 #if KMP_BARRIER_ICV_PUSH
1151  if (propagate_icvs) { // primary already has ICVs in final destination; copy
1152  copy_icvs(&thr_bar->th_fixed_icvs,
1153  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1154  }
1155 #endif
1156  } else { // Handle fork barrier workers who aren't part of a team yet
1157  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
1158  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
1159  // Wait for parent thread to release us
1160  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1161  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1162 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1163  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
1164  // In fork barrier where we could not get the object reliably
1165  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
1166  // Cancel wait on previous parallel region...
1167  __kmp_itt_task_starting(itt_sync_obj);
1168 
1169  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1170  return;
1171 
1172  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1173  if (itt_sync_obj != NULL)
1174  // Call prepare as early as possible for "new" barrier
1175  __kmp_itt_task_finished(itt_sync_obj);
1176  } else
1177 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1178  // Early exit for reaping threads releasing forkjoin barrier
1179  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1180  return;
1181 
1182  // The worker thread may now assume that the team is valid.
1183  team = __kmp_threads[gtid]->th.th_team;
1184  KMP_DEBUG_ASSERT(team != NULL);
1185  tid = __kmp_tid_from_gtid(gtid);
1186 
1187  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1188  KA_TRACE(20,
1189  ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1190  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1191  KMP_MB(); // Flush all pending memory write invalidates.
1192  }
1193  num_threads = this_thr->th.th_team_nproc;
1194  other_threads = team->t.t_threads;
1195 
1196 #ifdef KMP_REVERSE_HYPER_BAR
1197  // Count up to correct level for parent
1198  for (level = 0, offset = 1;
1199  offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
1200  level += branch_bits, offset <<= branch_bits)
1201  ;
1202 
1203  // Now go down from there
1204  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
1205  level -= branch_bits, offset >>= branch_bits)
1206 #else
1207  // Go down the tree, level by level
1208  for (level = 0, offset = 1; offset < num_threads;
1209  level += branch_bits, offset <<= branch_bits)
1210 #endif // KMP_REVERSE_HYPER_BAR
1211  {
1212 #ifdef KMP_REVERSE_HYPER_BAR
1213  /* Now go in reverse order through the children, highest to lowest.
1214  Initial setting of child is conservative here. */
1215  child = num_threads >> ((level == 0) ? level : level - 1);
1216  for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
1217  child_tid = tid + (child << level);
1218  child >= 1; child--, child_tid -= (1 << level))
1219 #else
1220  if (((tid >> level) & (branch_factor - 1)) != 0)
1221  // No need to go lower than this, since this is the level parent would be
1222  // notified
1223  break;
1224  // Iterate through children on this level of the tree
1225  for (child = 1, child_tid = tid + (1 << level);
1226  child < branch_factor && child_tid < num_threads;
1227  child++, child_tid += (1 << level))
1228 #endif // KMP_REVERSE_HYPER_BAR
1229  {
1230  if (child_tid >= num_threads)
1231  continue; // Child doesn't exist so keep going
1232  else {
1233  kmp_info_t *child_thr = other_threads[child_tid];
1234  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1235 #if KMP_CACHE_MANAGE
1236  kmp_uint32 next_child_tid = child_tid - (1 << level);
1237 // Prefetch next thread's go count
1238 #ifdef KMP_REVERSE_HYPER_BAR
1239  if (child - 1 >= 1 && next_child_tid < num_threads)
1240 #else
1241  if (child + 1 < branch_factor && next_child_tid < num_threads)
1242 #endif // KMP_REVERSE_HYPER_BAR
1243  KMP_CACHE_PREFETCH(
1244  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
1245 #endif /* KMP_CACHE_MANAGE */
1246 
1247 #if KMP_BARRIER_ICV_PUSH
1248  if (propagate_icvs) // push my fixed ICVs to my child
1249  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1250 #endif // KMP_BARRIER_ICV_PUSH
1251 
1252  KA_TRACE(
1253  20,
1254  ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1255  "go(%p): %u => %u\n",
1256  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1257  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1258  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1259  // Release child from barrier
1260  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1261  flag.release();
1262  }
1263  }
1264  }
1265 #if KMP_BARRIER_ICV_PUSH
1266  if (propagate_icvs &&
1267  !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
1268  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1269  FALSE);
1270  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1271  &thr_bar->th_fixed_icvs);
1272  }
1273 #endif
1274  KA_TRACE(
1275  20,
1276  ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1277  gtid, team->t.t_id, tid, bt));
1278 }
1279 
1280 // Hierarchical Barrier
1281 
1282 // Initialize thread barrier data
1283 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
1284  Performs the minimum amount of initialization required based on how the team
1285  has changed. Returns true if leaf children will require both on-core and
1286  traditional wake-up mechanisms. For example, if the team size increases,
1287  threads already in the team will respond to on-core wakeup on their parent
1288  thread, but threads newly added to the team will only be listening on the
1289  their local b_go. */
1290 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
1291  kmp_bstate_t *thr_bar,
1292  kmp_uint32 nproc, int gtid,
1293  int tid, kmp_team_t *team) {
1294  // Checks to determine if (re-)initialization is needed
1295  bool uninitialized = thr_bar->team == NULL;
1296  bool team_changed = team != thr_bar->team;
1297  bool team_sz_changed = nproc != thr_bar->nproc;
1298  bool tid_changed = tid != thr_bar->old_tid;
1299  bool retval = false;
1300 
1301  if (uninitialized || team_sz_changed) {
1302  __kmp_get_hierarchy(nproc, thr_bar);
1303  }
1304 
1305  if (uninitialized || team_sz_changed || tid_changed) {
1306  thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
1307  thr_bar->parent_tid = -1; // default for primary thread
1308  if (!KMP_MASTER_TID(tid)) {
1309  // if not primary thread, find parent thread in hierarchy
1310  kmp_uint32 d = 0;
1311  while (d < thr_bar->depth) { // find parent based on level of thread in
1312  // hierarchy, and note level
1313  kmp_uint32 rem;
1314  if (d == thr_bar->depth - 2) { // reached level right below the primary
1315  thr_bar->parent_tid = 0;
1316  thr_bar->my_level = d;
1317  break;
1318  } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
1319  // TODO: can we make the above op faster?
1320  // thread is not a subtree root at next level, so this is max
1321  thr_bar->parent_tid = tid - rem;
1322  thr_bar->my_level = d;
1323  break;
1324  }
1325  ++d;
1326  }
1327  }
1328  __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
1329  (thr_bar->skip_per_level[thr_bar->my_level])),
1330  &(thr_bar->offset));
1331  thr_bar->old_tid = tid;
1332  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1333  thr_bar->team = team;
1334  thr_bar->parent_bar =
1335  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1336  }
1337  if (uninitialized || team_changed || tid_changed) {
1338  thr_bar->team = team;
1339  thr_bar->parent_bar =
1340  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1341  retval = true;
1342  }
1343  if (uninitialized || team_sz_changed || tid_changed) {
1344  thr_bar->nproc = nproc;
1345  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
1346  if (thr_bar->my_level == 0)
1347  thr_bar->leaf_kids = 0;
1348  if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
1349  __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
1350  thr_bar->leaf_state = 0;
1351  for (int i = 0; i < thr_bar->leaf_kids; ++i)
1352  ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
1353  }
1354  return retval;
1355 }
1356 
1357 static void __kmp_hierarchical_barrier_gather(
1358  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1359  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1360  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
1361  kmp_team_t *team = this_thr->th.th_team;
1362  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1363  kmp_uint32 nproc = this_thr->th.th_team_nproc;
1364  kmp_info_t **other_threads = team->t.t_threads;
1365  kmp_uint64 new_state = 0;
1366 
1367  int level = team->t.t_level;
1368  if (other_threads[0]
1369  ->th.th_teams_microtask) // are we inside the teams construct?
1370  if (this_thr->th.th_teams_size.nteams > 1)
1371  ++level; // level was not increased in teams construct for team_of_masters
1372  if (level == 1)
1373  thr_bar->use_oncore_barrier = 1;
1374  else
1375  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1376 
1377  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
1378  "barrier type %d\n",
1379  gtid, team->t.t_id, tid, bt));
1380  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1381 
1382 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1383  // Barrier imbalance - save arrive time to the thread
1384  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1385  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
1386  }
1387 #endif
1388 
1389  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
1390  team);
1391 
1392  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
1393  kmp_int32 child_tid;
1394  new_state =
1395  (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1396  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1397  thr_bar->use_oncore_barrier) {
1398  if (thr_bar->leaf_kids) {
1399  // First, wait for leaf children to check-in on my b_arrived flag
1400  kmp_uint64 leaf_state =
1401  KMP_MASTER_TID(tid)
1402  ? thr_bar->b_arrived | thr_bar->leaf_state
1403  : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
1404  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
1405  "for leaf kids\n",
1406  gtid, team->t.t_id, tid));
1407  kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
1408  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1409  if (reduce) {
1410  OMPT_REDUCTION_DECL(this_thr, gtid);
1411  OMPT_REDUCTION_BEGIN;
1412  for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
1413  ++child_tid) {
1414  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1415  "T#%d(%d:%d)\n",
1416  gtid, team->t.t_id, tid,
1417  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1418  child_tid));
1419  (*reduce)(this_thr->th.th_local.reduce_data,
1420  other_threads[child_tid]->th.th_local.reduce_data);
1421  }
1422  OMPT_REDUCTION_END;
1423  }
1424  // clear leaf_state bits
1425  KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
1426  }
1427  // Next, wait for higher level children on each child's b_arrived flag
1428  for (kmp_uint32 d = 1; d < thr_bar->my_level;
1429  ++d) { // gather lowest level threads first, but skip 0
1430  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1431  skip = thr_bar->skip_per_level[d];
1432  if (last > nproc)
1433  last = nproc;
1434  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1435  kmp_info_t *child_thr = other_threads[child_tid];
1436  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1437  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1438  "T#%d(%d:%d) "
1439  "arrived(%p) == %llu\n",
1440  gtid, team->t.t_id, tid,
1441  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1442  child_tid, &child_bar->b_arrived, new_state));
1443  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1444  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1445  if (reduce) {
1446  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1447  "T#%d(%d:%d)\n",
1448  gtid, team->t.t_id, tid,
1449  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1450  child_tid));
1451  (*reduce)(this_thr->th.th_local.reduce_data,
1452  child_thr->th.th_local.reduce_data);
1453  }
1454  }
1455  }
1456  } else { // Blocktime is not infinite
1457  for (kmp_uint32 d = 0; d < thr_bar->my_level;
1458  ++d) { // Gather lowest level threads first
1459  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1460  skip = thr_bar->skip_per_level[d];
1461  if (last > nproc)
1462  last = nproc;
1463  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1464  kmp_info_t *child_thr = other_threads[child_tid];
1465  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1466  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1467  "T#%d(%d:%d) "
1468  "arrived(%p) == %llu\n",
1469  gtid, team->t.t_id, tid,
1470  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1471  child_tid, &child_bar->b_arrived, new_state));
1472  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1473  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1474  if (reduce) {
1475  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1476  "T#%d(%d:%d)\n",
1477  gtid, team->t.t_id, tid,
1478  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1479  child_tid));
1480  (*reduce)(this_thr->th.th_local.reduce_data,
1481  child_thr->th.th_local.reduce_data);
1482  }
1483  }
1484  }
1485  }
1486  }
1487  // All subordinates are gathered; now release parent if not primary thread
1488 
1489  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1490  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1491  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1492  gtid, team->t.t_id, tid,
1493  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1494  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1495  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1496  /* Mark arrival to parent: After performing this write, a worker thread may
1497  not assume that the team is valid any more - it could be deallocated by
1498  the primary thread at any time. */
1499  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1500  !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1501  // flag; release it
1502  kmp_flag_64<> flag(&thr_bar->b_arrived,
1503  other_threads[thr_bar->parent_tid]);
1504  flag.release();
1505  } else {
1506  // Leaf does special release on "offset" bits of parent's b_arrived flag
1507  thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1508  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1509  thr_bar->offset + 1);
1510  flag.set_waiter(other_threads[thr_bar->parent_tid]);
1511  flag.release();
1512  }
1513  } else { // Primary thread needs to update the team's b_arrived value
1514  team->t.t_bar[bt].b_arrived = new_state;
1515  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1516  "arrived(%p) = %llu\n",
1517  gtid, team->t.t_id, tid, team->t.t_id,
1518  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1519  }
1520  // Is the team access below unsafe or just technically invalid?
1521  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1522  "barrier type %d\n",
1523  gtid, team->t.t_id, tid, bt));
1524 }
1525 
1526 static void __kmp_hierarchical_barrier_release(
1527  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1528  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1529  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1530  kmp_team_t *team;
1531  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1532  kmp_uint32 nproc;
1533  bool team_change = false; // indicates on-core barrier shouldn't be used
1534 
1535  if (KMP_MASTER_TID(tid)) {
1536  team = __kmp_threads[gtid]->th.th_team;
1537  KMP_DEBUG_ASSERT(team != NULL);
1538  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1539  "entered barrier type %d\n",
1540  gtid, team->t.t_id, tid, bt));
1541  } else { // Worker threads
1542  // Wait for parent thread to release me
1543  if (!thr_bar->use_oncore_barrier ||
1544  __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1545  thr_bar->team == NULL) {
1546  // Use traditional method of waiting on my own b_go flag
1547  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1548  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1549  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1550  TCW_8(thr_bar->b_go,
1551  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1552  } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1553  // infinite, not nested
1554  // Wait on my "offset" bits on parent's b_go flag
1555  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1556  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1557  thr_bar->offset + 1, bt,
1558  this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1559  flag.wait(this_thr, TRUE);
1560  if (thr_bar->wait_flag ==
1561  KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1562  TCW_8(thr_bar->b_go,
1563  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1564  } else { // Reset my bits on parent's b_go flag
1565  (RCAST(volatile char *,
1566  &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1567  }
1568  }
1569  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1570  // Early exit for reaping threads releasing forkjoin barrier
1571  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1572  return;
1573  // The worker thread may now assume that the team is valid.
1574  team = __kmp_threads[gtid]->th.th_team;
1575  KMP_DEBUG_ASSERT(team != NULL);
1576  tid = __kmp_tid_from_gtid(gtid);
1577 
1578  KA_TRACE(
1579  20,
1580  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1581  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1582  KMP_MB(); // Flush all pending memory write invalidates.
1583  }
1584 
1585  nproc = this_thr->th.th_team_nproc;
1586  int level = team->t.t_level;
1587  if (team->t.t_threads[0]
1588  ->th.th_teams_microtask) { // are we inside the teams construct?
1589  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1590  this_thr->th.th_teams_level == level)
1591  ++level; // level was not increased in teams construct for team_of_workers
1592  if (this_thr->th.th_teams_size.nteams > 1)
1593  ++level; // level was not increased in teams construct for team_of_masters
1594  }
1595  if (level == 1)
1596  thr_bar->use_oncore_barrier = 1;
1597  else
1598  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1599 
1600  // If the team size has increased, we still communicate with old leaves via
1601  // oncore barrier.
1602  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1603  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1604  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1605  tid, team);
1606  // But if the entire team changes, we won't use oncore barrier at all
1607  if (team_change)
1608  old_leaf_kids = 0;
1609 
1610 #if KMP_BARRIER_ICV_PUSH
1611  if (propagate_icvs) {
1612  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1613  FALSE);
1614  if (KMP_MASTER_TID(
1615  tid)) { // primary already has copy in final destination; copy
1616  copy_icvs(&thr_bar->th_fixed_icvs,
1617  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1618  } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1619  thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1620  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1621  // leaves (on-core children) pull parent's fixed ICVs directly to local
1622  // ICV store
1623  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1624  &thr_bar->parent_bar->th_fixed_icvs);
1625  // non-leaves will get ICVs piggybacked with b_go via NGO store
1626  } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1627  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1628  // access
1629  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1630  else // leaves copy parent's fixed ICVs directly to local ICV store
1631  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1632  &thr_bar->parent_bar->th_fixed_icvs);
1633  }
1634  }
1635 #endif // KMP_BARRIER_ICV_PUSH
1636 
1637  // Now, release my children
1638  if (thr_bar->my_level) { // not a leaf
1639  kmp_int32 child_tid;
1640  kmp_uint32 last;
1641  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1642  thr_bar->use_oncore_barrier) {
1643  if (KMP_MASTER_TID(tid)) { // do a flat release
1644  // Set local b_go to bump children via NGO store of the cache line
1645  // containing IVCs and b_go.
1646  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1647  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1648  // the cache line
1649  ngo_load(&thr_bar->th_fixed_icvs);
1650  // This loops over all the threads skipping only the leaf nodes in the
1651  // hierarchy
1652  for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1653  child_tid += thr_bar->skip_per_level[1]) {
1654  kmp_bstate_t *child_bar =
1655  &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1656  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1657  "releasing T#%d(%d:%d)"
1658  " go(%p): %u => %u\n",
1659  gtid, team->t.t_id, tid,
1660  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1661  child_tid, &child_bar->b_go, child_bar->b_go,
1662  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1663  // Use ngo store (if available) to both store ICVs and release child
1664  // via child's b_go
1665  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1666  }
1667  ngo_sync();
1668  }
1669  TCW_8(thr_bar->b_go,
1670  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1671  // Now, release leaf children
1672  if (thr_bar->leaf_kids) { // if there are any
1673  // We test team_change on the off-chance that the level 1 team changed.
1674  if (team_change ||
1675  old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1676  if (old_leaf_kids) { // release old leaf kids
1677  thr_bar->b_go |= old_leaf_state;
1678  }
1679  // Release new leaf kids
1680  last = tid + thr_bar->skip_per_level[1];
1681  if (last > nproc)
1682  last = nproc;
1683  for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1684  ++child_tid) { // skip_per_level[0]=1
1685  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1686  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1687  KA_TRACE(
1688  20,
1689  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1690  " T#%d(%d:%d) go(%p): %u => %u\n",
1691  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1692  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1693  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1694  // Release child using child's b_go flag
1695  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1696  flag.release();
1697  }
1698  } else { // Release all children at once with leaf_state bits on my own
1699  // b_go flag
1700  thr_bar->b_go |= thr_bar->leaf_state;
1701  }
1702  }
1703  } else { // Blocktime is not infinite; do a simple hierarchical release
1704  for (int d = thr_bar->my_level - 1; d >= 0;
1705  --d) { // Release highest level threads first
1706  last = tid + thr_bar->skip_per_level[d + 1];
1707  kmp_uint32 skip = thr_bar->skip_per_level[d];
1708  if (last > nproc)
1709  last = nproc;
1710  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1711  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1712  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1713  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1714  "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1715  gtid, team->t.t_id, tid,
1716  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1717  child_tid, &child_bar->b_go, child_bar->b_go,
1718  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1719  // Release child using child's b_go flag
1720  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1721  flag.release();
1722  }
1723  }
1724  }
1725 #if KMP_BARRIER_ICV_PUSH
1726  if (propagate_icvs && !KMP_MASTER_TID(tid))
1727  // non-leaves copy ICVs from fixed ICVs to local dest
1728  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1729  &thr_bar->th_fixed_icvs);
1730 #endif // KMP_BARRIER_ICV_PUSH
1731  }
1732  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1733  "barrier type %d\n",
1734  gtid, team->t.t_id, tid, bt));
1735 }
1736 
1737 // End of Barrier Algorithms
1738 
1739 // type traits for cancellable value
1740 // if cancellable is true, then is_cancellable is a normal boolean variable
1741 // if cancellable is false, then is_cancellable is a compile time constant
1742 template <bool cancellable> struct is_cancellable {};
1743 template <> struct is_cancellable<true> {
1744  bool value;
1745  is_cancellable() : value(false) {}
1746  is_cancellable(bool b) : value(b) {}
1747  is_cancellable &operator=(bool b) {
1748  value = b;
1749  return *this;
1750  }
1751  operator bool() const { return value; }
1752 };
1753 template <> struct is_cancellable<false> {
1754  is_cancellable &operator=(bool b) { return *this; }
1755  constexpr operator bool() const { return false; }
1756 };
1757 
1758 // Internal function to do a barrier.
1759 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1760  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1761  barrier
1762  When cancellable = false,
1763  Returns 0 if primary thread, 1 if worker thread.
1764  When cancellable = true
1765  Returns 0 if not cancelled, 1 if cancelled. */
1766 template <bool cancellable = false>
1767 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1768  size_t reduce_size, void *reduce_data,
1769  void (*reduce)(void *, void *)) {
1770  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1771  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1772  int tid = __kmp_tid_from_gtid(gtid);
1773  kmp_info_t *this_thr = __kmp_threads[gtid];
1774  kmp_team_t *team = this_thr->th.th_team;
1775  int status = 0;
1776  is_cancellable<cancellable> cancelled;
1777 #if OMPT_SUPPORT && OMPT_OPTIONAL
1778  ompt_data_t *my_task_data;
1779  ompt_data_t *my_parallel_data;
1780  void *return_address;
1781  ompt_sync_region_t barrier_kind;
1782 #endif
1783 
1784  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1785  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1786 
1787 #if OMPT_SUPPORT
1788  if (ompt_enabled.enabled) {
1789 #if OMPT_OPTIONAL
1790  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1791  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1792  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1793  barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1794  if (ompt_enabled.ompt_callback_sync_region) {
1795  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1796  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1797  return_address);
1798  }
1799  if (ompt_enabled.ompt_callback_sync_region_wait) {
1800  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1801  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1802  return_address);
1803  }
1804 #endif
1805  // It is OK to report the barrier state after the barrier begin callback.
1806  // According to the OMPT specification, a compliant implementation may
1807  // even delay reporting this state until the barrier begins to wait.
1808  auto *ompt_thr_info = &this_thr->th.ompt_thread_info;
1809  switch (barrier_kind) {
1810  case ompt_sync_region_barrier_explicit:
1811  ompt_thr_info->state = ompt_state_wait_barrier_explicit;
1812  break;
1813  case ompt_sync_region_barrier_implicit_workshare:
1814  ompt_thr_info->state = ompt_state_wait_barrier_implicit_workshare;
1815  break;
1816  case ompt_sync_region_barrier_implicit_parallel:
1817  ompt_thr_info->state = ompt_state_wait_barrier_implicit_parallel;
1818  break;
1819  case ompt_sync_region_barrier_teams:
1820  ompt_thr_info->state = ompt_state_wait_barrier_teams;
1821  break;
1822  case ompt_sync_region_barrier_implementation:
1823  [[fallthrough]];
1824  default:
1825  ompt_thr_info->state = ompt_state_wait_barrier_implementation;
1826  }
1827  }
1828 #endif
1829 
1830  if (!team->t.t_serialized) {
1831 #if USE_ITT_BUILD
1832  // This value will be used in itt notify events below.
1833  void *itt_sync_obj = NULL;
1834 #if USE_ITT_NOTIFY
1835  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1836  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1837 #endif
1838 #endif /* USE_ITT_BUILD */
1839  if (__kmp_tasking_mode == tskm_extra_barrier) {
1840  __kmp_tasking_barrier(team, this_thr, gtid);
1841  KA_TRACE(15,
1842  ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1843  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1844  }
1845 
1846  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1847  access it when the team struct is not guaranteed to exist. */
1848  // See note about the corresponding code in __kmp_join_barrier() being
1849  // performance-critical.
1850  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1851 #if KMP_USE_MONITOR
1852  this_thr->th.th_team_bt_intervals =
1853  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1854  this_thr->th.th_team_bt_set =
1855  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1856 #else
1857  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1858 #endif
1859  }
1860 
1861 #if USE_ITT_BUILD
1862  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1863  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1864 #endif /* USE_ITT_BUILD */
1865 #if USE_DEBUGGER
1866  // Let the debugger know: the thread arrived to the barrier and waiting.
1867  if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
1868  team->t.t_bar[bt].b_master_arrived += 1;
1869  } else {
1870  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1871  } // if
1872 #endif /* USE_DEBUGGER */
1873  if (reduce != NULL) {
1874  // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1875  this_thr->th.th_local.reduce_data = reduce_data;
1876  }
1877 
1878  if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1879  __kmp_task_team_setup(this_thr, team);
1880 
1881  if (cancellable) {
1882  cancelled = __kmp_linear_barrier_gather_cancellable(
1883  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1884  } else {
1885  switch (__kmp_barrier_gather_pattern[bt]) {
1886  case bp_dist_bar: {
1887  __kmp_dist_barrier_gather(bt, this_thr, gtid, tid,
1888  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1889  break;
1890  }
1891  case bp_hyper_bar: {
1892  // don't set branch bits to 0; use linear
1893  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1894  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1895  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1896  break;
1897  }
1898  case bp_hierarchical_bar: {
1899  __kmp_hierarchical_barrier_gather(
1900  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1901  break;
1902  }
1903  case bp_tree_bar: {
1904  // don't set branch bits to 0; use linear
1905  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1906  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1907  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1908  break;
1909  }
1910  default: {
1911  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1912  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1913  }
1914  }
1915  }
1916 
1917  KMP_MB();
1918 
1919  if (KMP_MASTER_TID(tid)) {
1920  status = 0;
1921  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1922  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1923  }
1924 #if USE_DEBUGGER
1925  // Let the debugger know: All threads are arrived and starting leaving the
1926  // barrier.
1927  team->t.t_bar[bt].b_team_arrived += 1;
1928 #endif
1929 
1930  if (__kmp_omp_cancellation) {
1931  kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1932  // Reset cancellation flag for worksharing constructs
1933  if (cancel_request == cancel_loop ||
1934  cancel_request == cancel_sections) {
1935  KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1936  }
1937  }
1938 #if USE_ITT_BUILD
1939  /* TODO: In case of split reduction barrier, primary thread may send
1940  acquired event early, before the final summation into the shared
1941  variable is done (final summation can be a long operation for array
1942  reductions). */
1943  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1944  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1945 #endif /* USE_ITT_BUILD */
1946 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1947  // Barrier - report frame end (only if active_level == 1)
1948  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1949  __kmp_forkjoin_frames_mode &&
1950  (this_thr->th.th_teams_microtask == NULL || // either not in teams
1951  this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1952  team->t.t_active_level == 1) {
1953  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1954  kmp_uint64 cur_time = __itt_get_timestamp();
1955  kmp_info_t **other_threads = team->t.t_threads;
1956  int nproc = this_thr->th.th_team_nproc;
1957  int i;
1958  switch (__kmp_forkjoin_frames_mode) {
1959  case 1:
1960  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1961  loc, nproc);
1962  this_thr->th.th_frame_time = cur_time;
1963  break;
1964  case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1965  // be fixed)
1966  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1967  1, loc, nproc);
1968  break;
1969  case 3:
1970  if (__itt_metadata_add_ptr) {
1971  // Initialize with primary thread's wait time
1972  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1973  // Set arrive time to zero to be able to check it in
1974  // __kmp_invoke_task(); the same is done inside the loop below
1975  this_thr->th.th_bar_arrive_time = 0;
1976  for (i = 1; i < nproc; ++i) {
1977  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1978  other_threads[i]->th.th_bar_arrive_time = 0;
1979  }
1980  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1981  cur_time, delta,
1982  (kmp_uint64)(reduce != NULL));
1983  }
1984  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1985  loc, nproc);
1986  this_thr->th.th_frame_time = cur_time;
1987  break;
1988  }
1989  }
1990 #endif /* USE_ITT_BUILD */
1991  } else {
1992  status = 1;
1993 #if USE_ITT_BUILD
1994  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1995  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1996 #endif /* USE_ITT_BUILD */
1997  }
1998  if ((status == 1 || !is_split) && !cancelled) {
1999  if (cancellable) {
2000  cancelled = __kmp_linear_barrier_release_cancellable(
2001  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2002  } else {
2003  switch (__kmp_barrier_release_pattern[bt]) {
2004  case bp_dist_bar: {
2005  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2006  __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2007  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2008  break;
2009  }
2010  case bp_hyper_bar: {
2011  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2012  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2013  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2014  break;
2015  }
2016  case bp_hierarchical_bar: {
2017  __kmp_hierarchical_barrier_release(
2018  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2019  break;
2020  }
2021  case bp_tree_bar: {
2022  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2023  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2024  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2025  break;
2026  }
2027  default: {
2028  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2029  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2030  }
2031  }
2032  }
2033  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
2034  __kmp_task_team_sync(this_thr, team);
2035  }
2036  }
2037 
2038 #if USE_ITT_BUILD
2039  /* GEH: TODO: Move this under if-condition above and also include in
2040  __kmp_end_split_barrier(). This will more accurately represent the actual
2041  release time of the threads for split barriers. */
2042  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2043  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2044 #endif /* USE_ITT_BUILD */
2045  } else { // Team is serialized.
2046  status = 0;
2047  if (__kmp_tasking_mode != tskm_immediate_exec) {
2048  if (this_thr->th.th_task_team != NULL) {
2049 #if USE_ITT_NOTIFY
2050  void *itt_sync_obj = NULL;
2051  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2052  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
2053  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2054  }
2055 #endif
2056 
2057  KMP_DEBUG_ASSERT(
2058  this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE ||
2059  this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
2060  TRUE);
2061  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2062  __kmp_task_team_setup(this_thr, team);
2063 
2064 #if USE_ITT_BUILD
2065  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2066  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2067 #endif /* USE_ITT_BUILD */
2068  }
2069  }
2070  }
2071  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
2072  gtid, __kmp_team_from_gtid(gtid)->t.t_id,
2073  __kmp_tid_from_gtid(gtid), status));
2074 
2075 #if OMPT_SUPPORT
2076  if (ompt_enabled.enabled) {
2077 #if OMPT_OPTIONAL
2078  if (ompt_enabled.ompt_callback_sync_region_wait) {
2079  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2080  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2081  return_address);
2082  }
2083  if (ompt_enabled.ompt_callback_sync_region) {
2084  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2085  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2086  return_address);
2087  }
2088 #endif
2089  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
2090  }
2091 #endif
2092 
2093  if (cancellable)
2094  return (int)cancelled;
2095  return status;
2096 }
2097 
2098 // Returns 0 if primary thread, 1 if worker thread.
2099 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
2100  size_t reduce_size, void *reduce_data,
2101  void (*reduce)(void *, void *)) {
2102  return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
2103  reduce);
2104 }
2105 
2106 #if defined(KMP_GOMP_COMPAT)
2107 // Returns 1 if cancelled, 0 otherwise
2108 int __kmp_barrier_gomp_cancel(int gtid) {
2109  if (__kmp_omp_cancellation) {
2110  int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
2111  0, NULL, NULL);
2112  if (cancelled) {
2113  int tid = __kmp_tid_from_gtid(gtid);
2114  kmp_info_t *this_thr = __kmp_threads[gtid];
2115  if (KMP_MASTER_TID(tid)) {
2116  // Primary thread does not need to revert anything
2117  } else {
2118  // Workers need to revert their private b_arrived flag
2119  this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
2120  KMP_BARRIER_STATE_BUMP;
2121  }
2122  }
2123  return cancelled;
2124  }
2125  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2126  return FALSE;
2127 }
2128 #endif
2129 
2130 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
2131  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
2132  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
2133  KMP_DEBUG_ASSERT(bt < bs_last_barrier);
2134  int tid = __kmp_tid_from_gtid(gtid);
2135  kmp_info_t *this_thr = __kmp_threads[gtid];
2136  kmp_team_t *team = this_thr->th.th_team;
2137 
2138  if (!team->t.t_serialized) {
2139  if (KMP_MASTER_GTID(gtid)) {
2140  switch (__kmp_barrier_release_pattern[bt]) {
2141  case bp_dist_bar: {
2142  __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2143  FALSE USE_ITT_BUILD_ARG(NULL));
2144  break;
2145  }
2146  case bp_hyper_bar: {
2147  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2148  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2149  FALSE USE_ITT_BUILD_ARG(NULL));
2150  break;
2151  }
2152  case bp_hierarchical_bar: {
2153  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
2154  FALSE USE_ITT_BUILD_ARG(NULL));
2155  break;
2156  }
2157  case bp_tree_bar: {
2158  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2159  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2160  FALSE USE_ITT_BUILD_ARG(NULL));
2161  break;
2162  }
2163  default: {
2164  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2165  FALSE USE_ITT_BUILD_ARG(NULL));
2166  }
2167  }
2168  if (__kmp_tasking_mode != tskm_immediate_exec) {
2169  __kmp_task_team_sync(this_thr, team);
2170  } // if
2171  }
2172  }
2173 }
2174 
2175 void __kmp_join_barrier(int gtid) {
2176  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
2177  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2178 
2179  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
2180 
2181  kmp_info_t *this_thr = __kmp_threads[gtid];
2182  kmp_team_t *team;
2183  int tid;
2184 #ifdef KMP_DEBUG
2185  int team_id;
2186 #endif /* KMP_DEBUG */
2187 #if USE_ITT_BUILD
2188  void *itt_sync_obj = NULL;
2189 #if USE_ITT_NOTIFY
2190  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
2191  // Get object created at fork_barrier
2192  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2193 #endif
2194 #endif /* USE_ITT_BUILD */
2195 #if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG)
2196  int nproc = this_thr->th.th_team_nproc;
2197 #endif
2198  KMP_MB();
2199 
2200  // Get current info
2201  team = this_thr->th.th_team;
2202  KMP_DEBUG_ASSERT(nproc == team->t.t_nproc);
2203  tid = __kmp_tid_from_gtid(gtid);
2204 #ifdef KMP_DEBUG
2205  team_id = team->t.t_id;
2206  kmp_info_t *master_thread = this_thr->th.th_team_master;
2207  if (master_thread != team->t.t_threads[0]) {
2208  __kmp_print_structure();
2209  }
2210 #endif /* KMP_DEBUG */
2211  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
2212  KMP_MB();
2213 
2214  // Verify state
2215  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
2216  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
2217  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
2218  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
2219  gtid, team_id, tid));
2220 
2221 #if OMPT_SUPPORT
2222  if (ompt_enabled.enabled) {
2223 #if OMPT_OPTIONAL
2224  ompt_data_t *my_task_data;
2225  ompt_data_t *my_parallel_data;
2226  void *codeptr = NULL;
2227  int ds_tid = this_thr->th.th_info.ds.ds_tid;
2228  if (KMP_MASTER_TID(ds_tid) &&
2229  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2230  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2231  codeptr = team->t.ompt_team_info.master_return_address;
2232  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
2233  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
2234  ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2235  ompt_state_t ompt_state = ompt_state_wait_barrier_implicit_parallel;
2236  if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) {
2237  sync_kind = ompt_sync_region_barrier_teams;
2238  ompt_state = ompt_state_wait_barrier_teams;
2239  }
2240  if (ompt_enabled.ompt_callback_sync_region) {
2241  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2242  sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2243  }
2244  if (ompt_enabled.ompt_callback_sync_region_wait) {
2245  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2246  sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2247  }
2248  if (!KMP_MASTER_TID(ds_tid))
2249  this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
2250 #endif
2251  this_thr->th.ompt_thread_info.state = ompt_state;
2252  }
2253 #endif
2254 
2255  if (__kmp_tasking_mode == tskm_extra_barrier) {
2256  __kmp_tasking_barrier(team, this_thr, gtid);
2257  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
2258  gtid, team_id, tid));
2259  }
2260 #ifdef KMP_DEBUG
2261  if (__kmp_tasking_mode != tskm_immediate_exec) {
2262  KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
2263  "%p, th_task_team = %p\n",
2264  __kmp_gtid_from_thread(this_thr), team_id,
2265  team->t.t_task_team[this_thr->th.th_task_state],
2266  this_thr->th.th_task_team));
2267  KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, this_thr);
2268  }
2269 #endif /* KMP_DEBUG */
2270 
2271  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
2272  access it when the team struct is not guaranteed to exist. Doing these
2273  loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
2274  we do not perform the copy if blocktime=infinite, since the values are not
2275  used by __kmp_wait_template() in that case. */
2276  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2277 #if KMP_USE_MONITOR
2278  this_thr->th.th_team_bt_intervals =
2279  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2280  this_thr->th.th_team_bt_set =
2281  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2282 #else
2283  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2284 #endif
2285  }
2286 
2287 #if USE_ITT_BUILD
2288  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2289  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2290 #endif /* USE_ITT_BUILD */
2291 
2292  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
2293  case bp_dist_bar: {
2294  __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2295  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2296  break;
2297  }
2298  case bp_hyper_bar: {
2299  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2300  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2301  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2302  break;
2303  }
2304  case bp_hierarchical_bar: {
2305  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2306  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2307  break;
2308  }
2309  case bp_tree_bar: {
2310  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2311  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2312  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2313  break;
2314  }
2315  default: {
2316  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2317  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2318  }
2319  }
2320 
2321  /* From this point on, the team data structure may be deallocated at any time
2322  by the primary thread - it is unsafe to reference it in any of the worker
2323  threads. Any per-team data items that need to be referenced before the
2324  end of the barrier should be moved to the kmp_task_team_t structs. */
2325  if (KMP_MASTER_TID(tid)) {
2326  if (__kmp_tasking_mode != tskm_immediate_exec) {
2327  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2328  }
2329  if (__kmp_display_affinity) {
2330  KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
2331  }
2332 #if KMP_STATS_ENABLED
2333  // Have primary thread flag the workers to indicate they are now waiting for
2334  // next parallel region, Also wake them up so they switch their timers to
2335  // idle.
2336  for (int i = 0; i < team->t.t_nproc; ++i) {
2337  kmp_info_t *team_thread = team->t.t_threads[i];
2338  if (team_thread == this_thr)
2339  continue;
2340  team_thread->th.th_stats->setIdleFlag();
2341  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2342  team_thread->th.th_sleep_loc != NULL)
2343  __kmp_null_resume_wrapper(team_thread);
2344  }
2345 #endif
2346 #if USE_ITT_BUILD
2347  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2348  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2349 #endif /* USE_ITT_BUILD */
2350 
2351 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2352  // Join barrier - report frame end
2353  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2354  __kmp_forkjoin_frames_mode &&
2355  (this_thr->th.th_teams_microtask == NULL || // either not in teams
2356  this_thr->th.th_teams_size.nteams == 1) && // or inside single team
2357  team->t.t_active_level == 1) {
2358  kmp_uint64 cur_time = __itt_get_timestamp();
2359  ident_t *loc = team->t.t_ident;
2360  kmp_info_t **other_threads = team->t.t_threads;
2361  switch (__kmp_forkjoin_frames_mode) {
2362  case 1:
2363  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2364  loc, nproc);
2365  break;
2366  case 2:
2367  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
2368  loc, nproc);
2369  break;
2370  case 3:
2371  if (__itt_metadata_add_ptr) {
2372  // Initialize with primary thread's wait time
2373  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
2374  // Set arrive time to zero to be able to check it in
2375  // __kmp_invoke_task(); the same is done inside the loop below
2376  this_thr->th.th_bar_arrive_time = 0;
2377  for (int i = 1; i < nproc; ++i) {
2378  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2379  other_threads[i]->th.th_bar_arrive_time = 0;
2380  }
2381  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2382  cur_time, delta, 0);
2383  }
2384  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2385  loc, nproc);
2386  this_thr->th.th_frame_time = cur_time;
2387  break;
2388  }
2389  }
2390 #endif /* USE_ITT_BUILD */
2391  }
2392 #if USE_ITT_BUILD
2393  else {
2394  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2395  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2396  }
2397 #endif /* USE_ITT_BUILD */
2398 
2399 #if KMP_DEBUG
2400  if (KMP_MASTER_TID(tid)) {
2401  KA_TRACE(
2402  15,
2403  ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
2404  gtid, team_id, tid, nproc));
2405  }
2406 #endif /* KMP_DEBUG */
2407 
2408  // TODO now, mark worker threads as done so they may be disbanded
2409  KMP_MB(); // Flush all pending memory write invalidates.
2410  KA_TRACE(10,
2411  ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
2412 
2413 }
2414 
2415 // TODO release worker threads' fork barriers as we are ready instead of all at
2416 // once
2417 void __kmp_fork_barrier(int gtid, int tid) {
2418  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
2419  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2420  kmp_info_t *this_thr = __kmp_threads[gtid];
2421  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
2422 #if USE_ITT_BUILD
2423  void *itt_sync_obj = NULL;
2424 #endif /* USE_ITT_BUILD */
2425 #ifdef KMP_DEBUG
2426  if (team)
2427  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
2428  (team != NULL) ? team->t.t_id : -1, tid));
2429 #endif
2430  // th_team pointer only valid for primary thread here
2431  if (KMP_MASTER_TID(tid)) {
2432 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2433  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2434  // Create itt barrier object
2435  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
2436  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
2437  }
2438 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2439 
2440 #ifdef KMP_DEBUG
2441  KMP_DEBUG_ASSERT(team);
2442  kmp_info_t **other_threads = team->t.t_threads;
2443  int i;
2444 
2445  // Verify state
2446  KMP_MB();
2447 
2448  for (i = 1; i < team->t.t_nproc; ++i) {
2449  KA_TRACE(500,
2450  ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
2451  "== %u.\n",
2452  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
2453  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
2454  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
2455  KMP_DEBUG_ASSERT(
2456  (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
2457  ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
2458  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
2459  }
2460 #endif
2461 
2462  if (__kmp_tasking_mode != tskm_immediate_exec)
2463  __kmp_task_team_setup(this_thr, team);
2464 
2465  /* The primary thread may have changed its blocktime between join barrier
2466  and fork barrier. Copy the blocktime info to the thread, where
2467  __kmp_wait_template() can access it when the team struct is not
2468  guaranteed to exist. */
2469  // See note about the corresponding code in __kmp_join_barrier() being
2470  // performance-critical
2471  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2472 #if KMP_USE_MONITOR
2473  this_thr->th.th_team_bt_intervals =
2474  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2475  this_thr->th.th_team_bt_set =
2476  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2477 #else
2478  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2479 #endif
2480  }
2481  } // primary thread
2482 
2483  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
2484  case bp_dist_bar: {
2485  __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2486  TRUE USE_ITT_BUILD_ARG(NULL));
2487  break;
2488  }
2489  case bp_hyper_bar: {
2490  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2491  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2492  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2493  break;
2494  }
2495  case bp_hierarchical_bar: {
2496  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2497  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2498  break;
2499  }
2500  case bp_tree_bar: {
2501  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2502  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2503  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2504  break;
2505  }
2506  default: {
2507  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2508  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2509  }
2510  }
2511 
2512 #if OMPT_SUPPORT
2513  ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
2514  if (ompt_enabled.enabled &&
2515  (ompt_state == ompt_state_wait_barrier_teams ||
2516  ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
2517  int ds_tid = this_thr->th.th_info.ds.ds_tid;
2518  ompt_data_t *task_data = (team)
2519  ? OMPT_CUR_TASK_DATA(this_thr)
2520  : &(this_thr->th.ompt_thread_info.task_data);
2521  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2522 #if OMPT_OPTIONAL
2523  void *codeptr = NULL;
2524  if (KMP_MASTER_TID(ds_tid) &&
2525  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2526  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2527  codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
2528  ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2529  if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
2530  sync_kind = ompt_sync_region_barrier_teams;
2531  if (ompt_enabled.ompt_callback_sync_region_wait) {
2532  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2533  sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2534  }
2535  if (ompt_enabled.ompt_callback_sync_region) {
2536  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2537  sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2538  }
2539 #endif
2540  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2541  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2542  ompt_scope_end, NULL, task_data, 0, ds_tid,
2543  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2544  }
2545  }
2546 #endif
2547 
2548  // Early exit for reaping threads releasing forkjoin barrier
2549  if (TCR_4(__kmp_global.g.g_done)) {
2550  this_thr->th.th_task_team = NULL;
2551 
2552 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2553  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2554  if (!KMP_MASTER_TID(tid)) {
2555  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2556  if (itt_sync_obj)
2557  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2558  }
2559  }
2560 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2561  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2562  return;
2563  }
2564 
2565  /* We can now assume that a valid team structure has been allocated by the
2566  primary thread and propagated to all worker threads. The current thread,
2567  however, may not be part of the team, so we can't blindly assume that the
2568  team pointer is non-null. */
2569  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2570  KMP_DEBUG_ASSERT(team != NULL);
2571  tid = __kmp_tid_from_gtid(gtid);
2572 
2573 #if KMP_BARRIER_ICV_PULL
2574  /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2575  __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2576  implicit task has this data before this function is called. We cannot
2577  modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
2578  thread struct, because it is not always the case that the threads arrays
2579  have been allocated when __kmp_fork_call() is executed. */
2580  {
2581  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2582  if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
2583  // Copy the initial ICVs from the primary thread's thread struct to the
2584  // implicit task for this tid.
2585  KA_TRACE(10,
2586  ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2587  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2588  tid, FALSE);
2589  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2590  &team->t.t_threads[0]
2591  ->th.th_bar[bs_forkjoin_barrier]
2592  .bb.th_fixed_icvs);
2593  }
2594  }
2595 #endif // KMP_BARRIER_ICV_PULL
2596 
2597  if (__kmp_tasking_mode != tskm_immediate_exec) {
2598  __kmp_task_team_sync(this_thr, team);
2599  }
2600 
2601 #if KMP_AFFINITY_SUPPORTED
2602  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2603  if (proc_bind == proc_bind_intel) {
2604  // Call dynamic affinity settings
2605  if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) {
2606  __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2607  }
2608  } else if (proc_bind != proc_bind_false) {
2609  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2610  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2611  __kmp_gtid_from_thread(this_thr),
2612  this_thr->th.th_current_place));
2613  } else {
2614  __kmp_affinity_bind_place(gtid);
2615  }
2616  }
2617 #endif // KMP_AFFINITY_SUPPORTED
2618  // Perform the display affinity functionality
2619  if (__kmp_display_affinity) {
2620  if (team->t.t_display_affinity
2621 #if KMP_AFFINITY_SUPPORTED
2622  || (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed)
2623 #endif
2624  ) {
2625  // NULL means use the affinity-format-var ICV
2626  __kmp_aux_display_affinity(gtid, NULL);
2627  this_thr->th.th_prev_num_threads = team->t.t_nproc;
2628  this_thr->th.th_prev_level = team->t.t_level;
2629  }
2630  }
2631  if (!KMP_MASTER_TID(tid))
2632  KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2633 
2634 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2635  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2636  if (!KMP_MASTER_TID(tid)) {
2637  // Get correct barrier object
2638  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2639  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2640  } // (prepare called inside barrier_release)
2641  }
2642 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2643  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2644  team->t.t_id, tid));
2645 }
2646 
2647 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2648  kmp_internal_control_t *new_icvs, ident_t *loc) {
2649  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2650 
2651  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2652  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2653 
2654 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2655  __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2656  implicit task has this data before this function is called. */
2657 #if KMP_BARRIER_ICV_PULL
2658  /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
2659  remains untouched), where all of the worker threads can access them and
2660  make their own copies after the barrier. */
2661  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2662  // allocated at this point
2663  copy_icvs(
2664  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2665  new_icvs);
2666  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2667  team->t.t_threads[0], team));
2668 #elif KMP_BARRIER_ICV_PUSH
2669  // The ICVs will be propagated in the fork barrier, so nothing needs to be
2670  // done here.
2671  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2672  team->t.t_threads[0], team));
2673 #else
2674  // Copy the ICVs to each of the non-primary threads. This takes O(nthreads)
2675  // time.
2676  ngo_load(new_icvs);
2677  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2678  // allocated at this point
2679  for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
2680  // TODO: GEH - pass in better source location info since usually NULL here
2681  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2682  f, team->t.t_threads[f], team));
2683  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2684  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2685  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2686  f, team->t.t_threads[f], team));
2687  }
2688  ngo_sync();
2689 #endif // KMP_BARRIER_ICV_PULL
2690 }
Definition: kmp.h:247