LLVM OpenMP* Runtime Library
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_wait_release.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 
20 #if KMP_MIC
21 #include <immintrin.h>
22 #define USE_NGO_STORES 1
23 #endif // KMP_MIC
24 
25 #include "tsan_annotations.h"
26 
27 #if KMP_MIC && USE_NGO_STORES
28 // ICV copying
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33 #else
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
39 
40 void __kmp_print_structure(void); // Forward declaration
41 
42 // ---------------------------- Barrier Algorithms ----------------------------
43 
44 // Linear Barrier
45 template <bool cancellable = false>
46 static bool __kmp_linear_barrier_gather_template(
47  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
49  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
50  kmp_team_t *team = this_thr->th.th_team;
51  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
52  kmp_info_t **other_threads = team->t.t_threads;
53 
54  KA_TRACE(
55  20,
56  ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57  gtid, team->t.t_id, tid, bt));
58  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59 
60 #if USE_ITT_BUILD && USE_ITT_NOTIFY
61  // Barrier imbalance - save arrive time to the thread
62  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
64  __itt_get_timestamp();
65  }
66 #endif
67  // We now perform a linear reduction to signal that all of the threads have
68  // arrived.
69  if (!KMP_MASTER_TID(tid)) {
70  KA_TRACE(20,
71  ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
72  "arrived(%p): %llu => %llu\n",
73  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
74  team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
75  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
76  // Mark arrival to master thread
77  /* After performing this write, a worker thread may not assume that the team
78  is valid any more - it could be deallocated by the master thread at any
79  time. */
80  ANNOTATE_BARRIER_BEGIN(this_thr);
81  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
82  flag.release();
83  } else {
84  kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
85  int nproc = this_thr->th.th_team_nproc;
86  int i;
87  // Don't have to worry about sleep bit here or atomic since team setting
88  kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
89 
90  // Collect all the worker team member threads.
91  for (i = 1; i < nproc; ++i) {
92 #if KMP_CACHE_MANAGE
93  // Prefetch next thread's arrived count
94  if (i + 1 < nproc)
95  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
96 #endif /* KMP_CACHE_MANAGE */
97  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
98  "arrived(%p) == %llu\n",
99  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
100  team->t.t_id, i,
101  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
102 
103  // Wait for worker thread to arrive
104  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
105  new_state);
106  if (cancellable) {
107  bool cancelled = flag.wait_cancellable_nosleep(
108  this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
109  if (cancelled)
110  return true;
111  } else {
112  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
113  }
114  ANNOTATE_BARRIER_END(other_threads[i]);
115 #if USE_ITT_BUILD && USE_ITT_NOTIFY
116  // Barrier imbalance - write min of the thread time and the other thread
117  // time to the thread.
118  if (__kmp_forkjoin_frames_mode == 2) {
119  this_thr->th.th_bar_min_time = KMP_MIN(
120  this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
121  }
122 #endif
123  if (reduce) {
124  KA_TRACE(100,
125  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
126  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
127  team->t.t_id, i));
128  ANNOTATE_REDUCE_AFTER(reduce);
129  OMPT_REDUCTION_DECL(this_thr, gtid);
130  OMPT_REDUCTION_BEGIN;
131  (*reduce)(this_thr->th.th_local.reduce_data,
132  other_threads[i]->th.th_local.reduce_data);
133  OMPT_REDUCTION_END;
134  ANNOTATE_REDUCE_BEFORE(reduce);
135  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
136  }
137  }
138  // Don't have to worry about sleep bit here or atomic since team setting
139  team_bar->b_arrived = new_state;
140  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
141  "arrived(%p) = %llu\n",
142  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
143  new_state));
144  }
145  KA_TRACE(
146  20,
147  ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
148  gtid, team->t.t_id, tid, bt));
149  return false;
150 }
151 
152 template <bool cancellable = false>
153 static bool __kmp_linear_barrier_release_template(
154  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
155  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
156  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
157  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
158  kmp_team_t *team;
159 
160  if (KMP_MASTER_TID(tid)) {
161  unsigned int i;
162  kmp_uint32 nproc = this_thr->th.th_team_nproc;
163  kmp_info_t **other_threads;
164 
165  team = __kmp_threads[gtid]->th.th_team;
166  KMP_DEBUG_ASSERT(team != NULL);
167  other_threads = team->t.t_threads;
168 
169  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
170  "barrier type %d\n",
171  gtid, team->t.t_id, tid, bt));
172 
173  if (nproc > 1) {
174 #if KMP_BARRIER_ICV_PUSH
175  {
176  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
177  if (propagate_icvs) {
178  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
179  for (i = 1; i < nproc; ++i) {
180  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
181  team, i, FALSE);
182  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
183  &team->t.t_implicit_task_taskdata[0].td_icvs);
184  }
185  ngo_sync();
186  }
187  }
188 #endif // KMP_BARRIER_ICV_PUSH
189 
190  // Now, release all of the worker threads
191  for (i = 1; i < nproc; ++i) {
192 #if KMP_CACHE_MANAGE
193  // Prefetch next thread's go flag
194  if (i + 1 < nproc)
195  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
196 #endif /* KMP_CACHE_MANAGE */
197  KA_TRACE(
198  20,
199  ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
200  "go(%p): %u => %u\n",
201  gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
202  team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
203  other_threads[i]->th.th_bar[bt].bb.b_go,
204  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
205  ANNOTATE_BARRIER_BEGIN(other_threads[i]);
206  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
207  other_threads[i]);
208  flag.release();
209  }
210  }
211  } else { // Wait for the MASTER thread to release us
212  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
213  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
214  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
215  if (cancellable) {
216  bool cancelled = flag.wait_cancellable_nosleep(
217  this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
218  if (cancelled) {
219  return true;
220  }
221  } else {
222  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
223  }
224  ANNOTATE_BARRIER_END(this_thr);
225 #if USE_ITT_BUILD && USE_ITT_NOTIFY
226  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
227  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
228  // disabled)
229  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
230  // Cancel wait on previous parallel region...
231  __kmp_itt_task_starting(itt_sync_obj);
232 
233  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
234  return false;
235 
236  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
237  if (itt_sync_obj != NULL)
238  // Call prepare as early as possible for "new" barrier
239  __kmp_itt_task_finished(itt_sync_obj);
240  } else
241 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
242  // Early exit for reaping threads releasing forkjoin barrier
243  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
244  return false;
245 // The worker thread may now assume that the team is valid.
246 #ifdef KMP_DEBUG
247  tid = __kmp_tid_from_gtid(gtid);
248  team = __kmp_threads[gtid]->th.th_team;
249 #endif
250  KMP_DEBUG_ASSERT(team != NULL);
251  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
252  KA_TRACE(20,
253  ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
254  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
255  KMP_MB(); // Flush all pending memory write invalidates.
256  }
257  KA_TRACE(
258  20,
259  ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
260  gtid, team->t.t_id, tid, bt));
261  return false;
262 }
263 
264 static void __kmp_linear_barrier_gather(
265  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
266  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
267  __kmp_linear_barrier_gather_template<false>(
268  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
269 }
270 
271 static bool __kmp_linear_barrier_gather_cancellable(
272  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
273  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
274  return __kmp_linear_barrier_gather_template<true>(
275  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
276 }
277 
278 static void __kmp_linear_barrier_release(
279  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
280  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
281  __kmp_linear_barrier_release_template<false>(
282  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
283 }
284 
285 static bool __kmp_linear_barrier_release_cancellable(
286  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
287  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
288  return __kmp_linear_barrier_release_template<true>(
289  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
290 }
291 
292 // Tree barrier
293 static void
294 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
295  int tid, void (*reduce)(void *, void *)
296  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
297  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
298  kmp_team_t *team = this_thr->th.th_team;
299  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
300  kmp_info_t **other_threads = team->t.t_threads;
301  kmp_uint32 nproc = this_thr->th.th_team_nproc;
302  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
303  kmp_uint32 branch_factor = 1 << branch_bits;
304  kmp_uint32 child;
305  kmp_uint32 child_tid;
306  kmp_uint64 new_state;
307 
308  KA_TRACE(
309  20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
310  gtid, team->t.t_id, tid, bt));
311  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
312 
313 #if USE_ITT_BUILD && USE_ITT_NOTIFY
314  // Barrier imbalance - save arrive time to the thread
315  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
316  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
317  __itt_get_timestamp();
318  }
319 #endif
320  // Perform tree gather to wait until all threads have arrived; reduce any
321  // required data as we go
322  child_tid = (tid << branch_bits) + 1;
323  if (child_tid < nproc) {
324  // Parent threads wait for all their children to arrive
325  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
326  child = 1;
327  do {
328  kmp_info_t *child_thr = other_threads[child_tid];
329  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
330 #if KMP_CACHE_MANAGE
331  // Prefetch next thread's arrived count
332  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
333  KMP_CACHE_PREFETCH(
334  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
335 #endif /* KMP_CACHE_MANAGE */
336  KA_TRACE(20,
337  ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
338  "arrived(%p) == %llu\n",
339  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
340  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
341  // Wait for child to arrive
342  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
343  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
344  ANNOTATE_BARRIER_END(child_thr);
345 #if USE_ITT_BUILD && USE_ITT_NOTIFY
346  // Barrier imbalance - write min of the thread time and a child time to
347  // the thread.
348  if (__kmp_forkjoin_frames_mode == 2) {
349  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
350  child_thr->th.th_bar_min_time);
351  }
352 #endif
353  if (reduce) {
354  KA_TRACE(100,
355  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
356  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
357  team->t.t_id, child_tid));
358  ANNOTATE_REDUCE_AFTER(reduce);
359  OMPT_REDUCTION_DECL(this_thr, gtid);
360  OMPT_REDUCTION_BEGIN;
361  (*reduce)(this_thr->th.th_local.reduce_data,
362  child_thr->th.th_local.reduce_data);
363  OMPT_REDUCTION_END;
364  ANNOTATE_REDUCE_BEFORE(reduce);
365  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
366  }
367  child++;
368  child_tid++;
369  } while (child <= branch_factor && child_tid < nproc);
370  }
371 
372  if (!KMP_MASTER_TID(tid)) { // Worker threads
373  kmp_int32 parent_tid = (tid - 1) >> branch_bits;
374 
375  KA_TRACE(20,
376  ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
377  "arrived(%p): %llu => %llu\n",
378  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
379  team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
380  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
381 
382  // Mark arrival to parent thread
383  /* After performing this write, a worker thread may not assume that the team
384  is valid any more - it could be deallocated by the master thread at any
385  time. */
386  ANNOTATE_BARRIER_BEGIN(this_thr);
387  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
388  flag.release();
389  } else {
390  // Need to update the team arrived pointer if we are the master thread
391  if (nproc > 1) // New value was already computed above
392  team->t.t_bar[bt].b_arrived = new_state;
393  else
394  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
395  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
396  "arrived(%p) = %llu\n",
397  gtid, team->t.t_id, tid, team->t.t_id,
398  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
399  }
400  KA_TRACE(20,
401  ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
402  gtid, team->t.t_id, tid, bt));
403 }
404 
405 static void __kmp_tree_barrier_release(
406  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
407  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
408  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
409  kmp_team_t *team;
410  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
411  kmp_uint32 nproc;
412  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
413  kmp_uint32 branch_factor = 1 << branch_bits;
414  kmp_uint32 child;
415  kmp_uint32 child_tid;
416 
417  // Perform a tree release for all of the threads that have been gathered
418  if (!KMP_MASTER_TID(
419  tid)) { // Handle fork barrier workers who aren't part of a team yet
420  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
421  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
422  // Wait for parent thread to release us
423  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
424  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
425  ANNOTATE_BARRIER_END(this_thr);
426 #if USE_ITT_BUILD && USE_ITT_NOTIFY
427  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
428  // In fork barrier where we could not get the object reliably (or
429  // ITTNOTIFY is disabled)
430  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
431  // Cancel wait on previous parallel region...
432  __kmp_itt_task_starting(itt_sync_obj);
433 
434  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
435  return;
436 
437  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
438  if (itt_sync_obj != NULL)
439  // Call prepare as early as possible for "new" barrier
440  __kmp_itt_task_finished(itt_sync_obj);
441  } else
442 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
443  // Early exit for reaping threads releasing forkjoin barrier
444  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
445  return;
446 
447  // The worker thread may now assume that the team is valid.
448  team = __kmp_threads[gtid]->th.th_team;
449  KMP_DEBUG_ASSERT(team != NULL);
450  tid = __kmp_tid_from_gtid(gtid);
451 
452  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
453  KA_TRACE(20,
454  ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
455  team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
456  KMP_MB(); // Flush all pending memory write invalidates.
457  } else {
458  team = __kmp_threads[gtid]->th.th_team;
459  KMP_DEBUG_ASSERT(team != NULL);
460  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
461  "barrier type %d\n",
462  gtid, team->t.t_id, tid, bt));
463  }
464  nproc = this_thr->th.th_team_nproc;
465  child_tid = (tid << branch_bits) + 1;
466 
467  if (child_tid < nproc) {
468  kmp_info_t **other_threads = team->t.t_threads;
469  child = 1;
470  // Parent threads release all their children
471  do {
472  kmp_info_t *child_thr = other_threads[child_tid];
473  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
474 #if KMP_CACHE_MANAGE
475  // Prefetch next thread's go count
476  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
477  KMP_CACHE_PREFETCH(
478  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
479 #endif /* KMP_CACHE_MANAGE */
480 
481 #if KMP_BARRIER_ICV_PUSH
482  {
483  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
484  if (propagate_icvs) {
485  __kmp_init_implicit_task(team->t.t_ident,
486  team->t.t_threads[child_tid], team,
487  child_tid, FALSE);
488  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
489  &team->t.t_implicit_task_taskdata[0].td_icvs);
490  }
491  }
492 #endif // KMP_BARRIER_ICV_PUSH
493  KA_TRACE(20,
494  ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
495  "go(%p): %u => %u\n",
496  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
497  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
498  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
499  // Release child from barrier
500  ANNOTATE_BARRIER_BEGIN(child_thr);
501  kmp_flag_64 flag(&child_bar->b_go, child_thr);
502  flag.release();
503  child++;
504  child_tid++;
505  } while (child <= branch_factor && child_tid < nproc);
506  }
507  KA_TRACE(
508  20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
509  gtid, team->t.t_id, tid, bt));
510 }
511 
512 // Hyper Barrier
513 static void
514 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
515  int tid, void (*reduce)(void *, void *)
516  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
517  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
518  kmp_team_t *team = this_thr->th.th_team;
519  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
520  kmp_info_t **other_threads = team->t.t_threads;
521  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
522  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
523  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
524  kmp_uint32 branch_factor = 1 << branch_bits;
525  kmp_uint32 offset;
526  kmp_uint32 level;
527 
528  KA_TRACE(
529  20,
530  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
531  gtid, team->t.t_id, tid, bt));
532  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
533 
534 #if USE_ITT_BUILD && USE_ITT_NOTIFY
535  // Barrier imbalance - save arrive time to the thread
536  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
537  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
538  __itt_get_timestamp();
539  }
540 #endif
541  /* Perform a hypercube-embedded tree gather to wait until all of the threads
542  have arrived, and reduce any required data as we go. */
543  kmp_flag_64 p_flag(&thr_bar->b_arrived);
544  for (level = 0, offset = 1; offset < num_threads;
545  level += branch_bits, offset <<= branch_bits) {
546  kmp_uint32 child;
547  kmp_uint32 child_tid;
548 
549  if (((tid >> level) & (branch_factor - 1)) != 0) {
550  kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
551 
552  KMP_MB(); // Synchronize parent and child threads.
553  KA_TRACE(20,
554  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
555  "arrived(%p): %llu => %llu\n",
556  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
557  team->t.t_id, parent_tid, &thr_bar->b_arrived,
558  thr_bar->b_arrived,
559  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
560  // Mark arrival to parent thread
561  /* After performing this write (in the last iteration of the enclosing for
562  loop), a worker thread may not assume that the team is valid any more
563  - it could be deallocated by the master thread at any time. */
564  ANNOTATE_BARRIER_BEGIN(this_thr);
565  p_flag.set_waiter(other_threads[parent_tid]);
566  p_flag.release();
567  break;
568  }
569 
570  // Parent threads wait for children to arrive
571  if (new_state == KMP_BARRIER_UNUSED_STATE)
572  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
573  for (child = 1, child_tid = tid + (1 << level);
574  child < branch_factor && child_tid < num_threads;
575  child++, child_tid += (1 << level)) {
576  kmp_info_t *child_thr = other_threads[child_tid];
577  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
578 #if KMP_CACHE_MANAGE
579  kmp_uint32 next_child_tid = child_tid + (1 << level);
580  // Prefetch next thread's arrived count
581  if (child + 1 < branch_factor && next_child_tid < num_threads)
582  KMP_CACHE_PREFETCH(
583  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
584 #endif /* KMP_CACHE_MANAGE */
585  KA_TRACE(20,
586  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
587  "arrived(%p) == %llu\n",
588  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
589  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
590  // Wait for child to arrive
591  kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
592  c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
593  ANNOTATE_BARRIER_END(child_thr);
594  KMP_MB(); // Synchronize parent and child threads.
595 #if USE_ITT_BUILD && USE_ITT_NOTIFY
596  // Barrier imbalance - write min of the thread time and a child time to
597  // the thread.
598  if (__kmp_forkjoin_frames_mode == 2) {
599  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
600  child_thr->th.th_bar_min_time);
601  }
602 #endif
603  if (reduce) {
604  KA_TRACE(100,
605  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
606  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
607  team->t.t_id, child_tid));
608  ANNOTATE_REDUCE_AFTER(reduce);
609  OMPT_REDUCTION_DECL(this_thr, gtid);
610  OMPT_REDUCTION_BEGIN;
611  (*reduce)(this_thr->th.th_local.reduce_data,
612  child_thr->th.th_local.reduce_data);
613  OMPT_REDUCTION_END;
614  ANNOTATE_REDUCE_BEFORE(reduce);
615  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
616  }
617  }
618  }
619 
620  if (KMP_MASTER_TID(tid)) {
621  // Need to update the team arrived pointer if we are the master thread
622  if (new_state == KMP_BARRIER_UNUSED_STATE)
623  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
624  else
625  team->t.t_bar[bt].b_arrived = new_state;
626  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
627  "arrived(%p) = %llu\n",
628  gtid, team->t.t_id, tid, team->t.t_id,
629  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
630  }
631  KA_TRACE(
632  20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
633  gtid, team->t.t_id, tid, bt));
634 }
635 
636 // The reverse versions seem to beat the forward versions overall
637 #define KMP_REVERSE_HYPER_BAR
638 static void __kmp_hyper_barrier_release(
639  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
640  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
641  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
642  kmp_team_t *team;
643  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
644  kmp_info_t **other_threads;
645  kmp_uint32 num_threads;
646  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
647  kmp_uint32 branch_factor = 1 << branch_bits;
648  kmp_uint32 child;
649  kmp_uint32 child_tid;
650  kmp_uint32 offset;
651  kmp_uint32 level;
652 
653  /* Perform a hypercube-embedded tree release for all of the threads that have
654  been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
655  are released in the reverse order of the corresponding gather, otherwise
656  threads are released in the same order. */
657  if (KMP_MASTER_TID(tid)) { // master
658  team = __kmp_threads[gtid]->th.th_team;
659  KMP_DEBUG_ASSERT(team != NULL);
660  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
661  "barrier type %d\n",
662  gtid, team->t.t_id, tid, bt));
663 #if KMP_BARRIER_ICV_PUSH
664  if (propagate_icvs) { // master already has ICVs in final destination; copy
665  copy_icvs(&thr_bar->th_fixed_icvs,
666  &team->t.t_implicit_task_taskdata[tid].td_icvs);
667  }
668 #endif
669  } else { // Handle fork barrier workers who aren't part of a team yet
670  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
671  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
672  // Wait for parent thread to release us
673  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
674  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
675  ANNOTATE_BARRIER_END(this_thr);
676 #if USE_ITT_BUILD && USE_ITT_NOTIFY
677  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
678  // In fork barrier where we could not get the object reliably
679  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
680  // Cancel wait on previous parallel region...
681  __kmp_itt_task_starting(itt_sync_obj);
682 
683  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
684  return;
685 
686  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
687  if (itt_sync_obj != NULL)
688  // Call prepare as early as possible for "new" barrier
689  __kmp_itt_task_finished(itt_sync_obj);
690  } else
691 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
692  // Early exit for reaping threads releasing forkjoin barrier
693  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
694  return;
695 
696  // The worker thread may now assume that the team is valid.
697  team = __kmp_threads[gtid]->th.th_team;
698  KMP_DEBUG_ASSERT(team != NULL);
699  tid = __kmp_tid_from_gtid(gtid);
700 
701  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
702  KA_TRACE(20,
703  ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
704  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
705  KMP_MB(); // Flush all pending memory write invalidates.
706  }
707  num_threads = this_thr->th.th_team_nproc;
708  other_threads = team->t.t_threads;
709 
710 #ifdef KMP_REVERSE_HYPER_BAR
711  // Count up to correct level for parent
712  for (level = 0, offset = 1;
713  offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
714  level += branch_bits, offset <<= branch_bits)
715  ;
716 
717  // Now go down from there
718  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
719  level -= branch_bits, offset >>= branch_bits)
720 #else
721  // Go down the tree, level by level
722  for (level = 0, offset = 1; offset < num_threads;
723  level += branch_bits, offset <<= branch_bits)
724 #endif // KMP_REVERSE_HYPER_BAR
725  {
726 #ifdef KMP_REVERSE_HYPER_BAR
727  /* Now go in reverse order through the children, highest to lowest.
728  Initial setting of child is conservative here. */
729  child = num_threads >> ((level == 0) ? level : level - 1);
730  for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
731  child_tid = tid + (child << level);
732  child >= 1; child--, child_tid -= (1 << level))
733 #else
734  if (((tid >> level) & (branch_factor - 1)) != 0)
735  // No need to go lower than this, since this is the level parent would be
736  // notified
737  break;
738  // Iterate through children on this level of the tree
739  for (child = 1, child_tid = tid + (1 << level);
740  child < branch_factor && child_tid < num_threads;
741  child++, child_tid += (1 << level))
742 #endif // KMP_REVERSE_HYPER_BAR
743  {
744  if (child_tid >= num_threads)
745  continue; // Child doesn't exist so keep going
746  else {
747  kmp_info_t *child_thr = other_threads[child_tid];
748  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
749 #if KMP_CACHE_MANAGE
750  kmp_uint32 next_child_tid = child_tid - (1 << level);
751 // Prefetch next thread's go count
752 #ifdef KMP_REVERSE_HYPER_BAR
753  if (child - 1 >= 1 && next_child_tid < num_threads)
754 #else
755  if (child + 1 < branch_factor && next_child_tid < num_threads)
756 #endif // KMP_REVERSE_HYPER_BAR
757  KMP_CACHE_PREFETCH(
758  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
759 #endif /* KMP_CACHE_MANAGE */
760 
761 #if KMP_BARRIER_ICV_PUSH
762  if (propagate_icvs) // push my fixed ICVs to my child
763  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
764 #endif // KMP_BARRIER_ICV_PUSH
765 
766  KA_TRACE(
767  20,
768  ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
769  "go(%p): %u => %u\n",
770  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
771  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
772  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
773  // Release child from barrier
774  ANNOTATE_BARRIER_BEGIN(child_thr);
775  kmp_flag_64 flag(&child_bar->b_go, child_thr);
776  flag.release();
777  }
778  }
779  }
780 #if KMP_BARRIER_ICV_PUSH
781  if (propagate_icvs &&
782  !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
783  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
784  FALSE);
785  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
786  &thr_bar->th_fixed_icvs);
787  }
788 #endif
789  KA_TRACE(
790  20,
791  ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
792  gtid, team->t.t_id, tid, bt));
793 }
794 
795 // Hierarchical Barrier
796 
797 // Initialize thread barrier data
798 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
799  Performs the minimum amount of initialization required based on how the team
800  has changed. Returns true if leaf children will require both on-core and
801  traditional wake-up mechanisms. For example, if the team size increases,
802  threads already in the team will respond to on-core wakeup on their parent
803  thread, but threads newly added to the team will only be listening on the
804  their local b_go. */
805 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
806  kmp_bstate_t *thr_bar,
807  kmp_uint32 nproc, int gtid,
808  int tid, kmp_team_t *team) {
809  // Checks to determine if (re-)initialization is needed
810  bool uninitialized = thr_bar->team == NULL;
811  bool team_changed = team != thr_bar->team;
812  bool team_sz_changed = nproc != thr_bar->nproc;
813  bool tid_changed = tid != thr_bar->old_tid;
814  bool retval = false;
815 
816  if (uninitialized || team_sz_changed) {
817  __kmp_get_hierarchy(nproc, thr_bar);
818  }
819 
820  if (uninitialized || team_sz_changed || tid_changed) {
821  thr_bar->my_level = thr_bar->depth - 1; // default for master
822  thr_bar->parent_tid = -1; // default for master
823  if (!KMP_MASTER_TID(
824  tid)) { // if not master, find parent thread in hierarchy
825  kmp_uint32 d = 0;
826  while (d < thr_bar->depth) { // find parent based on level of thread in
827  // hierarchy, and note level
828  kmp_uint32 rem;
829  if (d == thr_bar->depth - 2) { // reached level right below the master
830  thr_bar->parent_tid = 0;
831  thr_bar->my_level = d;
832  break;
833  } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
834  0) { // TODO: can we make this op faster?
835  // thread is not a subtree root at next level, so this is max
836  thr_bar->parent_tid = tid - rem;
837  thr_bar->my_level = d;
838  break;
839  }
840  ++d;
841  }
842  }
843  thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
844  thr_bar->old_tid = tid;
845  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
846  thr_bar->team = team;
847  thr_bar->parent_bar =
848  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
849  }
850  if (uninitialized || team_changed || tid_changed) {
851  thr_bar->team = team;
852  thr_bar->parent_bar =
853  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
854  retval = true;
855  }
856  if (uninitialized || team_sz_changed || tid_changed) {
857  thr_bar->nproc = nproc;
858  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
859  if (thr_bar->my_level == 0)
860  thr_bar->leaf_kids = 0;
861  if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
862  thr_bar->leaf_kids = nproc - tid - 1;
863  thr_bar->leaf_state = 0;
864  for (int i = 0; i < thr_bar->leaf_kids; ++i)
865  ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
866  }
867  return retval;
868 }
869 
870 static void __kmp_hierarchical_barrier_gather(
871  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
872  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
873  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
874  kmp_team_t *team = this_thr->th.th_team;
875  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
876  kmp_uint32 nproc = this_thr->th.th_team_nproc;
877  kmp_info_t **other_threads = team->t.t_threads;
878  kmp_uint64 new_state;
879 
880  int level = team->t.t_level;
881  if (other_threads[0]
882  ->th.th_teams_microtask) // are we inside the teams construct?
883  if (this_thr->th.th_teams_size.nteams > 1)
884  ++level; // level was not increased in teams construct for team_of_masters
885  if (level == 1)
886  thr_bar->use_oncore_barrier = 1;
887  else
888  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
889 
890  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
891  "barrier type %d\n",
892  gtid, team->t.t_id, tid, bt));
893  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
894 
895 #if USE_ITT_BUILD && USE_ITT_NOTIFY
896  // Barrier imbalance - save arrive time to the thread
897  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
898  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
899  }
900 #endif
901 
902  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
903  team);
904 
905  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
906  kmp_int32 child_tid;
907  new_state =
908  (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
909  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
910  thr_bar->use_oncore_barrier) {
911  if (thr_bar->leaf_kids) {
912  // First, wait for leaf children to check-in on my b_arrived flag
913  kmp_uint64 leaf_state =
914  KMP_MASTER_TID(tid)
915  ? thr_bar->b_arrived | thr_bar->leaf_state
916  : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
917  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
918  "for leaf kids\n",
919  gtid, team->t.t_id, tid));
920  kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
921  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
922  if (reduce) {
923  ANNOTATE_REDUCE_AFTER(reduce);
924  OMPT_REDUCTION_DECL(this_thr, gtid);
925  OMPT_REDUCTION_BEGIN;
926  for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
927  ++child_tid) {
928  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
929  "T#%d(%d:%d)\n",
930  gtid, team->t.t_id, tid,
931  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
932  child_tid));
933  ANNOTATE_BARRIER_END(other_threads[child_tid]);
934  (*reduce)(this_thr->th.th_local.reduce_data,
935  other_threads[child_tid]->th.th_local.reduce_data);
936  }
937  OMPT_REDUCTION_END;
938  ANNOTATE_REDUCE_BEFORE(reduce);
939  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
940  }
941  // clear leaf_state bits
942  KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
943  }
944  // Next, wait for higher level children on each child's b_arrived flag
945  for (kmp_uint32 d = 1; d < thr_bar->my_level;
946  ++d) { // gather lowest level threads first, but skip 0
947  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
948  skip = thr_bar->skip_per_level[d];
949  if (last > nproc)
950  last = nproc;
951  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
952  kmp_info_t *child_thr = other_threads[child_tid];
953  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
954  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
955  "T#%d(%d:%d) "
956  "arrived(%p) == %llu\n",
957  gtid, team->t.t_id, tid,
958  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
959  child_tid, &child_bar->b_arrived, new_state));
960  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
961  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
962  ANNOTATE_BARRIER_END(child_thr);
963  if (reduce) {
964  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
965  "T#%d(%d:%d)\n",
966  gtid, team->t.t_id, tid,
967  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
968  child_tid));
969  ANNOTATE_REDUCE_AFTER(reduce);
970  (*reduce)(this_thr->th.th_local.reduce_data,
971  child_thr->th.th_local.reduce_data);
972  ANNOTATE_REDUCE_BEFORE(reduce);
973  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
974  }
975  }
976  }
977  } else { // Blocktime is not infinite
978  for (kmp_uint32 d = 0; d < thr_bar->my_level;
979  ++d) { // Gather lowest level threads first
980  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
981  skip = thr_bar->skip_per_level[d];
982  if (last > nproc)
983  last = nproc;
984  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
985  kmp_info_t *child_thr = other_threads[child_tid];
986  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
987  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
988  "T#%d(%d:%d) "
989  "arrived(%p) == %llu\n",
990  gtid, team->t.t_id, tid,
991  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
992  child_tid, &child_bar->b_arrived, new_state));
993  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
994  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
995  ANNOTATE_BARRIER_END(child_thr);
996  if (reduce) {
997  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
998  "T#%d(%d:%d)\n",
999  gtid, team->t.t_id, tid,
1000  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1001  child_tid));
1002  ANNOTATE_REDUCE_AFTER(reduce);
1003  (*reduce)(this_thr->th.th_local.reduce_data,
1004  child_thr->th.th_local.reduce_data);
1005  ANNOTATE_REDUCE_BEFORE(reduce);
1006  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1007  }
1008  }
1009  }
1010  }
1011  }
1012  // All subordinates are gathered; now release parent if not master thread
1013 
1014  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1015  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1016  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1017  gtid, team->t.t_id, tid,
1018  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1019  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1020  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1021  /* Mark arrival to parent: After performing this write, a worker thread may
1022  not assume that the team is valid any more - it could be deallocated by
1023  the master thread at any time. */
1024  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1025  !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1026  // flag; release it
1027  ANNOTATE_BARRIER_BEGIN(this_thr);
1028  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
1029  flag.release();
1030  } else {
1031  // Leaf does special release on "offset" bits of parent's b_arrived flag
1032  thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1033  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
1034  flag.set_waiter(other_threads[thr_bar->parent_tid]);
1035  flag.release();
1036  }
1037  } else { // Master thread needs to update the team's b_arrived value
1038  team->t.t_bar[bt].b_arrived = new_state;
1039  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1040  "arrived(%p) = %llu\n",
1041  gtid, team->t.t_id, tid, team->t.t_id,
1042  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1043  }
1044  // Is the team access below unsafe or just technically invalid?
1045  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1046  "barrier type %d\n",
1047  gtid, team->t.t_id, tid, bt));
1048 }
1049 
1050 static void __kmp_hierarchical_barrier_release(
1051  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1052  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1053  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1054  kmp_team_t *team;
1055  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1056  kmp_uint32 nproc;
1057  bool team_change = false; // indicates on-core barrier shouldn't be used
1058 
1059  if (KMP_MASTER_TID(tid)) {
1060  team = __kmp_threads[gtid]->th.th_team;
1061  KMP_DEBUG_ASSERT(team != NULL);
1062  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1063  "entered barrier type %d\n",
1064  gtid, team->t.t_id, tid, bt));
1065  } else { // Worker threads
1066  // Wait for parent thread to release me
1067  if (!thr_bar->use_oncore_barrier ||
1068  __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1069  thr_bar->team == NULL) {
1070  // Use traditional method of waiting on my own b_go flag
1071  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1072  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1073  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1074  ANNOTATE_BARRIER_END(this_thr);
1075  TCW_8(thr_bar->b_go,
1076  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1077  } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1078  // infinite, not nested
1079  // Wait on my "offset" bits on parent's b_go flag
1080  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1081  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1082  thr_bar->offset, bt,
1083  this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1084  flag.wait(this_thr, TRUE);
1085  if (thr_bar->wait_flag ==
1086  KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1087  TCW_8(thr_bar->b_go,
1088  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1089  } else { // Reset my bits on parent's b_go flag
1090  (RCAST(volatile char *,
1091  &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1092  }
1093  }
1094  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1095  // Early exit for reaping threads releasing forkjoin barrier
1096  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1097  return;
1098  // The worker thread may now assume that the team is valid.
1099  team = __kmp_threads[gtid]->th.th_team;
1100  KMP_DEBUG_ASSERT(team != NULL);
1101  tid = __kmp_tid_from_gtid(gtid);
1102 
1103  KA_TRACE(
1104  20,
1105  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1106  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1107  KMP_MB(); // Flush all pending memory write invalidates.
1108  }
1109 
1110  nproc = this_thr->th.th_team_nproc;
1111  int level = team->t.t_level;
1112  if (team->t.t_threads[0]
1113  ->th.th_teams_microtask) { // are we inside the teams construct?
1114  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1115  this_thr->th.th_teams_level == level)
1116  ++level; // level was not increased in teams construct for team_of_workers
1117  if (this_thr->th.th_teams_size.nteams > 1)
1118  ++level; // level was not increased in teams construct for team_of_masters
1119  }
1120  if (level == 1)
1121  thr_bar->use_oncore_barrier = 1;
1122  else
1123  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1124 
1125  // If the team size has increased, we still communicate with old leaves via
1126  // oncore barrier.
1127  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1128  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1129  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1130  tid, team);
1131  // But if the entire team changes, we won't use oncore barrier at all
1132  if (team_change)
1133  old_leaf_kids = 0;
1134 
1135 #if KMP_BARRIER_ICV_PUSH
1136  if (propagate_icvs) {
1137  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1138  FALSE);
1139  if (KMP_MASTER_TID(
1140  tid)) { // master already has copy in final destination; copy
1141  copy_icvs(&thr_bar->th_fixed_icvs,
1142  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1143  } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1144  thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1145  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1146  // leaves (on-core children) pull parent's fixed ICVs directly to local
1147  // ICV store
1148  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1149  &thr_bar->parent_bar->th_fixed_icvs);
1150  // non-leaves will get ICVs piggybacked with b_go via NGO store
1151  } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1152  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1153  // access
1154  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1155  else // leaves copy parent's fixed ICVs directly to local ICV store
1156  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1157  &thr_bar->parent_bar->th_fixed_icvs);
1158  }
1159  }
1160 #endif // KMP_BARRIER_ICV_PUSH
1161 
1162  // Now, release my children
1163  if (thr_bar->my_level) { // not a leaf
1164  kmp_int32 child_tid;
1165  kmp_uint32 last;
1166  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1167  thr_bar->use_oncore_barrier) {
1168  if (KMP_MASTER_TID(tid)) { // do a flat release
1169  // Set local b_go to bump children via NGO store of the cache line
1170  // containing IVCs and b_go.
1171  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1172  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1173  // the cache line
1174  ngo_load(&thr_bar->th_fixed_icvs);
1175  // This loops over all the threads skipping only the leaf nodes in the
1176  // hierarchy
1177  for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1178  child_tid += thr_bar->skip_per_level[1]) {
1179  kmp_bstate_t *child_bar =
1180  &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1181  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1182  "releasing T#%d(%d:%d)"
1183  " go(%p): %u => %u\n",
1184  gtid, team->t.t_id, tid,
1185  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1186  child_tid, &child_bar->b_go, child_bar->b_go,
1187  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1188  // Use ngo store (if available) to both store ICVs and release child
1189  // via child's b_go
1190  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1191  }
1192  ngo_sync();
1193  }
1194  TCW_8(thr_bar->b_go,
1195  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1196  // Now, release leaf children
1197  if (thr_bar->leaf_kids) { // if there are any
1198  // We test team_change on the off-chance that the level 1 team changed.
1199  if (team_change ||
1200  old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1201  if (old_leaf_kids) { // release old leaf kids
1202  thr_bar->b_go |= old_leaf_state;
1203  }
1204  // Release new leaf kids
1205  last = tid + thr_bar->skip_per_level[1];
1206  if (last > nproc)
1207  last = nproc;
1208  for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1209  ++child_tid) { // skip_per_level[0]=1
1210  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1211  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1212  KA_TRACE(
1213  20,
1214  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1215  " T#%d(%d:%d) go(%p): %u => %u\n",
1216  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1217  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1218  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1219  // Release child using child's b_go flag
1220  ANNOTATE_BARRIER_BEGIN(child_thr);
1221  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1222  flag.release();
1223  }
1224  } else { // Release all children at once with leaf_state bits on my own
1225  // b_go flag
1226  thr_bar->b_go |= thr_bar->leaf_state;
1227  }
1228  }
1229  } else { // Blocktime is not infinite; do a simple hierarchical release
1230  for (int d = thr_bar->my_level - 1; d >= 0;
1231  --d) { // Release highest level threads first
1232  last = tid + thr_bar->skip_per_level[d + 1];
1233  kmp_uint32 skip = thr_bar->skip_per_level[d];
1234  if (last > nproc)
1235  last = nproc;
1236  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1237  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1238  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1239  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1240  "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1241  gtid, team->t.t_id, tid,
1242  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1243  child_tid, &child_bar->b_go, child_bar->b_go,
1244  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1245  // Release child using child's b_go flag
1246  ANNOTATE_BARRIER_BEGIN(child_thr);
1247  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1248  flag.release();
1249  }
1250  }
1251  }
1252 #if KMP_BARRIER_ICV_PUSH
1253  if (propagate_icvs && !KMP_MASTER_TID(tid))
1254  // non-leaves copy ICVs from fixed ICVs to local dest
1255  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1256  &thr_bar->th_fixed_icvs);
1257 #endif // KMP_BARRIER_ICV_PUSH
1258  }
1259  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1260  "barrier type %d\n",
1261  gtid, team->t.t_id, tid, bt));
1262 }
1263 
1264 // End of Barrier Algorithms
1265 
1266 // type traits for cancellable value
1267 // if cancellable is true, then is_cancellable is a normal boolean variable
1268 // if cancellable is false, then is_cancellable is a compile time constant
1269 template <bool cancellable> struct is_cancellable {};
1270 template <> struct is_cancellable<true> {
1271  bool value;
1272  is_cancellable() : value(false) {}
1273  is_cancellable(bool b) : value(b) {}
1274  is_cancellable &operator=(bool b) {
1275  value = b;
1276  return *this;
1277  }
1278  operator bool() const { return value; }
1279 };
1280 template <> struct is_cancellable<false> {
1281  is_cancellable &operator=(bool b) { return *this; }
1282  constexpr operator bool() const { return false; }
1283 };
1284 
1285 // Internal function to do a barrier.
1286 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1287  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1288  barrier
1289  When cancellable = false,
1290  Returns 0 if master thread, 1 if worker thread.
1291  When cancellable = true
1292  Returns 0 if not cancelled, 1 if cancelled. */
1293 template <bool cancellable = false>
1294 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1295  size_t reduce_size, void *reduce_data,
1296  void (*reduce)(void *, void *)) {
1297  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1298  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1299  int tid = __kmp_tid_from_gtid(gtid);
1300  kmp_info_t *this_thr = __kmp_threads[gtid];
1301  kmp_team_t *team = this_thr->th.th_team;
1302  int status = 0;
1303  is_cancellable<cancellable> cancelled;
1304 #if OMPT_SUPPORT && OMPT_OPTIONAL
1305  ompt_data_t *my_task_data;
1306  ompt_data_t *my_parallel_data;
1307  void *return_address;
1308  ompt_sync_region_t barrier_kind;
1309 #endif
1310 
1311  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1312  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1313 
1314  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1315 #if OMPT_SUPPORT
1316  if (ompt_enabled.enabled) {
1317 #if OMPT_OPTIONAL
1318  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1319  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1320  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1321  barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1322  if (ompt_enabled.ompt_callback_sync_region) {
1323  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1324  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1325  return_address);
1326  }
1327  if (ompt_enabled.ompt_callback_sync_region_wait) {
1328  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1329  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1330  return_address);
1331  }
1332 #endif
1333  // It is OK to report the barrier state after the barrier begin callback.
1334  // According to the OMPT specification, a compliant implementation may
1335  // even delay reporting this state until the barrier begins to wait.
1336  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1337  }
1338 #endif
1339 
1340  if (!team->t.t_serialized) {
1341 #if USE_ITT_BUILD
1342  // This value will be used in itt notify events below.
1343  void *itt_sync_obj = NULL;
1344 #if USE_ITT_NOTIFY
1345  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1346  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1347 #endif
1348 #endif /* USE_ITT_BUILD */
1349  if (__kmp_tasking_mode == tskm_extra_barrier) {
1350  __kmp_tasking_barrier(team, this_thr, gtid);
1351  KA_TRACE(15,
1352  ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1353  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1354  }
1355 
1356  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1357  access it when the team struct is not guaranteed to exist. */
1358  // See note about the corresponding code in __kmp_join_barrier() being
1359  // performance-critical.
1360  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1361 #if KMP_USE_MONITOR
1362  this_thr->th.th_team_bt_intervals =
1363  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1364  this_thr->th.th_team_bt_set =
1365  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1366 #else
1367  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1368 #endif
1369  }
1370 
1371 #if USE_ITT_BUILD
1372  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1373  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1374 #endif /* USE_ITT_BUILD */
1375 #if USE_DEBUGGER
1376  // Let the debugger know: the thread arrived to the barrier and waiting.
1377  if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1378  team->t.t_bar[bt].b_master_arrived += 1;
1379  } else {
1380  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1381  } // if
1382 #endif /* USE_DEBUGGER */
1383  if (reduce != NULL) {
1384  // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1385  this_thr->th.th_local.reduce_data = reduce_data;
1386  }
1387 
1388  if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1389  // use 0 to only setup the current team if nthreads > 1
1390  __kmp_task_team_setup(this_thr, team, 0);
1391 
1392  if (cancellable) {
1393  cancelled = __kmp_linear_barrier_gather_cancellable(
1394  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1395  } else {
1396  switch (__kmp_barrier_gather_pattern[bt]) {
1397  case bp_hyper_bar: {
1398  // don't set branch bits to 0; use linear
1399  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1400  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1401  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1402  break;
1403  }
1404  case bp_hierarchical_bar: {
1405  __kmp_hierarchical_barrier_gather(
1406  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1407  break;
1408  }
1409  case bp_tree_bar: {
1410  // don't set branch bits to 0; use linear
1411  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1412  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1413  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1414  break;
1415  }
1416  default: {
1417  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1418  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1419  }
1420  }
1421  }
1422 
1423  KMP_MB();
1424 
1425  if (KMP_MASTER_TID(tid)) {
1426  status = 0;
1427  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1428  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1429  }
1430 #if USE_DEBUGGER
1431  // Let the debugger know: All threads are arrived and starting leaving the
1432  // barrier.
1433  team->t.t_bar[bt].b_team_arrived += 1;
1434 #endif
1435 
1436  if (__kmp_omp_cancellation) {
1437  kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1438  // Reset cancellation flag for worksharing constructs
1439  if (cancel_request == cancel_loop ||
1440  cancel_request == cancel_sections) {
1441  KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1442  }
1443  }
1444 #if USE_ITT_BUILD
1445  /* TODO: In case of split reduction barrier, master thread may send
1446  acquired event early, before the final summation into the shared
1447  variable is done (final summation can be a long operation for array
1448  reductions). */
1449  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1450  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1451 #endif /* USE_ITT_BUILD */
1452 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1453  // Barrier - report frame end (only if active_level == 1)
1454  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1455  __kmp_forkjoin_frames_mode &&
1456  (this_thr->th.th_teams_microtask == NULL || // either not in teams
1457  this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1458  team->t.t_active_level == 1) {
1459  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1460  kmp_uint64 cur_time = __itt_get_timestamp();
1461  kmp_info_t **other_threads = team->t.t_threads;
1462  int nproc = this_thr->th.th_team_nproc;
1463  int i;
1464  switch (__kmp_forkjoin_frames_mode) {
1465  case 1:
1466  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1467  loc, nproc);
1468  this_thr->th.th_frame_time = cur_time;
1469  break;
1470  case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1471  // be fixed)
1472  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1473  1, loc, nproc);
1474  break;
1475  case 3:
1476  if (__itt_metadata_add_ptr) {
1477  // Initialize with master's wait time
1478  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1479  // Set arrive time to zero to be able to check it in
1480  // __kmp_invoke_task(); the same is done inside the loop below
1481  this_thr->th.th_bar_arrive_time = 0;
1482  for (i = 1; i < nproc; ++i) {
1483  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1484  other_threads[i]->th.th_bar_arrive_time = 0;
1485  }
1486  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1487  cur_time, delta,
1488  (kmp_uint64)(reduce != NULL));
1489  }
1490  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1491  loc, nproc);
1492  this_thr->th.th_frame_time = cur_time;
1493  break;
1494  }
1495  }
1496 #endif /* USE_ITT_BUILD */
1497  } else {
1498  status = 1;
1499 #if USE_ITT_BUILD
1500  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1501  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1502 #endif /* USE_ITT_BUILD */
1503  }
1504  if ((status == 1 || !is_split) && !cancelled) {
1505  if (cancellable) {
1506  cancelled = __kmp_linear_barrier_release_cancellable(
1507  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1508  } else {
1509  switch (__kmp_barrier_release_pattern[bt]) {
1510  case bp_hyper_bar: {
1511  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1512  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1513  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1514  break;
1515  }
1516  case bp_hierarchical_bar: {
1517  __kmp_hierarchical_barrier_release(
1518  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1519  break;
1520  }
1521  case bp_tree_bar: {
1522  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1523  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1524  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1525  break;
1526  }
1527  default: {
1528  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1529  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1530  }
1531  }
1532  }
1533  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1534  __kmp_task_team_sync(this_thr, team);
1535  }
1536  }
1537 
1538 #if USE_ITT_BUILD
1539  /* GEH: TODO: Move this under if-condition above and also include in
1540  __kmp_end_split_barrier(). This will more accurately represent the actual
1541  release time of the threads for split barriers. */
1542  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1543  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1544 #endif /* USE_ITT_BUILD */
1545  } else { // Team is serialized.
1546  status = 0;
1547  if (__kmp_tasking_mode != tskm_immediate_exec) {
1548  if (this_thr->th.th_task_team != NULL) {
1549 #if USE_ITT_NOTIFY
1550  void *itt_sync_obj = NULL;
1551  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1552  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1553  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1554  }
1555 #endif
1556 
1557  KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1558  TRUE);
1559  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1560  __kmp_task_team_setup(this_thr, team, 0);
1561 
1562 #if USE_ITT_BUILD
1563  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1564  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1565 #endif /* USE_ITT_BUILD */
1566  }
1567  }
1568  }
1569  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1570  gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1571  __kmp_tid_from_gtid(gtid), status));
1572 
1573 #if OMPT_SUPPORT
1574  if (ompt_enabled.enabled) {
1575 #if OMPT_OPTIONAL
1576  if (ompt_enabled.ompt_callback_sync_region_wait) {
1577  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1578  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1579  return_address);
1580  }
1581  if (ompt_enabled.ompt_callback_sync_region) {
1582  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1583  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1584  return_address);
1585  }
1586 #endif
1587  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1588  }
1589 #endif
1590  ANNOTATE_BARRIER_END(&team->t.t_bar);
1591 
1592  if (cancellable)
1593  return (int)cancelled;
1594  return status;
1595 }
1596 
1597 // Returns 0 if master thread, 1 if worker thread.
1598 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1599  size_t reduce_size, void *reduce_data,
1600  void (*reduce)(void *, void *)) {
1601  return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1602  reduce);
1603 }
1604 
1605 #if defined(KMP_GOMP_COMPAT)
1606 // Returns 1 if cancelled, 0 otherwise
1607 int __kmp_barrier_gomp_cancel(int gtid) {
1608  if (__kmp_omp_cancellation) {
1609  int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1610  0, NULL, NULL);
1611  if (cancelled) {
1612  int tid = __kmp_tid_from_gtid(gtid);
1613  kmp_info_t *this_thr = __kmp_threads[gtid];
1614  if (KMP_MASTER_TID(tid)) {
1615  // Master does not need to revert anything
1616  } else {
1617  // Workers need to revert their private b_arrived flag
1618  this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1619  KMP_BARRIER_STATE_BUMP;
1620  }
1621  }
1622  return cancelled;
1623  }
1624  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1625  return FALSE;
1626 }
1627 #endif
1628 
1629 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1630  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1631  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1632  int tid = __kmp_tid_from_gtid(gtid);
1633  kmp_info_t *this_thr = __kmp_threads[gtid];
1634  kmp_team_t *team = this_thr->th.th_team;
1635 
1636  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1637  if (!team->t.t_serialized) {
1638  if (KMP_MASTER_GTID(gtid)) {
1639  switch (__kmp_barrier_release_pattern[bt]) {
1640  case bp_hyper_bar: {
1641  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1642  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1643  FALSE USE_ITT_BUILD_ARG(NULL));
1644  break;
1645  }
1646  case bp_hierarchical_bar: {
1647  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1648  FALSE USE_ITT_BUILD_ARG(NULL));
1649  break;
1650  }
1651  case bp_tree_bar: {
1652  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1653  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1654  FALSE USE_ITT_BUILD_ARG(NULL));
1655  break;
1656  }
1657  default: {
1658  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1659  FALSE USE_ITT_BUILD_ARG(NULL));
1660  }
1661  }
1662  if (__kmp_tasking_mode != tskm_immediate_exec) {
1663  __kmp_task_team_sync(this_thr, team);
1664  } // if
1665  }
1666  }
1667  ANNOTATE_BARRIER_END(&team->t.t_bar);
1668 }
1669 
1670 void __kmp_join_barrier(int gtid) {
1671  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1672  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1673  kmp_info_t *this_thr = __kmp_threads[gtid];
1674  kmp_team_t *team;
1675  kmp_uint nproc;
1676  kmp_info_t *master_thread;
1677  int tid;
1678 #ifdef KMP_DEBUG
1679  int team_id;
1680 #endif /* KMP_DEBUG */
1681 #if USE_ITT_BUILD
1682  void *itt_sync_obj = NULL;
1683 #if USE_ITT_NOTIFY
1684  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1685  // Get object created at fork_barrier
1686  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1687 #endif
1688 #endif /* USE_ITT_BUILD */
1689  KMP_MB();
1690 
1691  // Get current info
1692  team = this_thr->th.th_team;
1693  nproc = this_thr->th.th_team_nproc;
1694  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1695  tid = __kmp_tid_from_gtid(gtid);
1696 #ifdef KMP_DEBUG
1697  team_id = team->t.t_id;
1698 #endif /* KMP_DEBUG */
1699  master_thread = this_thr->th.th_team_master;
1700 #ifdef KMP_DEBUG
1701  if (master_thread != team->t.t_threads[0]) {
1702  __kmp_print_structure();
1703  }
1704 #endif /* KMP_DEBUG */
1705  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1706  KMP_MB();
1707 
1708  // Verify state
1709  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1710  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1711  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1712  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1713  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1714  gtid, team_id, tid));
1715 
1716  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1717 #if OMPT_SUPPORT
1718  if (ompt_enabled.enabled) {
1719 #if OMPT_OPTIONAL
1720  ompt_data_t *my_task_data;
1721  ompt_data_t *my_parallel_data;
1722  void *codeptr = NULL;
1723  int ds_tid = this_thr->th.th_info.ds.ds_tid;
1724  if (KMP_MASTER_TID(ds_tid) &&
1725  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1726  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1727  codeptr = team->t.ompt_team_info.master_return_address;
1728  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1729  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1730  if (ompt_enabled.ompt_callback_sync_region) {
1731  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1732  ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1733  my_task_data, codeptr);
1734  }
1735  if (ompt_enabled.ompt_callback_sync_region_wait) {
1736  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1737  ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1738  my_task_data, codeptr);
1739  }
1740  if (!KMP_MASTER_TID(ds_tid))
1741  this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1742 #endif
1743  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1744  }
1745 #endif
1746 
1747  if (__kmp_tasking_mode == tskm_extra_barrier) {
1748  __kmp_tasking_barrier(team, this_thr, gtid);
1749  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1750  team_id, tid));
1751  }
1752 #ifdef KMP_DEBUG
1753  if (__kmp_tasking_mode != tskm_immediate_exec) {
1754  KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1755  "%p, th_task_team = %p\n",
1756  __kmp_gtid_from_thread(this_thr), team_id,
1757  team->t.t_task_team[this_thr->th.th_task_state],
1758  this_thr->th.th_task_team));
1759  KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1760  team->t.t_task_team[this_thr->th.th_task_state]);
1761  }
1762 #endif /* KMP_DEBUG */
1763 
1764  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1765  access it when the team struct is not guaranteed to exist. Doing these
1766  loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1767  we do not perform the copy if blocktime=infinite, since the values are not
1768  used by __kmp_wait_template() in that case. */
1769  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1770 #if KMP_USE_MONITOR
1771  this_thr->th.th_team_bt_intervals =
1772  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1773  this_thr->th.th_team_bt_set =
1774  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1775 #else
1776  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1777 #endif
1778  }
1779 
1780 #if USE_ITT_BUILD
1781  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1782  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1783 #endif /* USE_ITT_BUILD */
1784 
1785  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1786  case bp_hyper_bar: {
1787  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1788  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1789  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1790  break;
1791  }
1792  case bp_hierarchical_bar: {
1793  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1794  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1795  break;
1796  }
1797  case bp_tree_bar: {
1798  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1799  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1800  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1801  break;
1802  }
1803  default: {
1804  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1805  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1806  }
1807  }
1808 
1809  /* From this point on, the team data structure may be deallocated at any time
1810  by the master thread - it is unsafe to reference it in any of the worker
1811  threads. Any per-team data items that need to be referenced before the
1812  end of the barrier should be moved to the kmp_task_team_t structs. */
1813  if (KMP_MASTER_TID(tid)) {
1814  if (__kmp_tasking_mode != tskm_immediate_exec) {
1815  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1816  }
1817  if (__kmp_display_affinity) {
1818  KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1819  }
1820 #if KMP_STATS_ENABLED
1821  // Have master thread flag the workers to indicate they are now waiting for
1822  // next parallel region, Also wake them up so they switch their timers to
1823  // idle.
1824  for (int i = 0; i < team->t.t_nproc; ++i) {
1825  kmp_info_t *team_thread = team->t.t_threads[i];
1826  if (team_thread == this_thr)
1827  continue;
1828  team_thread->th.th_stats->setIdleFlag();
1829  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1830  team_thread->th.th_sleep_loc != NULL)
1831  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1832  team_thread->th.th_sleep_loc);
1833  }
1834 #endif
1835 #if USE_ITT_BUILD
1836  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1837  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1838 #endif /* USE_ITT_BUILD */
1839 
1840 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1841  // Join barrier - report frame end
1842  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1843  __kmp_forkjoin_frames_mode &&
1844  (this_thr->th.th_teams_microtask == NULL || // either not in teams
1845  this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1846  team->t.t_active_level == 1) {
1847  kmp_uint64 cur_time = __itt_get_timestamp();
1848  ident_t *loc = team->t.t_ident;
1849  kmp_info_t **other_threads = team->t.t_threads;
1850  int nproc = this_thr->th.th_team_nproc;
1851  int i;
1852  switch (__kmp_forkjoin_frames_mode) {
1853  case 1:
1854  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1855  loc, nproc);
1856  break;
1857  case 2:
1858  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1859  loc, nproc);
1860  break;
1861  case 3:
1862  if (__itt_metadata_add_ptr) {
1863  // Initialize with master's wait time
1864  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1865  // Set arrive time to zero to be able to check it in
1866  // __kmp_invoke_task(); the same is done inside the loop below
1867  this_thr->th.th_bar_arrive_time = 0;
1868  for (i = 1; i < nproc; ++i) {
1869  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1870  other_threads[i]->th.th_bar_arrive_time = 0;
1871  }
1872  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1873  cur_time, delta, 0);
1874  }
1875  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1876  loc, nproc);
1877  this_thr->th.th_frame_time = cur_time;
1878  break;
1879  }
1880  }
1881 #endif /* USE_ITT_BUILD */
1882  }
1883 #if USE_ITT_BUILD
1884  else {
1885  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1886  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1887  }
1888 #endif /* USE_ITT_BUILD */
1889 
1890 #if KMP_DEBUG
1891  if (KMP_MASTER_TID(tid)) {
1892  KA_TRACE(
1893  15,
1894  ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1895  gtid, team_id, tid, nproc));
1896  }
1897 #endif /* KMP_DEBUG */
1898 
1899  // TODO now, mark worker threads as done so they may be disbanded
1900  KMP_MB(); // Flush all pending memory write invalidates.
1901  KA_TRACE(10,
1902  ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1903 
1904  ANNOTATE_BARRIER_END(&team->t.t_bar);
1905 }
1906 
1907 // TODO release worker threads' fork barriers as we are ready instead of all at
1908 // once
1909 void __kmp_fork_barrier(int gtid, int tid) {
1910  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1911  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1912  kmp_info_t *this_thr = __kmp_threads[gtid];
1913  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1914 #if USE_ITT_BUILD
1915  void *itt_sync_obj = NULL;
1916 #endif /* USE_ITT_BUILD */
1917  if (team)
1918  ANNOTATE_BARRIER_END(&team->t.t_bar);
1919 
1920  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1921  (team != NULL) ? team->t.t_id : -1, tid));
1922 
1923  // th_team pointer only valid for master thread here
1924  if (KMP_MASTER_TID(tid)) {
1925 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1926  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1927  // Create itt barrier object
1928  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1929  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1930  }
1931 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1932 
1933 #ifdef KMP_DEBUG
1934  kmp_info_t **other_threads = team->t.t_threads;
1935  int i;
1936 
1937  // Verify state
1938  KMP_MB();
1939 
1940  for (i = 1; i < team->t.t_nproc; ++i) {
1941  KA_TRACE(500,
1942  ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1943  "== %u.\n",
1944  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1945  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1946  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1947  KMP_DEBUG_ASSERT(
1948  (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1949  ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1950  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1951  }
1952 #endif
1953 
1954  if (__kmp_tasking_mode != tskm_immediate_exec) {
1955  // 0 indicates setup current task team if nthreads > 1
1956  __kmp_task_team_setup(this_thr, team, 0);
1957  }
1958 
1959  /* The master thread may have changed its blocktime between the join barrier
1960  and the fork barrier. Copy the blocktime info to the thread, where
1961  __kmp_wait_template() can access it when the team struct is not
1962  guaranteed to exist. */
1963  // See note about the corresponding code in __kmp_join_barrier() being
1964  // performance-critical
1965  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1966 #if KMP_USE_MONITOR
1967  this_thr->th.th_team_bt_intervals =
1968  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1969  this_thr->th.th_team_bt_set =
1970  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1971 #else
1972  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1973 #endif
1974  }
1975  } // master
1976 
1977  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1978  case bp_hyper_bar: {
1979  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1980  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1981  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1982  break;
1983  }
1984  case bp_hierarchical_bar: {
1985  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1986  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1987  break;
1988  }
1989  case bp_tree_bar: {
1990  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1991  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1992  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1993  break;
1994  }
1995  default: {
1996  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1997  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1998  }
1999  }
2000 
2001 #if OMPT_SUPPORT
2002  if (ompt_enabled.enabled &&
2003  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2004  int ds_tid = this_thr->th.th_info.ds.ds_tid;
2005  ompt_data_t *task_data = (team)
2006  ? OMPT_CUR_TASK_DATA(this_thr)
2007  : &(this_thr->th.ompt_thread_info.task_data);
2008  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2009 #if OMPT_OPTIONAL
2010  void *codeptr = NULL;
2011  if (KMP_MASTER_TID(ds_tid) &&
2012  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2013  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2014  codeptr = team->t.ompt_team_info.master_return_address;
2015  if (ompt_enabled.ompt_callback_sync_region_wait) {
2016  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2017  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2018  codeptr);
2019  }
2020  if (ompt_enabled.ompt_callback_sync_region) {
2021  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2022  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2023  codeptr);
2024  }
2025 #endif
2026  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2027  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2028  ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2029  }
2030  }
2031 #endif
2032 
2033  // Early exit for reaping threads releasing forkjoin barrier
2034  if (TCR_4(__kmp_global.g.g_done)) {
2035  this_thr->th.th_task_team = NULL;
2036 
2037 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2038  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2039  if (!KMP_MASTER_TID(tid)) {
2040  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2041  if (itt_sync_obj)
2042  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2043  }
2044  }
2045 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2046  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2047  return;
2048  }
2049 
2050  /* We can now assume that a valid team structure has been allocated by the
2051  master and propagated to all worker threads. The current thread, however,
2052  may not be part of the team, so we can't blindly assume that the team
2053  pointer is non-null. */
2054  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2055  KMP_DEBUG_ASSERT(team != NULL);
2056  tid = __kmp_tid_from_gtid(gtid);
2057 
2058 #if KMP_BARRIER_ICV_PULL
2059  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2060  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2061  implicit task has this data before this function is called. We cannot
2062  modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
2063  struct, because it is not always the case that the threads arrays have
2064  been allocated when __kmp_fork_call() is executed. */
2065  {
2066  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2067  if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
2068  // Copy the initial ICVs from the master's thread struct to the implicit
2069  // task for this tid.
2070  KA_TRACE(10,
2071  ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2072  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2073  tid, FALSE);
2074  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2075  &team->t.t_threads[0]
2076  ->th.th_bar[bs_forkjoin_barrier]
2077  .bb.th_fixed_icvs);
2078  }
2079  }
2080 #endif // KMP_BARRIER_ICV_PULL
2081 
2082  if (__kmp_tasking_mode != tskm_immediate_exec) {
2083  __kmp_task_team_sync(this_thr, team);
2084  }
2085 
2086 #if KMP_AFFINITY_SUPPORTED
2087  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2088  if (proc_bind == proc_bind_intel) {
2089  // Call dynamic affinity settings
2090  if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2091  __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2092  }
2093  } else if (proc_bind != proc_bind_false) {
2094  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2095  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2096  __kmp_gtid_from_thread(this_thr),
2097  this_thr->th.th_current_place));
2098  } else {
2099  __kmp_affinity_set_place(gtid);
2100  }
2101  }
2102 #endif // KMP_AFFINITY_SUPPORTED
2103  // Perform the display affinity functionality
2104  if (__kmp_display_affinity) {
2105  if (team->t.t_display_affinity
2106 #if KMP_AFFINITY_SUPPORTED
2107  || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2108 #endif
2109  ) {
2110  // NULL means use the affinity-format-var ICV
2111  __kmp_aux_display_affinity(gtid, NULL);
2112  this_thr->th.th_prev_num_threads = team->t.t_nproc;
2113  this_thr->th.th_prev_level = team->t.t_level;
2114  }
2115  }
2116  if (!KMP_MASTER_TID(tid))
2117  KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2118 
2119 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2120  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2121  if (!KMP_MASTER_TID(tid)) {
2122  // Get correct barrier object
2123  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2124  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2125  } // (prepare called inside barrier_release)
2126  }
2127 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2128  ANNOTATE_BARRIER_END(&team->t.t_bar);
2129  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2130  team->t.t_id, tid));
2131 }
2132 
2133 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2134  kmp_internal_control_t *new_icvs, ident_t *loc) {
2135  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2136 
2137  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2138  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2139 
2140 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2141  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2142  implicit task has this data before this function is called. */
2143 #if KMP_BARRIER_ICV_PULL
2144  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2145  untouched), where all of the worker threads can access them and make their
2146  own copies after the barrier. */
2147  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2148  // allocated at this point
2149  copy_icvs(
2150  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2151  new_icvs);
2152  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2153  team->t.t_threads[0], team));
2154 #elif KMP_BARRIER_ICV_PUSH
2155  // The ICVs will be propagated in the fork barrier, so nothing needs to be
2156  // done here.
2157  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2158  team->t.t_threads[0], team));
2159 #else
2160  // Copy the ICVs to each of the non-master threads. This takes O(nthreads)
2161  // time.
2162  ngo_load(new_icvs);
2163  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2164  // allocated at this point
2165  for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2166  // TODO: GEH - pass in better source location info since usually NULL here
2167  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2168  f, team->t.t_threads[f], team));
2169  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2170  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2171  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2172  f, team->t.t_threads[f], team));
2173  }
2174  ngo_sync();
2175 #endif // KMP_BARRIER_ICV_PULL
2176 }
ident
Definition: kmp.h:226