LLVM OpenMP* Runtime Library
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_wait_release.h"
16 #include "kmp_itt.h"
17 #include "kmp_os.h"
18 #include "kmp_stats.h"
19 #if OMPT_SUPPORT
20 #include "ompt-specific.h"
21 #endif
22 
23 #if KMP_MIC
24 #include <immintrin.h>
25 #define USE_NGO_STORES 1
26 #endif // KMP_MIC
27 
28 #include "tsan_annotations.h"
29 
30 #if KMP_MIC && USE_NGO_STORES
31 // ICV copying
32 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
33 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
34 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
35 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
36 #else
37 #define ngo_load(src) ((void)0)
38 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
39 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
40 #define ngo_sync() ((void)0)
41 #endif /* KMP_MIC && USE_NGO_STORES */
42 
43 void __kmp_print_structure(void); // Forward declaration
44 
45 // ---------------------------- Barrier Algorithms ----------------------------
46 
47 // Linear Barrier
48 static void __kmp_linear_barrier_gather(
49  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
50  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
51  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
52  kmp_team_t *team = this_thr->th.th_team;
53  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
54  kmp_info_t **other_threads = team->t.t_threads;
55 
56  KA_TRACE(
57  20,
58  ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
59  gtid, team->t.t_id, tid, bt));
60  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
61 
62 #if USE_ITT_BUILD && USE_ITT_NOTIFY
63  // Barrier imbalance - save arrive time to the thread
64  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
65  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
66  __itt_get_timestamp();
67  }
68 #endif
69  // We now perform a linear reduction to signal that all of the threads have
70  // arrived.
71  if (!KMP_MASTER_TID(tid)) {
72  KA_TRACE(20,
73  ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
74  "arrived(%p): %llu => %llu\n",
75  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
76  team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
77  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
78  // Mark arrival to master thread
79  /* After performing this write, a worker thread may not assume that the team
80  is valid any more - it could be deallocated by the master thread at any
81  time. */
82  ANNOTATE_BARRIER_BEGIN(this_thr);
83  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
84  flag.release();
85  } else {
86  kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
87  int nproc = this_thr->th.th_team_nproc;
88  int i;
89  // Don't have to worry about sleep bit here or atomic since team setting
90  kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
91 
92  // Collect all the worker team member threads.
93  for (i = 1; i < nproc; ++i) {
94 #if KMP_CACHE_MANAGE
95  // Prefetch next thread's arrived count
96  if (i + 1 < nproc)
97  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
98 #endif /* KMP_CACHE_MANAGE */
99  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
100  "arrived(%p) == %llu\n",
101  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
102  team->t.t_id, i,
103  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
104 
105  // Wait for worker thread to arrive
106  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
107  new_state);
108  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
109  ANNOTATE_BARRIER_END(other_threads[i]);
110 #if USE_ITT_BUILD && USE_ITT_NOTIFY
111  // Barrier imbalance - write min of the thread time and the other thread
112  // time to the thread.
113  if (__kmp_forkjoin_frames_mode == 2) {
114  this_thr->th.th_bar_min_time = KMP_MIN(
115  this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
116  }
117 #endif
118  if (reduce) {
119  KA_TRACE(100,
120  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
121  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
122  team->t.t_id, i));
123  ANNOTATE_REDUCE_AFTER(reduce);
124  (*reduce)(this_thr->th.th_local.reduce_data,
125  other_threads[i]->th.th_local.reduce_data);
126  ANNOTATE_REDUCE_BEFORE(reduce);
127  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
128  }
129  }
130  // Don't have to worry about sleep bit here or atomic since team setting
131  team_bar->b_arrived = new_state;
132  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
133  "arrived(%p) = %llu\n",
134  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
135  new_state));
136  }
137  KA_TRACE(
138  20,
139  ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
140  gtid, team->t.t_id, tid, bt));
141 }
142 
143 static void __kmp_linear_barrier_release(
144  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
145  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
146  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
147  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
148  kmp_team_t *team;
149 
150  if (KMP_MASTER_TID(tid)) {
151  unsigned int i;
152  kmp_uint32 nproc = this_thr->th.th_team_nproc;
153  kmp_info_t **other_threads;
154 
155  team = __kmp_threads[gtid]->th.th_team;
156  KMP_DEBUG_ASSERT(team != NULL);
157  other_threads = team->t.t_threads;
158 
159  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
160  "barrier type %d\n",
161  gtid, team->t.t_id, tid, bt));
162 
163  if (nproc > 1) {
164 #if KMP_BARRIER_ICV_PUSH
165  {
166  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
167  if (propagate_icvs) {
168  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
169  for (i = 1; i < nproc; ++i) {
170  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
171  team, i, FALSE);
172  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
173  &team->t.t_implicit_task_taskdata[0].td_icvs);
174  }
175  ngo_sync();
176  }
177  }
178 #endif // KMP_BARRIER_ICV_PUSH
179 
180  // Now, release all of the worker threads
181  for (i = 1; i < nproc; ++i) {
182 #if KMP_CACHE_MANAGE
183  // Prefetch next thread's go flag
184  if (i + 1 < nproc)
185  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
186 #endif /* KMP_CACHE_MANAGE */
187  KA_TRACE(
188  20,
189  ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
190  "go(%p): %u => %u\n",
191  gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
192  team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
193  other_threads[i]->th.th_bar[bt].bb.b_go,
194  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
195  ANNOTATE_BARRIER_BEGIN(other_threads[i]);
196  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
197  other_threads[i]);
198  flag.release();
199  }
200  }
201  } else { // Wait for the MASTER thread to release us
202  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
203  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
204  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
205  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
206  ANNOTATE_BARRIER_END(this_thr);
207 #if USE_ITT_BUILD && USE_ITT_NOTIFY
208  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
209  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
210  // disabled)
211  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
212  // Cancel wait on previous parallel region...
213  __kmp_itt_task_starting(itt_sync_obj);
214 
215  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
216  return;
217 
218  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
219  if (itt_sync_obj != NULL)
220  // Call prepare as early as possible for "new" barrier
221  __kmp_itt_task_finished(itt_sync_obj);
222  } else
223 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
224  // Early exit for reaping threads releasing forkjoin barrier
225  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
226  return;
227 // The worker thread may now assume that the team is valid.
228 #ifdef KMP_DEBUG
229  tid = __kmp_tid_from_gtid(gtid);
230  team = __kmp_threads[gtid]->th.th_team;
231 #endif
232  KMP_DEBUG_ASSERT(team != NULL);
233  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
234  KA_TRACE(20,
235  ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
236  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
237  KMP_MB(); // Flush all pending memory write invalidates.
238  }
239  KA_TRACE(
240  20,
241  ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
242  gtid, team->t.t_id, tid, bt));
243 }
244 
245 // Tree barrier
246 static void
247 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
248  int tid, void (*reduce)(void *, void *)
249  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
250  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
251  kmp_team_t *team = this_thr->th.th_team;
252  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
253  kmp_info_t **other_threads = team->t.t_threads;
254  kmp_uint32 nproc = this_thr->th.th_team_nproc;
255  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
256  kmp_uint32 branch_factor = 1 << branch_bits;
257  kmp_uint32 child;
258  kmp_uint32 child_tid;
259  kmp_uint64 new_state;
260 
261  KA_TRACE(
262  20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
263  gtid, team->t.t_id, tid, bt));
264  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
265 
266 #if USE_ITT_BUILD && USE_ITT_NOTIFY
267  // Barrier imbalance - save arrive time to the thread
268  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
269  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
270  __itt_get_timestamp();
271  }
272 #endif
273  // Perform tree gather to wait until all threads have arrived; reduce any
274  // required data as we go
275  child_tid = (tid << branch_bits) + 1;
276  if (child_tid < nproc) {
277  // Parent threads wait for all their children to arrive
278  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
279  child = 1;
280  do {
281  kmp_info_t *child_thr = other_threads[child_tid];
282  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
283 #if KMP_CACHE_MANAGE
284  // Prefetch next thread's arrived count
285  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
286  KMP_CACHE_PREFETCH(
287  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
288 #endif /* KMP_CACHE_MANAGE */
289  KA_TRACE(20,
290  ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
291  "arrived(%p) == %llu\n",
292  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
293  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
294  // Wait for child to arrive
295  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
296  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
297  ANNOTATE_BARRIER_END(child_thr);
298 #if USE_ITT_BUILD && USE_ITT_NOTIFY
299  // Barrier imbalance - write min of the thread time and a child time to
300  // the thread.
301  if (__kmp_forkjoin_frames_mode == 2) {
302  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
303  child_thr->th.th_bar_min_time);
304  }
305 #endif
306  if (reduce) {
307  KA_TRACE(100,
308  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
309  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
310  team->t.t_id, child_tid));
311  ANNOTATE_REDUCE_AFTER(reduce);
312  (*reduce)(this_thr->th.th_local.reduce_data,
313  child_thr->th.th_local.reduce_data);
314  ANNOTATE_REDUCE_BEFORE(reduce);
315  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
316  }
317  child++;
318  child_tid++;
319  } while (child <= branch_factor && child_tid < nproc);
320  }
321 
322  if (!KMP_MASTER_TID(tid)) { // Worker threads
323  kmp_int32 parent_tid = (tid - 1) >> branch_bits;
324 
325  KA_TRACE(20,
326  ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
327  "arrived(%p): %llu => %llu\n",
328  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
329  team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
330  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
331 
332  // Mark arrival to parent thread
333  /* After performing this write, a worker thread may not assume that the team
334  is valid any more - it could be deallocated by the master thread at any
335  time. */
336  ANNOTATE_BARRIER_BEGIN(this_thr);
337  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
338  flag.release();
339  } else {
340  // Need to update the team arrived pointer if we are the master thread
341  if (nproc > 1) // New value was already computed above
342  team->t.t_bar[bt].b_arrived = new_state;
343  else
344  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
345  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
346  "arrived(%p) = %llu\n",
347  gtid, team->t.t_id, tid, team->t.t_id,
348  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
349  }
350  KA_TRACE(20,
351  ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
352  gtid, team->t.t_id, tid, bt));
353 }
354 
355 static void __kmp_tree_barrier_release(
356  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
357  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
358  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
359  kmp_team_t *team;
360  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
361  kmp_uint32 nproc;
362  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
363  kmp_uint32 branch_factor = 1 << branch_bits;
364  kmp_uint32 child;
365  kmp_uint32 child_tid;
366 
367  // Perform a tree release for all of the threads that have been gathered
368  if (!KMP_MASTER_TID(
369  tid)) { // Handle fork barrier workers who aren't part of a team yet
370  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
371  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
372  // Wait for parent thread to release us
373  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
374  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
375  ANNOTATE_BARRIER_END(this_thr);
376 #if USE_ITT_BUILD && USE_ITT_NOTIFY
377  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
378  // In fork barrier where we could not get the object reliably (or
379  // ITTNOTIFY is disabled)
380  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
381  // Cancel wait on previous parallel region...
382  __kmp_itt_task_starting(itt_sync_obj);
383 
384  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
385  return;
386 
387  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
388  if (itt_sync_obj != NULL)
389  // Call prepare as early as possible for "new" barrier
390  __kmp_itt_task_finished(itt_sync_obj);
391  } else
392 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
393  // Early exit for reaping threads releasing forkjoin barrier
394  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
395  return;
396 
397  // The worker thread may now assume that the team is valid.
398  team = __kmp_threads[gtid]->th.th_team;
399  KMP_DEBUG_ASSERT(team != NULL);
400  tid = __kmp_tid_from_gtid(gtid);
401 
402  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
403  KA_TRACE(20,
404  ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
405  team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
406  KMP_MB(); // Flush all pending memory write invalidates.
407  } else {
408  team = __kmp_threads[gtid]->th.th_team;
409  KMP_DEBUG_ASSERT(team != NULL);
410  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
411  "barrier type %d\n",
412  gtid, team->t.t_id, tid, bt));
413  }
414  nproc = this_thr->th.th_team_nproc;
415  child_tid = (tid << branch_bits) + 1;
416 
417  if (child_tid < nproc) {
418  kmp_info_t **other_threads = team->t.t_threads;
419  child = 1;
420  // Parent threads release all their children
421  do {
422  kmp_info_t *child_thr = other_threads[child_tid];
423  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
424 #if KMP_CACHE_MANAGE
425  // Prefetch next thread's go count
426  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
427  KMP_CACHE_PREFETCH(
428  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
429 #endif /* KMP_CACHE_MANAGE */
430 
431 #if KMP_BARRIER_ICV_PUSH
432  {
433  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
434  if (propagate_icvs) {
435  __kmp_init_implicit_task(team->t.t_ident,
436  team->t.t_threads[child_tid], team,
437  child_tid, FALSE);
438  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
439  &team->t.t_implicit_task_taskdata[0].td_icvs);
440  }
441  }
442 #endif // KMP_BARRIER_ICV_PUSH
443  KA_TRACE(20,
444  ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
445  "go(%p): %u => %u\n",
446  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
447  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
448  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
449  // Release child from barrier
450  ANNOTATE_BARRIER_BEGIN(child_thr);
451  kmp_flag_64 flag(&child_bar->b_go, child_thr);
452  flag.release();
453  child++;
454  child_tid++;
455  } while (child <= branch_factor && child_tid < nproc);
456  }
457  KA_TRACE(
458  20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
459  gtid, team->t.t_id, tid, bt));
460 }
461 
462 // Hyper Barrier
463 static void
464 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
465  int tid, void (*reduce)(void *, void *)
466  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
467  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
468  kmp_team_t *team = this_thr->th.th_team;
469  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
470  kmp_info_t **other_threads = team->t.t_threads;
471  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
472  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
473  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
474  kmp_uint32 branch_factor = 1 << branch_bits;
475  kmp_uint32 offset;
476  kmp_uint32 level;
477 
478  KA_TRACE(
479  20,
480  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
481  gtid, team->t.t_id, tid, bt));
482  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
483 
484 #if USE_ITT_BUILD && USE_ITT_NOTIFY
485  // Barrier imbalance - save arrive time to the thread
486  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
487  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
488  __itt_get_timestamp();
489  }
490 #endif
491  /* Perform a hypercube-embedded tree gather to wait until all of the threads
492  have arrived, and reduce any required data as we go. */
493  kmp_flag_64 p_flag(&thr_bar->b_arrived);
494  for (level = 0, offset = 1; offset < num_threads;
495  level += branch_bits, offset <<= branch_bits) {
496  kmp_uint32 child;
497  kmp_uint32 child_tid;
498 
499  if (((tid >> level) & (branch_factor - 1)) != 0) {
500  kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
501 
502  KA_TRACE(20,
503  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
504  "arrived(%p): %llu => %llu\n",
505  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
506  team->t.t_id, parent_tid, &thr_bar->b_arrived,
507  thr_bar->b_arrived,
508  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
509  // Mark arrival to parent thread
510  /* After performing this write (in the last iteration of the enclosing for
511  loop), a worker thread may not assume that the team is valid any more
512  - it could be deallocated by the master thread at any time. */
513  ANNOTATE_BARRIER_BEGIN(this_thr);
514  p_flag.set_waiter(other_threads[parent_tid]);
515  p_flag.release();
516  break;
517  }
518 
519  // Parent threads wait for children to arrive
520  if (new_state == KMP_BARRIER_UNUSED_STATE)
521  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
522  for (child = 1, child_tid = tid + (1 << level);
523  child < branch_factor && child_tid < num_threads;
524  child++, child_tid += (1 << level)) {
525  kmp_info_t *child_thr = other_threads[child_tid];
526  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
527 #if KMP_CACHE_MANAGE
528  kmp_uint32 next_child_tid = child_tid + (1 << level);
529  // Prefetch next thread's arrived count
530  if (child + 1 < branch_factor && next_child_tid < num_threads)
531  KMP_CACHE_PREFETCH(
532  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
533 #endif /* KMP_CACHE_MANAGE */
534  KA_TRACE(20,
535  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
536  "arrived(%p) == %llu\n",
537  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
538  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
539  // Wait for child to arrive
540  kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
541  c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
542  ANNOTATE_BARRIER_END(child_thr);
543 #if USE_ITT_BUILD && USE_ITT_NOTIFY
544  // Barrier imbalance - write min of the thread time and a child time to
545  // the thread.
546  if (__kmp_forkjoin_frames_mode == 2) {
547  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
548  child_thr->th.th_bar_min_time);
549  }
550 #endif
551  if (reduce) {
552  KA_TRACE(100,
553  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
554  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
555  team->t.t_id, child_tid));
556  ANNOTATE_REDUCE_AFTER(reduce);
557  (*reduce)(this_thr->th.th_local.reduce_data,
558  child_thr->th.th_local.reduce_data);
559  ANNOTATE_REDUCE_BEFORE(reduce);
560  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
561  }
562  }
563  }
564 
565  if (KMP_MASTER_TID(tid)) {
566  // Need to update the team arrived pointer if we are the master thread
567  if (new_state == KMP_BARRIER_UNUSED_STATE)
568  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
569  else
570  team->t.t_bar[bt].b_arrived = new_state;
571  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
572  "arrived(%p) = %llu\n",
573  gtid, team->t.t_id, tid, team->t.t_id,
574  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
575  }
576  KA_TRACE(
577  20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
578  gtid, team->t.t_id, tid, bt));
579 }
580 
581 // The reverse versions seem to beat the forward versions overall
582 #define KMP_REVERSE_HYPER_BAR
583 static void __kmp_hyper_barrier_release(
584  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
585  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
586  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
587  kmp_team_t *team;
588  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
589  kmp_info_t **other_threads;
590  kmp_uint32 num_threads;
591  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
592  kmp_uint32 branch_factor = 1 << branch_bits;
593  kmp_uint32 child;
594  kmp_uint32 child_tid;
595  kmp_uint32 offset;
596  kmp_uint32 level;
597 
598  /* Perform a hypercube-embedded tree release for all of the threads that have
599  been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
600  are released in the reverse order of the corresponding gather, otherwise
601  threads are released in the same order. */
602  if (KMP_MASTER_TID(tid)) { // master
603  team = __kmp_threads[gtid]->th.th_team;
604  KMP_DEBUG_ASSERT(team != NULL);
605  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
606  "barrier type %d\n",
607  gtid, team->t.t_id, tid, bt));
608 #if KMP_BARRIER_ICV_PUSH
609  if (propagate_icvs) { // master already has ICVs in final destination; copy
610  copy_icvs(&thr_bar->th_fixed_icvs,
611  &team->t.t_implicit_task_taskdata[tid].td_icvs);
612  }
613 #endif
614  } else { // Handle fork barrier workers who aren't part of a team yet
615  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
616  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
617  // Wait for parent thread to release us
618  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
619  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
620  ANNOTATE_BARRIER_END(this_thr);
621 #if USE_ITT_BUILD && USE_ITT_NOTIFY
622  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
623  // In fork barrier where we could not get the object reliably
624  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
625  // Cancel wait on previous parallel region...
626  __kmp_itt_task_starting(itt_sync_obj);
627 
628  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
629  return;
630 
631  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
632  if (itt_sync_obj != NULL)
633  // Call prepare as early as possible for "new" barrier
634  __kmp_itt_task_finished(itt_sync_obj);
635  } else
636 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
637  // Early exit for reaping threads releasing forkjoin barrier
638  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
639  return;
640 
641  // The worker thread may now assume that the team is valid.
642  team = __kmp_threads[gtid]->th.th_team;
643  KMP_DEBUG_ASSERT(team != NULL);
644  tid = __kmp_tid_from_gtid(gtid);
645 
646  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
647  KA_TRACE(20,
648  ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
649  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
650  KMP_MB(); // Flush all pending memory write invalidates.
651  }
652  num_threads = this_thr->th.th_team_nproc;
653  other_threads = team->t.t_threads;
654 
655 #ifdef KMP_REVERSE_HYPER_BAR
656  // Count up to correct level for parent
657  for (level = 0, offset = 1;
658  offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
659  level += branch_bits, offset <<= branch_bits)
660  ;
661 
662  // Now go down from there
663  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
664  level -= branch_bits, offset >>= branch_bits)
665 #else
666  // Go down the tree, level by level
667  for (level = 0, offset = 1; offset < num_threads;
668  level += branch_bits, offset <<= branch_bits)
669 #endif // KMP_REVERSE_HYPER_BAR
670  {
671 #ifdef KMP_REVERSE_HYPER_BAR
672  /* Now go in reverse order through the children, highest to lowest.
673  Initial setting of child is conservative here. */
674  child = num_threads >> ((level == 0) ? level : level - 1);
675  for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
676  child_tid = tid + (child << level);
677  child >= 1; child--, child_tid -= (1 << level))
678 #else
679  if (((tid >> level) & (branch_factor - 1)) != 0)
680  // No need to go lower than this, since this is the level parent would be
681  // notified
682  break;
683  // Iterate through children on this level of the tree
684  for (child = 1, child_tid = tid + (1 << level);
685  child < branch_factor && child_tid < num_threads;
686  child++, child_tid += (1 << level))
687 #endif // KMP_REVERSE_HYPER_BAR
688  {
689  if (child_tid >= num_threads)
690  continue; // Child doesn't exist so keep going
691  else {
692  kmp_info_t *child_thr = other_threads[child_tid];
693  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
694 #if KMP_CACHE_MANAGE
695  kmp_uint32 next_child_tid = child_tid - (1 << level);
696 // Prefetch next thread's go count
697 #ifdef KMP_REVERSE_HYPER_BAR
698  if (child - 1 >= 1 && next_child_tid < num_threads)
699 #else
700  if (child + 1 < branch_factor && next_child_tid < num_threads)
701 #endif // KMP_REVERSE_HYPER_BAR
702  KMP_CACHE_PREFETCH(
703  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
704 #endif /* KMP_CACHE_MANAGE */
705 
706 #if KMP_BARRIER_ICV_PUSH
707  if (propagate_icvs) // push my fixed ICVs to my child
708  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
709 #endif // KMP_BARRIER_ICV_PUSH
710 
711  KA_TRACE(
712  20,
713  ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
714  "go(%p): %u => %u\n",
715  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
716  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
717  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
718  // Release child from barrier
719  ANNOTATE_BARRIER_BEGIN(child_thr);
720  kmp_flag_64 flag(&child_bar->b_go, child_thr);
721  flag.release();
722  }
723  }
724  }
725 #if KMP_BARRIER_ICV_PUSH
726  if (propagate_icvs &&
727  !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
728  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
729  FALSE);
730  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
731  &thr_bar->th_fixed_icvs);
732  }
733 #endif
734  KA_TRACE(
735  20,
736  ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
737  gtid, team->t.t_id, tid, bt));
738 }
739 
740 // Hierarchical Barrier
741 
742 // Initialize thread barrier data
743 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
744  Performs the minimum amount of initialization required based on how the team
745  has changed. Returns true if leaf children will require both on-core and
746  traditional wake-up mechanisms. For example, if the team size increases,
747  threads already in the team will respond to on-core wakeup on their parent
748  thread, but threads newly added to the team will only be listening on the
749  their local b_go. */
750 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
751  kmp_bstate_t *thr_bar,
752  kmp_uint32 nproc, int gtid,
753  int tid, kmp_team_t *team) {
754  // Checks to determine if (re-)initialization is needed
755  bool uninitialized = thr_bar->team == NULL;
756  bool team_changed = team != thr_bar->team;
757  bool team_sz_changed = nproc != thr_bar->nproc;
758  bool tid_changed = tid != thr_bar->old_tid;
759  bool retval = false;
760 
761  if (uninitialized || team_sz_changed) {
762  __kmp_get_hierarchy(nproc, thr_bar);
763  }
764 
765  if (uninitialized || team_sz_changed || tid_changed) {
766  thr_bar->my_level = thr_bar->depth - 1; // default for master
767  thr_bar->parent_tid = -1; // default for master
768  if (!KMP_MASTER_TID(
769  tid)) { // if not master, find parent thread in hierarchy
770  kmp_uint32 d = 0;
771  while (d < thr_bar->depth) { // find parent based on level of thread in
772  // hierarchy, and note level
773  kmp_uint32 rem;
774  if (d == thr_bar->depth - 2) { // reached level right below the master
775  thr_bar->parent_tid = 0;
776  thr_bar->my_level = d;
777  break;
778  } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
779  0) { // TODO: can we make this op faster?
780  // thread is not a subtree root at next level, so this is max
781  thr_bar->parent_tid = tid - rem;
782  thr_bar->my_level = d;
783  break;
784  }
785  ++d;
786  }
787  }
788  thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
789  thr_bar->old_tid = tid;
790  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
791  thr_bar->team = team;
792  thr_bar->parent_bar =
793  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
794  }
795  if (uninitialized || team_changed || tid_changed) {
796  thr_bar->team = team;
797  thr_bar->parent_bar =
798  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
799  retval = true;
800  }
801  if (uninitialized || team_sz_changed || tid_changed) {
802  thr_bar->nproc = nproc;
803  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
804  if (thr_bar->my_level == 0)
805  thr_bar->leaf_kids = 0;
806  if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
807  thr_bar->leaf_kids = nproc - tid - 1;
808  thr_bar->leaf_state = 0;
809  for (int i = 0; i < thr_bar->leaf_kids; ++i)
810  ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
811  }
812  return retval;
813 }
814 
815 static void __kmp_hierarchical_barrier_gather(
816  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
817  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
818  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
819  kmp_team_t *team = this_thr->th.th_team;
820  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
821  kmp_uint32 nproc = this_thr->th.th_team_nproc;
822  kmp_info_t **other_threads = team->t.t_threads;
823  kmp_uint64 new_state;
824 
825  int level = team->t.t_level;
826 #if OMP_40_ENABLED
827  if (other_threads[0]
828  ->th.th_teams_microtask) // are we inside the teams construct?
829  if (this_thr->th.th_teams_size.nteams > 1)
830  ++level; // level was not increased in teams construct for team_of_masters
831 #endif
832  if (level == 1)
833  thr_bar->use_oncore_barrier = 1;
834  else
835  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
836 
837  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
838  "barrier type %d\n",
839  gtid, team->t.t_id, tid, bt));
840  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
841 
842 #if USE_ITT_BUILD && USE_ITT_NOTIFY
843  // Barrier imbalance - save arrive time to the thread
844  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
845  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
846  }
847 #endif
848 
849  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
850  team);
851 
852  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
853  kmp_int32 child_tid;
854  new_state =
855  (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
856  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
857  thr_bar->use_oncore_barrier) {
858  if (thr_bar->leaf_kids) {
859  // First, wait for leaf children to check-in on my b_arrived flag
860  kmp_uint64 leaf_state =
861  KMP_MASTER_TID(tid)
862  ? thr_bar->b_arrived | thr_bar->leaf_state
863  : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
864  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
865  "for leaf kids\n",
866  gtid, team->t.t_id, tid));
867  kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
868  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
869  if (reduce) {
870  ANNOTATE_REDUCE_AFTER(reduce);
871  for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
872  ++child_tid) {
873  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
874  "T#%d(%d:%d)\n",
875  gtid, team->t.t_id, tid,
876  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
877  child_tid));
878  ANNOTATE_BARRIER_END(other_threads[child_tid]);
879  (*reduce)(this_thr->th.th_local.reduce_data,
880  other_threads[child_tid]->th.th_local.reduce_data);
881  }
882  ANNOTATE_REDUCE_BEFORE(reduce);
883  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
884  }
885  // clear leaf_state bits
886  KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
887  }
888  // Next, wait for higher level children on each child's b_arrived flag
889  for (kmp_uint32 d = 1; d < thr_bar->my_level;
890  ++d) { // gather lowest level threads first, but skip 0
891  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
892  skip = thr_bar->skip_per_level[d];
893  if (last > nproc)
894  last = nproc;
895  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
896  kmp_info_t *child_thr = other_threads[child_tid];
897  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
898  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
899  "T#%d(%d:%d) "
900  "arrived(%p) == %llu\n",
901  gtid, team->t.t_id, tid,
902  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
903  child_tid, &child_bar->b_arrived, new_state));
904  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
905  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
906  ANNOTATE_BARRIER_END(child_thr);
907  if (reduce) {
908  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
909  "T#%d(%d:%d)\n",
910  gtid, team->t.t_id, tid,
911  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
912  child_tid));
913  ANNOTATE_REDUCE_AFTER(reduce);
914  (*reduce)(this_thr->th.th_local.reduce_data,
915  child_thr->th.th_local.reduce_data);
916  ANNOTATE_REDUCE_BEFORE(reduce);
917  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
918  }
919  }
920  }
921  } else { // Blocktime is not infinite
922  for (kmp_uint32 d = 0; d < thr_bar->my_level;
923  ++d) { // Gather lowest level threads first
924  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
925  skip = thr_bar->skip_per_level[d];
926  if (last > nproc)
927  last = nproc;
928  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
929  kmp_info_t *child_thr = other_threads[child_tid];
930  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
931  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
932  "T#%d(%d:%d) "
933  "arrived(%p) == %llu\n",
934  gtid, team->t.t_id, tid,
935  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
936  child_tid, &child_bar->b_arrived, new_state));
937  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
938  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
939  ANNOTATE_BARRIER_END(child_thr);
940  if (reduce) {
941  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
942  "T#%d(%d:%d)\n",
943  gtid, team->t.t_id, tid,
944  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
945  child_tid));
946  ANNOTATE_REDUCE_AFTER(reduce);
947  (*reduce)(this_thr->th.th_local.reduce_data,
948  child_thr->th.th_local.reduce_data);
949  ANNOTATE_REDUCE_BEFORE(reduce);
950  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
951  }
952  }
953  }
954  }
955  }
956  // All subordinates are gathered; now release parent if not master thread
957 
958  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
959  KA_TRACE(
960  20,
961  ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
962  "arrived(%p): %llu => %llu\n",
963  gtid, team->t.t_id, tid,
964  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
965  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
966  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
967  /* Mark arrival to parent: After performing this write, a worker thread may
968  not assume that the team is valid any more - it could be deallocated by
969  the master thread at any time. */
970  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
971  !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
972  // flag; release it
973  ANNOTATE_BARRIER_BEGIN(this_thr);
974  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
975  flag.release();
976  } else { // Leaf does special release on the "offset" bits of parent's
977  // b_arrived flag
978  thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
979  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
980  flag.set_waiter(other_threads[thr_bar->parent_tid]);
981  flag.release();
982  }
983  } else { // Master thread needs to update the team's b_arrived value
984  team->t.t_bar[bt].b_arrived = new_state;
985  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
986  "arrived(%p) = %llu\n",
987  gtid, team->t.t_id, tid, team->t.t_id,
988  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
989  }
990  // Is the team access below unsafe or just technically invalid?
991  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
992  "barrier type %d\n",
993  gtid, team->t.t_id, tid, bt));
994 }
995 
996 static void __kmp_hierarchical_barrier_release(
997  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
998  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
999  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1000  kmp_team_t *team;
1001  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1002  kmp_uint32 nproc;
1003  bool team_change = false; // indicates on-core barrier shouldn't be used
1004 
1005  if (KMP_MASTER_TID(tid)) {
1006  team = __kmp_threads[gtid]->th.th_team;
1007  KMP_DEBUG_ASSERT(team != NULL);
1008  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1009  "entered barrier type %d\n",
1010  gtid, team->t.t_id, tid, bt));
1011  } else { // Worker threads
1012  // Wait for parent thread to release me
1013  if (!thr_bar->use_oncore_barrier ||
1014  __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1015  thr_bar->team == NULL) {
1016  // Use traditional method of waiting on my own b_go flag
1017  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1018  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1019  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1020  ANNOTATE_BARRIER_END(this_thr);
1021  TCW_8(thr_bar->b_go,
1022  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1023  } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1024  // infinite, not nested
1025  // Wait on my "offset" bits on parent's b_go flag
1026  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1027  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1028  thr_bar->offset, bt,
1029  this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1030  flag.wait(this_thr, TRUE);
1031  if (thr_bar->wait_flag ==
1032  KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1033  TCW_8(thr_bar->b_go,
1034  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1035  } else { // Reset my bits on parent's b_go flag
1036  (RCAST(volatile char *,
1037  &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1038  }
1039  }
1040  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1041  // Early exit for reaping threads releasing forkjoin barrier
1042  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1043  return;
1044  // The worker thread may now assume that the team is valid.
1045  team = __kmp_threads[gtid]->th.th_team;
1046  KMP_DEBUG_ASSERT(team != NULL);
1047  tid = __kmp_tid_from_gtid(gtid);
1048 
1049  KA_TRACE(
1050  20,
1051  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1052  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1053  KMP_MB(); // Flush all pending memory write invalidates.
1054  }
1055 
1056  nproc = this_thr->th.th_team_nproc;
1057  int level = team->t.t_level;
1058 #if OMP_40_ENABLED
1059  if (team->t.t_threads[0]
1060  ->th.th_teams_microtask) { // are we inside the teams construct?
1061  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1062  this_thr->th.th_teams_level == level)
1063  ++level; // level was not increased in teams construct for team_of_workers
1064  if (this_thr->th.th_teams_size.nteams > 1)
1065  ++level; // level was not increased in teams construct for team_of_masters
1066  }
1067 #endif
1068  if (level == 1)
1069  thr_bar->use_oncore_barrier = 1;
1070  else
1071  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1072 
1073  // If the team size has increased, we still communicate with old leaves via
1074  // oncore barrier.
1075  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1076  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1077  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1078  tid, team);
1079  // But if the entire team changes, we won't use oncore barrier at all
1080  if (team_change)
1081  old_leaf_kids = 0;
1082 
1083 #if KMP_BARRIER_ICV_PUSH
1084  if (propagate_icvs) {
1085  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1086  FALSE);
1087  if (KMP_MASTER_TID(
1088  tid)) { // master already has copy in final destination; copy
1089  copy_icvs(&thr_bar->th_fixed_icvs,
1090  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1091  } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1092  thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1093  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1094  // leaves (on-core children) pull parent's fixed ICVs directly to local
1095  // ICV store
1096  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1097  &thr_bar->parent_bar->th_fixed_icvs);
1098  // non-leaves will get ICVs piggybacked with b_go via NGO store
1099  } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1100  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1101  // access
1102  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1103  else // leaves copy parent's fixed ICVs directly to local ICV store
1104  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1105  &thr_bar->parent_bar->th_fixed_icvs);
1106  }
1107  }
1108 #endif // KMP_BARRIER_ICV_PUSH
1109 
1110  // Now, release my children
1111  if (thr_bar->my_level) { // not a leaf
1112  kmp_int32 child_tid;
1113  kmp_uint32 last;
1114  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1115  thr_bar->use_oncore_barrier) {
1116  if (KMP_MASTER_TID(tid)) { // do a flat release
1117  // Set local b_go to bump children via NGO store of the cache line
1118  // containing IVCs and b_go.
1119  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1120  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1121  // the cache line
1122  ngo_load(&thr_bar->th_fixed_icvs);
1123  // This loops over all the threads skipping only the leaf nodes in the
1124  // hierarchy
1125  for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1126  child_tid += thr_bar->skip_per_level[1]) {
1127  kmp_bstate_t *child_bar =
1128  &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1129  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1130  "releasing T#%d(%d:%d)"
1131  " go(%p): %u => %u\n",
1132  gtid, team->t.t_id, tid,
1133  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1134  child_tid, &child_bar->b_go, child_bar->b_go,
1135  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1136  // Use ngo store (if available) to both store ICVs and release child
1137  // via child's b_go
1138  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1139  }
1140  ngo_sync();
1141  }
1142  TCW_8(thr_bar->b_go,
1143  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1144  // Now, release leaf children
1145  if (thr_bar->leaf_kids) { // if there are any
1146  // We test team_change on the off-chance that the level 1 team changed.
1147  if (team_change ||
1148  old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1149  if (old_leaf_kids) { // release old leaf kids
1150  thr_bar->b_go |= old_leaf_state;
1151  }
1152  // Release new leaf kids
1153  last = tid + thr_bar->skip_per_level[1];
1154  if (last > nproc)
1155  last = nproc;
1156  for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1157  ++child_tid) { // skip_per_level[0]=1
1158  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1159  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1160  KA_TRACE(
1161  20,
1162  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1163  " T#%d(%d:%d) go(%p): %u => %u\n",
1164  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1165  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1166  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1167  // Release child using child's b_go flag
1168  ANNOTATE_BARRIER_BEGIN(child_thr);
1169  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1170  flag.release();
1171  }
1172  } else { // Release all children at once with leaf_state bits on my own
1173  // b_go flag
1174  thr_bar->b_go |= thr_bar->leaf_state;
1175  }
1176  }
1177  } else { // Blocktime is not infinite; do a simple hierarchical release
1178  for (int d = thr_bar->my_level - 1; d >= 0;
1179  --d) { // Release highest level threads first
1180  last = tid + thr_bar->skip_per_level[d + 1];
1181  kmp_uint32 skip = thr_bar->skip_per_level[d];
1182  if (last > nproc)
1183  last = nproc;
1184  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1185  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1186  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1187  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1188  "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1189  gtid, team->t.t_id, tid,
1190  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1191  child_tid, &child_bar->b_go, child_bar->b_go,
1192  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1193  // Release child using child's b_go flag
1194  ANNOTATE_BARRIER_BEGIN(child_thr);
1195  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1196  flag.release();
1197  }
1198  }
1199  }
1200 #if KMP_BARRIER_ICV_PUSH
1201  if (propagate_icvs && !KMP_MASTER_TID(tid))
1202  // non-leaves copy ICVs from fixed ICVs to local dest
1203  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1204  &thr_bar->th_fixed_icvs);
1205 #endif // KMP_BARRIER_ICV_PUSH
1206  }
1207  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1208  "barrier type %d\n",
1209  gtid, team->t.t_id, tid, bt));
1210 }
1211 
1212 // End of Barrier Algorithms
1213 
1214 // Internal function to do a barrier.
1215 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1216  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1217  barrier
1218  Returns 0 if master thread, 1 if worker thread. */
1219 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1220  size_t reduce_size, void *reduce_data,
1221  void (*reduce)(void *, void *)) {
1222  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1223  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1224  int tid = __kmp_tid_from_gtid(gtid);
1225  kmp_info_t *this_thr = __kmp_threads[gtid];
1226  kmp_team_t *team = this_thr->th.th_team;
1227  int status = 0;
1228  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1229 #if OMPT_SUPPORT
1230  ompt_data_t *my_task_data;
1231  ompt_data_t *my_parallel_data;
1232  void *return_address;
1233 #endif
1234 
1235  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1236  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1237 
1238  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1239 #if OMPT_SUPPORT
1240  if (ompt_enabled.enabled) {
1241 #if OMPT_OPTIONAL
1242  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1243  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1244  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1245  if (ompt_enabled.ompt_callback_sync_region) {
1246  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1247  ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1248  my_task_data, return_address);
1249  }
1250  if (ompt_enabled.ompt_callback_sync_region_wait) {
1251  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1252  ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1253  my_task_data, return_address);
1254  }
1255 #endif
1256  // It is OK to report the barrier state after the barrier begin callback.
1257  // According to the OMPT specification, a compliant implementation may
1258  // even delay reporting this state until the barrier begins to wait.
1259  this_thr->th.ompt_thread_info.state = omp_state_wait_barrier;
1260  }
1261 #endif
1262 
1263  if (!team->t.t_serialized) {
1264 #if USE_ITT_BUILD
1265  // This value will be used in itt notify events below.
1266  void *itt_sync_obj = NULL;
1267 #if USE_ITT_NOTIFY
1268  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1269  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1270 #endif
1271 #endif /* USE_ITT_BUILD */
1272  if (__kmp_tasking_mode == tskm_extra_barrier) {
1273  __kmp_tasking_barrier(team, this_thr, gtid);
1274  KA_TRACE(15,
1275  ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1276  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1277  }
1278 
1279  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1280  access it when the team struct is not guaranteed to exist. */
1281  // See note about the corresponding code in __kmp_join_barrier() being
1282  // performance-critical.
1283  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1284 #if KMP_USE_MONITOR
1285  this_thr->th.th_team_bt_intervals =
1286  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1287  this_thr->th.th_team_bt_set =
1288  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1289 #else
1290  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1291 #endif
1292  }
1293 
1294 #if USE_ITT_BUILD
1295  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1296  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1297 #endif /* USE_ITT_BUILD */
1298 #if USE_DEBUGGER
1299  // Let the debugger know: the thread arrived to the barrier and waiting.
1300  if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1301  team->t.t_bar[bt].b_master_arrived += 1;
1302  } else {
1303  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1304  } // if
1305 #endif /* USE_DEBUGGER */
1306  if (reduce != NULL) {
1307  // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1308  this_thr->th.th_local.reduce_data = reduce_data;
1309  }
1310 
1311  if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1312  __kmp_task_team_setup(
1313  this_thr, team,
1314  0); // use 0 to only setup the current team if nthreads > 1
1315 
1316  switch (__kmp_barrier_gather_pattern[bt]) {
1317  case bp_hyper_bar: {
1318  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits
1319  // to 0; use linear
1320  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1321  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1322  break;
1323  }
1324  case bp_hierarchical_bar: {
1325  __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid,
1326  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1327  break;
1328  }
1329  case bp_tree_bar: {
1330  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits
1331  // to 0; use linear
1332  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1333  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1334  break;
1335  }
1336  default: {
1337  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1338  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1339  }
1340  }
1341 
1342  KMP_MB();
1343 
1344  if (KMP_MASTER_TID(tid)) {
1345  status = 0;
1346  if (__kmp_tasking_mode != tskm_immediate_exec) {
1347  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1348  }
1349 #if USE_DEBUGGER
1350  // Let the debugger know: All threads are arrived and starting leaving the
1351  // barrier.
1352  team->t.t_bar[bt].b_team_arrived += 1;
1353 #endif
1354 
1355 #if OMP_40_ENABLED
1356  // Reset cancellation flag for worksharing constructs
1357  if (team->t.t_cancel_request == cancel_loop ||
1358  team->t.t_cancel_request == cancel_sections) {
1359  team->t.t_cancel_request = cancel_noreq;
1360  }
1361 #endif
1362 #if USE_ITT_BUILD
1363  /* TODO: In case of split reduction barrier, master thread may send
1364  acquired event early, before the final summation into the shared
1365  variable is done (final summation can be a long operation for array
1366  reductions). */
1367  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1368  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1369 #endif /* USE_ITT_BUILD */
1370 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1371  // Barrier - report frame end (only if active_level == 1)
1372  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1373  __kmp_forkjoin_frames_mode &&
1374 #if OMP_40_ENABLED
1375  this_thr->th.th_teams_microtask == NULL &&
1376 #endif
1377  team->t.t_active_level == 1) {
1378  kmp_uint64 cur_time = __itt_get_timestamp();
1379  kmp_info_t **other_threads = team->t.t_threads;
1380  int nproc = this_thr->th.th_team_nproc;
1381  int i;
1382  switch (__kmp_forkjoin_frames_mode) {
1383  case 1:
1384  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1385  loc, nproc);
1386  this_thr->th.th_frame_time = cur_time;
1387  break;
1388  case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1389  // be fixed)
1390  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1391  1, loc, nproc);
1392  break;
1393  case 3:
1394  if (__itt_metadata_add_ptr) {
1395  // Initialize with master's wait time
1396  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1397  // Set arrive time to zero to be able to check it in
1398  // __kmp_invoke_task(); the same is done inside the loop below
1399  this_thr->th.th_bar_arrive_time = 0;
1400  for (i = 1; i < nproc; ++i) {
1401  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1402  other_threads[i]->th.th_bar_arrive_time = 0;
1403  }
1404  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1405  cur_time, delta,
1406  (kmp_uint64)(reduce != NULL));
1407  }
1408  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1409  loc, nproc);
1410  this_thr->th.th_frame_time = cur_time;
1411  break;
1412  }
1413  }
1414 #endif /* USE_ITT_BUILD */
1415  } else {
1416  status = 1;
1417 #if USE_ITT_BUILD
1418  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1419  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1420 #endif /* USE_ITT_BUILD */
1421  }
1422  if (status == 1 || !is_split) {
1423  switch (__kmp_barrier_release_pattern[bt]) {
1424  case bp_hyper_bar: {
1425  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1426  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1427  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1428  break;
1429  }
1430  case bp_hierarchical_bar: {
1431  __kmp_hierarchical_barrier_release(
1432  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1433  break;
1434  }
1435  case bp_tree_bar: {
1436  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1437  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1438  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1439  break;
1440  }
1441  default: {
1442  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1443  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1444  }
1445  }
1446  if (__kmp_tasking_mode != tskm_immediate_exec) {
1447  __kmp_task_team_sync(this_thr, team);
1448  }
1449  }
1450 
1451 #if USE_ITT_BUILD
1452  /* GEH: TODO: Move this under if-condition above and also include in
1453  __kmp_end_split_barrier(). This will more accurately represent the actual
1454  release time of the threads for split barriers. */
1455  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1456  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1457 #endif /* USE_ITT_BUILD */
1458  } else { // Team is serialized.
1459  status = 0;
1460  if (__kmp_tasking_mode != tskm_immediate_exec) {
1461 #if OMP_45_ENABLED
1462  if (this_thr->th.th_task_team != NULL) {
1463  void *itt_sync_obj = NULL;
1464 #if USE_ITT_NOTIFY
1465  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1466  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1467  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1468  }
1469 #endif
1470 
1471  KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1472  TRUE);
1473  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1474  __kmp_task_team_setup(this_thr, team, 0);
1475 
1476 #if USE_ITT_BUILD
1477  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1478  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1479 #endif /* USE_ITT_BUILD */
1480  }
1481 #else
1482  // The task team should be NULL for serialized code (tasks will be
1483  // executed immediately)
1484  KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
1485  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1486 #endif
1487  }
1488  }
1489  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1490  gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1491  __kmp_tid_from_gtid(gtid), status));
1492 
1493 #if OMPT_SUPPORT
1494  if (ompt_enabled.enabled) {
1495 #if OMPT_OPTIONAL
1496  if (ompt_enabled.ompt_callback_sync_region_wait) {
1497  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1498  ompt_sync_region_barrier, ompt_scope_end, my_parallel_data,
1499  my_task_data, return_address);
1500  }
1501  if (ompt_enabled.ompt_callback_sync_region) {
1502  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1503  ompt_sync_region_barrier, ompt_scope_end, my_parallel_data,
1504  my_task_data, return_address);
1505  }
1506 #endif
1507  this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
1508  }
1509 #endif
1510  ANNOTATE_BARRIER_END(&team->t.t_bar);
1511 
1512  return status;
1513 }
1514 
1515 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1516  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1517  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1518  int tid = __kmp_tid_from_gtid(gtid);
1519  kmp_info_t *this_thr = __kmp_threads[gtid];
1520  kmp_team_t *team = this_thr->th.th_team;
1521 
1522  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1523  if (!team->t.t_serialized) {
1524  if (KMP_MASTER_GTID(gtid)) {
1525  switch (__kmp_barrier_release_pattern[bt]) {
1526  case bp_hyper_bar: {
1527  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1528  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1529  FALSE USE_ITT_BUILD_ARG(NULL));
1530  break;
1531  }
1532  case bp_hierarchical_bar: {
1533  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1534  FALSE USE_ITT_BUILD_ARG(NULL));
1535  break;
1536  }
1537  case bp_tree_bar: {
1538  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1539  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1540  FALSE USE_ITT_BUILD_ARG(NULL));
1541  break;
1542  }
1543  default: {
1544  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1545  FALSE USE_ITT_BUILD_ARG(NULL));
1546  }
1547  }
1548  if (__kmp_tasking_mode != tskm_immediate_exec) {
1549  __kmp_task_team_sync(this_thr, team);
1550  } // if
1551  }
1552  }
1553  ANNOTATE_BARRIER_END(&team->t.t_bar);
1554 }
1555 
1556 void __kmp_join_barrier(int gtid) {
1557  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1558  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1559  kmp_info_t *this_thr = __kmp_threads[gtid];
1560  kmp_team_t *team;
1561  kmp_uint nproc;
1562  kmp_info_t *master_thread;
1563  int tid;
1564 #ifdef KMP_DEBUG
1565  int team_id;
1566 #endif /* KMP_DEBUG */
1567 #if USE_ITT_BUILD
1568  void *itt_sync_obj = NULL;
1569 #if USE_ITT_NOTIFY
1570  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1571  // Get object created at fork_barrier
1572  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1573 #endif
1574 #endif /* USE_ITT_BUILD */
1575  KMP_MB();
1576 
1577  // Get current info
1578  team = this_thr->th.th_team;
1579  nproc = this_thr->th.th_team_nproc;
1580  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1581  tid = __kmp_tid_from_gtid(gtid);
1582 #ifdef KMP_DEBUG
1583  team_id = team->t.t_id;
1584 #endif /* KMP_DEBUG */
1585  master_thread = this_thr->th.th_team_master;
1586 #ifdef KMP_DEBUG
1587  if (master_thread != team->t.t_threads[0]) {
1588  __kmp_print_structure();
1589  }
1590 #endif /* KMP_DEBUG */
1591  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1592  KMP_MB();
1593 
1594  // Verify state
1595  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1596  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1597  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1598  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1599  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1600  gtid, team_id, tid));
1601 
1602  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1603 #if OMPT_SUPPORT
1604  ompt_data_t *my_task_data;
1605  ompt_data_t *my_parallel_data;
1606  if (ompt_enabled.enabled) {
1607 #if OMPT_OPTIONAL
1608  void *codeptr = NULL;
1609  int ds_tid = this_thr->th.th_info.ds.ds_tid;
1610  if (KMP_MASTER_TID(ds_tid) &&
1611  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1612  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1613  codeptr = team->t.ompt_team_info.master_return_address;
1614  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1615  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1616  if (ompt_enabled.ompt_callback_sync_region) {
1617  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1618  ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1619  my_task_data, codeptr);
1620  }
1621  if (ompt_enabled.ompt_callback_sync_region_wait) {
1622  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1623  ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1624  my_task_data, codeptr);
1625  }
1626 #endif
1627  this_thr->th.ompt_thread_info.state = omp_state_wait_barrier_implicit;
1628  }
1629 #endif
1630 
1631  if (__kmp_tasking_mode == tskm_extra_barrier) {
1632  __kmp_tasking_barrier(team, this_thr, gtid);
1633  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1634  team_id, tid));
1635  }
1636 #ifdef KMP_DEBUG
1637  if (__kmp_tasking_mode != tskm_immediate_exec) {
1638  KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1639  "%p, th_task_team = %p\n",
1640  __kmp_gtid_from_thread(this_thr), team_id,
1641  team->t.t_task_team[this_thr->th.th_task_state],
1642  this_thr->th.th_task_team));
1643  KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1644  team->t.t_task_team[this_thr->th.th_task_state]);
1645  }
1646 #endif /* KMP_DEBUG */
1647 
1648  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1649  access it when the team struct is not guaranteed to exist. Doing these
1650  loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1651  we do not perform the copy if blocktime=infinite, since the values are not
1652  used by __kmp_wait_template() in that case. */
1653  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1654 #if KMP_USE_MONITOR
1655  this_thr->th.th_team_bt_intervals =
1656  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1657  this_thr->th.th_team_bt_set =
1658  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1659 #else
1660  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1661 #endif
1662  }
1663 
1664 #if USE_ITT_BUILD
1665  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1666  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1667 #endif /* USE_ITT_BUILD */
1668 
1669  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1670  case bp_hyper_bar: {
1671  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1672  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1673  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1674  break;
1675  }
1676  case bp_hierarchical_bar: {
1677  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1678  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1679  break;
1680  }
1681  case bp_tree_bar: {
1682  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1683  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1684  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1685  break;
1686  }
1687  default: {
1688  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1689  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1690  }
1691  }
1692 
1693  /* From this point on, the team data structure may be deallocated at any time
1694  by the master thread - it is unsafe to reference it in any of the worker
1695  threads. Any per-team data items that need to be referenced before the
1696  end of the barrier should be moved to the kmp_task_team_t structs. */
1697  if (KMP_MASTER_TID(tid)) {
1698  if (__kmp_tasking_mode != tskm_immediate_exec) {
1699  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1700  }
1701 #if KMP_STATS_ENABLED
1702  // Have master thread flag the workers to indicate they are now waiting for
1703  // next parallel region, Also wake them up so they switch their timers to
1704  // idle.
1705  for (int i = 0; i < team->t.t_nproc; ++i) {
1706  kmp_info_t *team_thread = team->t.t_threads[i];
1707  if (team_thread == this_thr)
1708  continue;
1709  team_thread->th.th_stats->setIdleFlag();
1710  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1711  team_thread->th.th_sleep_loc != NULL)
1712  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1713  team_thread->th.th_sleep_loc);
1714  }
1715 #endif
1716 #if USE_ITT_BUILD
1717  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1718  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1719 #endif /* USE_ITT_BUILD */
1720 
1721 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1722  // Join barrier - report frame end
1723  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1724  __kmp_forkjoin_frames_mode &&
1725 #if OMP_40_ENABLED
1726  this_thr->th.th_teams_microtask == NULL &&
1727 #endif
1728  team->t.t_active_level == 1) {
1729  kmp_uint64 cur_time = __itt_get_timestamp();
1730  ident_t *loc = team->t.t_ident;
1731  kmp_info_t **other_threads = team->t.t_threads;
1732  int nproc = this_thr->th.th_team_nproc;
1733  int i;
1734  switch (__kmp_forkjoin_frames_mode) {
1735  case 1:
1736  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1737  loc, nproc);
1738  break;
1739  case 2:
1740  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1741  loc, nproc);
1742  break;
1743  case 3:
1744  if (__itt_metadata_add_ptr) {
1745  // Initialize with master's wait time
1746  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1747  // Set arrive time to zero to be able to check it in
1748  // __kmp_invoke_task(); the same is done inside the loop below
1749  this_thr->th.th_bar_arrive_time = 0;
1750  for (i = 1; i < nproc; ++i) {
1751  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1752  other_threads[i]->th.th_bar_arrive_time = 0;
1753  }
1754  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1755  cur_time, delta, 0);
1756  }
1757  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1758  loc, nproc);
1759  this_thr->th.th_frame_time = cur_time;
1760  break;
1761  }
1762  }
1763 #endif /* USE_ITT_BUILD */
1764  }
1765 #if USE_ITT_BUILD
1766  else {
1767  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1768  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1769  }
1770 #endif /* USE_ITT_BUILD */
1771 
1772 #if KMP_DEBUG
1773  if (KMP_MASTER_TID(tid)) {
1774  KA_TRACE(
1775  15,
1776  ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1777  gtid, team_id, tid, nproc));
1778  }
1779 #endif /* KMP_DEBUG */
1780 
1781  // TODO now, mark worker threads as done so they may be disbanded
1782  KMP_MB(); // Flush all pending memory write invalidates.
1783  KA_TRACE(10,
1784  ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1785 
1786  ANNOTATE_BARRIER_END(&team->t.t_bar);
1787 }
1788 
1789 // TODO release worker threads' fork barriers as we are ready instead of all at
1790 // once
1791 void __kmp_fork_barrier(int gtid, int tid) {
1792  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1793  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1794  kmp_info_t *this_thr = __kmp_threads[gtid];
1795  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1796 #if USE_ITT_BUILD
1797  void *itt_sync_obj = NULL;
1798 #endif /* USE_ITT_BUILD */
1799  if (team)
1800  ANNOTATE_BARRIER_END(&team->t.t_bar);
1801 
1802  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1803  (team != NULL) ? team->t.t_id : -1, tid));
1804 
1805  // th_team pointer only valid for master thread here
1806  if (KMP_MASTER_TID(tid)) {
1807 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1808  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1809  // Create itt barrier object
1810  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1811  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1812  }
1813 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1814 
1815 #ifdef KMP_DEBUG
1816  kmp_info_t **other_threads = team->t.t_threads;
1817  int i;
1818 
1819  // Verify state
1820  KMP_MB();
1821 
1822  for (i = 1; i < team->t.t_nproc; ++i) {
1823  KA_TRACE(500,
1824  ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1825  "== %u.\n",
1826  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1827  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1828  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1829  KMP_DEBUG_ASSERT(
1830  (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1831  ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1832  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1833  }
1834 #endif
1835 
1836  if (__kmp_tasking_mode != tskm_immediate_exec) {
1837  // 0 indicates setup current task team if nthreads > 1
1838  __kmp_task_team_setup(this_thr, team, 0);
1839  }
1840 
1841  /* The master thread may have changed its blocktime between the join barrier
1842  and the fork barrier. Copy the blocktime info to the thread, where
1843  __kmp_wait_template() can access it when the team struct is not
1844  guaranteed to exist. */
1845  // See note about the corresponding code in __kmp_join_barrier() being
1846  // performance-critical
1847  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1848 #if KMP_USE_MONITOR
1849  this_thr->th.th_team_bt_intervals =
1850  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1851  this_thr->th.th_team_bt_set =
1852  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1853 #else
1854  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1855 #endif
1856  }
1857  } // master
1858 
1859  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1860  case bp_hyper_bar: {
1861  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1862  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1863  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1864  break;
1865  }
1866  case bp_hierarchical_bar: {
1867  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1868  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1869  break;
1870  }
1871  case bp_tree_bar: {
1872  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1873  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1874  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1875  break;
1876  }
1877  default: {
1878  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1879  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1880  }
1881  }
1882 
1883 #if OMPT_SUPPORT
1884  if (ompt_enabled.enabled) {
1885  if (this_thr->th.ompt_thread_info.state ==
1886  omp_state_wait_barrier_implicit) {
1887  int ds_tid = this_thr->th.th_info.ds.ds_tid;
1888  ompt_data_t *tId = (team) ? OMPT_CUR_TASK_DATA(this_thr)
1889  : &(this_thr->th.ompt_thread_info.task_data);
1890  this_thr->th.ompt_thread_info.state = omp_state_overhead;
1891 #if OMPT_OPTIONAL
1892  void *codeptr = NULL;
1893  if (KMP_MASTER_TID(ds_tid) &&
1894  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1895  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1896  codeptr = team->t.ompt_team_info.master_return_address;
1897  if (ompt_enabled.ompt_callback_sync_region_wait) {
1898  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1899  ompt_sync_region_barrier, ompt_scope_end, NULL, tId, codeptr);
1900  }
1901  if (ompt_enabled.ompt_callback_sync_region) {
1902  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1903  ompt_sync_region_barrier, ompt_scope_end, NULL, tId, codeptr);
1904  }
1905 #endif
1906  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
1907  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1908  ompt_scope_end, NULL, tId, 0, ds_tid);
1909  }
1910  // return to idle state
1911  this_thr->th.ompt_thread_info.state = omp_state_overhead;
1912  }
1913  }
1914 #endif
1915 
1916  // Early exit for reaping threads releasing forkjoin barrier
1917  if (TCR_4(__kmp_global.g.g_done)) {
1918  this_thr->th.th_task_team = NULL;
1919 
1920 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1921  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1922  if (!KMP_MASTER_TID(tid)) {
1923  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1924  if (itt_sync_obj)
1925  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1926  }
1927  }
1928 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1929  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1930  return;
1931  }
1932 
1933  /* We can now assume that a valid team structure has been allocated by the
1934  master and propagated to all worker threads. The current thread, however,
1935  may not be part of the team, so we can't blindly assume that the team
1936  pointer is non-null. */
1937  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1938  KMP_DEBUG_ASSERT(team != NULL);
1939  tid = __kmp_tid_from_gtid(gtid);
1940 
1941 #if KMP_BARRIER_ICV_PULL
1942  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1943  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
1944  implicit task has this data before this function is called. We cannot
1945  modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
1946  struct, because it is not always the case that the threads arrays have
1947  been allocated when __kmp_fork_call() is executed. */
1948  {
1949  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
1950  if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1951  // Copy the initial ICVs from the master's thread struct to the implicit
1952  // task for this tid.
1953  KA_TRACE(10,
1954  ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1955  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
1956  tid, FALSE);
1957  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1958  &team->t.t_threads[0]
1959  ->th.th_bar[bs_forkjoin_barrier]
1960  .bb.th_fixed_icvs);
1961  }
1962  }
1963 #endif // KMP_BARRIER_ICV_PULL
1964 
1965  if (__kmp_tasking_mode != tskm_immediate_exec) {
1966  __kmp_task_team_sync(this_thr, team);
1967  }
1968 
1969 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1970  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1971  if (proc_bind == proc_bind_intel) {
1972 #endif
1973 #if KMP_AFFINITY_SUPPORTED
1974  // Call dynamic affinity settings
1975  if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1976  __kmp_balanced_affinity(tid, team->t.t_nproc);
1977  }
1978 #endif // KMP_AFFINITY_SUPPORTED
1979 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1980  } else if (proc_bind != proc_bind_false) {
1981  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1982  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1983  __kmp_gtid_from_thread(this_thr),
1984  this_thr->th.th_current_place));
1985  } else {
1986  __kmp_affinity_set_place(gtid);
1987  }
1988  }
1989 #endif
1990 
1991 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1992  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1993  if (!KMP_MASTER_TID(tid)) {
1994  // Get correct barrier object
1995  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1996  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1997  } // (prepare called inside barrier_release)
1998  }
1999 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2000  ANNOTATE_BARRIER_END(&team->t.t_bar);
2001  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2002  team->t.t_id, tid));
2003 }
2004 
2005 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2006  kmp_internal_control_t *new_icvs, ident_t *loc) {
2007  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2008 
2009  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2010  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2011 
2012 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2013  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2014  implicit task has this data before this function is called. */
2015 #if KMP_BARRIER_ICV_PULL
2016  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2017  untouched), where all of the worker threads can access them and make their
2018  own copies after the barrier. */
2019  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2020  // allocated at this point
2021  copy_icvs(
2022  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2023  new_icvs);
2024  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2025  team->t.t_threads[0], team));
2026 #elif KMP_BARRIER_ICV_PUSH
2027  // The ICVs will be propagated in the fork barrier, so nothing needs to be
2028  // done here.
2029  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2030  team->t.t_threads[0], team));
2031 #else
2032  // Copy the ICVs to each of the non-master threads. This takes O(nthreads)
2033  // time.
2034  ngo_load(new_icvs);
2035  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2036  // allocated at this point
2037  for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2038  // TODO: GEH - pass in better source location info since usually NULL here
2039  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2040  f, team->t.t_threads[f], team));
2041  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2042  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2043  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2044  f, team->t.t_threads[f], team));
2045  }
2046  ngo_sync();
2047 #endif // KMP_BARRIER_ICV_PULL
2048 }
Definition: kmp.h:210