14 #include "kmp_wait_release.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
21 #include <immintrin.h>
22 #define USE_NGO_STORES 1
25 #include "tsan_annotations.h"
27 #if KMP_MIC && USE_NGO_STORES
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
40 void __kmp_print_structure(
void);
45 template <
bool cancellable = false>
46 static bool __kmp_linear_barrier_gather_template(
47 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
48 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
49 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
50 kmp_team_t *team = this_thr->th.th_team;
51 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
52 kmp_info_t **other_threads = team->t.t_threads;
56 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57 gtid, team->t.t_id, tid, bt));
58 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
60 #if USE_ITT_BUILD && USE_ITT_NOTIFY
62 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
64 __itt_get_timestamp();
69 if (!KMP_MASTER_TID(tid)) {
71 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
72 "arrived(%p): %llu => %llu\n",
73 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
74 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
75 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
80 ANNOTATE_BARRIER_BEGIN(this_thr);
81 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
84 kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
85 int nproc = this_thr->th.th_team_nproc;
88 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
91 for (i = 1; i < nproc; ++i) {
95 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
97 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
98 "arrived(%p) == %llu\n",
99 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
101 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
105 kmp_flag_64<true, false> flag(
106 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
107 if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
110 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
112 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
114 ANNOTATE_BARRIER_END(other_threads[i]);
115 #if USE_ITT_BUILD && USE_ITT_NOTIFY
118 if (__kmp_forkjoin_frames_mode == 2) {
119 this_thr->th.th_bar_min_time = KMP_MIN(
120 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
125 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
126 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
128 ANNOTATE_REDUCE_AFTER(reduce);
129 OMPT_REDUCTION_DECL(this_thr, gtid);
130 OMPT_REDUCTION_BEGIN;
131 (*reduce)(this_thr->th.th_local.reduce_data,
132 other_threads[i]->th.th_local.reduce_data);
134 ANNOTATE_REDUCE_BEFORE(reduce);
135 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
139 team_bar->b_arrived = new_state;
140 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
141 "arrived(%p) = %llu\n",
142 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
147 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
148 gtid, team->t.t_id, tid, bt));
152 template <
bool cancellable = false>
153 static bool __kmp_linear_barrier_release_template(
154 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
155 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
156 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
157 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
160 if (KMP_MASTER_TID(tid)) {
162 kmp_uint32 nproc = this_thr->th.th_team_nproc;
163 kmp_info_t **other_threads;
165 team = __kmp_threads[gtid]->th.th_team;
166 KMP_DEBUG_ASSERT(team != NULL);
167 other_threads = team->t.t_threads;
169 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
171 gtid, team->t.t_id, tid, bt));
174 #if KMP_BARRIER_ICV_PUSH
176 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
177 if (propagate_icvs) {
178 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
179 for (i = 1; i < nproc; ++i) {
180 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
182 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
183 &team->t.t_implicit_task_taskdata[0].td_icvs);
191 for (i = 1; i < nproc; ++i) {
195 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
199 (
"__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
200 "go(%p): %u => %u\n",
201 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
202 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
203 other_threads[i]->th.th_bar[bt].bb.b_go,
204 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
205 ANNOTATE_BARRIER_BEGIN(other_threads[i]);
206 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
212 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
213 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
215 kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
216 if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
219 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
220 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
222 ANNOTATE_BARRIER_END(this_thr);
223 #if USE_ITT_BUILD && USE_ITT_NOTIFY
224 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
227 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
229 __kmp_itt_task_starting(itt_sync_obj);
231 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
234 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
235 if (itt_sync_obj != NULL)
237 __kmp_itt_task_finished(itt_sync_obj);
241 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
245 tid = __kmp_tid_from_gtid(gtid);
246 team = __kmp_threads[gtid]->th.th_team;
248 KMP_DEBUG_ASSERT(team != NULL);
249 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
251 (
"__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
252 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
257 (
"__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
258 gtid, team->t.t_id, tid, bt));
262 static void __kmp_linear_barrier_gather(
263 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
264 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
265 __kmp_linear_barrier_gather_template<false>(
266 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
269 static bool __kmp_linear_barrier_gather_cancellable(
270 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
271 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
272 return __kmp_linear_barrier_gather_template<true>(
273 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
276 static void __kmp_linear_barrier_release(
277 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
278 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
279 __kmp_linear_barrier_release_template<false>(
280 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
283 static bool __kmp_linear_barrier_release_cancellable(
284 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
285 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
286 return __kmp_linear_barrier_release_template<true>(
287 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
292 __kmp_tree_barrier_gather(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
293 int tid,
void (*reduce)(
void *,
void *)
294 USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
295 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
296 kmp_team_t *team = this_thr->th.th_team;
297 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
298 kmp_info_t **other_threads = team->t.t_threads;
299 kmp_uint32 nproc = this_thr->th.th_team_nproc;
300 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
301 kmp_uint32 branch_factor = 1 << branch_bits;
303 kmp_uint32 child_tid;
304 kmp_uint64 new_state;
307 20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
308 gtid, team->t.t_id, tid, bt));
309 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
311 #if USE_ITT_BUILD && USE_ITT_NOTIFY
313 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
314 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
315 __itt_get_timestamp();
320 child_tid = (tid << branch_bits) + 1;
321 if (child_tid < nproc) {
323 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
326 kmp_info_t *child_thr = other_threads[child_tid];
327 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
330 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
332 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
335 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
336 "arrived(%p) == %llu\n",
337 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
338 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
340 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
341 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
342 ANNOTATE_BARRIER_END(child_thr);
343 #if USE_ITT_BUILD && USE_ITT_NOTIFY
346 if (__kmp_forkjoin_frames_mode == 2) {
347 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
348 child_thr->th.th_bar_min_time);
353 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
354 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
355 team->t.t_id, child_tid));
356 ANNOTATE_REDUCE_AFTER(reduce);
357 OMPT_REDUCTION_DECL(this_thr, gtid);
358 OMPT_REDUCTION_BEGIN;
359 (*reduce)(this_thr->th.th_local.reduce_data,
360 child_thr->th.th_local.reduce_data);
362 ANNOTATE_REDUCE_BEFORE(reduce);
363 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
367 }
while (child <= branch_factor && child_tid < nproc);
370 if (!KMP_MASTER_TID(tid)) {
371 kmp_int32 parent_tid = (tid - 1) >> branch_bits;
374 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
375 "arrived(%p): %llu => %llu\n",
376 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
377 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
378 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
384 ANNOTATE_BARRIER_BEGIN(this_thr);
385 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
390 team->t.t_bar[bt].b_arrived = new_state;
392 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
393 KA_TRACE(20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
394 "arrived(%p) = %llu\n",
395 gtid, team->t.t_id, tid, team->t.t_id,
396 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
399 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
400 gtid, team->t.t_id, tid, bt));
403 static void __kmp_tree_barrier_release(
404 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
405 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
406 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
408 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
410 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
411 kmp_uint32 branch_factor = 1 << branch_bits;
413 kmp_uint32 child_tid;
418 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
419 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
421 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
422 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
423 ANNOTATE_BARRIER_END(this_thr);
424 #if USE_ITT_BUILD && USE_ITT_NOTIFY
425 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
428 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
430 __kmp_itt_task_starting(itt_sync_obj);
432 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
435 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
436 if (itt_sync_obj != NULL)
438 __kmp_itt_task_finished(itt_sync_obj);
442 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
446 team = __kmp_threads[gtid]->th.th_team;
447 KMP_DEBUG_ASSERT(team != NULL);
448 tid = __kmp_tid_from_gtid(gtid);
450 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
452 (
"__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
453 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
456 team = __kmp_threads[gtid]->th.th_team;
457 KMP_DEBUG_ASSERT(team != NULL);
458 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
460 gtid, team->t.t_id, tid, bt));
462 nproc = this_thr->th.th_team_nproc;
463 child_tid = (tid << branch_bits) + 1;
465 if (child_tid < nproc) {
466 kmp_info_t **other_threads = team->t.t_threads;
470 kmp_info_t *child_thr = other_threads[child_tid];
471 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
474 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
476 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
479 #if KMP_BARRIER_ICV_PUSH
481 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
482 if (propagate_icvs) {
483 __kmp_init_implicit_task(team->t.t_ident,
484 team->t.t_threads[child_tid], team,
486 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
487 &team->t.t_implicit_task_taskdata[0].td_icvs);
492 (
"__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
493 "go(%p): %u => %u\n",
494 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
495 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
496 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
498 ANNOTATE_BARRIER_BEGIN(child_thr);
499 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
503 }
while (child <= branch_factor && child_tid < nproc);
506 20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
507 gtid, team->t.t_id, tid, bt));
512 __kmp_hyper_barrier_gather(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
513 int tid,
void (*reduce)(
void *,
void *)
514 USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
515 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
516 kmp_team_t *team = this_thr->th.th_team;
517 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
518 kmp_info_t **other_threads = team->t.t_threads;
519 kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
520 kmp_uint32 num_threads = this_thr->th.th_team_nproc;
521 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
522 kmp_uint32 branch_factor = 1 << branch_bits;
528 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
529 gtid, team->t.t_id, tid, bt));
530 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
532 #if USE_ITT_BUILD && USE_ITT_NOTIFY
534 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
535 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
536 __itt_get_timestamp();
541 kmp_flag_64<> p_flag(&thr_bar->b_arrived);
542 for (level = 0, offset = 1; offset < num_threads;
543 level += branch_bits, offset <<= branch_bits) {
545 kmp_uint32 child_tid;
547 if (((tid >> level) & (branch_factor - 1)) != 0) {
548 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
552 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
553 "arrived(%p): %llu => %llu\n",
554 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
555 team->t.t_id, parent_tid, &thr_bar->b_arrived,
557 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
562 ANNOTATE_BARRIER_BEGIN(this_thr);
563 p_flag.set_waiter(other_threads[parent_tid]);
569 if (new_state == KMP_BARRIER_UNUSED_STATE)
570 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
571 for (child = 1, child_tid = tid + (1 << level);
572 child < branch_factor && child_tid < num_threads;
573 child++, child_tid += (1 << level)) {
574 kmp_info_t *child_thr = other_threads[child_tid];
575 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
577 kmp_uint32 next_child_tid = child_tid + (1 << level);
579 if (child + 1 < branch_factor && next_child_tid < num_threads)
581 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
584 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
585 "arrived(%p) == %llu\n",
586 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
587 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
589 kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
590 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
591 ANNOTATE_BARRIER_END(child_thr);
593 #if USE_ITT_BUILD && USE_ITT_NOTIFY
596 if (__kmp_forkjoin_frames_mode == 2) {
597 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
598 child_thr->th.th_bar_min_time);
603 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
604 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
605 team->t.t_id, child_tid));
606 ANNOTATE_REDUCE_AFTER(reduce);
607 OMPT_REDUCTION_DECL(this_thr, gtid);
608 OMPT_REDUCTION_BEGIN;
609 (*reduce)(this_thr->th.th_local.reduce_data,
610 child_thr->th.th_local.reduce_data);
612 ANNOTATE_REDUCE_BEFORE(reduce);
613 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
618 if (KMP_MASTER_TID(tid)) {
620 if (new_state == KMP_BARRIER_UNUSED_STATE)
621 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
623 team->t.t_bar[bt].b_arrived = new_state;
624 KA_TRACE(20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
625 "arrived(%p) = %llu\n",
626 gtid, team->t.t_id, tid, team->t.t_id,
627 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
630 20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
631 gtid, team->t.t_id, tid, bt));
635 #define KMP_REVERSE_HYPER_BAR
636 static void __kmp_hyper_barrier_release(
637 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
638 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
639 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
641 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
642 kmp_info_t **other_threads;
643 kmp_uint32 num_threads;
644 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
645 kmp_uint32 branch_factor = 1 << branch_bits;
647 kmp_uint32 child_tid;
655 if (KMP_MASTER_TID(tid)) {
656 team = __kmp_threads[gtid]->th.th_team;
657 KMP_DEBUG_ASSERT(team != NULL);
658 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
660 gtid, team->t.t_id, tid, bt));
661 #if KMP_BARRIER_ICV_PUSH
662 if (propagate_icvs) {
663 copy_icvs(&thr_bar->th_fixed_icvs,
664 &team->t.t_implicit_task_taskdata[tid].td_icvs);
668 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
669 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
671 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
672 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
673 ANNOTATE_BARRIER_END(this_thr);
674 #if USE_ITT_BUILD && USE_ITT_NOTIFY
675 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
677 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
679 __kmp_itt_task_starting(itt_sync_obj);
681 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
684 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
685 if (itt_sync_obj != NULL)
687 __kmp_itt_task_finished(itt_sync_obj);
691 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
695 team = __kmp_threads[gtid]->th.th_team;
696 KMP_DEBUG_ASSERT(team != NULL);
697 tid = __kmp_tid_from_gtid(gtid);
699 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
701 (
"__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
702 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
705 num_threads = this_thr->th.th_team_nproc;
706 other_threads = team->t.t_threads;
708 #ifdef KMP_REVERSE_HYPER_BAR
710 for (level = 0, offset = 1;
711 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
712 level += branch_bits, offset <<= branch_bits)
716 for (level -= branch_bits, offset >>= branch_bits; offset != 0;
717 level -= branch_bits, offset >>= branch_bits)
720 for (level = 0, offset = 1; offset < num_threads;
721 level += branch_bits, offset <<= branch_bits)
724 #ifdef KMP_REVERSE_HYPER_BAR
727 child = num_threads >> ((level == 0) ? level : level - 1);
728 for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
729 child_tid = tid + (child << level);
730 child >= 1; child--, child_tid -= (1 << level))
732 if (((tid >> level) & (branch_factor - 1)) != 0)
737 for (child = 1, child_tid = tid + (1 << level);
738 child < branch_factor && child_tid < num_threads;
739 child++, child_tid += (1 << level))
742 if (child_tid >= num_threads)
745 kmp_info_t *child_thr = other_threads[child_tid];
746 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
748 kmp_uint32 next_child_tid = child_tid - (1 << level);
750 #ifdef KMP_REVERSE_HYPER_BAR
751 if (child - 1 >= 1 && next_child_tid < num_threads)
753 if (child + 1 < branch_factor && next_child_tid < num_threads)
756 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
759 #if KMP_BARRIER_ICV_PUSH
761 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
766 (
"__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
767 "go(%p): %u => %u\n",
768 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
769 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
770 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
772 ANNOTATE_BARRIER_BEGIN(child_thr);
773 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
778 #if KMP_BARRIER_ICV_PUSH
779 if (propagate_icvs &&
780 !KMP_MASTER_TID(tid)) {
781 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
783 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
784 &thr_bar->th_fixed_icvs);
789 (
"__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
790 gtid, team->t.t_id, tid, bt));
803 static bool __kmp_init_hierarchical_barrier_thread(
enum barrier_type bt,
804 kmp_bstate_t *thr_bar,
805 kmp_uint32 nproc,
int gtid,
806 int tid, kmp_team_t *team) {
808 bool uninitialized = thr_bar->team == NULL;
809 bool team_changed = team != thr_bar->team;
810 bool team_sz_changed = nproc != thr_bar->nproc;
811 bool tid_changed = tid != thr_bar->old_tid;
814 if (uninitialized || team_sz_changed) {
815 __kmp_get_hierarchy(nproc, thr_bar);
818 if (uninitialized || team_sz_changed || tid_changed) {
819 thr_bar->my_level = thr_bar->depth - 1;
820 thr_bar->parent_tid = -1;
824 while (d < thr_bar->depth) {
827 if (d == thr_bar->depth - 2) {
828 thr_bar->parent_tid = 0;
829 thr_bar->my_level = d;
831 }
else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
834 thr_bar->parent_tid = tid - rem;
835 thr_bar->my_level = d;
841 __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
842 (thr_bar->skip_per_level[thr_bar->my_level])),
844 thr_bar->old_tid = tid;
845 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
846 thr_bar->team = team;
847 thr_bar->parent_bar =
848 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
850 if (uninitialized || team_changed || tid_changed) {
851 thr_bar->team = team;
852 thr_bar->parent_bar =
853 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
856 if (uninitialized || team_sz_changed || tid_changed) {
857 thr_bar->nproc = nproc;
858 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
859 if (thr_bar->my_level == 0)
860 thr_bar->leaf_kids = 0;
861 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
862 __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
863 thr_bar->leaf_state = 0;
864 for (
int i = 0; i < thr_bar->leaf_kids; ++i)
865 ((
char *)&(thr_bar->leaf_state))[7 - i] = 1;
870 static void __kmp_hierarchical_barrier_gather(
871 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
872 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
873 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
874 kmp_team_t *team = this_thr->th.th_team;
875 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
876 kmp_uint32 nproc = this_thr->th.th_team_nproc;
877 kmp_info_t **other_threads = team->t.t_threads;
878 kmp_uint64 new_state;
880 int level = team->t.t_level;
882 ->th.th_teams_microtask)
883 if (this_thr->th.th_teams_size.nteams > 1)
886 thr_bar->use_oncore_barrier = 1;
888 thr_bar->use_oncore_barrier = 0;
890 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
892 gtid, team->t.t_id, tid, bt));
893 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
895 #if USE_ITT_BUILD && USE_ITT_NOTIFY
897 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
898 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
902 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
905 if (thr_bar->my_level) {
908 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
909 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
910 thr_bar->use_oncore_barrier) {
911 if (thr_bar->leaf_kids) {
913 kmp_uint64 leaf_state =
915 ? thr_bar->b_arrived | thr_bar->leaf_state
916 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
917 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
919 gtid, team->t.t_id, tid));
920 kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
921 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
923 ANNOTATE_REDUCE_AFTER(reduce);
924 OMPT_REDUCTION_DECL(this_thr, gtid);
925 OMPT_REDUCTION_BEGIN;
926 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
928 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
930 gtid, team->t.t_id, tid,
931 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
933 ANNOTATE_BARRIER_END(other_threads[child_tid]);
934 (*reduce)(this_thr->th.th_local.reduce_data,
935 other_threads[child_tid]->th.th_local.reduce_data);
938 ANNOTATE_REDUCE_BEFORE(reduce);
939 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
942 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
945 for (kmp_uint32 d = 1; d < thr_bar->my_level;
947 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
948 skip = thr_bar->skip_per_level[d];
951 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
952 kmp_info_t *child_thr = other_threads[child_tid];
953 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
954 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
956 "arrived(%p) == %llu\n",
957 gtid, team->t.t_id, tid,
958 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
959 child_tid, &child_bar->b_arrived, new_state));
960 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
961 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
962 ANNOTATE_BARRIER_END(child_thr);
964 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
966 gtid, team->t.t_id, tid,
967 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
969 ANNOTATE_REDUCE_AFTER(reduce);
970 (*reduce)(this_thr->th.th_local.reduce_data,
971 child_thr->th.th_local.reduce_data);
972 ANNOTATE_REDUCE_BEFORE(reduce);
973 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
978 for (kmp_uint32 d = 0; d < thr_bar->my_level;
980 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
981 skip = thr_bar->skip_per_level[d];
984 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
985 kmp_info_t *child_thr = other_threads[child_tid];
986 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
987 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
989 "arrived(%p) == %llu\n",
990 gtid, team->t.t_id, tid,
991 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
992 child_tid, &child_bar->b_arrived, new_state));
993 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
994 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
995 ANNOTATE_BARRIER_END(child_thr);
997 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
999 gtid, team->t.t_id, tid,
1000 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1002 ANNOTATE_REDUCE_AFTER(reduce);
1003 (*reduce)(this_thr->th.th_local.reduce_data,
1004 child_thr->th.th_local.reduce_data);
1005 ANNOTATE_REDUCE_BEFORE(reduce);
1006 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1014 if (!KMP_MASTER_TID(tid)) {
1015 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1016 " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1017 gtid, team->t.t_id, tid,
1018 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1019 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1020 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1024 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1025 !thr_bar->use_oncore_barrier) {
1027 ANNOTATE_BARRIER_BEGIN(this_thr);
1028 kmp_flag_64<> flag(&thr_bar->b_arrived,
1029 other_threads[thr_bar->parent_tid]);
1033 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1034 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1035 thr_bar->offset + 1);
1036 flag.set_waiter(other_threads[thr_bar->parent_tid]);
1040 team->t.t_bar[bt].b_arrived = new_state;
1041 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1042 "arrived(%p) = %llu\n",
1043 gtid, team->t.t_id, tid, team->t.t_id,
1044 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1047 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1048 "barrier type %d\n",
1049 gtid, team->t.t_id, tid, bt));
1052 static void __kmp_hierarchical_barrier_release(
1053 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
1054 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
1055 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1057 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1059 bool team_change =
false;
1061 if (KMP_MASTER_TID(tid)) {
1062 team = __kmp_threads[gtid]->th.th_team;
1063 KMP_DEBUG_ASSERT(team != NULL);
1064 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1065 "entered barrier type %d\n",
1066 gtid, team->t.t_id, tid, bt));
1069 if (!thr_bar->use_oncore_barrier ||
1070 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1071 thr_bar->team == NULL) {
1073 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1074 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1075 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1076 ANNOTATE_BARRIER_END(this_thr);
1077 TCW_8(thr_bar->b_go,
1078 KMP_INIT_BARRIER_STATE);
1082 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1083 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1084 thr_bar->offset + 1, bt,
1085 this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1086 flag.wait(this_thr, TRUE);
1087 if (thr_bar->wait_flag ==
1088 KMP_BARRIER_SWITCHING) {
1089 TCW_8(thr_bar->b_go,
1090 KMP_INIT_BARRIER_STATE);
1092 (RCAST(
volatile char *,
1093 &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1096 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1098 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1101 team = __kmp_threads[gtid]->th.th_team;
1102 KMP_DEBUG_ASSERT(team != NULL);
1103 tid = __kmp_tid_from_gtid(gtid);
1107 (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1108 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1112 nproc = this_thr->th.th_team_nproc;
1113 int level = team->t.t_level;
1114 if (team->t.t_threads[0]
1115 ->th.th_teams_microtask) {
1116 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1117 this_thr->th.th_teams_level == level)
1119 if (this_thr->th.th_teams_size.nteams > 1)
1123 thr_bar->use_oncore_barrier = 1;
1125 thr_bar->use_oncore_barrier = 0;
1129 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1130 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1131 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1137 #if KMP_BARRIER_ICV_PUSH
1138 if (propagate_icvs) {
1139 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1143 copy_icvs(&thr_bar->th_fixed_icvs,
1144 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1145 }
else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1146 thr_bar->use_oncore_barrier) {
1147 if (!thr_bar->my_level)
1150 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1151 &thr_bar->parent_bar->th_fixed_icvs);
1154 if (thr_bar->my_level)
1156 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1158 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1159 &thr_bar->parent_bar->th_fixed_icvs);
1165 if (thr_bar->my_level) {
1166 kmp_int32 child_tid;
1168 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1169 thr_bar->use_oncore_barrier) {
1170 if (KMP_MASTER_TID(tid)) {
1173 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1176 ngo_load(&thr_bar->th_fixed_icvs);
1179 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (
int)nproc;
1180 child_tid += thr_bar->skip_per_level[1]) {
1181 kmp_bstate_t *child_bar =
1182 &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1183 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1184 "releasing T#%d(%d:%d)"
1185 " go(%p): %u => %u\n",
1186 gtid, team->t.t_id, tid,
1187 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1188 child_tid, &child_bar->b_go, child_bar->b_go,
1189 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1192 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1196 TCW_8(thr_bar->b_go,
1197 KMP_INIT_BARRIER_STATE);
1199 if (thr_bar->leaf_kids) {
1202 old_leaf_kids < thr_bar->leaf_kids) {
1203 if (old_leaf_kids) {
1204 thr_bar->b_go |= old_leaf_state;
1207 last = tid + thr_bar->skip_per_level[1];
1210 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1212 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1213 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1216 (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1217 " T#%d(%d:%d) go(%p): %u => %u\n",
1218 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1219 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1220 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1222 ANNOTATE_BARRIER_BEGIN(child_thr);
1223 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1228 thr_bar->b_go |= thr_bar->leaf_state;
1232 for (
int d = thr_bar->my_level - 1; d >= 0;
1234 last = tid + thr_bar->skip_per_level[d + 1];
1235 kmp_uint32 skip = thr_bar->skip_per_level[d];
1238 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1239 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1240 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1241 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1242 "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1243 gtid, team->t.t_id, tid,
1244 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1245 child_tid, &child_bar->b_go, child_bar->b_go,
1246 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1248 ANNOTATE_BARRIER_BEGIN(child_thr);
1249 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1254 #if KMP_BARRIER_ICV_PUSH
1255 if (propagate_icvs && !KMP_MASTER_TID(tid))
1257 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1258 &thr_bar->th_fixed_icvs);
1261 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1262 "barrier type %d\n",
1263 gtid, team->t.t_id, tid, bt));
1271 template <
bool cancellable>
struct is_cancellable {};
1272 template <>
struct is_cancellable<true> {
1274 is_cancellable() : value(false) {}
1275 is_cancellable(
bool b) : value(b) {}
1276 is_cancellable &operator=(
bool b) {
1280 operator bool()
const {
return value; }
1282 template <>
struct is_cancellable<false> {
1283 is_cancellable &operator=(
bool b) {
return *
this; }
1284 constexpr
operator bool()
const {
return false; }
1295 template <
bool cancellable = false>
1296 static int __kmp_barrier_template(
enum barrier_type bt,
int gtid,
int is_split,
1297 size_t reduce_size,
void *reduce_data,
1298 void (*reduce)(
void *,
void *)) {
1299 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1300 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1301 int tid = __kmp_tid_from_gtid(gtid);
1302 kmp_info_t *this_thr = __kmp_threads[gtid];
1303 kmp_team_t *team = this_thr->th.th_team;
1305 is_cancellable<cancellable> cancelled;
1306 #if OMPT_SUPPORT && OMPT_OPTIONAL
1307 ompt_data_t *my_task_data;
1308 ompt_data_t *my_parallel_data;
1309 void *return_address;
1310 ompt_sync_region_t barrier_kind;
1313 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1314 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1316 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1318 if (ompt_enabled.enabled) {
1320 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1321 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1322 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1323 barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1324 if (ompt_enabled.ompt_callback_sync_region) {
1325 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1326 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1329 if (ompt_enabled.ompt_callback_sync_region_wait) {
1330 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1331 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1338 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1342 if (!team->t.t_serialized) {
1345 void *itt_sync_obj = NULL;
1347 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1348 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1351 if (__kmp_tasking_mode == tskm_extra_barrier) {
1352 __kmp_tasking_barrier(team, this_thr, gtid);
1354 (
"__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1355 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1362 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1364 this_thr->th.th_team_bt_intervals =
1365 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1366 this_thr->th.th_team_bt_set =
1367 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1369 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1374 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1375 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1379 if (KMP_MASTER_TID(tid)) {
1380 team->t.t_bar[bt].b_master_arrived += 1;
1382 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1385 if (reduce != NULL) {
1387 this_thr->th.th_local.reduce_data = reduce_data;
1390 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1392 __kmp_task_team_setup(this_thr, team, 0);
1395 cancelled = __kmp_linear_barrier_gather_cancellable(
1396 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1398 switch (__kmp_barrier_gather_pattern[bt]) {
1399 case bp_hyper_bar: {
1401 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1402 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1403 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1406 case bp_hierarchical_bar: {
1407 __kmp_hierarchical_barrier_gather(
1408 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1413 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1414 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1415 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1419 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1420 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1427 if (KMP_MASTER_TID(tid)) {
1429 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1430 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1435 team->t.t_bar[bt].b_team_arrived += 1;
1438 if (__kmp_omp_cancellation) {
1439 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1441 if (cancel_request == cancel_loop ||
1442 cancel_request == cancel_sections) {
1443 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1451 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1452 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1454 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1456 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1457 __kmp_forkjoin_frames_mode &&
1458 (this_thr->th.th_teams_microtask == NULL ||
1459 this_thr->th.th_teams_size.nteams == 1) &&
1460 team->t.t_active_level == 1) {
1461 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1462 kmp_uint64 cur_time = __itt_get_timestamp();
1463 kmp_info_t **other_threads = team->t.t_threads;
1464 int nproc = this_thr->th.th_team_nproc;
1466 switch (__kmp_forkjoin_frames_mode) {
1468 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1470 this_thr->th.th_frame_time = cur_time;
1474 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1478 if (__itt_metadata_add_ptr) {
1480 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1483 this_thr->th.th_bar_arrive_time = 0;
1484 for (i = 1; i < nproc; ++i) {
1485 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1486 other_threads[i]->th.th_bar_arrive_time = 0;
1488 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1490 (kmp_uint64)(reduce != NULL));
1492 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1494 this_thr->th.th_frame_time = cur_time;
1502 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1503 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1506 if ((status == 1 || !is_split) && !cancelled) {
1508 cancelled = __kmp_linear_barrier_release_cancellable(
1509 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1511 switch (__kmp_barrier_release_pattern[bt]) {
1512 case bp_hyper_bar: {
1513 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1514 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1515 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1518 case bp_hierarchical_bar: {
1519 __kmp_hierarchical_barrier_release(
1520 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1524 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1525 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1526 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1530 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1531 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1535 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1536 __kmp_task_team_sync(this_thr, team);
1544 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1545 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1549 if (__kmp_tasking_mode != tskm_immediate_exec) {
1550 if (this_thr->th.th_task_team != NULL) {
1552 void *itt_sync_obj = NULL;
1553 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1554 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1555 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1559 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1561 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1562 __kmp_task_team_setup(this_thr, team, 0);
1565 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1566 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1571 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1572 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1573 __kmp_tid_from_gtid(gtid), status));
1576 if (ompt_enabled.enabled) {
1578 if (ompt_enabled.ompt_callback_sync_region_wait) {
1579 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1580 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1583 if (ompt_enabled.ompt_callback_sync_region) {
1584 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1585 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1589 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1592 ANNOTATE_BARRIER_END(&team->t.t_bar);
1595 return (
int)cancelled;
1600 int __kmp_barrier(
enum barrier_type bt,
int gtid,
int is_split,
1601 size_t reduce_size,
void *reduce_data,
1602 void (*reduce)(
void *,
void *)) {
1603 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1607 #if defined(KMP_GOMP_COMPAT)
1609 int __kmp_barrier_gomp_cancel(
int gtid) {
1610 if (__kmp_omp_cancellation) {
1611 int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1614 int tid = __kmp_tid_from_gtid(gtid);
1615 kmp_info_t *this_thr = __kmp_threads[gtid];
1616 if (KMP_MASTER_TID(tid)) {
1620 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1621 KMP_BARRIER_STATE_BUMP;
1626 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1631 void __kmp_end_split_barrier(
enum barrier_type bt,
int gtid) {
1632 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1633 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1634 int tid = __kmp_tid_from_gtid(gtid);
1635 kmp_info_t *this_thr = __kmp_threads[gtid];
1636 kmp_team_t *team = this_thr->th.th_team;
1638 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1639 if (!team->t.t_serialized) {
1640 if (KMP_MASTER_GTID(gtid)) {
1641 switch (__kmp_barrier_release_pattern[bt]) {
1642 case bp_hyper_bar: {
1643 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1644 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1645 FALSE USE_ITT_BUILD_ARG(NULL));
1648 case bp_hierarchical_bar: {
1649 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1650 FALSE USE_ITT_BUILD_ARG(NULL));
1654 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1655 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1656 FALSE USE_ITT_BUILD_ARG(NULL));
1660 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1661 FALSE USE_ITT_BUILD_ARG(NULL));
1664 if (__kmp_tasking_mode != tskm_immediate_exec) {
1665 __kmp_task_team_sync(this_thr, team);
1669 ANNOTATE_BARRIER_END(&team->t.t_bar);
1672 void __kmp_join_barrier(
int gtid) {
1673 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1674 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1675 kmp_info_t *this_thr = __kmp_threads[gtid];
1678 kmp_info_t *master_thread;
1684 void *itt_sync_obj = NULL;
1686 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1688 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1694 team = this_thr->th.th_team;
1695 nproc = this_thr->th.th_team_nproc;
1696 KMP_DEBUG_ASSERT((
int)nproc == team->t.t_nproc);
1697 tid = __kmp_tid_from_gtid(gtid);
1699 team_id = team->t.t_id;
1701 master_thread = this_thr->th.th_team_master;
1703 if (master_thread != team->t.t_threads[0]) {
1704 __kmp_print_structure();
1707 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1711 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1712 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1713 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1714 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1715 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1716 gtid, team_id, tid));
1718 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1720 if (ompt_enabled.enabled) {
1722 ompt_data_t *my_task_data;
1723 ompt_data_t *my_parallel_data;
1724 void *codeptr = NULL;
1725 int ds_tid = this_thr->th.th_info.ds.ds_tid;
1726 if (KMP_MASTER_TID(ds_tid) &&
1727 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1728 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1729 codeptr = team->t.ompt_team_info.master_return_address;
1730 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1731 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1732 if (ompt_enabled.ompt_callback_sync_region) {
1733 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1734 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1735 my_task_data, codeptr);
1737 if (ompt_enabled.ompt_callback_sync_region_wait) {
1738 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1739 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1740 my_task_data, codeptr);
1742 if (!KMP_MASTER_TID(ds_tid))
1743 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1745 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1749 if (__kmp_tasking_mode == tskm_extra_barrier) {
1750 __kmp_tasking_barrier(team, this_thr, gtid);
1751 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1755 if (__kmp_tasking_mode != tskm_immediate_exec) {
1756 KA_TRACE(20, (
"__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1757 "%p, th_task_team = %p\n",
1758 __kmp_gtid_from_thread(this_thr), team_id,
1759 team->t.t_task_team[this_thr->th.th_task_state],
1760 this_thr->th.th_task_team));
1761 KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1762 team->t.t_task_team[this_thr->th.th_task_state]);
1771 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1773 this_thr->th.th_team_bt_intervals =
1774 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1775 this_thr->th.th_team_bt_set =
1776 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1778 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1783 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1784 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1787 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1788 case bp_hyper_bar: {
1789 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1790 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1791 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1794 case bp_hierarchical_bar: {
1795 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1796 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1800 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1801 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1802 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1806 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1807 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1815 if (KMP_MASTER_TID(tid)) {
1816 if (__kmp_tasking_mode != tskm_immediate_exec) {
1817 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1819 if (__kmp_display_affinity) {
1820 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1822 #if KMP_STATS_ENABLED
1826 for (
int i = 0; i < team->t.t_nproc; ++i) {
1827 kmp_info_t *team_thread = team->t.t_threads[i];
1828 if (team_thread == this_thr)
1830 team_thread->th.th_stats->setIdleFlag();
1831 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1832 team_thread->th.th_sleep_loc != NULL)
1833 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1834 team_thread->th.th_sleep_loc);
1838 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1839 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1842 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1844 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1845 __kmp_forkjoin_frames_mode &&
1846 (this_thr->th.th_teams_microtask == NULL ||
1847 this_thr->th.th_teams_size.nteams == 1) &&
1848 team->t.t_active_level == 1) {
1849 kmp_uint64 cur_time = __itt_get_timestamp();
1850 ident_t *loc = team->t.t_ident;
1851 kmp_info_t **other_threads = team->t.t_threads;
1852 int nproc = this_thr->th.th_team_nproc;
1854 switch (__kmp_forkjoin_frames_mode) {
1856 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1860 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1864 if (__itt_metadata_add_ptr) {
1866 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1869 this_thr->th.th_bar_arrive_time = 0;
1870 for (i = 1; i < nproc; ++i) {
1871 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1872 other_threads[i]->th.th_bar_arrive_time = 0;
1874 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1875 cur_time, delta, 0);
1877 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1879 this_thr->th.th_frame_time = cur_time;
1887 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1888 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1893 if (KMP_MASTER_TID(tid)) {
1896 (
"__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1897 gtid, team_id, tid, nproc));
1904 (
"__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1906 ANNOTATE_BARRIER_END(&team->t.t_bar);
1911 void __kmp_fork_barrier(
int gtid,
int tid) {
1912 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1913 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1914 kmp_info_t *this_thr = __kmp_threads[gtid];
1915 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1917 void *itt_sync_obj = NULL;
1920 ANNOTATE_BARRIER_END(&team->t.t_bar);
1922 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1923 (team != NULL) ? team->t.t_id : -1, tid));
1926 if (KMP_MASTER_TID(tid)) {
1927 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1928 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1930 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1931 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1936 kmp_info_t **other_threads = team->t.t_threads;
1942 for (i = 1; i < team->t.t_nproc; ++i) {
1944 (
"__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1946 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1947 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1948 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1950 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1951 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1952 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1956 if (__kmp_tasking_mode != tskm_immediate_exec) {
1958 __kmp_task_team_setup(this_thr, team, 0);
1967 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1969 this_thr->th.th_team_bt_intervals =
1970 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1971 this_thr->th.th_team_bt_set =
1972 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1974 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1979 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1980 case bp_hyper_bar: {
1981 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1982 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1983 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1986 case bp_hierarchical_bar: {
1987 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1988 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1992 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1993 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1994 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1998 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1999 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2004 if (ompt_enabled.enabled &&
2005 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2006 int ds_tid = this_thr->th.th_info.ds.ds_tid;
2007 ompt_data_t *task_data = (team)
2008 ? OMPT_CUR_TASK_DATA(this_thr)
2009 : &(this_thr->th.ompt_thread_info.task_data);
2010 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2012 void *codeptr = NULL;
2013 if (KMP_MASTER_TID(ds_tid) &&
2014 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2015 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2016 codeptr = team->t.ompt_team_info.master_return_address;
2017 if (ompt_enabled.ompt_callback_sync_region_wait) {
2018 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2019 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2022 if (ompt_enabled.ompt_callback_sync_region) {
2023 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2024 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2028 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2029 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2030 ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit);
2036 if (TCR_4(__kmp_global.g.g_done)) {
2037 this_thr->th.th_task_team = NULL;
2039 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2040 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2041 if (!KMP_MASTER_TID(tid)) {
2042 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2044 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2048 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2056 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2057 KMP_DEBUG_ASSERT(team != NULL);
2058 tid = __kmp_tid_from_gtid(gtid);
2060 #if KMP_BARRIER_ICV_PULL
2068 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2069 if (!KMP_MASTER_TID(tid)) {
2073 (
"__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2074 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2076 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2077 &team->t.t_threads[0]
2078 ->th.th_bar[bs_forkjoin_barrier]
2084 if (__kmp_tasking_mode != tskm_immediate_exec) {
2085 __kmp_task_team_sync(this_thr, team);
2088 #if KMP_AFFINITY_SUPPORTED
2089 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2090 if (proc_bind == proc_bind_intel) {
2092 if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2093 __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2095 }
else if (proc_bind != proc_bind_false) {
2096 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2097 KA_TRACE(100, (
"__kmp_fork_barrier: T#%d already in correct place %d\n",
2098 __kmp_gtid_from_thread(this_thr),
2099 this_thr->th.th_current_place));
2101 __kmp_affinity_set_place(gtid);
2106 if (__kmp_display_affinity) {
2107 if (team->t.t_display_affinity
2108 #
if KMP_AFFINITY_SUPPORTED
2109 || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2113 __kmp_aux_display_affinity(gtid, NULL);
2114 this_thr->th.th_prev_num_threads = team->t.t_nproc;
2115 this_thr->th.th_prev_level = team->t.t_level;
2118 if (!KMP_MASTER_TID(tid))
2119 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2121 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2122 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2123 if (!KMP_MASTER_TID(tid)) {
2125 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2126 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2130 ANNOTATE_BARRIER_END(&team->t.t_bar);
2131 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2132 team->t.t_id, tid));
2135 void __kmp_setup_icv_copy(kmp_team_t *team,
int new_nproc,
2136 kmp_internal_control_t *new_icvs,
ident_t *loc) {
2137 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2139 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2140 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2145 #if KMP_BARRIER_ICV_PULL
2149 KMP_DEBUG_ASSERT(team->t.t_threads[0]);
2152 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2154 KF_TRACE(10, (
"__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2155 team->t.t_threads[0], team));
2156 #elif KMP_BARRIER_ICV_PUSH
2159 KF_TRACE(10, (
"__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2160 team->t.t_threads[0], team));
2165 KMP_DEBUG_ASSERT(team->t.t_threads[0]);
2167 for (
int f = 1; f < new_nproc; ++f) {
2169 KF_TRACE(10, (
"__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2170 f, team->t.t_threads[f], team));
2171 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2172 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2173 KF_TRACE(10, (
"__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2174 f, team->t.t_threads[f], team));